파이썬으로 하는 outlook : 메일 분류 학습(1)

coding

by 서티제 2022. 2. 18. 22:00

Mail의 범주 자동 분류( category, classification).
업무용 아웃룩 메일 1,200개를 이용하여 범주(classification, category) 학습을 진행하였다

학습 및 학습 모델링 주요 정보
1) Train및 Predict에 이용한 전체 메일 숫자 : 1,200 개
2) 범주 (classification); 총 11개 종류
: 공유, 기타-기타, 일정-회의, 일정-교육, 일정-요청, 기타-인사, 기타-사외, 일정-세미나, 소식지, 일정-기타, 협조전
3) 범주 학습 : 700개 메일에 대하여 사전에 범주화함.
4) Train, Test, Predict data의 구분: 범부(classification) 구분한 data 700개 중에서
400개는 train, 나머지 300개는 test data로 구분하여 test 진행하였으나 모델의 정확도를 위해서
최종적으로 700개 전체를 train data로 사용하여 최종 model 생성함.
5) 메일 본문 (mail..Body)의 preprocess
- 메일 본문중에서 학습에 불필요하다고 생각되는 사람이름, 보낸사람 sign정보(이름,직책,회사, 메일주소,
홈페이지 정보), 조사등 삭제함.
- 전단 메일의 경우 전달 이전 내용 삭제.
- 위 내용은 별도 function을 통해서 preprocess 진행함.
6) 학습툴은 tensorflow, keras 이용함.
7) 모델 분석에 필요한 mail의 길이는 정규분포 95% 이내인 400단어로 정의함
8) 모델분석에 사용하는 단어는 전체 단어중 출현 빈도등의 조건에 의해 1,165단어로 하였으며
사용빈도에 의해서 mail body word integer index 모델을 별도로 저장함.
9) model 생성을 위한 학습 프로그램과 predict 프로그램 별도 생성함.
10) 8번의 단어 정수화 index와 classification 정수화 index는 shelves package를 사용하여 변수 저장하거나
불러와서 사용함.
학숩 프로그램 예시.. # 일부 fuction 프로그램 및 저장 학습 data는 공유하지 않았으며 프로그램 구조만 참조 바람.
import MailControlFunc
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import numpy as np
import shelve
import datetime

def first_second_largest_number(arr):
second = largest = -float('inf')
for n in arr:
if n > largest:
second = largest
largest = n
elif second < n < largest:
second = n
return largest, second

Mails_Train = []
Mails_Predict = []
Mails_Test = []
f, n = MailControlFunc.makeKoreanNameStdList()
# word for consistance of sign sentence
file = open('kindOfEmailSignWord_v1.txt', 'r')
fileRead = file.read()
kindOfSignWordSet = []
for word in fileRead.split(sep='\n'):
kindOfSignWordSet.append(word)
file.close()
''' for train and predict '''
Mails = MailControlFunc.ReadMailFile('whole_inbox_mail_220104_v6_new_and_old_기타일정.txt',kindOfSignWordSet, f, n)
''' for Test '''
Mails_fr_0104_to_0117 = MailControlFunc.ReadMailFile('whole_inbox_mail_220104_21_v12_기타일정.txt',kindOfSignWordSet, f, n)
X_Train , Y_Train, X_Test, Y_Test, X_Predict, Y_Predict \
= [], [], [], [], [], []
X_Train_txt, Y_Train_txt, X_Test_txt, Y_Test_txt, X_Predict_txt, Y_Predict_txt \
= [], [], [], [], [], []
X_Train_int, Y_Train_int, X_Test_int, Y_Test_int, X_Predict_int, Y_Predict_int \
= [], [], [], [], [], []
mail_body_all =''
for mail in Mails:
if mail.categories != '':
mail_body_all = mail_body_all + mail.subject + '\n' + mail.mailbody + '\n'
Mails_Train.append(mail)
X_Train_txt.append(mail.subject + '\n' + mail.mailbody)
Y_Train_txt.append(mail.categories)
else:
Mails_Predict.append(mail)
X_Predict_txt.append(mail.subject + mail.mailbody)
Y_Predict_txt.append(mail.categories)

for mail in Mails_fr_0104_to_0117:
if mail.categories != '':
Mails_Test.append(mail)
X_Test_txt.append(mail.subject + '\n' + mail.mailbody)
Y_Test_txt.append(mail.categories)
# add more train data
Mails_Train.append(mail)
mail_body_all = mail_body_all + mail.subject + '\n' + mail.mailbody + '\n'
X_Train_txt.append(mail.subject + '\n' + mail.mailbody)
Y_Train_txt.append(mail.categories)

word_index_table = MailControlFunc.make_word_index_of_text(mail_body_all,kindOfSignWordSet, f, n)

for txt in X_Train_txt:
temp = MailControlFunc.text_to_integer_encoding(txt, word_index_table)
X_Train_int.append(temp)

for txt in X_Test_txt:
temp = MailControlFunc.text_to_integer_encoding(txt, word_index_table)
X_Test_int.append(temp)

for txt_1 in X_Predict_txt:
temp = MailControlFunc.text_to_integer_encoding(txt_1, word_index_table)
X_Predict_int.append(temp)

category_list = set(Y_Train_txt + Y_Test_txt)
cate_index_table = MailControlFunc.make_category_index(category_list)
for cate in Y_Train_txt:
Y_Train_int.append(cate_index_table[cate])
for cate in Y_Test_txt:
# print('test category: ', cate)
Y_Test_int.append((cate_index_table[cate]))

vocab_size = int(len(word_index_table)) + 1 # len is 4696 , 3896
print('index word limit quantity: ', vocab_size)
print('index word quantity(len(word_index_for_train): ', len(word_index_table))
max_len = 400 # 메일길이 정규 분호 95%인 메일의 길이
embedding_dim = 128
hidden_units = 128
# preprocess mail body, padding
X_Train = pad_sequences(X_Train_int, maxlen=max_len)
X_Test = pad_sequences(X_Test_int, maxlen=max_len)
X_Predict = pad_sequences(X_Predict_int, maxlen=max_len)
Y_Train = to_categorical(Y_Train_int)
Y_Test = to_categorical(Y_Test_int)
num_classes = len(set(Y_Train_txt))
# modeling
model = Sequential()
print(num_classes)
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(num_classes, activation='softmax'))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model_210.h5', monitor='val_acc', mode='max',
verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_Train, Y_Train, batch_size=128, epochs=40, callbacks=[es, mc],
validation_data=(X_Test, Y_Test))
loaded_model = load_model('best_model_210.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_Test, Y_Test)[1]))
epochs = range(1, len(history.history['acc']) + 1)

# change predict array to category string ..
cate_index_table_key, cate_index_table_value = [], []
cate_index_table_key = list(cate_index_table.keys())
cate_index_table_value = list(cate_index_table.values())
predicted_cate_count = {}

with shelve.open('word_cate_index_table_210',flag='c') as data_file:
data_file['word_index_table'] = word_index_table
data_file['cate_index_table'] = cate_index_table

# predict train category by loaded model
try:
temp_Train_loaded_model = loaded_model.predict(np.array(X_Train))
print("Loaded_model.predict : ")
except:
temp_Train_model = model.predict(X_Train)
print("model.predict : ")

Y_Train_for_predict_1st_txt = []
Y_Train_for_predict_2nd_txt = []
for idx, cate_int in enumerate(temp_Train_loaded_model):
cate_int = list(cate_int)
max_value, second_value = first_second_largest_number(cate_int)
max_cate_temp = cate_index_table_key[cate_int.index(max_value)]
second_cate_temp = cate_index_table_key[cate_int.index(second_value)]
Y_Train_for_predict_1st_txt.append(max_cate_temp)
Y_Train_for_predict_2nd_txt.append(second_cate_temp)

pd_Train_cate_int = pd.DataFrame(temp_Train_loaded_model.__array__())
# pd_predict_cate.to_excel('predict_cate_220119.xlsx')
pd_Train_txt = pd.DataFrame(X_Train_txt)
pd_Train_cate_txt = pd.DataFrame(Y_Train_txt)
pd_Y_Train_for_predict_1st_txt = pd.DataFrame(Y_Train_for_predict_1st_txt)
pd_Y_Train_for_predict_2nd_txt = pd.DataFrame(Y_Train_for_predict_2nd_txt)
pd_word_index_table = pd.DataFrame(word_index_table.keys())
pd_cate_index_table = pd.DataFrame(cate_index_table.keys())

Train_all_data =[]
for idx in range(len(X_Train)):
Train_all_data.append([Y_Train_txt[idx],Y_Train_for_predict_1st_txt[idx],
Y_Train_for_predict_2nd_txt[idx],X_Train_txt[idx]])
pd_Train_all_data = pd.DataFrame(Train_all_data)

todays = str(datetime.date.today()) +'-'+ str(datetime.datetime.today().hour) + '-' \
+ str(datetime.datetime.today().minute)
# pd_predict_txt.to_excel('predict_txt_220119.xlsx')

with pd.ExcelWriter('Train_mail_mailbody_cate_no_reply_'+ todays +'.xlsx') as writer:
pd_Train_cate_txt.to_excel(writer, sheet_name="cate")
pd_Train_txt.to_excel(writer, sheet_name= "mailbody")
pd_Train_cate_int.to_excel(writer, sheet_name="predicted_cate_int")
pd_Y_Train_for_predict_1st_txt.to_excel(writer, sheet_name="predicted_cate_txt_1st")
pd_Y_Train_for_predict_2nd_txt.to_excel(writer, sheet_name="predicted_cate_txt_2nd")
pd_word_index_table.to_excel(writer, sheet_name="word_index")
pd_cate_index_table.to_excel(writer, sheet_name="cate_index")
pd_Train_all_data.to_excel(writer, sheet_name="all data")