-
Notifications
You must be signed in to change notification settings - Fork 1
/
A 사전 모델 학습: 한국어 감정 정보가 포함된 단발성 대화 데이터셋.py
225 lines (191 loc) · 8.6 KB
/
A 사전 모델 학습: 한국어 감정 정보가 포함된 단발성 대화 데이터셋.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#1. colab 환경 설정
from google.colab import drive
drive.mount('/content/drive')
#파일 경로 선택
cd '/content/drive/Shareddrives/파일명'
!git clone https://github.com/SKTBrain/KoBERT.git
cd 'KoBERT'
!pip install -r requirements.txt
!pip install .
predefined_args = {
'attention_cell': 'multi_head',
'num_layers': 12,
'units': 768,
'hidden_size': 3072,
'max_length': 512,
'num_heads': 12,
'scaled': True,
'dropout': 0.1,
'use_residual': True,
'embed_size': 768,
'embed_dropout': 0.1,
'token_type_vocab_size': 2,
'word_embed': None,
}
!pip install torch==1.7.0
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
#참고: https://github.com/SKTBrain/KoBERT/tree/master/kobert_hf
import torch
from kobert.pytorch_kobert import get_kobert_model, get_pytorch_kobert_model
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel, vocab = get_kobert_model('skt/kobert-base-v1',tokenizer.vocab_file)
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
bertmodel, vocab = get_kobert_model('skt/kobert-base-v1',tokenizer.vocab_file)
print(bertmodel, vocab)
#2. 데이터셋 전처리
#데이터 불러오기
import pandas as pd
chatbot_data = pd.read_excel('/content/drive/파일저장경로/한국어_단발성_대화_데이터셋.xlsx')
#NaN값을 가지는 컬럼들을 없애고 Emotion컬럼에 7가지의 감정 단어들을 숫자로 변경
chatbot_data.loc[(chatbot_data['Emotion'] == "공포"), 'Emotion'] = 0 #공포 => 0
chatbot_data.loc[(chatbot_data['Emotion'] == "놀람"), 'Emotion'] = 1 #놀람 => 1
chatbot_data.loc[(chatbot_data['Emotion'] == "분노"), 'Emotion'] = 2 #분노 => 2
chatbot_data.loc[(chatbot_data['Emotion'] == "슬픔"), 'Emotion'] = 3 #슬픔 => 3
chatbot_data.loc[(chatbot_data['Emotion'] == "중립"), 'Emotion'] = 4 #중립 => 4
chatbot_data.loc[(chatbot_data['Emotion'] == "행복"), 'Emotion'] = 5 #행복 => 5
chatbot_data.loc[(chatbot_data['Emotion'] == "혐오"), 'Emotion'] = 6 #혐오 => 6
#data_list = ('문장', '감정')
data_list = []
for q, label in zip(chatbot_data['Sentence'], chatbot_data['Emotion']) :
data = []
data.append(q)
data.append(str(label))
data_list.append(data)
#3. Train Data & Test Data
#dataset_train:dataset_test = 7:3
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(data_list, test_size=0.3, random_state=1234)
print(len(dataset_train)) # 27015
print(len(dataset_test)) # 11579
#4. KoBERT 입력 데이터로 만들기
#BERTDataset 클래스 생성
#각 데이터가 KoBERT 모델의 입력으로 들어갈 수 있는 형태가 되도록 토큰화, 정수 인코딩, 패딩 등 실시
#BERTSentenceTransform 이라는 모듈사용-> 토큰화, 패딩 이루어짐
#참고: 코드수정2 ->class에서 def init안에 vocab추가
class BERTDataset(Dataset):
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)
self.sentences = [transform([i[sent_idx]]) for i in dataset]
self.labels = [np.int32(i[label_idx]) for i in dataset]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
# Setting parameters
#하이퍼 파라미터 조정
#사실 파라미터의 값은 정답이 없고 가장 좋은 성능을 내는 값을 찾아가는 것이 좋음
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5
#토큰화
#string error 해결 -> vocab 추가
tok=tokenizer.tokenize
data_train = BERTDataset(dataset_train, 0, 1, tok,vocab, max_len, True, False)
data_test = BERTDataset(dataset_test,0, 1, tok,vocab, max_len, True, False)
#torch 형식의 dataset을 만들기
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
#5. KoBERT 학습모델 만들기
#KoBERT학습모델 만들기
class BERTClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes=7, ##클래스 수 조정##
dr_rate=None,
params=None):
super(BERTClassifier, self).__init__()
self.bert = bert
self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
if self.dr_rate:
out = self.dropout(pooler)
return self.classifier(out)
#GPU 사용
device = torch.device("cuda:0")
#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
#BERT 모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc
train_dataloader
#6. KoBERT 모델 학습
#KoBERT 모델 학습시키기
for e in range(num_epochs):
train_acc = 0.0
test_acc = 0.0
model.train()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
train_acc += calc_accuracy(out, label)
if batch_id % log_interval == 0:
print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
test_acc += calc_accuracy(out, label)
print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
#학습시킨 모델 파일로 저장
import joblib
joblib.dump(model, './모델명.pkl')
print(model)
#학습시킨 모델 파일 불러오기
import joblib
loaded_model = joblib.load('모델명.pkl')