-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
161 lines (128 loc) · 5.15 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import logging
import os
import sys
from arguments import DataTrainingArguments, ModelArguments
from datasets import DatasetDict, load_from_disk
import evaluate
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
AutoConfig,
AutoModelForQuestionAnswering,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
TrainingArguments,
set_seed,
)
from utils_qa import post_processing_function
from preprocessing.dataset_preprocessing import dataset_preprocessing
logger = logging.getLogger(__name__)
def train(model_args, data_args, training_args):
print("do train:", training_args.do_train)
print("do eval:", training_args.do_eval)
# logging 설정
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
# verbosity 설정 : Transformers logger의 정보로 사용합니다 (on main process only)
logger.info("Training/evaluation parameters %s", training_args)
# 모델을 초기화하기 전에 난수를 고정합니다.
set_seed(training_args.seed)
datasets = load_from_disk(data_args.dataset_name)
config = AutoConfig.from_pretrained(model_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
# 'use_fast' argument를 True로 설정할 경우 rust로 구현된 tokenizer를 사용할 수 있습니다.
# False로 설정할 경우 python으로 구현된 tokenizer를 사용할 수 있으며,
# rust version이 비교적 속도가 빠릅니다.
use_fast=True,
)
# model = BaseQAModel(model_name, model_config)
model = AutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
)
print(datasets)
print(
type(training_args),
type(model_args),
type(datasets),
type(tokenizer),
type(model),
)
run_mrc(data_args, training_args, model_args, datasets, tokenizer, model)
# ############## retrieval 학습
# if data_args.train_retrieval:
# retriever = SparseRetrieval(
# tokenize_fn=tokenizer.tokenize,
# data_path="./data",
# context_path="wikipeida_documents.json"
# )
# retriever.get_sparse_embedding()
# ##############
def run_mrc(
data_args: DataTrainingArguments,
training_args: TrainingArguments,
model_args: ModelArguments,
datasets: DatasetDict,
tokenizer,
model,
) -> None:
train_dataset, eval_dataset, last_checkpoint = dataset_preprocessing(data_args, training_args, datasets, tokenizer)
# Data collator
# flag가 True이면 이미 max length로 padding된 상태입니다.
# 그렇지 않다면 data collator에서 padding을 진행해야합니다.
data_collator = DataCollatorWithPadding(
tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
)
metric = evaluate.load("squad")
def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
# Trainer 초기화
trainer = QuestionAnsweringTrainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
eval_examples=datasets["validation"] if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
post_process_function=post_processing_function,
compute_metrics=compute_metrics,
data_args=data_args,
)
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
with open(output_train_file, "w") as writer:
logger.info("***** Train results *****")
for key, value in sorted(train_result.metrics.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
# State 저장
trainer.state.save_to_json(
os.path.join(training_args.output_dir, "trainer_state.json")
)
# Evaluation
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
metrics["eval_samples"] = len(eval_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)