diff --git a/main.py b/main.py index 64e8b42..804da04 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,6 @@ import os import pandas as pd -from soynlp.normalizer import repeat_normalize -from soynlp.tokenizer import RegexTokenizer from tqdm.auto import tqdm import transformers import torch @@ -15,11 +13,14 @@ import numpy as np from scipy.stats import pearsonr import random -import nltk -from nltk.corpus import stopwords +# import nltk +# from nltk.corpus import stopwords +import re +from soynlp.normalizer import repeat_normalize +from soynlp.tokenizer import RegexTokenizer +from datetime import datetime +# from konlpy.tag import Hannanum TODO -stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949') -Regextokenizer = RegexTokenizer() def compute_pearson_correlation(pred): preds = pred.predictions.flatten() labels = pred.label_ids.flatten() @@ -27,24 +28,35 @@ def compute_pearson_correlation(pred): def seed_everything(seed): - random.seed(seed) - os.environ['PYTHONHASHSEED'] = str(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.deterministic = True + random.seed(seed) # Python의 random 모듈의 시드를 설정 + os.environ['PYTHONHASHSEED'] = str(seed) # Python 해시 함수의 시드를 설정 + np.random.seed(seed) # NumPy의 무작위 함수인 random 모듈의 시드를 설정 + torch.manual_seed(seed) # PyTorch의 무작위 함수를 위한 시드를 설정 + torch.cuda.manual_seed(seed) # PyTorch의 CUDA(Compute Unified Device Architecture) 연산을 위한 시드를 설정 + torch.backends.cudnn.deterministic = True # PyTorch의 cuDNN(CUDA Deep Neural Network) 라이브러리의 동작을 재현 가능하게 만들기 위한 옵션을 설정 + # PyTorch의 cuDNN 라이브러리가 입력 데이터의 크기에 따라 최적의 컨볼루션 연산 알고리즘을 선택하는 옵션을 설정 + # True로 설정하면 동일한 입력 데이터에 대해서도 항상 같은 알고리즘이 선택되어, 동일한 결과를 재현할 수 있다. torch.backends.cudnn.benchmark = True class Train_val_TextDataset(torch.utils.data.Dataset): - def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=128, model_name='klue/roberta-small'): + def __init__(self, state, data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'): + """ + 클래스의 생성자 메서드로, 데이터셋을 초기화한다. + Args: + state (string): 데이터 셋의 상태 + data_file (string): 데이터 파일의 경로 + text_columns (list): 텍스트 데이터를 갖는 column들의 이름 + target_columns (string or list, optional): 레이블 데이터를 갖는 column의 이름 + delete_columns (string or list, optional): 제거할 column의 이름 + """ self.state = state - if self.state == 'train': + if state == 'train': self.data = pd.read_csv(data_file) #self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv') #self.data = pd.concat([self.data,self.add_data]) - else: + else: # state == val self.data = pd.read_csv(data_file) self.text_columns = text_columns self.target_columns = target_columns if target_columns is not None else [] @@ -52,25 +64,24 @@ def __init__(self,state,data_file, text_columns, target_columns=None, delete_col self.max_length = max_length self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.inputs, self.targets = self.preprocessing(self.data) - self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949') + + # create Korean stemmer and lemmatizer + # self.stemmer = Hannanum() TODO + + # 한글 불용어 파일을 다운로드 + # wget -O korean_stopwords.txt https://www.ranks.nl/stopwords/korean + + # 파일의 내용을 읽어 불용어 리스트 생성 + # with open('korean_stopwords.txt', 'r', encoding='utf-8') as f: + # stopwords = f.read().splitlines() + + # self.stopwords = stopwords def __getitem__(self, idx): if len(self.targets) == 0: return torch.tensor(self.inputs[idx]) else: - if self.state=='train': - target_val = self.targets[idx] - if self.delete_columns is not None and self.data.iloc[idx][self.delete_columns] == 1: - if random.random() <= 0.3: - target_val += random.uniform(0.1, 0.4) - else: - if random.random() <= 0.3: - target_val -= random.uniform(0.1, 0.4) - - target_val = max(min(target_val, 5.0), 0.0) - return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(target_val)} - else: - return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])} + return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])} def __len__(self): return len(self.inputs) @@ -80,69 +91,86 @@ def remove_stopwords(self, text): words = [word for word in words if word not in stopwords] return ' '.join(words) + def preprocess_text_wrapper(self, text_list): + text1, text2 = text_list + return self.preprocess_text(text1), self.preprocess_text(text2) + + # def preprocess_text(self, text): + # # create Korean tokenizer using soynlp library + # # tokenizer = RegexTokenizer() + + # # 2회 이상 반복된 문자를 정규화 + # text = repeat_normalize(text, num_repeats=2) + # # 불용어 제거 + # # text = ' '.join([token for token in text.split() if not token in stopwords]) + # # 대문자를 소문자로 변경 + # text = text.lower() + # # ""을 "사람"으로 변경 + # text = re.sub('', '사람', text) + # # 한글 문자, 영어 문자, 공백 문자를 제외한 모든 문자 제거 + # text = re.sub('[^가-힣a-z\\s]', '', text) + # # 텍스트를 토큰으로 분리 예) "안녕하세요" -> "안녕", "하", "세요" + # # tokens = tokenizer.tokenize(text) + # # 어간 추출 + # # tokens = [self.stemmer.morphs(token)[0] for token in text.split()] + # # join tokens back into sentence + # # text = ' '.join(tokens) + # return text + def tokenizing(self, dataframe): data = [] for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)): - # text = '[SEP]'.join([item[text_column] for text_column in self.text_columns]) - text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns]) - + + text = '[SEP]'.join([item[text_column] for text_column in self.text_columns]) + ##불용어 제거 outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=self.max_length) - data.append(outputs['input_ids']) return data def preprocessing(self, data): data = data.drop(columns=self.delete_columns) try: - targets = data[self.target_columns].values.tolist() + if self.state == "train": + targets = data[self.target_columns].values.tolist() + else: + targets = data[self.target_columns].values.tolist() except: targets = [] - inputs = self.tokenizing(data) return inputs, targets - def preprocess_text(self,text): - # normalize repeated characters using soynlp library - text = str(text) - # text = repeat_normalize(str(text), num_repeats=2) - # remove stopwords - #text = ' '.join([token for token in text.split() if not token in stopwords]) - # remove special characters and numbers - # text = re.sub('[^가-힣 ]', '', text) - # text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text) - # tokenize text using soynlp tokenizer - # tokens = Regextokenizer.tokenize(text) - # lowercase all tokens - # tokens = [token.lower() for token in tokens] - # join tokens back into sentence - # text = ' '.join(tokens) - # kospacing_sent = spacing(text) - return text + if __name__ == '__main__': - seed_everything(43) - model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=1,ignore_mismatched_sizes=True) + seed_everything(42) + model_name = "snunlp/KR-ELECTRA-discriminator" + train_data_name = './data/best_data_v1.csv' + dev_data_name = './data/dev.csv' + + model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True) #model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + Train_textDataset = Train_val_TextDataset('train',train_data_name,['sentence_1', 'sentence_2'],'label','binary-label',max_length=256,model_name=model_name) + Val_textDataset = Train_val_TextDataset('val',dev_data_name,['sentence_1', 'sentence_2'],'label','binary-label',max_length=256,model_name=model_name) - Train_textDataset = Train_val_TextDataset('train','./data/train_arg_hanspell_shuffle.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=128,model_name="monologg/koelectra-base-v3-discriminator") - Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=128,model_name="monologg/koelectra-base-v3-discriminator") args = TrainingArguments( - "E:/nlp/checkpoint/best_acc_/monologg/koelectra-base-v3-discriminator_train_arg_hanspell_shuffle", + f'checkpoint/{model_name}/{train_data_name}/{datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")}', evaluation_strategy = "epoch", save_strategy = "epoch", - learning_rate=0.00002860270719188072, #0.000005 - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=10, + learning_rate = 0.000018234535374473915, + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + num_train_epochs=8, weight_decay=0.5, load_best_model_at_end=True, dataloader_num_workers = 4, logging_steps=200, - seed = 43 + seed = 42 ) trainer = Trainer( diff --git a/wandb_tuning.py b/wandb_tuning.py index 79a1eb5..7f9fcdf 100644 --- a/wandb_tuning.py +++ b/wandb_tuning.py @@ -1,6 +1,3 @@ -import os -import random - import pandas as pd from tqdm.auto import tqdm import transformers @@ -10,11 +7,15 @@ import torch import pandas as pd from tqdm import tqdm -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForMaskedLM import numpy as np from scipy.stats import pearsonr +import random +import os +from datetime import datetime import wandb + def seed_everything(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) @@ -23,6 +24,7 @@ def seed_everything(seed): torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True + def compute_pearson_correlation(pred): preds = pred.predictions.flatten() labels = pred.label_ids.flatten() @@ -41,6 +43,7 @@ def __init__(self, data_file, text_columns, target_columns=None, delete_columns= def __getitem__(self, idx): if len(self.targets) == 0: return torch.tensor(self.inputs[idx]) + else: return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])} @@ -72,39 +75,61 @@ def preprocessing(self, data): wandb.login(key=key_wandb) sweep_config = { - 'method': 'random' + 'method': 'bayes', + 'metric': { + 'goal': 'maximize', + 'name': 'val_pearson' + }, } # hyperparameters parameters_dict = { 'epochs': { - 'value': 8 + 'values': [8] }, 'batch_size': { - 'values': [2,4,8,16] + 'values': [4] }, + # 'learning_rate': { + # 'distribution': 'log_uniform_values', + # 'min': 0.000018234535374473915, # 0.00002 + # 'max': 0.000018234535374473915 # 0.00003 + # # 4~4.5 + # }, 'learning_rate': { - 'distribution': 'log_uniform_values', - 'min': 8e-6, - 'max': 5e-5 + 'values': [0.000018234535374473915] }, + # 'warmup_steps': { + # 'values': [0, 400, 800] + # }, 'weight_decay': { - 'values': [ i/10 for i in range(2,6)] + 'values': [0.5] + # 'values': ['linear', 'cosine'] }, } sweep_config['parameters'] = parameters_dict - sweep_id = wandb.sweep(sweep_config, project="nlp04/korean_sentiment_analysis_dataset3") + sweep_id = wandb.sweep(sweep_config, project="snunlp_KR-ELECTRA-discriminator") + + model_name = "snunlp/KR-ELECTRA-discriminator" + # model_name = "snunlp/KR-ELECTRA-discriminator" + # model_name = "monologg/koelectra-base-v3-discriminator" + # model_name = "lighthouse/mdeberta-v3-base-kor-further" + # model_name = "jhn9803/Contract-new-tokenizer-mDeBERTa-v3-kor-further" + + train_data_name = 'best_data_v1.csv' + max_length = 256 # 512 + + # model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True) - model = AutoModelForSequenceClassification.from_pretrained("nlp04/korean_sentiment_analysis_dataset3",num_labels=1,ignore_mismatched_sizes=True) #model = transformers.AutoModelForSequenceClassification.from_pretrained( # 'C:/Users/tm011/PycharmProjects/NLP_COMP/checkpoint/checkpoint-6993') - Train_textDataset = Train_val_TextDataset('./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="nlp04/korean_sentiment_analysis_dataset3") - Val_textDataset = Train_val_TextDataset('./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="nlp04/korean_sentiment_analysis_dataset3") + Train_textDataset = Train_val_TextDataset(f'./data/{train_data_name}',['sentence_1', 'sentence_2'],'label','binary-label',max_length=max_length,model_name=model_name) + Val_textDataset = Train_val_TextDataset('./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=max_length,model_name=model_name) def model_init(): - model = AutoModelForSequenceClassification.from_pretrained("nlp04/korean_sentiment_analysis_dataset3",num_labels=1,ignore_mismatched_sizes=True) + model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True) return model @@ -113,18 +138,20 @@ def train(config=None): config = wandb.config t = config.learning_rate args = TrainingArguments( - f"E:/nlp/checkpoint/baseline_Test_fine_{t}", + f'param_sweep/checkpoint/snunlp/KR-ELECTRA-discriminator', evaluation_strategy="epoch", - save_strategy="epoch", + save_strategy= 'epoch', + # save_strategy="no", report_to='wandb', learning_rate=config.learning_rate, per_device_train_batch_size=config.batch_size, per_device_eval_batch_size=config.batch_size, num_train_epochs=config.epochs, weight_decay=config.weight_decay, - load_best_model_at_end=True, + # load_best_model_at_end=True, dataloader_num_workers=4, logging_steps=200, + seed=42 ) trainer = Trainer( model_init = model_init,