Skip to content

Commit

Permalink
[score 0.9302] snunlp/KR-Medium / best dataset #46
Browse files Browse the repository at this point in the history
  • Loading branch information
jun048098 committed Apr 19, 2023
1 parent 4ce20cb commit cde4500
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 81 deletions.
152 changes: 90 additions & 62 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os

import pandas as pd
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer
from tqdm.auto import tqdm
import transformers
import torch
Expand All @@ -15,62 +13,75 @@
import numpy as np
from scipy.stats import pearsonr
import random
import nltk
from nltk.corpus import stopwords
# import nltk
# from nltk.corpus import stopwords
import re
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer
from datetime import datetime
# from konlpy.tag import Hannanum TODO

stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
Regextokenizer = RegexTokenizer()
def compute_pearson_correlation(pred):
preds = pred.predictions.flatten()
labels = pred.label_ids.flatten()
return {"pearson_correlation": pearsonr(preds, labels)[0]}


def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
random.seed(seed) # Python의 random 모듈의 시드를 설정
os.environ['PYTHONHASHSEED'] = str(seed) # Python 해시 함수의 시드를 설정
np.random.seed(seed) # NumPy의 무작위 함수인 random 모듈의 시드를 설정
torch.manual_seed(seed) # PyTorch의 무작위 함수를 위한 시드를 설정
torch.cuda.manual_seed(seed) # PyTorch의 CUDA(Compute Unified Device Architecture) 연산을 위한 시드를 설정
torch.backends.cudnn.deterministic = True # PyTorch의 cuDNN(CUDA Deep Neural Network) 라이브러리의 동작을 재현 가능하게 만들기 위한 옵션을 설정
# PyTorch의 cuDNN 라이브러리가 입력 데이터의 크기에 따라 최적의 컨볼루션 연산 알고리즘을 선택하는 옵션을 설정
# True로 설정하면 동일한 입력 데이터에 대해서도 항상 같은 알고리즘이 선택되어, 동일한 결과를 재현할 수 있다.
torch.backends.cudnn.benchmark = True



class Train_val_TextDataset(torch.utils.data.Dataset):
def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=128, model_name='klue/roberta-small'):
def __init__(self, state, data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
"""
클래스의 생성자 메서드로, 데이터셋을 초기화한다.
Args:
state (string): 데이터 셋의 상태
data_file (string): 데이터 파일의 경로
text_columns (list): 텍스트 데이터를 갖는 column들의 이름
target_columns (string or list, optional): 레이블 데이터를 갖는 column의 이름
delete_columns (string or list, optional): 제거할 column의 이름
"""
self.state = state
if self.state == 'train':
if state == 'train':
self.data = pd.read_csv(data_file)
#self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv')
#self.data = pd.concat([self.data,self.add_data])
else:
else: # state == val
self.data = pd.read_csv(data_file)
self.text_columns = text_columns
self.target_columns = target_columns if target_columns is not None else []
self.delete_columns = delete_columns if delete_columns is not None else []
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.inputs, self.targets = self.preprocessing(self.data)
self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')

# create Korean stemmer and lemmatizer
# self.stemmer = Hannanum() TODO

# 한글 불용어 파일을 다운로드
# wget -O korean_stopwords.txt https://www.ranks.nl/stopwords/korean

# 파일의 내용을 읽어 불용어 리스트 생성
# with open('korean_stopwords.txt', 'r', encoding='utf-8') as f:
# stopwords = f.read().splitlines()

# self.stopwords = stopwords

def __getitem__(self, idx):
if len(self.targets) == 0:
return torch.tensor(self.inputs[idx])
else:
if self.state=='train':
target_val = self.targets[idx]
if self.delete_columns is not None and self.data.iloc[idx][self.delete_columns] == 1:
if random.random() <= 0.3:
target_val += random.uniform(0.1, 0.4)
else:
if random.random() <= 0.3:
target_val -= random.uniform(0.1, 0.4)

target_val = max(min(target_val, 5.0), 0.0)
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(target_val)}
else:
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}

def __len__(self):
return len(self.inputs)
Expand All @@ -80,69 +91,86 @@ def remove_stopwords(self, text):
words = [word for word in words if word not in stopwords]
return ' '.join(words)

def preprocess_text_wrapper(self, text_list):
text1, text2 = text_list
return self.preprocess_text(text1), self.preprocess_text(text2)

# def preprocess_text(self, text):
# # create Korean tokenizer using soynlp library
# # tokenizer = RegexTokenizer()

# # 2회 이상 반복된 문자를 정규화
# text = repeat_normalize(text, num_repeats=2)
# # 불용어 제거
# # text = ' '.join([token for token in text.split() if not token in stopwords])
# # 대문자를 소문자로 변경
# text = text.lower()
# # "<PERSON>"을 "사람"으로 변경
# text = re.sub('<PERSON>', '사람', text)
# # 한글 문자, 영어 문자, 공백 문자를 제외한 모든 문자 제거
# text = re.sub('[^가-힣a-z\\s]', '', text)
# # 텍스트를 토큰으로 분리 예) "안녕하세요" -> "안녕", "하", "세요"
# # tokens = tokenizer.tokenize(text)
# # 어간 추출
# # tokens = [self.stemmer.morphs(token)[0] for token in text.split()]
# # join tokens back into sentence
# # text = ' '.join(tokens)
# return text

def tokenizing(self, dataframe):
data = []
for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):
# text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])


text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
##불용어 제거
outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
max_length=self.max_length)

data.append(outputs['input_ids'])
return data

def preprocessing(self, data):
data = data.drop(columns=self.delete_columns)
try:
targets = data[self.target_columns].values.tolist()
if self.state == "train":
targets = data[self.target_columns].values.tolist()
else:
targets = data[self.target_columns].values.tolist()
except:
targets = []

inputs = self.tokenizing(data)
return inputs, targets

def preprocess_text(self,text):
# normalize repeated characters using soynlp library
text = str(text)
# text = repeat_normalize(str(text), num_repeats=2)
# remove stopwords
#text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
# text = re.sub('[^가-힣 ]', '', text)
# text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
# tokens = Regextokenizer.tokenize(text)
# lowercase all tokens
# tokens = [token.lower() for token in tokens]
# join tokens back into sentence
# text = ' '.join(tokens)
# kospacing_sent = spacing(text)
return text


if __name__ == '__main__':

seed_everything(43)
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=1,ignore_mismatched_sizes=True)
seed_everything(42)
model_name = "snunlp/KR-ELECTRA-discriminator"
train_data_name = './data/best_data_v1.csv'
dev_data_name = './data/dev.csv'

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True)
#model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Train_textDataset = Train_val_TextDataset('train',train_data_name,['sentence_1', 'sentence_2'],'label','binary-label',max_length=256,model_name=model_name)
Val_textDataset = Train_val_TextDataset('val',dev_data_name,['sentence_1', 'sentence_2'],'label','binary-label',max_length=256,model_name=model_name)

Train_textDataset = Train_val_TextDataset('train','./data/train_arg_hanspell_shuffle.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=128,model_name="monologg/koelectra-base-v3-discriminator")
Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=128,model_name="monologg/koelectra-base-v3-discriminator")

args = TrainingArguments(
"E:/nlp/checkpoint/best_acc_/monologg/koelectra-base-v3-discriminator_train_arg_hanspell_shuffle",
f'checkpoint/{model_name}/{train_data_name}/{datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")}',
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=0.00002860270719188072, #0.000005
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=10,
learning_rate = 0.000018234535374473915,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=8,
weight_decay=0.5,
load_best_model_at_end=True,
dataloader_num_workers = 4,
logging_steps=200,
seed = 43
seed = 42
)

trainer = Trainer(
Expand Down
65 changes: 46 additions & 19 deletions wandb_tuning.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import os
import random

import pandas as pd
from tqdm.auto import tqdm
import transformers
Expand All @@ -10,11 +7,15 @@
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
from scipy.stats import pearsonr
import random
import os
from datetime import datetime
import wandb


def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
Expand All @@ -23,6 +24,7 @@ def seed_everything(seed):
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

def compute_pearson_correlation(pred):
preds = pred.predictions.flatten()
labels = pred.label_ids.flatten()
Expand All @@ -41,6 +43,7 @@ def __init__(self, data_file, text_columns, target_columns=None, delete_columns=
def __getitem__(self, idx):
if len(self.targets) == 0:
return torch.tensor(self.inputs[idx])

else:
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}

Expand Down Expand Up @@ -72,39 +75,61 @@ def preprocessing(self, data):
wandb.login(key=key_wandb)

sweep_config = {
'method': 'random'
'method': 'bayes',
'metric': {
'goal': 'maximize',
'name': 'val_pearson'
},
}

# hyperparameters
parameters_dict = {
'epochs': {
'value': 8
'values': [8]
},
'batch_size': {
'values': [2,4,8,16]
'values': [4]
},
# 'learning_rate': {
# 'distribution': 'log_uniform_values',
# 'min': 0.000018234535374473915, # 0.00002
# 'max': 0.000018234535374473915 # 0.00003
# # 4~4.5
# },
'learning_rate': {
'distribution': 'log_uniform_values',
'min': 8e-6,
'max': 5e-5
'values': [0.000018234535374473915]
},
# 'warmup_steps': {
# 'values': [0, 400, 800]
# },
'weight_decay': {
'values': [ i/10 for i in range(2,6)]
'values': [0.5]
# 'values': ['linear', 'cosine']
},
}
sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="nlp04/korean_sentiment_analysis_dataset3")
sweep_id = wandb.sweep(sweep_config, project="snunlp_KR-ELECTRA-discriminator")

model_name = "snunlp/KR-ELECTRA-discriminator"
# model_name = "snunlp/KR-ELECTRA-discriminator"
# model_name = "monologg/koelectra-base-v3-discriminator"
# model_name = "lighthouse/mdeberta-v3-base-kor-further"
# model_name = "jhn9803/Contract-new-tokenizer-mDeBERTa-v3-kor-further"

train_data_name = 'best_data_v1.csv'
max_length = 256 # 512

# model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True)

model = AutoModelForSequenceClassification.from_pretrained("nlp04/korean_sentiment_analysis_dataset3",num_labels=1,ignore_mismatched_sizes=True)

#model = transformers.AutoModelForSequenceClassification.from_pretrained(
# 'C:/Users/tm011/PycharmProjects/NLP_COMP/checkpoint/checkpoint-6993')
Train_textDataset = Train_val_TextDataset('./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="nlp04/korean_sentiment_analysis_dataset3")
Val_textDataset = Train_val_TextDataset('./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="nlp04/korean_sentiment_analysis_dataset3")
Train_textDataset = Train_val_TextDataset(f'./data/{train_data_name}',['sentence_1', 'sentence_2'],'label','binary-label',max_length=max_length,model_name=model_name)
Val_textDataset = Train_val_TextDataset('./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=max_length,model_name=model_name)


def model_init():
model = AutoModelForSequenceClassification.from_pretrained("nlp04/korean_sentiment_analysis_dataset3",num_labels=1,ignore_mismatched_sizes=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1,ignore_mismatched_sizes=True)
return model


Expand All @@ -113,18 +138,20 @@ def train(config=None):
config = wandb.config
t = config.learning_rate
args = TrainingArguments(
f"E:/nlp/checkpoint/baseline_Test_fine_{t}",
f'param_sweep/checkpoint/snunlp/KR-ELECTRA-discriminator',
evaluation_strategy="epoch",
save_strategy="epoch",
save_strategy= 'epoch',
# save_strategy="no",
report_to='wandb',
learning_rate=config.learning_rate,
per_device_train_batch_size=config.batch_size,
per_device_eval_batch_size=config.batch_size,
num_train_epochs=config.epochs,
weight_decay=config.weight_decay,
load_best_model_at_end=True,
# load_best_model_at_end=True,
dataloader_num_workers=4,
logging_steps=200,
seed=42
)
trainer = Trainer(
model_init = model_init,
Expand Down

0 comments on commit cde4500

Please sign in to comment.