diff --git a/esnb.py b/esnb.py index 8ee8d41..9787aa9 100644 --- a/esnb.py +++ b/esnb.py @@ -1,8 +1,8 @@ import pandas as pd import numpy as np -df1 = pd.read_csv('./data/output_sota.csv') -df2 = pd.read_csv('./data/output_2d.csv') +df1 = pd.read_csv('./output_dis_preprocess_re.csv') +df2 = pd.read_csv('./output_mdeberta_preprocess_re.csv') a = np.array(df1) b = np.array(df2) @@ -17,6 +17,6 @@ df3 = pd.DataFrame({'id':a[:,0], 'target':total}) -df3.to_csv('./data/esnb.csv') +df3.to_csv('./data/esnb_dis_mdeberta.csv') diff --git a/infer.py b/infer.py index 08fe656..385ecb9 100644 --- a/infer.py +++ b/infer.py @@ -4,7 +4,29 @@ from tqdm import tqdm from transformers import AutoTokenizer from torch.utils.data import DataLoader +from soynlp.normalizer import repeat_normalize +from soynlp.tokenizer import RegexTokenizer +tokenizer = RegexTokenizer() +stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949') + + +def preprocess_text(text): + # normalize repeated characters using soynlp library + text = repeat_normalize(text, num_repeats=2) + # remove stopwords + text = ' '.join([token for token in text.split() if not token in stopwords]) + # remove special characters and numbers + #text = re.sub('[^가-힣 ]', '', text) + #text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text) + # tokenize text using soynlp tokenizer + tokens = tokenizer.tokenize(text) + # lowercase all tokens + tokens = [token.lower() for token in tokens] + # join tokens back into sentence + text = ' '.join(tokens) + #kospacing_sent = spacing(text) + return text class Infer_TextDataset(torch.utils.data.Dataset): def __init__(self, data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'): self.data = pd.read_csv(data_file) @@ -23,7 +45,8 @@ def __len__(self): def tokenizing(self, dataframe): data=[] for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)): - text = '[SEP]'.join([item[text_column] for text_column in self.text_columns]) + text = '[SEP]'.join([preprocess_text(item[text_column]) for text_column in self.text_columns]) + print(text) outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=self.max_length) data.append(outputs['input_ids']) return data @@ -35,7 +58,7 @@ def preprocessing(self, data): if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_add_data/checkpoint-23625') + model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162') model.to(device) test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further") test_dataloader = DataLoader(dataset=test_textDataset, @@ -51,8 +74,8 @@ def preprocessing(self, data): logits = y_pred.logits y_pred = logits.detach().cpu().numpy() score.extend(y_pred) - score = list(round(float(i), 1) for i in score) + score = list(float(i) for i in score) #predictions = list(round(float(i), 1) for i in torch.cat(output)) output = pd.read_csv('./data/sample_submission.csv') output['target'] = score - output.to_csv('output_add.csv', index=False) \ No newline at end of file + output.to_csv('pretest.csv', index=False) \ No newline at end of file diff --git a/main.py b/main.py index f911045..f627810 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,8 @@ import os import pandas as pd +from soynlp.normalizer import repeat_normalize +from soynlp.tokenizer import RegexTokenizer from tqdm.auto import tqdm import transformers import torch @@ -46,7 +48,8 @@ def __init__(self,state,data_file, text_columns, target_columns=None, delete_col self.max_length = max_length self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.inputs, self.targets = self.preprocessing(self.data) - + self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949') + self.Regextokenizer = RegexTokenizer() def __getitem__(self, idx): if len(self.targets) == 0: return torch.tensor(self.inputs[idx]) @@ -65,7 +68,7 @@ def tokenizing(self, dataframe): data = [] for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)): - text = '[SEP]'.join([item[text_column] for text_column in self.text_columns]) + text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns]) ##불용어 제거 outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=self.max_length) @@ -81,31 +84,52 @@ def preprocessing(self, data): inputs = self.tokenizing(data) return inputs, targets - + def preprocess_text(self,text): + # normalize repeated characters using soynlp library + text = repeat_normalize(text, num_repeats=2) + # remove stopwords + text = ' '.join([token for token in text.split() if not token in stopwords]) + # remove special characters and numbers + # text = re.sub('[^가-힣 ]', '', text) + # text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text) + # tokenize text using soynlp tokenizer + tokens = self.Regextokenizer.tokenize(text) + # lowercase all tokens + tokens = [token.lower() for token in tokens] + # join tokens back into sentence + text = ' '.join(tokens) + # kospacing_sent = spacing(text) + return text if __name__ == '__main__': - seed_everything(42) - model = AutoModelForSequenceClassification.from_pretrained("lighthouse/mdeberta-v3-base-kor-further",num_labels=1,ignore_mismatched_sizes=True) + seed_everything(43) + model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=1,ignore_mismatched_sizes=True) #model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True) - Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further") - Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further") + + + + Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator") + Val_textDataset = Train_val_TextDataset('val','./data/val.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator") + + + args = TrainingArguments( - "E:/nlp/checkpoint/best_acc_mdeberta", + "E:/nlp/checkpoint/best_acc_/discriminator_include_en_10epoch", evaluation_strategy = "epoch", save_strategy = "epoch", - learning_rate=0.00002340865224868444, #0.000005 - per_device_train_batch_size=8, - per_device_eval_batch_size=8, - num_train_epochs=8, + learning_rate=0.00002860270719188072, #0.000005 + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=10, weight_decay=0.5, load_best_model_at_end=True, dataloader_num_workers = 4, logging_steps=200, - seed = 42 + seed = 43 ) trainer = Trainer( diff --git a/preprocess.py b/preprocess.py index 360cea6..bf9fe26 100644 --- a/preprocess.py +++ b/preprocess.py @@ -1,39 +1,54 @@ import pandas as pd import re -import urllib.request +import requests from soynlp.normalizer import repeat_normalize from soynlp.tokenizer import RegexTokenizer - +from konlpy.tag import Hannanum +from pykospacing import Spacing # download Korean stopwords file from provided link -stopword_url = 'https://www.ranks.nl/stopwords/korean' -with urllib.request.urlopen(stopword_url) as response: - stopwords = response.read().decode().splitlines() + + # create Korean tokenizer using soynlp library tokenizer = RegexTokenizer() +spacing = Spacing() +# create Korean stemmer + +stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949') def preprocess_text(text): # normalize repeated characters using soynlp library text = repeat_normalize(text, num_repeats=2) + # remove stopwords + text = ' '.join([token for token in text.split() if not token in stopwords]) # remove special characters and numbers - text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]', '', text) + #text = re.sub('[^가-힣 ]', '', text) + #text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text) # tokenize text using soynlp tokenizer tokens = tokenizer.tokenize(text) - # remove stopwords - tokens = [token for token in tokens if not token in stopwords] + # lowercase all tokens + tokens = [token.lower() for token in tokens] # join tokens back into sentence text = ' '.join(tokens) + #kospacing_sent = spacing(text) return text # load csv data data = pd.read_csv('./data/train.csv') -# remove null values -data = data.dropna() +# drop rows with NaN values in sentence_1 column # preprocess sentence_1 and sentence_2 columns data['sentence_1'] = data['sentence_1'].apply(lambda x: preprocess_text(x)) data['sentence_2'] = data['sentence_2'].apply(lambda x: preprocess_text(x)) +data = data.dropna(subset=['sentence_1']) +data = data.dropna(subset=['sentence_2']) + # save preprocessed data to csv -data.to_csv('preprocessed_data.csv', index=False) \ No newline at end of file +data.to_csv('./data/preprocessed_train_data_sin_v2.csv', index=False) + +data = pd.read_csv('./data/preprocessed_train_data_sin_v2.csv') +data = data.dropna(subset=['sentence_1']) +data = data.dropna(subset=['sentence_2']) +data.to_csv('./data/preprocessed_train_data_sin_v2_filter_.csv', index=False)