Skip to content

Commit

Permalink
[Test 0.92222 ]Baseline in preprocessing code, monologg/koelectra-bas…
Browse files Browse the repository at this point in the history
…e-v3-discriminator #23

[Test 0.92222 ]Baseline in preprocessing code, monologg/koelectra-base-v3-discriminator #23
  • Loading branch information
Kim-Ju-won authored Apr 13, 2023
2 parents b9b89c0 + fd6030b commit 07f771f
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 31 deletions.
6 changes: 3 additions & 3 deletions esnb.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
import numpy as np

df1 = pd.read_csv('./data/output_sota.csv')
df2 = pd.read_csv('./data/output_2d.csv')
df1 = pd.read_csv('./output_dis_preprocess_re.csv')
df2 = pd.read_csv('./output_mdeberta_preprocess_re.csv')

a = np.array(df1)
b = np.array(df2)
Expand All @@ -17,6 +17,6 @@
df3 = pd.DataFrame({'id':a[:,0],
'target':total})

df3.to_csv('./data/esnb.csv')
df3.to_csv('./data/esnb_dis_mdeberta.csv')


31 changes: 27 additions & 4 deletions infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,29 @@
from tqdm import tqdm
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer

tokenizer = RegexTokenizer()
stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')


def preprocess_text(text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
# remove stopwords
text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
#text = re.sub('[^가-힣 ]', '', text)
#text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
tokens = tokenizer.tokenize(text)
# lowercase all tokens
tokens = [token.lower() for token in tokens]
# join tokens back into sentence
text = ' '.join(tokens)
#kospacing_sent = spacing(text)
return text
class Infer_TextDataset(torch.utils.data.Dataset):
def __init__(self, data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
self.data = pd.read_csv(data_file)
Expand All @@ -23,7 +45,8 @@ def __len__(self):
def tokenizing(self, dataframe):
data=[]
for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):
text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
text = '[SEP]'.join([preprocess_text(item[text_column]) for text_column in self.text_columns])
print(text)
outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=self.max_length)
data.append(outputs['input_ids'])
return data
Expand All @@ -35,7 +58,7 @@ def preprocessing(self, data):
if __name__ == '__main__':
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_add_data/checkpoint-23625')
model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162')
model.to(device)
test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
test_dataloader = DataLoader(dataset=test_textDataset,
Expand All @@ -51,8 +74,8 @@ def preprocessing(self, data):
logits = y_pred.logits
y_pred = logits.detach().cpu().numpy()
score.extend(y_pred)
score = list(round(float(i), 1) for i in score)
score = list(float(i) for i in score)
#predictions = list(round(float(i), 1) for i in torch.cat(output))
output = pd.read_csv('./data/sample_submission.csv')
output['target'] = score
output.to_csv('output_add.csv', index=False)
output.to_csv('pretest.csv', index=False)
50 changes: 37 additions & 13 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os

import pandas as pd
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer
from tqdm.auto import tqdm
import transformers
import torch
Expand Down Expand Up @@ -46,7 +48,8 @@ def __init__(self,state,data_file, text_columns, target_columns=None, delete_col
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.inputs, self.targets = self.preprocessing(self.data)

self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')
self.Regextokenizer = RegexTokenizer()
def __getitem__(self, idx):
if len(self.targets) == 0:
return torch.tensor(self.inputs[idx])
Expand All @@ -65,7 +68,7 @@ def tokenizing(self, dataframe):
data = []
for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):

text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])
##불용어 제거
outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
max_length=self.max_length)
Expand All @@ -81,31 +84,52 @@ def preprocessing(self, data):
inputs = self.tokenizing(data)
return inputs, targets


def preprocess_text(self,text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
# remove stopwords
text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
# text = re.sub('[^가-힣 ]', '', text)
# text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
tokens = self.Regextokenizer.tokenize(text)
# lowercase all tokens
tokens = [token.lower() for token in tokens]
# join tokens back into sentence
text = ' '.join(tokens)
# kospacing_sent = spacing(text)
return text

if __name__ == '__main__':

seed_everything(42)
model = AutoModelForSequenceClassification.from_pretrained("lighthouse/mdeberta-v3-base-kor-further",num_labels=1,ignore_mismatched_sizes=True)
seed_everything(43)
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels=1,ignore_mismatched_sizes=True)
#model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)

Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")



Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
Val_textDataset = Train_val_TextDataset('val','./data/val.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")





args = TrainingArguments(
"E:/nlp/checkpoint/best_acc_mdeberta",
"E:/nlp/checkpoint/best_acc_/discriminator_include_en_10epoch",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=0.00002340865224868444, #0.000005
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=8,
learning_rate=0.00002860270719188072, #0.000005
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=10,
weight_decay=0.5,
load_best_model_at_end=True,
dataloader_num_workers = 4,
logging_steps=200,
seed = 42
seed = 43
)

trainer = Trainer(
Expand Down
37 changes: 26 additions & 11 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,54 @@
import pandas as pd
import re
import urllib.request
import requests
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer

from konlpy.tag import Hannanum
from pykospacing import Spacing
# download Korean stopwords file from provided link
stopword_url = 'https://www.ranks.nl/stopwords/korean'
with urllib.request.urlopen(stopword_url) as response:
stopwords = response.read().decode().splitlines()



# create Korean tokenizer using soynlp library
tokenizer = RegexTokenizer()
spacing = Spacing()
# create Korean stemmer

stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')

def preprocess_text(text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
# remove stopwords
text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]', '', text)
#text = re.sub('[^가-힣 ]', '', text)
#text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
tokens = tokenizer.tokenize(text)
# remove stopwords
tokens = [token for token in tokens if not token in stopwords]
# lowercase all tokens
tokens = [token.lower() for token in tokens]
# join tokens back into sentence
text = ' '.join(tokens)
#kospacing_sent = spacing(text)
return text

# load csv data
data = pd.read_csv('./data/train.csv')

# remove null values
data = data.dropna()
# drop rows with NaN values in sentence_1 column

# preprocess sentence_1 and sentence_2 columns
data['sentence_1'] = data['sentence_1'].apply(lambda x: preprocess_text(x))
data['sentence_2'] = data['sentence_2'].apply(lambda x: preprocess_text(x))

data = data.dropna(subset=['sentence_1'])
data = data.dropna(subset=['sentence_2'])

# save preprocessed data to csv
data.to_csv('preprocessed_data.csv', index=False)
data.to_csv('./data/preprocessed_train_data_sin_v2.csv', index=False)

data = pd.read_csv('./data/preprocessed_train_data_sin_v2.csv')
data = data.dropna(subset=['sentence_1'])
data = data.dropna(subset=['sentence_2'])
data.to_csv('./data/preprocessed_train_data_sin_v2_filter_.csv', index=False)

0 comments on commit 07f771f

Please sign in to comment.