Skip to content

Commit

Permalink
[score 0.9242]label smoothing #22
Browse files Browse the repository at this point in the history
[score 0.9242]label smoothing #22
  • Loading branch information
Kim-Ju-won authored Apr 13, 2023
2 parents 07f771f + df9a596 commit fb3094a
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 21 deletions.
6 changes: 3 additions & 3 deletions esnb.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pandas as pd
import numpy as np

df1 = pd.read_csv('./output_dis_preprocess_re.csv')
df2 = pd.read_csv('./output_mdeberta_preprocess_re.csv')
df1 = pd.read_csv('./dev_add_dis_preproceess_include_en.csv')
df2 = pd.read_csv('./dev_add_deberta_preproceess_include_en.csv')

a = np.array(df1)
b = np.array(df2)
Expand All @@ -17,6 +17,6 @@
df3 = pd.DataFrame({'id':a[:,0],
'target':total})

df3.to_csv('./data/esnb_dis_mdeberta.csv')
df3.to_csv('./data/esnb_dis_mdeberta_add_dev.csv')


7 changes: 4 additions & 3 deletions infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')



def preprocess_text(text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
Expand Down Expand Up @@ -58,9 +59,9 @@ def preprocessing(self, data):
if __name__ == '__main__':
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162')
model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/add_dev/discriminator_include_en_10epoch/checkpoint-70')
model.to(device)
test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],None,None,max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
test_dataloader = DataLoader(dataset=test_textDataset,
batch_size=4,
num_workers=0,
Expand All @@ -78,4 +79,4 @@ def preprocessing(self, data):
#predictions = list(round(float(i), 1) for i in torch.cat(output))
output = pd.read_csv('./data/sample_submission.csv')
output['target'] = score
output.to_csv('pretest.csv', index=False)
output.to_csv('dev_add_dis_preproceess_include_en.csv', index=False)
39 changes: 24 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
import random
import nltk
from nltk.corpus import stopwords

stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
Regextokenizer = RegexTokenizer()
def compute_pearson_correlation(pred):
preds = pred.predictions.flatten()
labels = pred.label_ids.flatten()
Expand All @@ -36,7 +39,8 @@ def seed_everything(seed):

class Train_val_TextDataset(torch.utils.data.Dataset):
def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
if state == 'train':
self.state = state
if self.state == 'train':
self.data = pd.read_csv(data_file)
#self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv')
#self.data = pd.concat([self.data,self.add_data])
Expand All @@ -49,12 +53,24 @@ def __init__(self,state,data_file, text_columns, target_columns=None, delete_col
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.inputs, self.targets = self.preprocessing(self.data)
self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')
self.Regextokenizer = RegexTokenizer()

def __getitem__(self, idx):
if len(self.targets) == 0:
return torch.tensor(self.inputs[idx])
else:
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}
if self.state=='train':
target_val = self.targets[idx]
if self.delete_columns is not None and self.data.iloc[idx][self.delete_columns] == 1:
if random.random() <= 0.2:
target_val += random.uniform(0.0, 0.1)
else:
if random.random() <= 0.2:
target_val -= random.uniform(0.0, 0.1)

target_val = max(min(target_val, 5.0), 0.0)
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(target_val)}
else:
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}

def __len__(self):
return len(self.inputs)
Expand All @@ -67,9 +83,8 @@ def remove_stopwords(self, text):
def tokenizing(self, dataframe):
data = []
for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):

text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])
##불용어 제거

outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
max_length=self.max_length)
data.append(outputs['input_ids'])
Expand All @@ -88,12 +103,12 @@ def preprocess_text(self,text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
# remove stopwords
text = ' '.join([token for token in text.split() if not token in stopwords])
#text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
# text = re.sub('[^가-힣 ]', '', text)
# text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
tokens = self.Regextokenizer.tokenize(text)
tokens = Regextokenizer.tokenize(text)
# lowercase all tokens
tokens = [token.lower() for token in tokens]
# join tokens back into sentence
Expand All @@ -108,17 +123,11 @@ def preprocess_text(self,text):
#model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)




Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
Val_textDataset = Train_val_TextDataset('val','./data/val.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")




Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")

args = TrainingArguments(
"E:/nlp/checkpoint/best_acc_/discriminator_include_en_10epoch",
"E:/nlp/checkpoint/best_acc_/koelectra-labelsoomthing_0.4_0.1_0.2",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=0.00002860270719188072, #0.000005
Expand Down
145 changes: 145 additions & 0 deletions train_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import os

import pandas as pd
from soynlp.normalizer import repeat_normalize
from soynlp.tokenizer import RegexTokenizer
from tqdm.auto import tqdm
import transformers
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForPreTraining
from transformers import ElectraModel, ElectraTokenizer
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
import numpy as np
from scipy.stats import pearsonr
import random
import nltk
from nltk.corpus import stopwords
def compute_pearson_correlation(pred):
preds = pred.predictions.flatten()
labels = pred.label_ids.flatten()
return {"pearson_correlation": pearsonr(preds, labels)[0]}

stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
RegexTokenizer = RegexTokenizer()
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True



class Train_val_TextDataset(torch.utils.data.Dataset):
def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
self.state = state
if self.state == 'train':
self.data = pd.read_csv(data_file)
#self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv')
#self.data = pd.concat([self.data,self.add_data])
else:
self.data = pd.read_csv(data_file)
self.text_columns = text_columns
self.target_columns = target_columns if target_columns is not None else []
self.delete_columns = delete_columns if delete_columns is not None else []
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.inputs, self.targets = self.preprocessing(self.data)
self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')

def __getitem__(self, idx):
if len(self.targets) == 0:
return torch.tensor(self.inputs[idx])
else:
return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}

def __len__(self):
return len(self.inputs)

def remove_stopwords(self, text):
words = text.split()
words = [word for word in words if word not in stopwords]
return ' '.join(words)

def tokenizing(self, dataframe):
data = []
for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):
text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])
##불용어 제거
outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
max_length=self.max_length)
data.append(outputs['input_ids'])
return data

def preprocessing(self, data):
data = data.drop(columns=self.delete_columns)
try:
targets = data[self.target_columns].values.tolist()
except:
targets = []
inputs = self.tokenizing(data)
return inputs, targets

def preprocess_text(self,text):
# normalize repeated characters using soynlp library
text = repeat_normalize(text, num_repeats=2)
# remove stopwords
text = ' '.join([token for token in text.split() if not token in stopwords])
# remove special characters and numbers
# text = re.sub('[^가-힣 ]', '', text)
# text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
# tokenize text using soynlp tokenizer
tokens = RegexTokenizer.tokenize(text)
# lowercase all tokens
tokens = [token.lower() for token in tokens]
# join tokens back into sentence
text = ' '.join(tokens)
# kospacing_sent = spacing(text)
return text

if __name__ == '__main__':

seed_everything(42)
model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162",num_labels=1,ignore_mismatched_sizes=True)
#model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)




Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")





args = TrainingArguments(
"E:/nlp/checkpoint/add_dev/deberta_preproceess_include_en",
evaluation_strategy = "epoch",
save_strategy = "epoch",
learning_rate=0.000002340865224868444, #0.000005
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
weight_decay=0.5,
load_best_model_at_end=True,
dataloader_num_workers = 4,
logging_steps=200,
seed = 42
)

trainer = Trainer(
model,
args,
train_dataset=Val_textDataset,
eval_dataset=Val_textDataset,
#tokenizer=tokenizer,
compute_metrics=compute_pearson_correlation
)

trainer.train()

0 comments on commit fb3094a

Please sign in to comment.