[score 0.9242]label smoothing #22

boostcampaitech5 · Apr 13, 2023 · fb3094a · fb3094a
2 parents 07f771f + df9a596
commit fb3094a
Show file tree

Hide file tree

Showing 4 changed files with 176 additions and 21 deletions.
diff --git a/esnb.py b/esnb.py
@@ -1,8 +1,8 @@
 import pandas as pd
 import numpy as np
 
-df1 = pd.read_csv('./output_dis_preprocess_re.csv')
-df2 = pd.read_csv('./output_mdeberta_preprocess_re.csv')
+df1 = pd.read_csv('./dev_add_dis_preproceess_include_en.csv')
+df2 = pd.read_csv('./dev_add_deberta_preproceess_include_en.csv')
 
 a = np.array(df1)
 b = np.array(df2)
@@ -17,6 +17,6 @@
 df3 = pd.DataFrame({'id':a[:,0],
                              'target':total})
 
-df3.to_csv('./data/esnb_dis_mdeberta.csv')
+df3.to_csv('./data/esnb_dis_mdeberta_add_dev.csv')
 
 
diff --git a/infer.py b/infer.py
@@ -11,6 +11,7 @@
 stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
 
 
+
 def preprocess_text(text):
     # normalize repeated characters using soynlp library
     text = repeat_normalize(text, num_repeats=2)
@@ -58,9 +59,9 @@ def preprocessing(self, data):
 if __name__ == '__main__':
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
-    model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162')
+    model = transformers.AutoModelForSequenceClassification.from_pretrained('E:/nlp/checkpoint/add_dev/discriminator_include_en_10epoch/checkpoint-70')
     model.to(device)
-    test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
+    test_textDataset = Infer_TextDataset('./data/test.csv',['sentence_1', 'sentence_2'],None,None,max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
     test_dataloader = DataLoader(dataset=test_textDataset,
                                  batch_size=4,
                                  num_workers=0,
@@ -78,4 +79,4 @@ def preprocessing(self, data):
     #predictions = list(round(float(i), 1) for i in torch.cat(output))
     output = pd.read_csv('./data/sample_submission.csv')
     output['target'] = score
-    output.to_csv('pretest.csv', index=False)
+    output.to_csv('dev_add_dis_preproceess_include_en.csv', index=False)
diff --git a/main.py b/main.py
@@ -17,6 +17,9 @@
 import random
 import nltk
 from nltk.corpus import stopwords
+
+stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
+Regextokenizer = RegexTokenizer()
 def compute_pearson_correlation(pred):
     preds = pred.predictions.flatten()
     labels = pred.label_ids.flatten()
@@ -36,7 +39,8 @@ def seed_everything(seed):
 
 class Train_val_TextDataset(torch.utils.data.Dataset):
     def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
-        if state == 'train':
+        self.state = state
+        if self.state == 'train':
             self.data = pd.read_csv(data_file)
             #self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv')
             #self.data = pd.concat([self.data,self.add_data])
@@ -49,12 +53,24 @@ def __init__(self,state,data_file, text_columns, target_columns=None, delete_col
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.inputs, self.targets = self.preprocessing(self.data)
         self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')
-        self.Regextokenizer = RegexTokenizer()
+
     def __getitem__(self, idx):
         if len(self.targets) == 0:
             return torch.tensor(self.inputs[idx])
         else:
-            return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}
+            if self.state=='train':
+                target_val = self.targets[idx]
+                if self.delete_columns is not None and self.data.iloc[idx][self.delete_columns] == 1:
+                    if random.random() <= 0.2:
+                        target_val += random.uniform(0.0, 0.1)
+                else:
+                    if random.random() <= 0.2:
+                        target_val -= random.uniform(0.0, 0.1)
+
+                target_val = max(min(target_val, 5.0), 0.0)
+                return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(target_val)}
+            else:
+                return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}
 
     def __len__(self):
         return len(self.inputs)
@@ -67,9 +83,8 @@ def remove_stopwords(self, text):
     def tokenizing(self, dataframe):
         data = []
         for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):
-
             text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])
-            ##불용어 제거
+
             outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
                                      max_length=self.max_length)
             data.append(outputs['input_ids'])
@@ -88,12 +103,12 @@ def preprocess_text(self,text):
         # normalize repeated characters using soynlp library
         text = repeat_normalize(text, num_repeats=2)
         # remove stopwords
-        text = ' '.join([token for token in text.split() if not token in stopwords])
+        #text = ' '.join([token for token in text.split() if not token in stopwords])
         # remove special characters and numbers
         # text = re.sub('[^가-힣 ]', '', text)
         # text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
         # tokenize text using soynlp tokenizer
-        tokens = self.Regextokenizer.tokenize(text)
+        tokens = Regextokenizer.tokenize(text)
         # lowercase all tokens
         tokens = [token.lower() for token in tokens]
         # join tokens back into sentence
@@ -108,17 +123,11 @@ def preprocess_text(self,text):
     #model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)
 
 
-
-
     Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
-    Val_textDataset = Train_val_TextDataset('val','./data/val.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
-
-
-
-
+    Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="monologg/koelectra-base-v3-discriminator")
 
     args = TrainingArguments(
-        "E:/nlp/checkpoint/best_acc_/discriminator_include_en_10epoch",
+        "E:/nlp/checkpoint/best_acc_/koelectra-labelsoomthing_0.4_0.1_0.2",
         evaluation_strategy = "epoch",
         save_strategy = "epoch",
         learning_rate=0.00002860270719188072, #0.000005

diff --git a/train_dev.py b/train_dev.py
@@ -0,0 +1,145 @@
+import os
+
+import pandas as pd
+from soynlp.normalizer import repeat_normalize
+from soynlp.tokenizer import RegexTokenizer
+from tqdm.auto import tqdm
+import transformers
+import torch
+from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForPreTraining
+from transformers import ElectraModel, ElectraTokenizer
+import torch
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import numpy as np
+from scipy.stats import pearsonr
+import random
+import nltk
+from nltk.corpus import stopwords
+def compute_pearson_correlation(pred):
+    preds = pred.predictions.flatten()
+    labels = pred.label_ids.flatten()
+    return {"pearson_correlation": pearsonr(preds, labels)[0]}
+
+stopwords = pd.read_csv('./data/stopwords.csv',encoding='cp949')
+RegexTokenizer = RegexTokenizer()
+def seed_everything(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+
+
+
+class Train_val_TextDataset(torch.utils.data.Dataset):
+    def __init__(self,state,data_file, text_columns, target_columns=None, delete_columns=None, max_length=512, model_name='klue/roberta-small'):
+        self.state = state
+        if self.state == 'train':
+            self.data = pd.read_csv(data_file)
+            #self.add_data = pd.read_csv('./data/preprocessed_data_sin_v2_filter.csv')
+            #self.data = pd.concat([self.data,self.add_data])
+        else:
+            self.data = pd.read_csv(data_file)
+        self.text_columns = text_columns
+        self.target_columns = target_columns if target_columns is not None else []
+        self.delete_columns = delete_columns if delete_columns is not None else []
+        self.max_length = max_length
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.inputs, self.targets = self.preprocessing(self.data)
+        self.stopwords = pd.read_csv('./data/stopwords.csv', encoding='cp949')
+
+    def __getitem__(self, idx):
+        if len(self.targets) == 0:
+            return torch.tensor(self.inputs[idx])
+        else:
+            return {"input_ids": torch.tensor(self.inputs[idx]), "labels": torch.tensor(self.targets[idx])}
+
+    def __len__(self):
+        return len(self.inputs)
+
+    def remove_stopwords(self, text):
+        words = text.split()
+        words = [word for word in words if word not in stopwords]
+        return ' '.join(words)
+
+    def tokenizing(self, dataframe):
+        data = []
+        for idx, item in tqdm(dataframe.iterrows(), desc='Tokenizing', total=len(dataframe)):
+            text = '[SEP]'.join([self.preprocess_text(item[text_column]) for text_column in self.text_columns])
+            ##불용어 제거
+            outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True,
+                                     max_length=self.max_length)
+            data.append(outputs['input_ids'])
+        return data
+
+    def preprocessing(self, data):
+        data = data.drop(columns=self.delete_columns)
+        try:
+            targets = data[self.target_columns].values.tolist()
+        except:
+            targets = []
+        inputs = self.tokenizing(data)
+        return inputs, targets
+
+    def preprocess_text(self,text):
+        # normalize repeated characters using soynlp library
+        text = repeat_normalize(text, num_repeats=2)
+        # remove stopwords
+        text = ' '.join([token for token in text.split() if not token in stopwords])
+        # remove special characters and numbers
+        # text = re.sub('[^가-힣 ]', '', text)
+        # text = re.sub('[^a-zA-Zㄱ-ㅎ가-힣]', '', text)
+        # tokenize text using soynlp tokenizer
+        tokens = RegexTokenizer.tokenize(text)
+        # lowercase all tokens
+        tokens = [token.lower() for token in tokens]
+        # join tokens back into sentence
+        text = ' '.join(tokens)
+        # kospacing_sent = spacing(text)
+        return text
+
+if __name__ == '__main__':
+
+    seed_everything(42)
+    model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc_mdeberta_preproceess_include_en/checkpoint-8162",num_labels=1,ignore_mismatched_sizes=True)
+    #model = AutoModelForSequenceClassification.from_pretrained("E:/nlp/checkpoint/best_acc/checkpoint-16317",num_labels=1,ignore_mismatched_sizes=True)
+
+
+
+
+    Train_textDataset = Train_val_TextDataset('train','./data/train.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
+    Val_textDataset = Train_val_TextDataset('val','./data/dev.csv',['sentence_1', 'sentence_2'],'label','binary-label',max_length=512,model_name="lighthouse/mdeberta-v3-base-kor-further")
+
+
+
+
+
+    args = TrainingArguments(
+        "E:/nlp/checkpoint/add_dev/deberta_preproceess_include_en",
+        evaluation_strategy = "epoch",
+        save_strategy = "epoch",
+        learning_rate=0.000002340865224868444, #0.000005
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        num_train_epochs=2,
+        weight_decay=0.5,
+        load_best_model_at_end=True,
+        dataloader_num_workers = 4,
+        logging_steps=200,
+        seed = 42
+    )
+
+    trainer = Trainer(
+        model,
+        args,
+        train_dataset=Val_textDataset,
+        eval_dataset=Val_textDataset,
+        #tokenizer=tokenizer,
+        compute_metrics=compute_pearson_correlation
+    )
+
+    trainer.train()