From 594ea68f668ec077bdb1bc5aa152687f6d8a1d80 Mon Sep 17 00:00:00 2001 From: Advaith Rao Date: Sun, 26 Nov 2023 00:05:43 -0500 Subject: [PATCH] Added use_aug parameter to 3 model trainer code --- pipelines/distilbert_trainer.py | 41 ++++++++++++++++++--------------- pipelines/roberta_trainer.py | 41 ++++++++++++++++++--------------- pipelines/svm_trainer.py | 29 ++++++++++++++++++++--- 3 files changed, 70 insertions(+), 41 deletions(-) diff --git a/pipelines/distilbert_trainer.py b/pipelines/distilbert_trainer.py index 60898ca..23af8c3 100644 --- a/pipelines/distilbert_trainer.py +++ b/pipelines/distilbert_trainer.py @@ -1,4 +1,4 @@ -#usage: python3 -m pipelines.distilbert_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'distilbert-base-uncased' +#usage: python3 -m pipelines.distilbert_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'distilbert-base-uncased' --use_aug True import sys sys.path.append('..') @@ -35,6 +35,7 @@ def parse_args(): parser.add_argument("--num_epochs", "-e", type=int, default=40, help="Number of epochs") parser.add_argument("--batch_size", "-b", type=int, default=128, help="Batch size") parser.add_argument("--device", "-d", type=str, default='cpu', help="Device to train the model on: 'cpu', 'cuda' or 'gpu'") + parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing") return parser.parse_args() def load_data(): @@ -132,30 +133,31 @@ def train_model(train_data, hyper_params): # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] # train_data = train_data.reset_index(drop=True) - augmentor = Augmentor() + if hyper_params['use_aug']: + augmentor = Augmentor() - train_body, train_labels = augmentor( - train_data['Body'].tolist(), - train_data['Label'].tolist(), - aug_label=1, - num_aug_per_label_1=9, - shuffle=True - ) + train_body, train_labels = augmentor( + train_data['Body'].tolist(), + train_data['Label'].tolist(), + aug_label=1, + num_aug_per_label_1=9, + shuffle=True + ) - _train_data = pd.DataFrame( - { - 'Body': train_body, - 'Label': train_labels - } - ) + train_data = pd.DataFrame( + { + 'Body': train_body, + 'Label': train_labels + } + ) - _train_data.drop_duplicates(subset=['Body'], inplace=True) - _train_data.reset_index(drop=True, inplace=True) + train_data.drop_duplicates(subset=['Body'], inplace=True) + train_data.reset_index(drop=True, inplace=True) # Call your code that produces output model.train( - body=_train_data['Body'], - label=_train_data['Label'], + body=train_data['Body'], + label=train_data['Label'], validation_size=0.2, wandb=run ) @@ -246,6 +248,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path): 'num_epochs': args.num_epochs, 'batch_size': args.batch_size, 'device': args.device, + 'use_aug': args.use_aug, } # Log in to Weights and Biases diff --git a/pipelines/roberta_trainer.py b/pipelines/roberta_trainer.py index 7d7997f..392bbcc 100644 --- a/pipelines/roberta_trainer.py +++ b/pipelines/roberta_trainer.py @@ -1,4 +1,4 @@ -#usage: python3 -m pipelines.roberta_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'roberta-base' +#usage: python3 -m pipelines.roberta_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'roberta-base' --use_aug True import sys sys.path.append('..') @@ -35,6 +35,7 @@ def parse_args(): parser.add_argument("--num_epochs", "-e", type=int, default=40, help="Number of epochs") parser.add_argument("--batch_size", "-b", type=int, default=128, help="Batch size") parser.add_argument("--device", "-d", type=str, default='cpu', help="Device to train the model on: 'cpu', 'cuda' or 'gpu'") + parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing") return parser.parse_args() def load_data(): @@ -128,30 +129,31 @@ def train_model(train_data, hyper_params): run = wandb.init(config=hyper_params) model = RobertaModel(**hyper_params) - augmentor = Augmentor() + if hyper_params['use_aug']: + augmentor = Augmentor() - train_body, train_labels = augmentor( - train_data['Body'].tolist(), - train_data['Label'].tolist(), - aug_label=1, - num_aug_per_label_1=9, - shuffle=True - ) + train_body, train_labels = augmentor( + train_data['Body'].tolist(), + train_data['Label'].tolist(), + aug_label=1, + num_aug_per_label_1=9, + shuffle=True + ) - _train_data = pd.DataFrame( - { - 'Body': train_body, - 'Label': train_labels - } - ) + train_data = pd.DataFrame( + { + 'Body': train_body, + 'Label': train_labels + } + ) - _train_data.drop_duplicates(subset=['Body'], inplace=True) - _train_data.reset_index(drop=True, inplace=True) + train_data.drop_duplicates(subset=['Body'], inplace=True) + train_data.reset_index(drop=True, inplace=True) # Call your code that produces output model.train( - body=_train_data['Body'], - label=_train_data['Label'], + body=train_data['Body'], + label=train_data['Label'], validation_size=0.2, wandb=run ) @@ -242,6 +244,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path): 'num_epochs': args.num_epochs, 'batch_size': args.batch_size, 'device': args.device, + 'use_aug': args.use_aug, } # Log in to Weights and Biases diff --git a/pipelines/svm_trainer.py b/pipelines/svm_trainer.py index e4be1ff..0462267 100644 --- a/pipelines/svm_trainer.py +++ b/pipelines/svm_trainer.py @@ -1,4 +1,4 @@ -#usage: python3 -m pipelines.svm_trainer --num_labels 2 --C 10 --kernel 'rbf' --save_path '/tmp/model' +#usage: python3 -m pipelines.svm_trainer --num_labels 2 --C 10 --kernel 'rbf' --save_path '/tmp/model' --use_aug True import sys sys.path.append('..') @@ -14,7 +14,7 @@ from detector.labeler import EnronLabeler, MismatchLabeler from detector.modeler import SVMModel from detector.preprocessor import Preprocessor -from utils.util_modeler import evaluate_and_log, get_f1_score +from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor import wandb import argparse @@ -33,6 +33,7 @@ def parse_args(): parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels") parser.add_argument("--C", "-C", type=int, default=1, help="Regularization parameter") parser.add_argument("--kernel", "-k", type=str, default='rbf', help="Kernel to use in the algorithm ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed')") + parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing") return parser.parse_args() def load_data(): @@ -130,6 +131,27 @@ def train_model(train_data, hyper_params): # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] # train_data = train_data.reset_index(drop=True) + if hyper_params['use_aug']: + augmentor = Augmentor() + + train_body, train_labels = augmentor( + train_data['Body'].tolist(), + train_data['Label'].tolist(), + aug_label=1, + num_aug_per_label_1=9, + shuffle=True + ) + + train_data = pd.DataFrame( + { + 'Body': train_body, + 'Label': train_labels + } + ) + + train_data.drop_duplicates(subset=['Body'], inplace=True) + train_data.reset_index(drop=True, inplace=True) + # Call your code that produces output model.train(body=train_data['Body'], label=train_data['Label']) return model @@ -209,7 +231,8 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path): hyper_params = { 'num_labels': args.num_labels, 'C': args.C, - 'kernel': args.kernel + 'kernel': args.kernel, + 'use_aug': args.use_aug, } # Log in to Weights and Biases