Skip to content

Commit

Permalink
Added use_aug parameter to 3 model trainer code
Browse files Browse the repository at this point in the history
  • Loading branch information
advaithsrao committed Nov 26, 2023
1 parent cc0f719 commit 594ea68
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 41 deletions.
41 changes: 22 additions & 19 deletions pipelines/distilbert_trainer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#usage: python3 -m pipelines.distilbert_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'distilbert-base-uncased'
#usage: python3 -m pipelines.distilbert_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'distilbert-base-uncased' --use_aug True
import sys
sys.path.append('..')

Expand Down Expand Up @@ -35,6 +35,7 @@ def parse_args():
parser.add_argument("--num_epochs", "-e", type=int, default=40, help="Number of epochs")
parser.add_argument("--batch_size", "-b", type=int, default=128, help="Batch size")
parser.add_argument("--device", "-d", type=str, default='cpu', help="Device to train the model on: 'cpu', 'cuda' or 'gpu'")
parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing")
return parser.parse_args()

def load_data():
Expand Down Expand Up @@ -132,30 +133,31 @@ def train_model(train_data, hyper_params):
# train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
# train_data = train_data.reset_index(drop=True)

augmentor = Augmentor()
if hyper_params['use_aug']:
augmentor = Augmentor()

train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)
train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

_train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)
train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

_train_data.drop_duplicates(subset=['Body'], inplace=True)
_train_data.reset_index(drop=True, inplace=True)
train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

# Call your code that produces output
model.train(
body=_train_data['Body'],
label=_train_data['Label'],
body=train_data['Body'],
label=train_data['Label'],
validation_size=0.2,
wandb=run
)
Expand Down Expand Up @@ -246,6 +248,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
'num_epochs': args.num_epochs,
'batch_size': args.batch_size,
'device': args.device,
'use_aug': args.use_aug,
}

# Log in to Weights and Biases
Expand Down
41 changes: 22 additions & 19 deletions pipelines/roberta_trainer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#usage: python3 -m pipelines.roberta_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'roberta-base'
#usage: python3 -m pipelines.roberta_trainer --num_epochs 20 --batch_size 8 --num_labels 2 --device 'cuda' --save_path '/tmp' --model_name 'roberta-base' --use_aug True
import sys
sys.path.append('..')

Expand Down Expand Up @@ -35,6 +35,7 @@ def parse_args():
parser.add_argument("--num_epochs", "-e", type=int, default=40, help="Number of epochs")
parser.add_argument("--batch_size", "-b", type=int, default=128, help="Batch size")
parser.add_argument("--device", "-d", type=str, default='cpu', help="Device to train the model on: 'cpu', 'cuda' or 'gpu'")
parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing")
return parser.parse_args()

def load_data():
Expand Down Expand Up @@ -128,30 +129,31 @@ def train_model(train_data, hyper_params):
run = wandb.init(config=hyper_params)
model = RobertaModel(**hyper_params)

augmentor = Augmentor()
if hyper_params['use_aug']:
augmentor = Augmentor()

train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)
train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

_train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)
train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

_train_data.drop_duplicates(subset=['Body'], inplace=True)
_train_data.reset_index(drop=True, inplace=True)
train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

# Call your code that produces output
model.train(
body=_train_data['Body'],
label=_train_data['Label'],
body=train_data['Body'],
label=train_data['Label'],
validation_size=0.2,
wandb=run
)
Expand Down Expand Up @@ -242,6 +244,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
'num_epochs': args.num_epochs,
'batch_size': args.batch_size,
'device': args.device,
'use_aug': args.use_aug,
}

# Log in to Weights and Biases
Expand Down
29 changes: 26 additions & 3 deletions pipelines/svm_trainer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#usage: python3 -m pipelines.svm_trainer --num_labels 2 --C 10 --kernel 'rbf' --save_path '/tmp/model'
#usage: python3 -m pipelines.svm_trainer --num_labels 2 --C 10 --kernel 'rbf' --save_path '/tmp/model' --use_aug True
import sys
sys.path.append('..')

Expand All @@ -14,7 +14,7 @@
from detector.labeler import EnronLabeler, MismatchLabeler
from detector.modeler import SVMModel
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse
Expand All @@ -33,6 +33,7 @@ def parse_args():
parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels")
parser.add_argument("--C", "-C", type=int, default=1, help="Regularization parameter")
parser.add_argument("--kernel", "-k", type=str, default='rbf', help="Kernel to use in the algorithm ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed')")
parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing")
return parser.parse_args()

def load_data():
Expand Down Expand Up @@ -130,6 +131,27 @@ def train_model(train_data, hyper_params):
# train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
# train_data = train_data.reset_index(drop=True)

if hyper_params['use_aug']:
augmentor = Augmentor()

train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

# Call your code that produces output
model.train(body=train_data['Body'], label=train_data['Label'])
return model
Expand Down Expand Up @@ -209,7 +231,8 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
hyper_params = {
'num_labels': args.num_labels,
'C': args.C,
'kernel': args.kernel
'kernel': args.kernel,
'use_aug': args.use_aug,
}

# Log in to Weights and Biases
Expand Down

0 comments on commit 594ea68

Please sign in to comment.