Skip to content

Commit

Permalink
Added nlpaug augmentation for our bert modeling (#30)
Browse files Browse the repository at this point in the history
* Added nlpaug augmentation for our bert modeling

* Added cudatoolkit to github actions and updated torch to 2.1.1

* Fixed issues with augmentation testing
  • Loading branch information
advaithsrao authored Nov 26, 2023
1 parent 9c4ed1f commit 3c3dd6b
Show file tree
Hide file tree
Showing 8 changed files with 984 additions and 608 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ jobs:
with:
python-version: 3.9

- name: Install NVIDIA CUDA Toolkit and GCC for CUDA
run: |
sudo apt-get update
sudo apt-get install -y nvidia-cuda-toolkit nvidia-cuda-toolkit-gcc
- name: Set environment variable for scikit-learn depricated package
run: echo "SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True" >> $GITHUB_ENV

Expand Down
45 changes: 33 additions & 12 deletions detector/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,16 @@ def train(
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
logits = outputs.logits

# Convert labels to one-hot encoding
b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
sigmoid_output = torch.sigmoid(logits[:, 1])

# Thresholding to convert probabilities to binary values (0 or 1)
binary_output = (sigmoid_output > 0.5).to(torch.int)

# # Convert labels to one-hot encoding
# b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()

# Calculate the loss using the weighted loss function
loss = loss_function(logits, b_labels_one_hot)
loss = loss_function(binary_output, b_labels)
total_train_loss += loss.item()

# Backward pass
Expand Down Expand Up @@ -192,11 +197,16 @@ def train(
# loss = outputs[0]
logits = outputs.logits

# Convert labels to one-hot encoding
b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
sigmoid_output = torch.sigmoid(logits[:, 1])

# Thresholding to convert probabilities to binary values (0 or 1)
binary_output = (sigmoid_output > 0.5).to(torch.int)

# # Convert labels to one-hot encoding
# b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()

# Calculate the loss using the weighted loss function
loss = loss_function(logits, b_labels_one_hot)
loss = loss_function(binary_output, b_labels)
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = b_labels.detach().cpu().numpy()
Expand Down Expand Up @@ -451,11 +461,17 @@ def train(
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
logits = outputs.logits

# Convert labels to one-hot encoding
b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
sigmoid_output = torch.sigmoid(logits[:, 1])

# Thresholding to convert probabilities to binary values (0 or 1)
binary_output = (sigmoid_output > 0.5).to(torch.int)

# # Convert labels to one-hot encoding
# b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()

# Calculate the loss using the weighted loss function
loss = loss_function(logits, b_labels_one_hot)
loss = loss_function(binary_output, b_labels)

total_train_loss += loss.item()

# Backward pass
Expand Down Expand Up @@ -488,11 +504,16 @@ def train(
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
logits = outputs.logits

# Convert labels to one-hot encoding
b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
sigmoid_output = torch.sigmoid(logits[:, 1])

# Thresholding to convert probabilities to binary values (0 or 1)
binary_output = (sigmoid_output > 0.5).to(torch.int)

# # Convert labels to one-hot encoding
# b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()

# Calculate the loss using the weighted loss function
loss = loss_function(logits, b_labels_one_hot)
loss = loss_function(binary_output, b_labels)
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = b_labels.detach().cpu().numpy()
Expand Down
29 changes: 27 additions & 2 deletions pipelines/distilbert_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from detector.labeler import EnronLabeler, MismatchLabeler
from detector.modeler import DistilbertModel
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse
Expand Down Expand Up @@ -132,8 +132,33 @@ def train_model(train_data, hyper_params):
# train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
# train_data = train_data.reset_index(drop=True)

augmentor = Augmentor()

train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

_train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

_train_data.drop_duplicates(subset=['Body'], inplace=True)
_train_data.reset_index(drop=True, inplace=True)

# Call your code that produces output
model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run)
model.train(
body=_train_data['Body'],
label=_train_data['Label'],
validation_size=0.2,
wandb=run
)

# Restore the original stdout
# sys.stdout = sys.__stdout__
Expand Down
36 changes: 23 additions & 13 deletions pipelines/roberta_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from detector.labeler import EnronLabeler, MismatchLabeler
from detector.modeler import RobertaModel
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse
Expand Down Expand Up @@ -128,23 +128,33 @@ def train_model(train_data, hyper_params):
run = wandb.init(config=hyper_params)
model = RobertaModel(**hyper_params)

# os.makedirs(f'/tmp/{date}/logs', exist_ok=True)
augmentor = Augmentor()

# # Define a log file path
# log_filename = f"/tmp/{date}/logs/model_training.log"
train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

# # Create or open the log file in write mode
# log_file = open(log_filename, "w")
_train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

# # Redirect stdout to the log file
# sys.stdout = log_file
_train_data.drop_duplicates(subset=['Body'], inplace=True)
_train_data.reset_index(drop=True, inplace=True)

# #drop train examples with Label=1 and Body less than 4 words
# train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
# train_data = train_data.reset_index(drop=True)

# Call your code that produces output
model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run)
model.train(
body=_train_data['Body'],
label=_train_data['Label'],
validation_size=0.2,
wandb=run
)

# Restore the original stdout
# sys.stdout = sys.__stdout__
Expand Down
Loading

0 comments on commit 3c3dd6b

Please sign in to comment.