Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added RoBERTa model training and w&b model tracking and logging #25

Merged
merged 2 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Fraud Detection Package with fine-tuned RoBERTa model, equipped with ethical con
- [Fraud-Detector](#fraud-detector)
- [Team](#team)
- [CI/CD Pipeline Status](#cicd-pipeline-status)
- [Steps to run](#steps-to-run)
- [Installation and Run Instructions](#installation-and-run-instructions)
- [Abstract](#abstract)
- [Dataset Description](#dataset-description)
- [Proposed Methodology](#proposed-methodology)
Expand All @@ -31,15 +31,15 @@ Fraud Detection Package with fine-tuned RoBERTa model, equipped with ethical con

***

## Steps to run
## Installation and Run Instructions

**All helper functions and run steps can be found in the wiki pages.**

| Helper | Page |
| ------ | ------ |
| Setup Environment | [Wiki](https://github.com/advaithsrao/Fraud-Detector/wiki/Setup-Environment) |
| Processing and Labeler | [Wiki](https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data) |
| | |
| Setup Environment and Integrations | [Wiki](https://github.com/advaithsrao/Fraud-Detector/wiki/Setup-Environment) |
| Standalone - Processing and Labeler | [Wiki](https://github.com/advaithsrao/Fraud-Detector/wiki/Load-Preprocessed-and-Labeled-Data) |
| Model Training Pipeline | [Wiki](https://github.com/advaithsrao/Fraud-Detector/wiki/Model-Training-Pipeline) |

***

Expand Down
2 changes: 1 addition & 1 deletion config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ emails = [email protected] & [email protected] & [email protected] &
names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yeager, Scott & Hirko, Joseph & Shelby, Rex & Bermingham, David & Darby, Giles & Mulgrew, Gary & Bayley, Daniel & Brown, James & Furst, Robert & Fuhs, William & Causey, Richard & Calger, Christopher & DeSpain, Timothy & Hannon, Kevin & Koenig, Mark & Forney, John & Rice, Kenneth & Rieker, Paula & Fastow, Lea & Fastow, Andrew & Delainey, David & Glisan, Ben & Richter, Jeffrey & Lawyer, Larry & Belden, Timothy & Kopper, Michael & Duncan, David & Bowen, Raymond & Colwell, Wesley & Boyle, Dan & Loehr, Christopher

[folders.possible_fraud]
folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items
folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items
278 changes: 278 additions & 0 deletions detector/modeler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import os
import shutil
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import wandb

class RobertaModel:
def __init__(
self,
num_labels: int = 2,
path: str = '',
model_name='roberta-base',
learning_rate=2e-5,
epsilon=1e-8,
num_epochs=40,
batch_size=128
):
self.num_labels = num_labels
self.path = path
self.model_name = model_name
self.learning_rate = learning_rate
self.epsilon = epsilon
self.num_epochs = num_epochs
self.batch_size = batch_size

self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)

if self.path != '':
self.model = RobertaForSequenceClassification.from_pretrained(self.path, num_labels=self.num_labels).to(self.device)
else:
self.model = RobertaForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels).to(self.device)

def train(
self,
body: pd.Series | list[str],
label: pd.Series | list[int],
validation_size=0.2,
wandb = None
):
"""Trains the model using the given data.

Args:
body (pd.Series | list[str]): The body of the email.
label (pd.Series | list[int]): The label of the email.
validation_size (float, optional): The size of the validation set. Defaults to 0.2.
wandb (wandb, optional): The wandb object. Defaults to None. If given, logs the training process to wandb.

Raises:
ValueError: If the body and label are not of the same size.
"""

if isinstance(body, pd.Series):
body = body.tolist()
if isinstance(label, pd.Series):
label = label.tolist()

# Tokenize input texts and convert labels to tensors
input_ids = []
attention_masks = []
label_ids = []

for _body, _label in zip(body, label):
# Tokenize the input text using the Roberta tokenizer
inputs = self.tokenizer.encode_plus(
_body,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
truncation=True
)

input_ids.append(inputs['input_ids'])
attention_masks.append(inputs['attention_mask'])
label_ids.append(torch.tensor(_label)) # Convert the label to a tensor

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
label_ids = torch.stack(label_ids) # Create a 1D tensor for label_ids

# Split the data into train and validation sets
dataset = TensorDataset(input_ids, attention_masks, label_ids)
dataset_size = len(dataset)
val_size = int(validation_size * dataset_size)
train_size = dataset_size - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders for training and validation data
train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)

# Initialize the optimizer and learning rate scheduler
optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.epsilon)
total_steps = len(train_dataloader) * self.num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Initialize variables for early stopping
best_validation_loss = float("inf")
patience = 3 # Number of epochs to wait for improvement
wait = 0

# Training loop
for epoch in range(self.num_epochs):
print(f'{"="*20} Epoch {epoch + 1}/{self.num_epochs} {"="*20}')

# Training loop
self.model.train()
total_train_loss = 0

for step, batch in enumerate(train_dataloader):
b_input_ids = batch[0].to(self.device)
b_input_mask = batch[1].to(self.device)
b_labels = batch[2].to(self.device)

self.model.zero_grad()
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
total_train_loss += loss.item()

loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
optimizer.step()
scheduler.step()

if step % 100 == 0 and step != 0:
avg_train_loss = total_train_loss / 100
print(f'Step {step}/{len(train_dataloader)} - Average training loss: {avg_train_loss:.4f}')

total_train_loss = 0

avg_train_loss = total_train_loss / len(train_dataloader)
print(f'Training loss: {avg_train_loss:.4f}')

# Evaluation loop
self.model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0

for batch in validation_dataloader:
b_input_ids = batch[0].to(self.device)
b_input_mask = batch[1].to(self.device)
b_labels = batch[2].to(self.device)

with torch.no_grad():
outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
logits = outputs[1]

total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
total_eval_accuracy += self.accuracy(logits, label_ids)

avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(f'Validation Accuracy: {avg_val_accuracy:.4f}')

avg_val_loss = total_eval_loss / len(validation_dataloader)
print(f'Validation Loss: {avg_val_loss:.4f}')

if wandb != None:
wandb.log({
'epoch': epoch,
'train_loss': avg_train_loss,
'val_loss': avg_val_loss,
'val_accuracy': avg_val_accuracy,
})

# Early stopping check
if avg_val_loss < best_validation_loss:
best_validation_loss = avg_val_loss
wait = 0
else:
wait += 1

if wait >= patience:
print(f'Early stopping after {patience} epochs without improvement.')
break

def predict(
self,
body: pd.Series | list[str]
):
"""Predicts the labels of the given data.

Args:
body (pd.Series | list[str]): The body of the email.

Returns:
np.array: The predictions of the model.
"""

# If input_texts is a Pandas Series, convert it to a list
if isinstance(body, pd.Series):
body = body.tolist()

input_ids = []
attention_masks = []

for _body in body:
inputs = self.tokenizer.encode_plus(
_body,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
truncation=True
)

input_ids.append(inputs['input_ids'])
attention_masks.append(inputs['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=self.batch_size)

self.model.eval()
predictions = []

for batch in dataloader:
b_input_ids = batch[0].to(self.device)
b_input_mask = batch[1].to(self.device)

with torch.no_grad():
outputs = self.model(b_input_ids, attention_mask=b_input_mask)
logits = outputs[0]

logits = logits.detach().cpu().numpy()

# Apply a threshold (e.g., 0.5) to convert logits to class predictions
class_predictions = np.argmax(logits, axis=1)
predictions.extend(class_predictions.tolist())

return predictions

def save_model(
self,
path: str
):
"""Saves the model to the given path.

Args:
path (str): The path to save the model to.
"""

# Check if the directory exists, and if not, create it
if not os.path.exists(path):
os.makedirs(path, exist_ok=True)

self.model.save_pretrained(path)

def accuracy(
self,
preds,
labels
):
"""Calculates the accuracy of the model.

Args:
preds (np.array): The predictions of the model.
labels (np.array): The labels of the data.

Returns:
float: The accuracy of the model.
"""

pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
Loading
Loading