Skip to content

Commit

Permalink
Modified RF DP training code
Browse files Browse the repository at this point in the history
  • Loading branch information
advaithsrao committed Dec 6, 2023
1 parent 7151af7 commit 19b2cc4
Show file tree
Hide file tree
Showing 3 changed files with 288 additions and 3 deletions.
2 changes: 1 addition & 1 deletion ethics/differential_privacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def train(
epsilons = [1e-8, 1e-2, 1, 7.5, 20]
accuracies = []

body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42)
body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)

for eps in epsilons:
self.model = Pipeline([
Expand Down
4 changes: 2 additions & 2 deletions pipelines/random_forest_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
)

def parse_args():
parser = argparse.ArgumentParser(description="SVM Model Fraud Detector Pipeline")
parser = argparse.ArgumentParser(description="Random Model Fraud Detector Pipeline")
parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path")
parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels")
parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest")
Expand Down Expand Up @@ -106,7 +106,7 @@ def data_split(data):
# For sanity_set, take first 5000 emails with Sender-Type = 'Internal'
sanity = data[
(data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data')
].head(5000)
].head(250000)
sanity['Split'] = 'Sanity'

# For train_set, take all data not in gold_fraud_set and sanity_set
Expand Down
285 changes: 285 additions & 0 deletions pipelines/rf_differential_privacy_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#usage: python3 -m pipelines.svm_trainer --num_labels 2 --n_estimators 100 --criterion gini --use_aug True
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
import pandas as pd
import sys
import os

from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
from detector.labeler import EnronLabeler, MismatchLabeler
from ethics.differential_privacy import RandomForestPrivacyModel
from detector.preprocessor import Preprocessor
from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor

import wandb
import argparse
import configparser
config = configparser.ConfigParser()
config.read(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'../config.ini'
)
)

def parse_args():
parser = argparse.ArgumentParser(description="Random Forest Model Fraud Detector Pipeline")
parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path")
parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels")
parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest")
parser.add_argument("--criterion", "-c", type=str, default='gini', help="Function to measure the quality of a split")
parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing")
return parser.parse_args()

def load_data():
if os.path.exists(
os.path.join(
os.path.dirname(__file__),
'../data/fraud_detector_data.csv'
)
):
data = pd.read_csv(
os.path.join(
os.path.dirname(__file__),
'../data/fraud_detector_data.csv'
)
)
else:
data = {
loader.__name__: loader().__call__() for loader in [LoadEnronData, LoadPhishingData, LoadSocEnggData]
}
return data

def label_and_preprocess_data(data):
if not os.path.exists(
os.path.join(
os.path.dirname(__file__),
'../data/fraud_detector_data.csv'
)
):
# Run Enron Labeler
data['LoadEnronData'] = EnronLabeler(data['LoadEnronData'], needs_preprocessing=True)()

# Preprocess the other 2 datasets
data['LoadPhishingData']['Body'] = data['LoadPhishingData']['Body'].swifter.apply(Preprocessor())
data['LoadSocEnggData']['Body'] = data['LoadSocEnggData']['Body'].swifter.apply(Preprocessor())

# Concatenate the 3 data sources into 1
data = pd.concat(
[
df for df in data.values()
],
axis=0,
ignore_index=True
)

# Run Mismatch Labeler
data = MismatchLabeler(data)()

data.reset_index(drop=True, inplace=True)

return data

def data_split(data):
if not os.path.exists(
os.path.join(
os.path.dirname(__file__),
'../data/fraud_detector_data.csv'
)
):
# For gold_fraud_set, take first 500 emails from Phishing Data and 500 emails from Social Engineering Data
gold_fraud = pd.concat(
[
data[data['Source'] == 'Phishing Data'][data['Label'] == 1].head(500),
data[data['Source'] == 'Social Engineering Data'][data['Label'] == 1].head(500)
],
axis=0,
ignore_index=True
)
gold_fraud['Split'] = 'Gold Fraud'

# For sanity_set, take first 5000 emails with Sender-Type = 'Internal'
sanity = data[
(data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data')
].head(250000)
sanity['Split'] = 'Sanity'

# For train_set, take all data not in gold_fraud_set and sanity_set
train = data[
~data['Mail-ID'].isin(gold_fraud['Mail-ID']) & ~data['Mail-ID'].isin(sanity['Mail-ID'])
]

train['Split'] = 'Train'

else:
train = data[data['Split'] == 'Train']
gold_fraud = data[data['Split'] == 'Gold Fraud']
sanity = data[data['Split'] == 'Sanity']

return train, sanity, gold_fraud

def train_model(train_data, hyper_params, use_aug=False):
run = wandb.init(config=hyper_params)
model = RandomForestFraudModel(**hyper_params)

# #drop train examples with Label=1 and Body less than 4 words
# train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
# train_data = train_data.reset_index(drop=True)

if use_aug:
augmentor = Augmentor()

train_body, train_labels = augmentor(
train_data['Body'].tolist(),
train_data['Label'].tolist(),
aug_label=1,
num_aug_per_label_1=9,
shuffle=True
)

train_data = pd.DataFrame(
{
'Body': train_body,
'Label': train_labels
}
)

train_data.drop_duplicates(subset=['Body'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

# Call your code that produces output
model.train(body=train_data['Body'], label=train_data['Label'])
return model

def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path):
# Define a dictionary to store the f1 scores
f1_scores = {}

# # Define a dictionary to store the predictions, true labels for each dataset
# true_pred_map = {
# 'train':{},
# 'sanity':{},
# 'gold_fraud':{}
# }

os.makedirs(os.path.join(save_path,'logs'), exist_ok=True)

# Save the model and logs to the date folder
model.save_model(os.path.join(save_path,'model'))

train_data['Prediction'] = model.predict(body=train_data['Body'])
evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())

sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())

gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())

#save mismatch data into a csv file
mismatch_data = pd.concat(
[
train_data[train_data['Prediction'] != train_data['Label']],
sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
],
axis=0,
ignore_index=True
)

mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)

return f1_scores

def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
# Log the hyperparameters and f1 scores to Weights and Biases
all_params = {**hyper_params, **f1_scores}
run.config.update(all_params)

# Log the model to Weights and Biases
model_path = os.path.join(save_path, 'model')
model_artifact = wandb.Artifact("fraud-detector-model", type="model")
model_artifact.add_dir(model_path)
run.use_artifact(model_artifact)

# Log the log files to Weights and Biases
logs_path = os.path.join(save_path,'logs')
log_artifact = wandb.Artifact("fraud-detector-logs", type="logs")
log_artifact.add_dir(logs_path)
run.use_artifact(log_artifact)

# Log confusion matrices
# run.log({
# "train_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['train']['true'], true_pred_map['train']['pred']),
# "sanity_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['sanity']['true'], true_pred_map['sanity']['pred']),
# "gold_fraud_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['gold_fraud']['true'], true_pred_map['gold_fraud']['pred'])
# })

if __name__ == '__main__':
# Parse the arguments
args = parse_args()

if type(args.use_aug) == str:
if args.use_aug.lower() == 'true':
args.use_aug = True
elif args.use_aug.lower() == 'false':
args.use_aug = False
else:
raise ValueError("Invalid value for use_aug. Please enter True or False.")

# Define model hyperparameters
hyper_params = {
'num_labels': args.num_labels,
'n_estimators': args.n_estimators,
'criterion': args.criterion
}

# Log in to Weights and Biases
wandbdict = {
'key': os.getenv('WANDB_API_KEY'),
'entity': os.getenv('WANDB_ENTITY'),
'project': os.getenv('WANDB_PROJECT'),
}
wandb.login(key=wandbdict['key'])
run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])

# Define a variable to store the trained model
model = None

# Get the current date
date = datetime.now().strftime("%Y-%m-%d")

# Create date folder in save path
save_path = args.save_path
save_path = os.path.join(save_path, f'{date}')
os.makedirs(save_path, exist_ok=True)

# Load the data
data = load_data()

# Label and preprocess the data
data = label_and_preprocess_data(data)

# Split the data into train, sanity, and gold_fraud sets
train_data, sanity_data, gold_fraud_data = data_split(data)

# Train the model
model = train_model(train_data, hyper_params, use_aug=args.use_aug)

# Test the model
f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path)

# Dump the logs to Weights and Biases
dump_logs_to_wandb(hyper_params, f1_scores, save_path)

# Close the Weights and Biases run
run.finish()

0 comments on commit 19b2cc4

Please sign in to comment.