diff --git a/ethics/differential_privacy.py b/ethics/differential_privacy.py index 4810724..ee9ee50 100644 --- a/ethics/differential_privacy.py +++ b/ethics/differential_privacy.py @@ -76,7 +76,7 @@ def train( epsilons = [1e-8, 1e-2, 1, 7.5, 20] accuracies = [] - body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42) + body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label) for eps in epsilons: self.model = Pipeline([ diff --git a/pipelines/random_forest_trainer.py b/pipelines/random_forest_trainer.py index 3086e60..bc575f3 100644 --- a/pipelines/random_forest_trainer.py +++ b/pipelines/random_forest_trainer.py @@ -28,7 +28,7 @@ ) def parse_args(): - parser = argparse.ArgumentParser(description="SVM Model Fraud Detector Pipeline") + parser = argparse.ArgumentParser(description="Random Model Fraud Detector Pipeline") parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path") parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels") parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest") @@ -106,7 +106,7 @@ def data_split(data): # For sanity_set, take first 5000 emails with Sender-Type = 'Internal' sanity = data[ (data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data') - ].head(5000) + ].head(250000) sanity['Split'] = 'Sanity' # For train_set, take all data not in gold_fraud_set and sanity_set diff --git a/pipelines/rf_differential_privacy_trainer.py b/pipelines/rf_differential_privacy_trainer.py new file mode 100644 index 0000000..7a9dfdf --- /dev/null +++ b/pipelines/rf_differential_privacy_trainer.py @@ -0,0 +1,285 @@ +#usage: python3 -m pipelines.svm_trainer --num_labels 2 --n_estimators 100 --criterion gini --use_aug True +import sys +sys.path.append('..') + +import warnings +warnings.filterwarnings("ignore") + +from datetime import datetime +import pandas as pd +import sys +import os + +from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData +from detector.labeler import EnronLabeler, MismatchLabeler +from ethics.differential_privacy import RandomForestPrivacyModel +from detector.preprocessor import Preprocessor +from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor + +import wandb +import argparse +import configparser +config = configparser.ConfigParser() +config.read( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + '../config.ini' + ) +) + +def parse_args(): + parser = argparse.ArgumentParser(description="Random Forest Model Fraud Detector Pipeline") + parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path") + parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels") + parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest") + parser.add_argument("--criterion", "-c", type=str, default='gini', help="Function to measure the quality of a split") + parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing") + return parser.parse_args() + +def load_data(): + if os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + data = pd.read_csv( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ) + else: + data = { + loader.__name__: loader().__call__() for loader in [LoadEnronData, LoadPhishingData, LoadSocEnggData] + } + return data + +def label_and_preprocess_data(data): + if not os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + # Run Enron Labeler + data['LoadEnronData'] = EnronLabeler(data['LoadEnronData'], needs_preprocessing=True)() + + # Preprocess the other 2 datasets + data['LoadPhishingData']['Body'] = data['LoadPhishingData']['Body'].swifter.apply(Preprocessor()) + data['LoadSocEnggData']['Body'] = data['LoadSocEnggData']['Body'].swifter.apply(Preprocessor()) + + # Concatenate the 3 data sources into 1 + data = pd.concat( + [ + df for df in data.values() + ], + axis=0, + ignore_index=True + ) + + # Run Mismatch Labeler + data = MismatchLabeler(data)() + + data.reset_index(drop=True, inplace=True) + + return data + +def data_split(data): + if not os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + # For gold_fraud_set, take first 500 emails from Phishing Data and 500 emails from Social Engineering Data + gold_fraud = pd.concat( + [ + data[data['Source'] == 'Phishing Data'][data['Label'] == 1].head(500), + data[data['Source'] == 'Social Engineering Data'][data['Label'] == 1].head(500) + ], + axis=0, + ignore_index=True + ) + gold_fraud['Split'] = 'Gold Fraud' + + # For sanity_set, take first 5000 emails with Sender-Type = 'Internal' + sanity = data[ + (data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data') + ].head(250000) + sanity['Split'] = 'Sanity' + + # For train_set, take all data not in gold_fraud_set and sanity_set + train = data[ + ~data['Mail-ID'].isin(gold_fraud['Mail-ID']) & ~data['Mail-ID'].isin(sanity['Mail-ID']) + ] + + train['Split'] = 'Train' + + else: + train = data[data['Split'] == 'Train'] + gold_fraud = data[data['Split'] == 'Gold Fraud'] + sanity = data[data['Split'] == 'Sanity'] + + return train, sanity, gold_fraud + +def train_model(train_data, hyper_params, use_aug=False): + run = wandb.init(config=hyper_params) + model = RandomForestFraudModel(**hyper_params) + + # #drop train examples with Label=1 and Body less than 4 words + # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] + # train_data = train_data.reset_index(drop=True) + + if use_aug: + augmentor = Augmentor() + + train_body, train_labels = augmentor( + train_data['Body'].tolist(), + train_data['Label'].tolist(), + aug_label=1, + num_aug_per_label_1=9, + shuffle=True + ) + + train_data = pd.DataFrame( + { + 'Body': train_body, + 'Label': train_labels + } + ) + + train_data.drop_duplicates(subset=['Body'], inplace=True) + train_data.reset_index(drop=True, inplace=True) + + # Call your code that produces output + model.train(body=train_data['Body'], label=train_data['Label']) + return model + +def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path): + # Define a dictionary to store the f1 scores + f1_scores = {} + + # # Define a dictionary to store the predictions, true labels for each dataset + # true_pred_map = { + # 'train':{}, + # 'sanity':{}, + # 'gold_fraud':{} + # } + + os.makedirs(os.path.join(save_path,'logs'), exist_ok=True) + + # Save the model and logs to the date folder + model.save_model(os.path.join(save_path,'model')) + + train_data['Prediction'] = model.predict(body=train_data['Body']) + evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist()) + f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist()) + + sanity_data['Prediction'] = model.predict(body=sanity_data['Body']) + evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist()) + f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist()) + + gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body']) + evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist()) + f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist()) + + #save mismatch data into a csv file + mismatch_data = pd.concat( + [ + train_data[train_data['Prediction'] != train_data['Label']], + sanity_data[sanity_data['Prediction'] != sanity_data['Label']], + gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']] + ], + axis=0, + ignore_index=True + ) + + mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False) + + return f1_scores + +def dump_logs_to_wandb(hyper_params, f1_scores, save_path): + # Log the hyperparameters and f1 scores to Weights and Biases + all_params = {**hyper_params, **f1_scores} + run.config.update(all_params) + + # Log the model to Weights and Biases + model_path = os.path.join(save_path, 'model') + model_artifact = wandb.Artifact("fraud-detector-model", type="model") + model_artifact.add_dir(model_path) + run.use_artifact(model_artifact) + + # Log the log files to Weights and Biases + logs_path = os.path.join(save_path,'logs') + log_artifact = wandb.Artifact("fraud-detector-logs", type="logs") + log_artifact.add_dir(logs_path) + run.use_artifact(log_artifact) + + # Log confusion matrices + # run.log({ + # "train_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['train']['true'], true_pred_map['train']['pred']), + # "sanity_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['sanity']['true'], true_pred_map['sanity']['pred']), + # "gold_fraud_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['gold_fraud']['true'], true_pred_map['gold_fraud']['pred']) + # }) + +if __name__ == '__main__': + # Parse the arguments + args = parse_args() + + if type(args.use_aug) == str: + if args.use_aug.lower() == 'true': + args.use_aug = True + elif args.use_aug.lower() == 'false': + args.use_aug = False + else: + raise ValueError("Invalid value for use_aug. Please enter True or False.") + + # Define model hyperparameters + hyper_params = { + 'num_labels': args.num_labels, + 'n_estimators': args.n_estimators, + 'criterion': args.criterion + } + + # Log in to Weights and Biases + wandbdict = { + 'key': os.getenv('WANDB_API_KEY'), + 'entity': os.getenv('WANDB_ENTITY'), + 'project': os.getenv('WANDB_PROJECT'), + } + wandb.login(key=wandbdict['key']) + run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity']) + + # Define a variable to store the trained model + model = None + + # Get the current date + date = datetime.now().strftime("%Y-%m-%d") + + # Create date folder in save path + save_path = args.save_path + save_path = os.path.join(save_path, f'{date}') + os.makedirs(save_path, exist_ok=True) + + # Load the data + data = load_data() + + # Label and preprocess the data + data = label_and_preprocess_data(data) + + # Split the data into train, sanity, and gold_fraud sets + train_data, sanity_data, gold_fraud_data = data_split(data) + + # Train the model + model = train_model(train_data, hyper_params, use_aug=args.use_aug) + + # Test the model + f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path) + + # Dump the logs to Weights and Biases + dump_logs_to_wandb(hyper_params, f1_scores, save_path) + + # Close the Weights and Biases run + run.finish() +