From b170bed62f50766d1572f58ef08b1c3fb02f61c7 Mon Sep 17 00:00:00 2001 From: Advaith Rao Date: Tue, 5 Dec 2023 10:54:07 -0500 Subject: [PATCH] Added random forest model --- detector/modeler.py | 83 ++++++++ notebooks/differential_privacy.ipynb | 12 -- pipelines/random_forest_trainer.py | 285 +++++++++++++++++++++++++++ 3 files changed, 368 insertions(+), 12 deletions(-) create mode 100644 pipelines/random_forest_trainer.py diff --git a/detector/modeler.py b/detector/modeler.py index dc1c2cb..fda9200 100644 --- a/detector/modeler.py +++ b/detector/modeler.py @@ -5,6 +5,7 @@ import pandas as pd import numpy as np from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score @@ -1061,3 +1062,85 @@ def save_model( os.makedirs(path, exist_ok=True) save_model(self.model, path) + + +class RandomForestFraudModel: + def __init__( + self, + num_labels: int = 2, + n_estimators = 100, + criterion = 'gini', + njobs = -1 + ): + self.num_labels = num_labels + self.n_estimators = n_estimators + self.criterion = criterion + self.njobs = njobs + + self.vectorizer = Word2VecEmbedder() + self.model = Pipeline([ + ('vectorizer', self.vectorizer), + ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs)) + ]) + + def train( + self, + body: pd.Series | list[str], + label: pd.Series | list[int], + ): + """Trains the SVM model. + + Args: + body (pd.Series | list[str]): The body of the email. + label (pd.Series | list[int]): The label of the email. + + Raises: + ValueError: If the body and label are not of the same size. + """ + if isinstance(body, pd.Series): + body = body.tolist() + if isinstance(label, pd.Series): + label = label.tolist() + + # Train the SVM model + self.model.fit(body, label) + + print(f'{"="*20} Training Done {"="*20}') + + def predict( + self, + body: pd.Series | list[str], + ): + """Predicts the labels of the given data. + + Args: + body (pd.Series | list[str]): The body of the email. + + Returns: + np.array: The predictions of the model. + """ + if isinstance(body, pd.Series): + body = body.tolist() + + # Make predictions using the trained SVM model + predictions = self.model.predict(body) + + if isinstance(predictions, np.ndarray): + predictions = predictions.tolist() + + return predictions + + def save_model( + self, + path: str, + ): + """Saves the model to the given path. + + Args: + path (str): The path to save the model to. + """ + + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + + save_model(self.model, path) diff --git a/notebooks/differential_privacy.ipynb b/notebooks/differential_privacy.ipynb index b3dc825..4bdbaaa 100644 --- a/notebooks/differential_privacy.ipynb +++ b/notebooks/differential_privacy.ipynb @@ -627,18 +627,6 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" } }, "nbformat": 4, diff --git a/pipelines/random_forest_trainer.py b/pipelines/random_forest_trainer.py new file mode 100644 index 0000000..3086e60 --- /dev/null +++ b/pipelines/random_forest_trainer.py @@ -0,0 +1,285 @@ +#usage: python3 -m pipelines.svm_trainer --num_labels 2 --n_estimators 100 --criterion gini --use_aug True +import sys +sys.path.append('..') + +import warnings +warnings.filterwarnings("ignore") + +from datetime import datetime +import pandas as pd +import sys +import os + +from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData +from detector.labeler import EnronLabeler, MismatchLabeler +from detector.modeler import RandomForestFraudModel +from detector.preprocessor import Preprocessor +from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor + +import wandb +import argparse +import configparser +config = configparser.ConfigParser() +config.read( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + '../config.ini' + ) +) + +def parse_args(): + parser = argparse.ArgumentParser(description="SVM Model Fraud Detector Pipeline") + parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path") + parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels") + parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest") + parser.add_argument("--criterion", "-c", type=str, default='gini', help="Function to measure the quality of a split") + parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing") + return parser.parse_args() + +def load_data(): + if os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + data = pd.read_csv( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ) + else: + data = { + loader.__name__: loader().__call__() for loader in [LoadEnronData, LoadPhishingData, LoadSocEnggData] + } + return data + +def label_and_preprocess_data(data): + if not os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + # Run Enron Labeler + data['LoadEnronData'] = EnronLabeler(data['LoadEnronData'], needs_preprocessing=True)() + + # Preprocess the other 2 datasets + data['LoadPhishingData']['Body'] = data['LoadPhishingData']['Body'].swifter.apply(Preprocessor()) + data['LoadSocEnggData']['Body'] = data['LoadSocEnggData']['Body'].swifter.apply(Preprocessor()) + + # Concatenate the 3 data sources into 1 + data = pd.concat( + [ + df for df in data.values() + ], + axis=0, + ignore_index=True + ) + + # Run Mismatch Labeler + data = MismatchLabeler(data)() + + data.reset_index(drop=True, inplace=True) + + return data + +def data_split(data): + if not os.path.exists( + os.path.join( + os.path.dirname(__file__), + '../data/fraud_detector_data.csv' + ) + ): + # For gold_fraud_set, take first 500 emails from Phishing Data and 500 emails from Social Engineering Data + gold_fraud = pd.concat( + [ + data[data['Source'] == 'Phishing Data'][data['Label'] == 1].head(500), + data[data['Source'] == 'Social Engineering Data'][data['Label'] == 1].head(500) + ], + axis=0, + ignore_index=True + ) + gold_fraud['Split'] = 'Gold Fraud' + + # For sanity_set, take first 5000 emails with Sender-Type = 'Internal' + sanity = data[ + (data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data') + ].head(5000) + sanity['Split'] = 'Sanity' + + # For train_set, take all data not in gold_fraud_set and sanity_set + train = data[ + ~data['Mail-ID'].isin(gold_fraud['Mail-ID']) & ~data['Mail-ID'].isin(sanity['Mail-ID']) + ] + + train['Split'] = 'Train' + + else: + train = data[data['Split'] == 'Train'] + gold_fraud = data[data['Split'] == 'Gold Fraud'] + sanity = data[data['Split'] == 'Sanity'] + + return train, sanity, gold_fraud + +def train_model(train_data, hyper_params, use_aug=False): + run = wandb.init(config=hyper_params) + model = RandomForestFraudModel(**hyper_params) + + # #drop train examples with Label=1 and Body less than 4 words + # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] + # train_data = train_data.reset_index(drop=True) + + if use_aug: + augmentor = Augmentor() + + train_body, train_labels = augmentor( + train_data['Body'].tolist(), + train_data['Label'].tolist(), + aug_label=1, + num_aug_per_label_1=9, + shuffle=True + ) + + train_data = pd.DataFrame( + { + 'Body': train_body, + 'Label': train_labels + } + ) + + train_data.drop_duplicates(subset=['Body'], inplace=True) + train_data.reset_index(drop=True, inplace=True) + + # Call your code that produces output + model.train(body=train_data['Body'], label=train_data['Label']) + return model + +def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path): + # Define a dictionary to store the f1 scores + f1_scores = {} + + # # Define a dictionary to store the predictions, true labels for each dataset + # true_pred_map = { + # 'train':{}, + # 'sanity':{}, + # 'gold_fraud':{} + # } + + os.makedirs(os.path.join(save_path,'logs'), exist_ok=True) + + # Save the model and logs to the date folder + model.save_model(os.path.join(save_path,'model')) + + train_data['Prediction'] = model.predict(body=train_data['Body']) + evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist()) + f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist()) + + sanity_data['Prediction'] = model.predict(body=sanity_data['Body']) + evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist()) + f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist()) + + gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body']) + evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist()) + f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist()) + + #save mismatch data into a csv file + mismatch_data = pd.concat( + [ + train_data[train_data['Prediction'] != train_data['Label']], + sanity_data[sanity_data['Prediction'] != sanity_data['Label']], + gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']] + ], + axis=0, + ignore_index=True + ) + + mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False) + + return f1_scores + +def dump_logs_to_wandb(hyper_params, f1_scores, save_path): + # Log the hyperparameters and f1 scores to Weights and Biases + all_params = {**hyper_params, **f1_scores} + run.config.update(all_params) + + # Log the model to Weights and Biases + model_path = os.path.join(save_path, 'model') + model_artifact = wandb.Artifact("fraud-detector-model", type="model") + model_artifact.add_dir(model_path) + run.use_artifact(model_artifact) + + # Log the log files to Weights and Biases + logs_path = os.path.join(save_path,'logs') + log_artifact = wandb.Artifact("fraud-detector-logs", type="logs") + log_artifact.add_dir(logs_path) + run.use_artifact(log_artifact) + + # Log confusion matrices + # run.log({ + # "train_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['train']['true'], true_pred_map['train']['pred']), + # "sanity_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['sanity']['true'], true_pred_map['sanity']['pred']), + # "gold_fraud_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['gold_fraud']['true'], true_pred_map['gold_fraud']['pred']) + # }) + +if __name__ == '__main__': + # Parse the arguments + args = parse_args() + + if type(args.use_aug) == str: + if args.use_aug.lower() == 'true': + args.use_aug = True + elif args.use_aug.lower() == 'false': + args.use_aug = False + else: + raise ValueError("Invalid value for use_aug. Please enter True or False.") + + # Define model hyperparameters + hyper_params = { + 'num_labels': args.num_labels, + 'n_estimators': args.n_estimators, + 'criterion': args.criterion + } + + # Log in to Weights and Biases + wandbdict = { + 'key': os.getenv('WANDB_API_KEY'), + 'entity': os.getenv('WANDB_ENTITY'), + 'project': os.getenv('WANDB_PROJECT'), + } + wandb.login(key=wandbdict['key']) + run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity']) + + # Define a variable to store the trained model + model = None + + # Get the current date + date = datetime.now().strftime("%Y-%m-%d") + + # Create date folder in save path + save_path = args.save_path + save_path = os.path.join(save_path, f'{date}') + os.makedirs(save_path, exist_ok=True) + + # Load the data + data = load_data() + + # Label and preprocess the data + data = label_and_preprocess_data(data) + + # Split the data into train, sanity, and gold_fraud sets + train_data, sanity_data, gold_fraud_data = data_split(data) + + # Train the model + model = train_model(train_data, hyper_params, use_aug=args.use_aug) + + # Test the model + f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path) + + # Dump the logs to Weights and Biases + dump_logs_to_wandb(hyper_params, f1_scores, save_path) + + # Close the Weights and Biases run + run.finish() +