From b170bed62f50766d1572f58ef08b1c3fb02f61c7 Mon Sep 17 00:00:00 2001
From: Advaith Rao <advaith652@gmail.com>
Date: Tue, 5 Dec 2023 10:54:07 -0500
Subject: [PATCH] Added random forest model

---
 detector/modeler.py                  |  83 ++++++++
 notebooks/differential_privacy.ipynb |  12 --
 pipelines/random_forest_trainer.py   | 285 +++++++++++++++++++++++++++
 3 files changed, 368 insertions(+), 12 deletions(-)
 create mode 100644 pipelines/random_forest_trainer.py

diff --git a/detector/modeler.py b/detector/modeler.py
index dc1c2cb..fda9200 100644
--- a/detector/modeler.py
+++ b/detector/modeler.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import numpy as np
 from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
@@ -1061,3 +1062,85 @@ def save_model(
             os.makedirs(path, exist_ok=True)
         
         save_model(self.model, path)
+
+
+class RandomForestFraudModel:
+    def __init__(
+        self,
+        num_labels: int = 2,
+        n_estimators = 100,
+        criterion = 'gini',
+        njobs = -1
+    ):
+        self.num_labels = num_labels
+        self.n_estimators = n_estimators
+        self.criterion = criterion
+        self.njobs = njobs
+
+        self.vectorizer = Word2VecEmbedder()
+        self.model = Pipeline([
+            ('vectorizer', self.vectorizer),
+            ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs))
+        ])
+
+    def train(
+        self,
+        body: pd.Series | list[str],
+        label: pd.Series | list[int],
+    ):
+        """Trains the SVM model.
+
+        Args:
+            body (pd.Series | list[str]): The body of the email.
+            label (pd.Series | list[int]): The label of the email.
+
+        Raises:
+            ValueError: If the body and label are not of the same size.
+        """
+        if isinstance(body, pd.Series):
+            body = body.tolist()
+        if isinstance(label, pd.Series):
+            label = label.tolist()
+
+        # Train the SVM model
+        self.model.fit(body, label)
+
+        print(f'{"="*20} Training Done {"="*20}')
+
+    def predict(
+        self,
+        body: pd.Series | list[str],
+    ):
+        """Predicts the labels of the given data.
+
+        Args:
+            body (pd.Series | list[str]): The body of the email.
+
+        Returns:
+            np.array: The predictions of the model.
+        """
+        if isinstance(body, pd.Series):
+            body = body.tolist()
+
+        # Make predictions using the trained SVM model
+        predictions = self.model.predict(body)
+
+        if isinstance(predictions, np.ndarray):
+            predictions = predictions.tolist()
+
+        return predictions
+
+    def save_model(
+        self,
+        path: str,
+    ):
+        """Saves the model to the given path.
+
+        Args:
+            path (str): The path to save the model to.
+        """
+
+        if not os.path.exists(path):
+            os.makedirs(path, exist_ok=True)
+        
+        save_model(self.model, path)
diff --git a/notebooks/differential_privacy.ipynb b/notebooks/differential_privacy.ipynb
index b3dc825..4bdbaaa 100644
--- a/notebooks/differential_privacy.ipynb
+++ b/notebooks/differential_privacy.ipynb
@@ -627,18 +627,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/pipelines/random_forest_trainer.py b/pipelines/random_forest_trainer.py
new file mode 100644
index 0000000..3086e60
--- /dev/null
+++ b/pipelines/random_forest_trainer.py
@@ -0,0 +1,285 @@
+#usage: python3 -m pipelines.svm_trainer --num_labels 2 --n_estimators 100 --criterion gini --use_aug True
+import sys
+sys.path.append('..')
+
+import warnings
+warnings.filterwarnings("ignore")
+
+from datetime import datetime
+import pandas as pd
+import sys
+import os
+
+from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
+from detector.labeler import EnronLabeler, MismatchLabeler
+from detector.modeler import RandomForestFraudModel
+from detector.preprocessor import Preprocessor
+from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor
+
+import wandb
+import argparse
+import configparser
+config = configparser.ConfigParser()
+config.read(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        '../config.ini'
+    )
+)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="SVM Model Fraud Detector Pipeline")
+    parser.add_argument("--save_path", "-s", type=str, default='/tmp/', help="Output save path")
+    parser.add_argument("--num_labels", "-l", type=int, default=2, help="Number of labels")
+    parser.add_argument("--n_estimators", "-n", type=int, default=100, help="Number of trees in the forest")
+    parser.add_argument("--criterion", "-c", type=str, default='gini', help="Function to measure the quality of a split")
+    parser.add_argument("--use_aug", "-u", type=bool, default=False, help="Whether to use data augmentation or not for training data balancing")
+    return parser.parse_args()
+
+def load_data():
+    if os.path.exists(
+        os.path.join(
+            os.path.dirname(__file__),
+            '../data/fraud_detector_data.csv'
+        )
+    ):
+        data = pd.read_csv(
+            os.path.join(
+                os.path.dirname(__file__),
+                '../data/fraud_detector_data.csv'
+            )
+        )
+    else:
+        data = {
+            loader.__name__: loader().__call__() for loader in [LoadEnronData, LoadPhishingData, LoadSocEnggData]
+        }
+    return data
+
+def label_and_preprocess_data(data):
+    if not os.path.exists(
+        os.path.join(
+            os.path.dirname(__file__),
+            '../data/fraud_detector_data.csv'
+        )
+    ):
+        # Run Enron Labeler
+        data['LoadEnronData'] = EnronLabeler(data['LoadEnronData'], needs_preprocessing=True)()
+
+        # Preprocess the other 2 datasets
+        data['LoadPhishingData']['Body'] = data['LoadPhishingData']['Body'].swifter.apply(Preprocessor())
+        data['LoadSocEnggData']['Body'] = data['LoadSocEnggData']['Body'].swifter.apply(Preprocessor())
+
+        # Concatenate the 3 data sources into 1
+        data = pd.concat(
+            [
+                df for df in data.values()
+            ],
+            axis=0,
+            ignore_index=True
+        )
+
+        # Run Mismatch Labeler
+        data = MismatchLabeler(data)()
+
+        data.reset_index(drop=True, inplace=True)
+
+    return data
+
+def data_split(data):
+    if not os.path.exists(
+        os.path.join(
+            os.path.dirname(__file__),
+            '../data/fraud_detector_data.csv'
+        )
+    ):
+        # For gold_fraud_set, take first 500 emails from Phishing Data and 500 emails from Social Engineering Data
+        gold_fraud = pd.concat(
+            [
+                data[data['Source'] == 'Phishing Data'][data['Label'] == 1].head(500),
+                data[data['Source'] == 'Social Engineering Data'][data['Label'] == 1].head(500)
+            ],
+            axis=0,
+            ignore_index=True
+        )
+        gold_fraud['Split'] = 'Gold Fraud'
+
+        # For sanity_set, take first 5000 emails with Sender-Type = 'Internal'
+        sanity = data[
+            (data['Sender-Type'] == 'Internal') & (data['Source'] == 'Enron Data')
+        ].head(5000)
+        sanity['Split'] = 'Sanity'
+
+        # For train_set, take all data not in gold_fraud_set and sanity_set
+        train = data[
+            ~data['Mail-ID'].isin(gold_fraud['Mail-ID']) & ~data['Mail-ID'].isin(sanity['Mail-ID'])
+        ]
+
+        train['Split'] = 'Train'
+
+    else:
+        train = data[data['Split'] == 'Train']
+        gold_fraud = data[data['Split'] == 'Gold Fraud']
+        sanity = data[data['Split'] == 'Sanity']
+        
+    return train, sanity, gold_fraud
+
+def train_model(train_data, hyper_params, use_aug=False):
+    run = wandb.init(config=hyper_params)
+    model = RandomForestFraudModel(**hyper_params)
+
+    # #drop train examples with Label=1 and Body less than 4 words
+    # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
+    # train_data = train_data.reset_index(drop=True)
+
+    if use_aug:
+        augmentor = Augmentor()
+
+        train_body, train_labels = augmentor(
+            train_data['Body'].tolist(), 
+            train_data['Label'].tolist(), 
+            aug_label=1, 
+            num_aug_per_label_1=9,
+            shuffle=True
+        )
+
+        train_data = pd.DataFrame(
+            {
+                'Body': train_body,
+                'Label': train_labels
+            }
+        )
+
+    train_data.drop_duplicates(subset=['Body'], inplace=True)
+    train_data.reset_index(drop=True, inplace=True)
+    
+    # Call your code that produces output
+    model.train(body=train_data['Body'], label=train_data['Label'])
+    return model
+
+def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path):
+    # Define a dictionary to store the f1 scores
+    f1_scores = {}
+    
+    # # Define a dictionary to store the predictions, true labels for each dataset
+    # true_pred_map = {
+    #     'train':{},
+    #     'sanity':{},
+    #     'gold_fraud':{}
+    # }
+
+    os.makedirs(os.path.join(save_path,'logs'), exist_ok=True)
+
+    # Save the model and logs to the date folder
+    model.save_model(os.path.join(save_path,'model'))
+
+    train_data['Prediction'] = model.predict(body=train_data['Body'])
+    evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
+    f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())
+
+    sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
+    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
+    f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())
+
+    gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
+    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
+    f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())
+
+    #save mismatch data into a csv file
+    mismatch_data = pd.concat(
+        [
+            train_data[train_data['Prediction'] != train_data['Label']],
+            sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
+            gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
+        ],
+        axis=0,
+        ignore_index=True
+    )
+
+    mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)
+
+    return f1_scores
+
+def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
+    # Log the hyperparameters and f1 scores to Weights and Biases
+    all_params = {**hyper_params, **f1_scores}
+    run.config.update(all_params)
+
+    # Log the model to Weights and Biases
+    model_path = os.path.join(save_path, 'model')
+    model_artifact = wandb.Artifact("fraud-detector-model", type="model")
+    model_artifact.add_dir(model_path)
+    run.use_artifact(model_artifact)
+
+    # Log the log files to Weights and Biases
+    logs_path = os.path.join(save_path,'logs')
+    log_artifact = wandb.Artifact("fraud-detector-logs", type="logs")
+    log_artifact.add_dir(logs_path)
+    run.use_artifact(log_artifact)
+
+    # Log confusion matrices
+    # run.log({
+    #     "train_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['train']['true'], true_pred_map['train']['pred']),
+    #     "sanity_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['sanity']['true'], true_pred_map['sanity']['pred']),
+    #     "gold_fraud_confusion_matrix": wandb.plot.confusion_matrix(true_pred_map['gold_fraud']['true'], true_pred_map['gold_fraud']['pred'])
+    # })
+
+if __name__ == '__main__':
+    # Parse the arguments
+    args = parse_args()
+
+    if type(args.use_aug) == str:
+        if args.use_aug.lower() == 'true':
+            args.use_aug = True
+        elif args.use_aug.lower() == 'false':
+            args.use_aug = False
+        else:
+            raise ValueError("Invalid value for use_aug. Please enter True or False.")
+    
+    # Define model hyperparameters
+    hyper_params = {
+        'num_labels': args.num_labels,
+        'n_estimators': args.n_estimators,
+        'criterion': args.criterion
+    }
+
+    # Log in to Weights and Biases
+    wandbdict = {
+        'key': os.getenv('WANDB_API_KEY'),
+        'entity': os.getenv('WANDB_ENTITY'),
+        'project': os.getenv('WANDB_PROJECT'),
+    }
+    wandb.login(key=wandbdict['key'])
+    run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])
+
+    # Define a variable to store the trained model
+    model = None
+
+    # Get the current date
+    date = datetime.now().strftime("%Y-%m-%d")
+
+    # Create date folder in save path
+    save_path = args.save_path
+    save_path = os.path.join(save_path, f'{date}')
+    os.makedirs(save_path, exist_ok=True)
+
+    # Load the data
+    data = load_data()
+
+    # Label and preprocess the data
+    data = label_and_preprocess_data(data)
+
+    # Split the data into train, sanity, and gold_fraud sets
+    train_data, sanity_data, gold_fraud_data = data_split(data)
+
+    # Train the model
+    model = train_model(train_data, hyper_params, use_aug=args.use_aug)
+
+    # Test the model
+    f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path)
+
+    # Dump the logs to Weights and Biases
+    dump_logs_to_wandb(hyper_params, f1_scores, save_path)
+
+    # Close the Weights and Biases run
+    run.finish()
+