From 3ddc6304769d579814405b9c7d112770e5a49245 Mon Sep 17 00:00:00 2001
From: Advaith Rao <advaith652@gmail.com>
Date: Tue, 5 Dec 2023 23:35:20 -0500
Subject: [PATCH] Updated rf differential privacy to use f1 score for
 validation

---
 ethics/differential_privacy.py               |  27 +-
 notebooks/differential_privacy.ipynb         | 634 -------------------
 notebooks/rf_differential_privacy.ipynb      | 595 +++++++++++++++++
 pipelines/differential_privacy_trainer.py    |   1 +
 pipelines/random_forest_trainer.py           |   2 +-
 pipelines/rf_differential_privacy_trainer.py |   8 +-
 pipelines/roberta_trainer.py                 |   2 +-
 7 files changed, 616 insertions(+), 653 deletions(-)
 delete mode 100644 notebooks/differential_privacy.ipynb
 create mode 100644 notebooks/rf_differential_privacy.ipynb

diff --git a/ethics/differential_privacy.py b/ethics/differential_privacy.py
index ee9ee50..f336c3a 100644
--- a/ethics/differential_privacy.py
+++ b/ethics/differential_privacy.py
@@ -29,7 +29,7 @@
 from scipy.sparse import hstack
 
 from ethics.base import BaseDistilbertModel
-from utils.util_modeler import Word2VecEmbedder, TPSampler
+from utils.util_modeler import Word2VecEmbedder, TPSampler, get_f1_score
 
 from opacus import PrivacyEngine
 from opacus.utils.batch_memory_manager import BatchMemoryManager
@@ -50,6 +50,11 @@ def __init__(
         self.criterion = criterion
         self.njobs = njobs
 
+        self.model = Pipeline([
+            ('vectorizer', self.vectorizer),
+            ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs))
+        ])
+
         self.vectorizer = Word2VecEmbedder()
 
     def train(
@@ -72,29 +77,25 @@ def train(
         if isinstance(label, pd.Series):
             label = label.tolist()
 
+        body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)
+
         # Train the RF model
         epsilons = [1e-8, 1e-2, 1, 7.5, 20]
         accuracies = []
 
-        body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)
-
         for eps in epsilons:
-            self.model = Pipeline([
-                ('vectorizer', self.vectorizer),
-                ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, epsilon=eps, criterion=self.criterion, n_jobs=self.njobs))
-            ])
-
+            self.model.set_params(classifier__epsilon=eps)
             self.model.fit(body_train, label_train)
 
-            accuracy = self.model.score(body_val, label_val)
-            print('********* \n Epsilon %.2f - Accuracy %.5f \n *********' % (eps, accuracy))
+            accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro')
+            print('********* \n Epsilon %.2f - Validation F1 Score %.5f \n *********' % (eps, accuracy))
             accuracies.append(accuracy)
         
         plt.plot(epsilons, accuracies, marker='o')
         plt.xscale('log')  # Use a logarithmic scale for better visibility
         plt.xlabel('Epsilon')
-        plt.ylabel('Accuracy')
-        plt.title('Accuracy vs Epsilon')
+        plt.ylabel('F1 Score')
+        plt.title('F1 Score vs Epsilon')
         plt.grid(True)
 
         plt.savefig("rf_dp_accuracy_vs_epsilon_plot.png")
@@ -102,7 +103,7 @@ def train(
         # Log the plot to wandb
         wandb.log({"Accuracy vs Epsilon": plt})
         
-        print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Accuracy = {np.max(accuracies)} \n {"="*20}')
+        print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \n {"="*20}')
         
         #Fit model with best epsilon
         self.model = Pipeline([
diff --git a/notebooks/differential_privacy.ipynb b/notebooks/differential_privacy.ipynb
deleted file mode 100644
index 4bdbaaa..0000000
--- a/notebooks/differential_privacy.ipynb
+++ /dev/null
@@ -1,634 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !poetry run pip3 install --force-reinstall opacus==0.13.0\n",
-    "# !poetry update"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !git clone https://github.com/woodyx218/opacus_global_clipping.git\n",
-    "# !mv opacus_global_clipping ./ethics/\n",
-    "# !pip3 install -e ./ethics/opacus_global_clipping\n",
-    "# !rm -rf ./ethics/opacus"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !ls ./ethics/opacus"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !poetry run pip3 install --upgrade opacus"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package stopwords to\n",
-      "[nltk_data]     /common/home/ps1279/nltk_data...\n",
-      "[nltk_data]   Package stopwords is already up-to-date!\n"
-     ]
-    }
-   ],
-   "source": [
-    "from ethics.differential_privacy import DistilbertPrivacyModel"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:512\"\n",
-    "\n",
-    "import shutil\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import accuracy_score\n",
-    "from sklearn.utils.class_weight import compute_class_weight\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "import torch\n",
-    "from torch import nn\n",
-    "from torch.optim import AdamW\n",
-    "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n",
-    "\n",
-    "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel\n",
-    "from transformers import get_linear_schedule_with_warmup\n",
-    "\n",
-    "from torch.utils.data import DataLoader, TensorDataset\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import wandb\n",
-    "from mlflow.sklearn import save_model\n",
-    "from scipy.sparse import hstack\n",
-    "\n",
-    "from utils.util_modeler import Word2VecEmbedder, TPSampler\n",
-    "\n",
-    "# import sys\n",
-    "# sys.path.append(\n",
-    "#         '../ethics'\n",
-    "# )\n",
-    "\n",
-    "from opacus import PrivacyEngine\n",
-    "# from opacus.utils.uniform_sampler import UniformWithReplacementSampler\n",
-    "from opacus.utils.batch_memory_manager import BatchMemoryManager\n",
-    "from opacus.utils.uniform_sampler import UniformWithReplacementSampler\n",
-    "# from opacus.grad_sample.utils import register_grad_sampler\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BaseModel(nn.Module):\n",
-    "    def __init__(self, num_labels, model_name='distilbert-base-uncased', device = 'cuda'):\n",
-    "        super(BaseModel, self).__init__()\n",
-    "\n",
-    "        # Load pre-trained RobertaModel\n",
-    "        self.model = DistilBertModel.from_pretrained(model_name).to(device)\n",
-    "\n",
-    "        for param in self.model.parameters():\n",
-    "            param.requires_grad = False\n",
-    "\n",
-    "        # Define classification head\n",
-    "        self.classification_head = nn.Sequential(\n",
-    "            nn.Linear(self.model.config.hidden_size, 128),\n",
-    "            nn.ReLU(),\n",
-    "            nn.Linear(128, num_labels)\n",
-    "        )\n",
-    "\n",
-    "    def forward(self, input_ids, attention_mask, labels=None):\n",
-    "        # Get model outputs\n",
-    "        outputs = self.model(input_ids, attention_mask=attention_mask)\n",
-    "        last_hidden_states = outputs.last_hidden_state\n",
-    "\n",
-    "        # Apply classification head\n",
-    "        logits = self.classification_head(last_hidden_states[:, 0, :])\n",
-    "\n",
-    "        return logits"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class DistilbertPrivacyModel:\n",
-    "    def __init__(\n",
-    "        self, \n",
-    "        num_labels=2, \n",
-    "        path='', \n",
-    "        model_name='distilbert-base-uncased', \n",
-    "        learning_rate=2e-5, \n",
-    "        epsilon=1e-8, \n",
-    "        num_epochs=40, \n",
-    "        batch_size=128, \n",
-    "        device=None\n",
-    "    ):\n",
-    "        self.num_labels = num_labels\n",
-    "        self.path = path\n",
-    "        self.model_name = model_name\n",
-    "        self.learning_rate = learning_rate\n",
-    "        self.epsilon = epsilon\n",
-    "        self.num_epochs = num_epochs\n",
-    "        self.batch_size = batch_size\n",
-    "        self.device = device\n",
-    "\n",
-    "        if not self.device and torch.cuda.is_available():\n",
-    "            self.device = 'cuda'\n",
-    "        elif not self.device:\n",
-    "            self.device = 'cpu'\n",
-    "\n",
-    "        self.device = torch.device(self.device)\n",
-    "        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)\n",
-    "\n",
-    "        if self.path != '':\n",
-    "            raise NotImplementedError('Loading model from path is not implemented yet.')\n",
-    "        else:\n",
-    "            self.model = BaseModel(num_labels=self.num_labels, model_name=self.model_name)\n",
-    "            self.model.to(self.device)\n",
-    "        \n",
-    "        self.privacy_engine = PrivacyEngine()\n",
-    "        \n",
-    "    def train(\n",
-    "        self, \n",
-    "        body: pd.Series | list[str], \n",
-    "        label: pd.Series | list[int], \n",
-    "        validation_size=0.2,\n",
-    "        wandb=None\n",
-    "    ):\n",
-    "        \"\"\"Trains the model using the given data.\n",
-    "\n",
-    "        Args:\n",
-    "            body (pd.Series | list[str]): The body of the email.\n",
-    "            label (pd.Series | list[int]): The label of the email.\n",
-    "            validation_size (float, optional): The size of the validation set. Defaults to 0.2.\n",
-    "            wandb (wandb, optional): The wandb object. Defaults to None. If given, logs the training process to wandb.\n",
-    "\n",
-    "        Raises:\n",
-    "            ValueError: If the body and label are not of the same size.\n",
-    "        \"\"\"\n",
-    "\n",
-    "        if isinstance(body, pd.Series):\n",
-    "            body = body.tolist()\n",
-    "        if isinstance(label, pd.Series):\n",
-    "            label = label.tolist()\n",
-    "\n",
-    "        # Tokenize input texts and convert labels to tensors\n",
-    "        input_ids = []\n",
-    "        attention_masks = []\n",
-    "        label_ids = []\n",
-    "\n",
-    "        for _body, _label in zip(body, label):\n",
-    "            # Tokenize the input text using the Roberta tokenizer\n",
-    "            inputs = self.tokenizer.encode_plus(\n",
-    "                _body,\n",
-    "                add_special_tokens=True,\n",
-    "                max_length=512,\n",
-    "                padding='max_length',\n",
-    "                return_attention_mask=True,\n",
-    "                return_tensors='pt',\n",
-    "                truncation=True\n",
-    "            )\n",
-    "\n",
-    "            input_ids.append(inputs['input_ids'])\n",
-    "            attention_masks.append(inputs['attention_mask'])\n",
-    "            label_ids.append(torch.tensor(_label))  # Convert the label to a tensor\n",
-    "\n",
-    "        # Convert lists to tensors\n",
-    "        input_ids = torch.cat(input_ids, dim=0)\n",
-    "        attention_masks = torch.cat(attention_masks, dim=0)\n",
-    "        label_ids = torch.stack(label_ids)\n",
-    "\n",
-    "        # Split the data into train and validation sets\n",
-    "        dataset = TensorDataset(input_ids, attention_masks, label_ids)\n",
-    "        dataset_size = len(dataset)\n",
-    "        val_size = int(validation_size * dataset_size)\n",
-    "        train_size = dataset_size - val_size\n",
-    "        train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])\n",
-    "\n",
-    "        # Create data loaders for training and validation data\n",
-    "        train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)\n",
-    "        validation_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)\n",
-    "\n",
-    "        # Initialize the optimizer and learning rate scheduler\n",
-    "        optimizer = AdamW(list(self.model.parameters()),\n",
-    "                          lr=self.learning_rate, eps=self.epsilon)\n",
-    "        total_steps = len(train_dataloader) * self.num_epochs\n",
-    "        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n",
-    "\n",
-    "        MAX_GRAD_NORM = 0.1\n",
-    "\n",
-    "        # self.model, optimizer, _ = self.privacy_engine.make_private_with_epsilon(\n",
-    "        #     module=self.model,\n",
-    "        #     optimizer=optimizer,\n",
-    "        #     data_loader=train_dataloader,\n",
-    "        #     target_delta=1/total_steps,\n",
-    "        #     target_epsilon=self.epsilon, \n",
-    "        #     epochs=self.num_epochs,\n",
-    "        #     max_grad_norm=MAX_GRAD_NORM,\n",
-    "        # )\n",
-    "\n",
-    "        # print(f\"******** Using sigma={optimizer.noise_multiplier} and C={MAX_GRAD_NORM} ********\")\n",
-    "\n",
-    "        # Initialize variables for early stopping\n",
-    "        best_validation_loss = float(\"inf\")\n",
-    "        patience = 5  # Number of epochs to wait for improvement\n",
-    "        wait = 0\n",
-    "\n",
-    "        for epoch in range(self.num_epochs):\n",
-    "            print(f'{\"=\"*20} Epoch {epoch + 1}/{self.num_epochs} {\"=\"*20}')\n",
-    "\n",
-    "            # Training loop\n",
-    "            self.model.train()\n",
-    "            total_train_loss = 0\n",
-    "\n",
-    "            # with BatchMemoryManager(\n",
-    "            #     data_loader=train_dataloader, \n",
-    "            #     max_physical_batch_size=self.batch_size, \n",
-    "            #     optimizer=optimizer\n",
-    "            # ) as memory_safe_data_loader:\n",
-    "            #     for step, batch in enumerate(memory_safe_data_loader):\n",
-    "\n",
-    "            for step, batch in enumerate(train_dataloader):\n",
-    "                    optimizer.zero_grad()\n",
-    "                    \n",
-    "                    b_input_ids = batch[0].to(self.device)\n",
-    "                    b_input_mask = batch[1].to(self.device)\n",
-    "                    b_labels = batch[2].to(self.device)\n",
-    "\n",
-    "                    # Forward pass\n",
-    "                    logits = self.model(b_input_ids, attention_mask=b_input_mask)\n",
-    "                    \n",
-    "                    loss = F.cross_entropy(logits, b_labels)\n",
-    "\n",
-    "                    total_train_loss += loss.item()\n",
-    "\n",
-    "                    # Backward pass\n",
-    "                    loss.backward()\n",
-    "\n",
-    "                    # torch.nn.utils.clip_grad_norm_(list(self.model.parameters()), 1.0)\n",
-    "\n",
-    "                    # Update the model parameters\n",
-    "                    optimizer.step()\n",
-    "\n",
-    "                    # Update the learning rate\n",
-    "                    scheduler.step()\n",
-    "\n",
-    "                    if step % 100 == 0 and step != 0:\n",
-    "                        avg_train_loss = total_train_loss / 100\n",
-    "                        print(f'Step {step}/{len(train_dataloader)} - Average training loss: {avg_train_loss:.4f}')\n",
-    "\n",
-    "                        total_train_loss = 0\n",
-    "\n",
-    "            avg_train_loss = total_train_loss / len(train_dataloader)\n",
-    "            print(f'Training loss: {avg_train_loss:.4f}')\n",
-    "\n",
-    "            # Evaluation loop\n",
-    "            self.model.eval()\n",
-    "            total_eval_accuracy = 0\n",
-    "            total_eval_loss = 0\n",
-    "\n",
-    "            for batch in validation_dataloader:\n",
-    "                b_input_ids = batch[0].to(self.device)\n",
-    "                b_input_mask = batch[1].to(self.device)\n",
-    "                b_labels = batch[2].to(self.device)\n",
-    "\n",
-    "                with torch.no_grad():\n",
-    "                    logits = self.model(b_input_ids, attention_mask=b_input_mask)\n",
-    "                    loss = F.cross_entropy(logits, b_labels)\n",
-    "\n",
-    "                    total_eval_loss += loss.item()\n",
-    "                    total_eval_accuracy += self.accuracy(logits, b_labels)\n",
-    "\n",
-    "                total_eval_accuracy += self.accuracy(logits, b_labels)\n",
-    "\n",
-    "            if len(validation_dataloader) > 0:\n",
-    "                avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n",
-    "                print(f'Validation Accuracy: {avg_val_accuracy:.4f}')\n",
-    "\n",
-    "                avg_val_loss = total_eval_loss / len(validation_dataloader)\n",
-    "                print(f'Validation Loss: {avg_val_loss:.4f}')\n",
-    "\n",
-    "                # Early stopping check\n",
-    "                if avg_val_loss < best_validation_loss:\n",
-    "                    best_validation_loss = avg_val_loss\n",
-    "                    wait = 0\n",
-    "                else:\n",
-    "                    wait += 1\n",
-    "\n",
-    "                if wait >= patience:\n",
-    "                    print(f'Early stopping after {patience} epochs without improvement.')\n",
-    "                    break\n",
-    "            else:\n",
-    "                print('No validation data provided.')\n",
-    "                avg_val_accuracy = 0\n",
-    "                avg_val_loss = 0\n",
-    "\n",
-    "            if wandb is not None:\n",
-    "                wandb.log({\n",
-    "                    'epoch': epoch, \n",
-    "                    'train_loss': avg_train_loss, \n",
-    "                    'val_loss': avg_val_loss,\n",
-    "                    'val_accuracy': avg_val_accuracy,\n",
-    "                })\n",
-    "\n",
-    "    def predict(\n",
-    "        self, \n",
-    "        body: pd.Series | list[str]\n",
-    "    ):\n",
-    "        \"\"\"Predicts the labels of the given data.\n",
-    "\n",
-    "        Args:\n",
-    "            body (pd.Series | list[str]): The body of the email.\n",
-    "\n",
-    "        Returns:\n",
-    "            np.array: The predictions of the model.\n",
-    "        \"\"\"\n",
-    "\n",
-    "        # If input_texts is a Pandas Series, convert it to a list\n",
-    "        if isinstance(body, pd.Series):\n",
-    "            body = body.tolist()\n",
-    "\n",
-    "        input_ids = []\n",
-    "        attention_masks = []\n",
-    "\n",
-    "        for _body in body:\n",
-    "            inputs = self.tokenizer.encode_plus(\n",
-    "                _body,\n",
-    "                add_special_tokens=True,\n",
-    "                max_length=512,\n",
-    "                padding='max_length',\n",
-    "                return_attention_mask=True,\n",
-    "                return_tensors='pt',\n",
-    "                truncation=True\n",
-    "            )\n",
-    "\n",
-    "            input_ids.append(inputs['input_ids'])\n",
-    "            attention_masks.append(inputs['attention_mask'])\n",
-    "\n",
-    "        input_ids = torch.cat(input_ids, dim=0)\n",
-    "        attention_masks = torch.cat(attention_masks, dim=0)\n",
-    "\n",
-    "        dataset = TensorDataset(input_ids, attention_masks)\n",
-    "        dataloader = DataLoader(dataset, batch_size=self.batch_size)\n",
-    "\n",
-    "        self.model.eval()\n",
-    "        predictions = []\n",
-    "\n",
-    "        for batch in dataloader:\n",
-    "            b_input_ids = batch[0].to(self.device)\n",
-    "            b_input_mask = batch[1].to(self.device)\n",
-    "\n",
-    "            with torch.no_grad():\n",
-    "                logits = self.model(b_input_ids, attention_mask=b_input_mask)\n",
-    "\n",
-    "            logits = logits.detach().cpu().numpy()\n",
-    "\n",
-    "            # Apply a threshold (e.g., 0.5) to convert logits to class predictions\n",
-    "            class_predictions = np.argmax(logits, axis=1)\n",
-    "            \n",
-    "            predictions.extend(class_predictions.tolist())\n",
-    "\n",
-    "        return predictions\n",
-    "    \n",
-    "    def save_model(\n",
-    "            self,\n",
-    "            path: str\n",
-    "    ):\n",
-    "        \"\"\"Saves the model to the given path.\n",
-    "\n",
-    "        Args:\n",
-    "            path (str): The path to save the model to.\n",
-    "        \"\"\"\n",
-    "\n",
-    "        # Check if the directory exists, and if not, create it\n",
-    "        if not os.path.exists(path):\n",
-    "            os.makedirs(path, exist_ok=True)\n",
-    "\n",
-    "        # Save the transformer model and the classification head\n",
-    "        self.model.save_pretrained(path)\n",
-    "        torch.save(self.classification_head.state_dict(), os.path.join(path, 'classification_head.pth'))\n",
-    "    \n",
-    "    def accuracy(\n",
-    "        self, \n",
-    "        preds, \n",
-    "        labels\n",
-    "    ):\n",
-    "        \"\"\"Calculates the accuracy of the model.\n",
-    "\n",
-    "        Args:\n",
-    "            preds (torch.Tensor|numpy.ndarray): The predictions of the model.\n",
-    "            labels (torch.Tensor|numpy.ndarray): The labels of the data.\n",
-    "\n",
-    "        Returns:\n",
-    "            float: The accuracy of the model.\n",
-    "        \"\"\"\n",
-    "\n",
-    "        if isinstance(preds, np.ndarray):\n",
-    "            preds = torch.from_numpy(preds)\n",
-    "        if isinstance(labels, np.ndarray):\n",
-    "            labels = torch.from_numpy(labels)\n",
-    "        \n",
-    "        _, preds = torch.max(preds, dim=1)\n",
-    "        \n",
-    "        return torch.tensor(torch.sum(preds == labels).item() / len(preds))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# data = pd.read_csv('./data/fraud_detector_data.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = data[data.Source == 'Phishing Data']\n",
-    "\n",
-    "data = pd.concat(\n",
-    "    [data[data.Label == 1].head(100),\n",
-    "    data[data.Label == 0].head(100)]\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/common/home/ps1279/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/privacy_engine.py:142: UserWarning: Secure RNG turned off. This is perfectly fine for experimentation as it allows for much faster training performance, but remember to turn it on and retrain one last time before production with ``secure_mode`` turned on.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "# model = DistilbertPrivacyModel(\n",
-    "#     num_epochs=1,\n",
-    "#     epsilon=1e-8,\n",
-    "#     batch_size=2,\n",
-    "#     device='cuda'\n",
-    "# )\n",
-    "\n",
-    "model = DistilbertPrivacyModel(\n",
-    "    num_epochs=1,\n",
-    "    epsilon=1e-8,\n",
-    "    batch_size=2,\n",
-    "    device='cuda'\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "==================== Epoch 1/1 ====================\n",
-      "Training loss: 0.6902\n",
-      "Validation Accuracy: 0.7500\n",
-      "Validation Loss: 0.7111\n"
-     ]
-    }
-   ],
-   "source": [
-    "model.train(\n",
-    "    body=data.Body,\n",
-    "    label=data.Label,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Per sample gradient is not initialized. Not updated in backward pass?",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[1;32m/Users/arao/Local/Github/Fraud-Detector/notebooks/differential_privacy.ipynb Cell 9\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/arao/Local/Github/Fraud-Detector/notebooks/differential_privacy.ipynb#X11sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m e\u001b[39m.\u001b[39;49mstep()\n",
-      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:513\u001b[0m, in \u001b[0;36mDPOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    510\u001b[0m     \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39menable_grad():\n\u001b[1;32m    511\u001b[0m         closure()\n\u001b[0;32m--> 513\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpre_step():\n\u001b[1;32m    514\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moriginal_optimizer\u001b[39m.\u001b[39mstep()\n\u001b[1;32m    515\u001b[0m \u001b[39melse\u001b[39;00m:\n",
-      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:494\u001b[0m, in \u001b[0;36mDPOptimizer.pre_step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    483\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpre_step\u001b[39m(\n\u001b[1;32m    484\u001b[0m     \u001b[39mself\u001b[39m, closure: Optional[Callable[[], \u001b[39mfloat\u001b[39m]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    485\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Optional[\u001b[39mfloat\u001b[39m]:\n\u001b[1;32m    486\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    487\u001b[0m \u001b[39m    Perform actions specific to ``DPOptimizer`` before calling\u001b[39;00m\n\u001b[1;32m    488\u001b[0m \u001b[39m    underlying  ``optimizer.step()``\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    492\u001b[0m \u001b[39m            returns the loss. Optional for most optimizers.\u001b[39;00m\n\u001b[1;32m    493\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 494\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mclip_and_accumulate()\n\u001b[1;32m    495\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_skip_next_step():\n\u001b[1;32m    496\u001b[0m         \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_is_last_step_skipped \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
-      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:397\u001b[0m, in \u001b[0;36mDPOptimizer.clip_and_accumulate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    391\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclip_and_accumulate\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m    392\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    393\u001b[0m \u001b[39m    Performs gradient clipping.\u001b[39;00m\n\u001b[1;32m    394\u001b[0m \u001b[39m    Stores clipped and aggregated gradients into `p.summed_grad```\u001b[39;00m\n\u001b[1;32m    395\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 397\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgrad_samples[\u001b[39m0\u001b[39m]) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m    398\u001b[0m         \u001b[39m# Empty batch\u001b[39;00m\n\u001b[1;32m    399\u001b[0m         per_sample_clip_factor \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mzeros((\u001b[39m0\u001b[39m,))\n\u001b[1;32m    400\u001b[0m     \u001b[39melse\u001b[39;00m:\n",
-      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:345\u001b[0m, in \u001b[0;36mDPOptimizer.grad_samples\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    343\u001b[0m ret \u001b[39m=\u001b[39m []\n\u001b[1;32m    344\u001b[0m \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams:\n\u001b[0;32m--> 345\u001b[0m     ret\u001b[39m.\u001b[39mappend(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_flat_grad_sample(p))\n\u001b[1;32m    346\u001b[0m \u001b[39mreturn\u001b[39;00m ret\n",
-      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:282\u001b[0m, in \u001b[0;36mDPOptimizer._get_flat_grad_sample\u001b[0;34m(self, p)\u001b[0m\n\u001b[1;32m    278\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    279\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mPer sample gradient not found. Are you using GradSampleModule?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    280\u001b[0m     )\n\u001b[1;32m    281\u001b[0m \u001b[39mif\u001b[39;00m p\u001b[39m.\u001b[39mgrad_sample \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 282\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    283\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mPer sample gradient is not initialized. Not updated in backward pass?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    284\u001b[0m     )\n\u001b[1;32m    285\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(p\u001b[39m.\u001b[39mgrad_sample, torch\u001b[39m.\u001b[39mTensor):\n\u001b[1;32m    286\u001b[0m     ret \u001b[39m=\u001b[39m p\u001b[39m.\u001b[39mgrad_sample\n",
-      "\u001b[0;31mValueError\u001b[0m: Per sample gradient is not initialized. Not updated in backward pass?"
-     ]
-    }
-   ],
-   "source": [
-    "e.step()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<opacus.data_loader.DPDataLoader at 0x7efd79f84130>"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "f"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = BaseModel()\n",
-    "# list(model.parameters())\n",
-    "# torch.nn.utils.clip_grad_norm_(list(model.parameters()), 1.0)\n",
-    "# list(model.parameters())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebooks/rf_differential_privacy.ipynb b/notebooks/rf_differential_privacy.ipynb
new file mode 100644
index 0000000..b824488
--- /dev/null
+++ b/notebooks/rf_differential_privacy.ipynb
@@ -0,0 +1,595 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('..')\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from datetime import datetime\n",
+    "import pandas as pd\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData\n",
+    "from detector.labeler import EnronLabeler, MismatchLabeler\n",
+    "from ethics.differential_privacy import RandomForestPrivacyModel\n",
+    "from detector.preprocessor import Preprocessor\n",
+    "from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor\n",
+    "\n",
+    "import wandb\n",
+    "import argparse\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Init wandb for model tracking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "12/05/2023 21:48:20:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33madvaithrao\u001b[0m (\u001b[33mregressors\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /common/home/ps1279/.netrc\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "wandb version 0.16.1 is available!  To upgrade, please run:\n",
+       " $ pip install wandb --upgrade"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/common/home/ps1279/Fraud-Detector/wandb/run-20231205_214822-lnh081ou</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou' target=\"_blank\">autumn-sky-109</a></strong> to <a href='https://wandb.ai/regressors/Fraud-Detector' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/regressors/Fraud-Detector' target=\"_blank\">https://wandb.ai/regressors/Fraud-Detector</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou' target=\"_blank\">https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "wandbdict = {\n",
+    "    'key': os.getenv('WANDB_API_KEY'),\n",
+    "    'entity': os.getenv('WANDB_ENTITY'),\n",
+    "    'project': os.getenv('WANDB_PROJECT'),\n",
+    "}\n",
+    "wandb.login(key=wandbdict['key'])\n",
+    "run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('./data/fraud_detector_data.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = data[data.Split == 'Train']\n",
+    "sanity_data = data[data.Split == 'Sanity']\n",
+    "gold_fraud_data = data[data.Split == 'Gold Fraud']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyper_params = {\n",
+    "    'num_labels': 2,\n",
+    "    'n_estimators': 100,\n",
+    "    'criterion': 'gini'\n",
+    "}\n",
+    "\n",
+    "model = RandomForestPrivacyModel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Finishing last run (ID:lnh081ou) before initializing another..."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "wandb: WARNING Source type is set to 'repo' but some required information is missing from the environment. A job will not be created from this run. See https://docs.wandb.ai/guides/launch/create-job\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2f74cfedf89a4095aee568bfa7cbcfd2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">autumn-sky-109</strong> at: <a href='https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou' target=\"_blank\">https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20231205_214822-lnh081ou/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Successfully finished last run (ID:lnh081ou). Initializing new run:<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40d6313528d041969c9df9845b906388",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011112813154856365, max=1.0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "wandb version 0.16.1 is available!  To upgrade, please run:\n",
+       " $ pip install wandb --upgrade"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.0"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/common/home/ps1279/Fraud-Detector/wandb/run-20231205_215125-4jzx14ww</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/regressors/Fraud-Detector/runs/4jzx14ww' target=\"_blank\">generous-energy-110</a></strong> to <a href='https://wandb.ai/regressors/Fraud-Detector' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/regressors/Fraud-Detector' target=\"_blank\">https://wandb.ai/regressors/Fraud-Detector</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/regressors/Fraud-Detector/runs/4jzx14ww' target=\"_blank\">https://wandb.ai/regressors/Fraud-Detector/runs/4jzx14ww</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# run = wandb.init(config=hyper_params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Augmentation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "augmentor = Augmentor()\n",
+    "\n",
+    "train_body, train_labels = augmentor(\n",
+    "    train_data['Body'].tolist(),\n",
+    "    train_data['Label'].tolist(),\n",
+    "    aug_label=1,\n",
+    "    num_aug_per_label_1=9,\n",
+    "    shuffle=True\n",
+    ")\n",
+    "\n",
+    "train_data = pd.DataFrame(\n",
+    "    {\n",
+    "        'Body': train_body,\n",
+    "        'Label': train_labels\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "train_data.drop_duplicates(subset=['Body'], inplace=True)\n",
+    "train_data.reset_index(drop=True, inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data.to_csv('./data/augmented_train_data.csv', index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Call your code that produces output\n",
+    "model.train(train_data['Body'], train_data['Label'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f1_scores = {}\n",
+    "os.makedirs('/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/logs', exist_ok=True)\n",
+    "save_path='/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Predict on all datasets and generate logs + mismatch_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data['Prediction'] = model.predict(body=train_data['Body'])\n",
+    "evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())\n",
+    "f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])\n",
+    "evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())\n",
+    "f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])\n",
+    "evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())\n",
+    "f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#save mismatch data into a csv file\n",
+    "mismatch_data = pd.concat(\n",
+    "    [\n",
+    "        train_data[train_data['Prediction'] != train_data['Label']],\n",
+    "        sanity_data[sanity_data['Prediction'] != sanity_data['Label']],\n",
+    "        gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]\n",
+    "    ],\n",
+    "    axis=0,\n",
+    "    ignore_index=True\n",
+    ")\n",
+    "\n",
+    "mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Logs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_params = {**hyper_params, **f1_scores}\n",
+    "run.config.update(all_params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logs_path = os.path.join(save_path,'logs')\n",
+    "log_artifact = wandb.Artifact(\"fraud-detector-logs\", type=\"logs\")\n",
+    "log_artifact.add_dir(logs_path)\n",
+    "run.use_artifact(log_artifact)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_model(os.path.join(save_path,'model'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = os.path.join(save_path, 'model')\n",
+    "model_artifact = wandb.Artifact(\"fraud-detector-model\", type=\"model\")\n",
+    "model_artifact.add_dir(model_path)\n",
+    "run.use_artifact(model_artifact)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run.finish()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pipelines/differential_privacy_trainer.py b/pipelines/differential_privacy_trainer.py
index 208bc4f..d98bf80 100644
--- a/pipelines/differential_privacy_trainer.py
+++ b/pipelines/differential_privacy_trainer.py
@@ -10,6 +10,7 @@
 import sys
 import os
 
+
 from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
 from detector.labeler import EnronLabeler, MismatchLabeler
 from ethics.differential_privacy import DistilbertPrivacyModel, RandomForestPrivacyModel
diff --git a/pipelines/random_forest_trainer.py b/pipelines/random_forest_trainer.py
index bc575f3..f2ab8e8 100644
--- a/pipelines/random_forest_trainer.py
+++ b/pipelines/random_forest_trainer.py
@@ -124,7 +124,7 @@ def data_split(data):
     return train, sanity, gold_fraud
 
 def train_model(train_data, hyper_params, use_aug=False):
-    run = wandb.init(config=hyper_params)
+    # run = wandb.init(config=hyper_params)
     model = RandomForestFraudModel(**hyper_params)
 
     # #drop train examples with Label=1 and Body less than 4 words
diff --git a/pipelines/rf_differential_privacy_trainer.py b/pipelines/rf_differential_privacy_trainer.py
index 7fe3380..f2f6a73 100644
--- a/pipelines/rf_differential_privacy_trainer.py
+++ b/pipelines/rf_differential_privacy_trainer.py
@@ -123,8 +123,8 @@ def data_split(data):
         
     return train, sanity, gold_fraud
 
-def train_model(train_data, hyper_params, use_aug=False):
-    run = wandb.init(config=hyper_params)
+def train_model(train_data, hyper_params, use_aug=False, wandb = None):
+    # run = wandb.init(config=hyper_params)
     model = RandomForestPrivacyModel(**hyper_params)
 
     # #drop train examples with Label=1 and Body less than 4 words
@@ -153,7 +153,7 @@ def train_model(train_data, hyper_params, use_aug=False):
     train_data.reset_index(drop=True, inplace=True)
     
     # Call your code that produces output
-    model.train(body=train_data['Body'], label=train_data['Label'])
+    model.train(body=train_data['Body'], label=train_data['Label'], wandb = wandb)
     return model
 
 def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path):
@@ -272,7 +272,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
     train_data, sanity_data, gold_fraud_data = data_split(data)
 
     # Train the model
-    model = train_model(train_data, hyper_params, use_aug=args.use_aug)
+    model = train_model(train_data, hyper_params, use_aug=args.use_aug, wandb = run)
 
     # Test the model
     f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path)
diff --git a/pipelines/roberta_trainer.py b/pipelines/roberta_trainer.py
index a326cbc..cacf137 100644
--- a/pipelines/roberta_trainer.py
+++ b/pipelines/roberta_trainer.py
@@ -127,7 +127,7 @@ def data_split(data):
     return train, sanity, gold_fraud
 
 def train_model(train_data, hyper_params, use_aug=False):
-    run = wandb.init(config=hyper_params)
+    # run = wandb.init(config=hyper_params)
     model = RobertaFraudModel(**hyper_params)
 
     if use_aug: