diff --git a/ethics/differential_privacy.py b/ethics/differential_privacy.py
index 19bf8cc..15fab6a 100644
--- a/ethics/differential_privacy.py
+++ b/ethics/differential_privacy.py
@@ -84,7 +84,8 @@ def train(
accuracies = []
for eps in epsilons:
- self.model.set_params(classifier__epsilon=eps)
+ self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=eps)
+ # self.model.set_params(classifier__epsilon=eps)
self.model.fit(body_train, label_train)
accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro')
@@ -106,10 +107,7 @@ def train(
print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \n {"="*20}')
#Fit model with best epsilon
- self.model = Pipeline([
- ('vectorizer', self.vectorizer),
- ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, epsilon=epsilons[np.argmax(accuracies)], criterion=self.criterion, n_jobs=self.njobs))
- ])
+ self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=epsilons[np.argmax(accuracies)])
self.model.fit(body, label)
diff --git a/notebooks/rf_differential_privacy.ipynb b/notebooks/rf_differential_privacy.ipynb
index b824488..ae31228 100644
--- a/notebooks/rf_differential_privacy.ipynb
+++ b/notebooks/rf_differential_privacy.ipynb
@@ -9,9 +9,19 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /common/home/ps1279/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ }
+ ],
"source": [
"import sys\n",
"sys.path.append('..')\n",
@@ -43,20 +53,14 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "12/05/2023 21:48:20:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
+ "12/05/2023 23:39:06:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33madvaithrao\u001b[0m (\u001b[33mregressors\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
@@ -91,7 +95,7 @@
{
"data": {
"text/html": [
- "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_214822-lnh081ou
"
+ "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_233909-rptqmnfr
"
],
"text/plain": [
""
@@ -103,7 +107,7 @@
{
"data": {
"text/html": [
- "Syncing run autumn-sky-109 to Weights & Biases (docs)
"
+ "Syncing run iconic-breeze-112 to Weights & Biases (docs)
"
],
"text/plain": [
""
@@ -127,7 +131,7 @@
{
"data": {
"text/html": [
- " View run at https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou"
+ " View run at https://wandb.ai/regressors/Fraud-Detector/runs/rptqmnfr"
],
"text/plain": [
""
@@ -156,7 +160,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -172,7 +176,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -190,7 +194,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -422,13 +426,209 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
+ "source": [
+ "train_data = pd.read_csv('./data/augmented_train_data.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:512\"\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append('..')\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "import shutil\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.utils.class_weight import compute_class_weight\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "import torch\n",
+ "from torch import nn\n",
+ "\n",
+ "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel\n",
+ "from transformers import AdamW,get_linear_schedule_with_warmup\n",
+ "\n",
+ "from torch.utils.data import DataLoader, TensorDataset\n",
+ "import torch.nn.functional as F\n",
+ "\n",
+ "from mlflow.sklearn import save_model\n",
+ "from scipy.sparse import hstack\n",
+ "\n",
+ "from ethics.base import BaseDistilbertModel\n",
+ "from utils.util_modeler import Word2VecEmbedder, TPSampler, get_f1_score\n",
+ "\n",
+ "from opacus import PrivacyEngine\n",
+ "from opacus.utils.batch_memory_manager import BatchMemoryManager\n",
+ "\n",
+ "from diffprivlib.models.forest import RandomForestClassifier\n",
+ "\n",
+ "\n",
+ "class RandomForestPrivacyModel:\n",
+ " def __init__(\n",
+ " self,\n",
+ " num_labels: int = 2,\n",
+ " n_estimators = 100,\n",
+ " criterion = 'gini',\n",
+ " njobs = -1\n",
+ " ):\n",
+ " self.num_labels = num_labels\n",
+ " self.n_estimators = n_estimators\n",
+ " self.criterion = criterion\n",
+ " self.njobs = njobs\n",
+ "\n",
+ " self.vectorizer = Word2VecEmbedder()\n",
+ " \n",
+ " self.model = Pipeline([\n",
+ " ('vectorizer', self.vectorizer),\n",
+ " ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs))\n",
+ " ])\n",
+ "\n",
+ " def train(\n",
+ " self,\n",
+ " body: pd.Series | list[str],\n",
+ " label: pd.Series | list[int],\n",
+ " wandb: wandb = None,\n",
+ " ):\n",
+ " \"\"\"Trains the SVM model.\n",
+ "\n",
+ " Args:\n",
+ " body (pd.Series | list[str]): The body of the email.\n",
+ " label (pd.Series | list[int]): The label of the email.\n",
+ "\n",
+ " Raises:\n",
+ " ValueError: If the body and label are not of the same size.\n",
+ " \"\"\"\n",
+ " if isinstance(body, pd.Series):\n",
+ " body = body.tolist()\n",
+ " if isinstance(label, pd.Series):\n",
+ " label = label.tolist()\n",
+ "\n",
+ " body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)\n",
+ "\n",
+ " # Train the RF model\n",
+ " epsilons = [1e-8, 1e-2, 1, 7.5, 20]\n",
+ " accuracies = []\n",
+ "\n",
+ " for eps in epsilons:\n",
+ " self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=eps)\n",
+ " # self.model.set_params(classifier__epsilon=eps)\n",
+ " self.model.fit(body_train, label_train)\n",
+ "\n",
+ " accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro')\n",
+ " print('********* \\n Epsilon %.2f - Validation F1 Score %.5f \\n *********' % (eps, accuracy))\n",
+ " accuracies.append(accuracy)\n",
+ " \n",
+ " plt.plot(epsilons, accuracies, marker='o')\n",
+ " plt.xscale('log') # Use a logarithmic scale for better visibility\n",
+ " plt.xlabel('Epsilon')\n",
+ " plt.ylabel('F1 Score')\n",
+ " plt.title('F1 Score vs Epsilon')\n",
+ " plt.grid(True)\n",
+ "\n",
+ " plt.savefig(\"rf_dp_accuracy_vs_epsilon_plot.png\")\n",
+ "\n",
+ " # Log the plot to wandb\n",
+ " wandb.log({\"Accuracy vs Epsilon\": plt})\n",
+ " \n",
+ " print(f'{\"=\"*20} \\n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \\n {\"=\"*20}')\n",
+ " \n",
+ " #Fit model with best epsilon\n",
+ " self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=epsilons[np.argmax(accuracies)])\n",
+ " \n",
+ " self.model.fit(body, label)\n",
+ "\n",
+ " print(f'{\"=\"*20} Training Done {\"=\"*20}')\n",
+ "\n",
+ " def predict(\n",
+ " self,\n",
+ " body: pd.Series | list[str],\n",
+ " ):\n",
+ " \"\"\"Predicts the labels of the given data.\n",
+ "\n",
+ " Args:\n",
+ " body (pd.Series | list[str]): The body of the email.\n",
+ "\n",
+ " Returns:\n",
+ " np.array: The predictions of the model.\n",
+ " \"\"\"\n",
+ " if isinstance(body, pd.Series):\n",
+ " body = body.tolist()\n",
+ "\n",
+ " # Make predictions using the trained SVM model\n",
+ " predictions = self.model.predict(body)\n",
+ "\n",
+ " if isinstance(predictions, np.ndarray):\n",
+ " predictions = predictions.tolist()\n",
+ "\n",
+ " return predictions\n",
+ "\n",
+ " def save_model(\n",
+ " self,\n",
+ " path: str,\n",
+ " ):\n",
+ " \"\"\"Saves the model to the given path.\n",
+ "\n",
+ " Args:\n",
+ " path (str): The path to save the model to.\n",
+ " \"\"\"\n",
+ "\n",
+ " if not os.path.exists(path):\n",
+ " os.makedirs(path, exist_ok=True)\n",
+ " \n",
+ " save_model(self.model, path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = RandomForestPrivacyModel()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "AttributeError",
+ "evalue": "'Word2VecEmbedder' object has no attribute 'model_name'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m/Users/arao/Local/Github/Fraud-Detector/notebooks/rf_differential_privacy.ipynb Cell 17\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m \u001b[39m# Call your code that produces output\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model\u001b[39m.\u001b[39;49mtrain(train_data[\u001b[39m'\u001b[39;49m\u001b[39mBody\u001b[39;49m\u001b[39m'\u001b[39;49m], train_data[\u001b[39m'\u001b[39;49m\u001b[39mLabel\u001b[39;49m\u001b[39m'\u001b[39;49m], wandb \u001b[39m=\u001b[39;49m run)\n",
+ "File \u001b[0;32m~/Fraud-Detector/ethics/differential_privacy.py:87\u001b[0m, in \u001b[0;36mRandomForestPrivacyModel.train\u001b[0;34m(self, body, label, wandb)\u001b[0m\n\u001b[1;32m 84\u001b[0m accuracies \u001b[39m=\u001b[39m []\n\u001b[1;32m 86\u001b[0m \u001b[39mfor\u001b[39;00m eps \u001b[39min\u001b[39;00m epsilons:\n\u001b[0;32m---> 87\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel\u001b[39m.\u001b[39;49mset_params(classifier__epsilon\u001b[39m=\u001b[39;49meps)\n\u001b[1;32m 88\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mfit(body_train, label_train)\n\u001b[1;32m 90\u001b[0m accuracy \u001b[39m=\u001b[39m get_f1_score(label_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(body_val), average \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mmacro\u001b[39m\u001b[39m'\u001b[39m)\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/pipeline.py:222\u001b[0m, in \u001b[0;36mPipeline.set_params\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mset_params\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 204\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Set the parameters of this estimator.\u001b[39;00m\n\u001b[1;32m 205\u001b[0m \n\u001b[1;32m 206\u001b[0m \u001b[39m Valid parameter keys can be listed with ``get_params()``. Note that\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[39m Pipeline class instance.\u001b[39;00m\n\u001b[1;32m 221\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 222\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_set_params(\u001b[39m\"\u001b[39;49m\u001b[39msteps\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 223\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/utils/metaestimators.py:68\u001b[0m, in \u001b[0;36m_BaseComposition._set_params\u001b[0;34m(self, attr, **params)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_replace_estimator(attr, name, params\u001b[39m.\u001b[39mpop(name))\n\u001b[1;32m 67\u001b[0m \u001b[39m# 3. Step parameters and other initialisation arguments\u001b[39;00m\n\u001b[0;32m---> 68\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mset_params(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparams)\n\u001b[1;32m 69\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/base.py:223\u001b[0m, in \u001b[0;36mBaseEstimator.set_params\u001b[0;34m(self, **params)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m params:\n\u001b[1;32m 221\u001b[0m \u001b[39m# Simple optimization to gain speed (inspect is slow)\u001b[39;00m\n\u001b[1;32m 222\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n\u001b[0;32m--> 223\u001b[0m valid_params \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mget_params(deep\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 225\u001b[0m nested_params \u001b[39m=\u001b[39m defaultdict(\u001b[39mdict\u001b[39m) \u001b[39m# grouped by prefix\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[39mfor\u001b[39;00m key, value \u001b[39min\u001b[39;00m params\u001b[39m.\u001b[39mitems():\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/pipeline.py:201\u001b[0m, in \u001b[0;36mPipeline.get_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_params\u001b[39m(\u001b[39mself\u001b[39m, deep\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 185\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Get parameters for this estimator.\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \n\u001b[1;32m 187\u001b[0m \u001b[39m Returns the parameters given in the constructor as well as the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[39m Parameter names mapped to their values.\u001b[39;00m\n\u001b[1;32m 200\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 201\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_params(\u001b[39m\"\u001b[39;49m\u001b[39msteps\u001b[39;49m\u001b[39m\"\u001b[39;49m, deep\u001b[39m=\u001b[39;49mdeep)\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/utils/metaestimators.py:46\u001b[0m, in \u001b[0;36m_BaseComposition._get_params\u001b[0;34m(self, attr, deep)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mfor\u001b[39;00m name, estimator \u001b[39min\u001b[39;00m estimators:\n\u001b[1;32m 45\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(estimator, \u001b[39m\"\u001b[39m\u001b[39mget_params\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m---> 46\u001b[0m \u001b[39mfor\u001b[39;00m key, value \u001b[39min\u001b[39;00m estimator\u001b[39m.\u001b[39;49mget_params(deep\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39mitems():\n\u001b[1;32m 47\u001b[0m out[\u001b[39m\"\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m__\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m (name, key)] \u001b[39m=\u001b[39m value\n\u001b[1;32m 48\u001b[0m \u001b[39mreturn\u001b[39;00m out\n",
+ "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/base.py:195\u001b[0m, in \u001b[0;36mBaseEstimator.get_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m 193\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m()\n\u001b[1;32m 194\u001b[0m \u001b[39mfor\u001b[39;00m key \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_param_names():\n\u001b[0;32m--> 195\u001b[0m value \u001b[39m=\u001b[39m \u001b[39mgetattr\u001b[39;49m(\u001b[39mself\u001b[39;49m, key)\n\u001b[1;32m 196\u001b[0m \u001b[39mif\u001b[39;00m deep \u001b[39mand\u001b[39;00m \u001b[39mhasattr\u001b[39m(value, \u001b[39m\"\u001b[39m\u001b[39mget_params\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, \u001b[39mtype\u001b[39m):\n\u001b[1;32m 197\u001b[0m deep_items \u001b[39m=\u001b[39m value\u001b[39m.\u001b[39mget_params()\u001b[39m.\u001b[39mitems()\n",
+ "\u001b[0;31mAttributeError\u001b[0m: 'Word2VecEmbedder' object has no attribute 'model_name'"
+ ]
+ }
+ ],
"source": [
"\n",
"# Call your code that produces output\n",
- "model.train(train_data['Body'], train_data['Label'])\n"
+ "model.train(train_data['Body'], train_data['Label'], wandb = run)\n"
]
},
{