diff --git a/ethics/differential_privacy.py b/ethics/differential_privacy.py index 19bf8cc..15fab6a 100644 --- a/ethics/differential_privacy.py +++ b/ethics/differential_privacy.py @@ -84,7 +84,8 @@ def train( accuracies = [] for eps in epsilons: - self.model.set_params(classifier__epsilon=eps) + self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=eps) + # self.model.set_params(classifier__epsilon=eps) self.model.fit(body_train, label_train) accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro') @@ -106,10 +107,7 @@ def train( print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \n {"="*20}') #Fit model with best epsilon - self.model = Pipeline([ - ('vectorizer', self.vectorizer), - ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, epsilon=epsilons[np.argmax(accuracies)], criterion=self.criterion, n_jobs=self.njobs)) - ]) + self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=epsilons[np.argmax(accuracies)]) self.model.fit(body, label) diff --git a/notebooks/rf_differential_privacy.ipynb b/notebooks/rf_differential_privacy.ipynb index b824488..ae31228 100644 --- a/notebooks/rf_differential_privacy.ipynb +++ b/notebooks/rf_differential_privacy.ipynb @@ -9,9 +9,19 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /common/home/ps1279/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], "source": [ "import sys\n", "sys.path.append('..')\n", @@ -43,20 +53,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "12/05/2023 21:48:20:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "12/05/2023 23:39:06:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33madvaithrao\u001b[0m (\u001b[33mregressors\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", @@ -91,7 +95,7 @@ { "data": { "text/html": [ - "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_214822-lnh081ou" + "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_233909-rptqmnfr" ], "text/plain": [ "" @@ -103,7 +107,7 @@ { "data": { "text/html": [ - "Syncing run autumn-sky-109 to Weights & Biases (docs)
" + "Syncing run iconic-breeze-112 to Weights & Biases (docs)
" ], "text/plain": [ "" @@ -127,7 +131,7 @@ { "data": { "text/html": [ - " View run at https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou" + " View run at https://wandb.ai/regressors/Fraud-Detector/runs/rptqmnfr" ], "text/plain": [ "" @@ -156,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -172,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -422,13 +426,209 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], + "source": [ + "train_data = pd.read_csv('./data/augmented_train_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:512\"\n", + "\n", + "import sys\n", + "sys.path.append('..')\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import shutil\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.utils.class_weight import compute_class_weight\n", + "from sklearn.pipeline import Pipeline\n", + "import torch\n", + "from torch import nn\n", + "\n", + "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel\n", + "from transformers import AdamW,get_linear_schedule_with_warmup\n", + "\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "import torch.nn.functional as F\n", + "\n", + "from mlflow.sklearn import save_model\n", + "from scipy.sparse import hstack\n", + "\n", + "from ethics.base import BaseDistilbertModel\n", + "from utils.util_modeler import Word2VecEmbedder, TPSampler, get_f1_score\n", + "\n", + "from opacus import PrivacyEngine\n", + "from opacus.utils.batch_memory_manager import BatchMemoryManager\n", + "\n", + "from diffprivlib.models.forest import RandomForestClassifier\n", + "\n", + "\n", + "class RandomForestPrivacyModel:\n", + " def __init__(\n", + " self,\n", + " num_labels: int = 2,\n", + " n_estimators = 100,\n", + " criterion = 'gini',\n", + " njobs = -1\n", + " ):\n", + " self.num_labels = num_labels\n", + " self.n_estimators = n_estimators\n", + " self.criterion = criterion\n", + " self.njobs = njobs\n", + "\n", + " self.vectorizer = Word2VecEmbedder()\n", + " \n", + " self.model = Pipeline([\n", + " ('vectorizer', self.vectorizer),\n", + " ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs))\n", + " ])\n", + "\n", + " def train(\n", + " self,\n", + " body: pd.Series | list[str],\n", + " label: pd.Series | list[int],\n", + " wandb: wandb = None,\n", + " ):\n", + " \"\"\"Trains the SVM model.\n", + "\n", + " Args:\n", + " body (pd.Series | list[str]): The body of the email.\n", + " label (pd.Series | list[int]): The label of the email.\n", + "\n", + " Raises:\n", + " ValueError: If the body and label are not of the same size.\n", + " \"\"\"\n", + " if isinstance(body, pd.Series):\n", + " body = body.tolist()\n", + " if isinstance(label, pd.Series):\n", + " label = label.tolist()\n", + "\n", + " body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label)\n", + "\n", + " # Train the RF model\n", + " epsilons = [1e-8, 1e-2, 1, 7.5, 20]\n", + " accuracies = []\n", + "\n", + " for eps in epsilons:\n", + " self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=eps)\n", + " # self.model.set_params(classifier__epsilon=eps)\n", + " self.model.fit(body_train, label_train)\n", + "\n", + " accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro')\n", + " print('********* \\n Epsilon %.2f - Validation F1 Score %.5f \\n *********' % (eps, accuracy))\n", + " accuracies.append(accuracy)\n", + " \n", + " plt.plot(epsilons, accuracies, marker='o')\n", + " plt.xscale('log') # Use a logarithmic scale for better visibility\n", + " plt.xlabel('Epsilon')\n", + " plt.ylabel('F1 Score')\n", + " plt.title('F1 Score vs Epsilon')\n", + " plt.grid(True)\n", + "\n", + " plt.savefig(\"rf_dp_accuracy_vs_epsilon_plot.png\")\n", + "\n", + " # Log the plot to wandb\n", + " wandb.log({\"Accuracy vs Epsilon\": plt})\n", + " \n", + " print(f'{\"=\"*20} \\n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \\n {\"=\"*20}')\n", + " \n", + " #Fit model with best epsilon\n", + " self.model.named_steps['classifier'] = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs, epsilon=epsilons[np.argmax(accuracies)])\n", + " \n", + " self.model.fit(body, label)\n", + "\n", + " print(f'{\"=\"*20} Training Done {\"=\"*20}')\n", + "\n", + " def predict(\n", + " self,\n", + " body: pd.Series | list[str],\n", + " ):\n", + " \"\"\"Predicts the labels of the given data.\n", + "\n", + " Args:\n", + " body (pd.Series | list[str]): The body of the email.\n", + "\n", + " Returns:\n", + " np.array: The predictions of the model.\n", + " \"\"\"\n", + " if isinstance(body, pd.Series):\n", + " body = body.tolist()\n", + "\n", + " # Make predictions using the trained SVM model\n", + " predictions = self.model.predict(body)\n", + "\n", + " if isinstance(predictions, np.ndarray):\n", + " predictions = predictions.tolist()\n", + "\n", + " return predictions\n", + "\n", + " def save_model(\n", + " self,\n", + " path: str,\n", + " ):\n", + " \"\"\"Saves the model to the given path.\n", + "\n", + " Args:\n", + " path (str): The path to save the model to.\n", + " \"\"\"\n", + "\n", + " if not os.path.exists(path):\n", + " os.makedirs(path, exist_ok=True)\n", + " \n", + " save_model(self.model, path)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "model = RandomForestPrivacyModel()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'Word2VecEmbedder' object has no attribute 'model_name'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/arao/Local/Github/Fraud-Detector/notebooks/rf_differential_privacy.ipynb Cell 17\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m \u001b[39m# Call your code that produces output\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model\u001b[39m.\u001b[39;49mtrain(train_data[\u001b[39m'\u001b[39;49m\u001b[39mBody\u001b[39;49m\u001b[39m'\u001b[39;49m], train_data[\u001b[39m'\u001b[39;49m\u001b[39mLabel\u001b[39;49m\u001b[39m'\u001b[39;49m], wandb \u001b[39m=\u001b[39;49m run)\n", + "File \u001b[0;32m~/Fraud-Detector/ethics/differential_privacy.py:87\u001b[0m, in \u001b[0;36mRandomForestPrivacyModel.train\u001b[0;34m(self, body, label, wandb)\u001b[0m\n\u001b[1;32m 84\u001b[0m accuracies \u001b[39m=\u001b[39m []\n\u001b[1;32m 86\u001b[0m \u001b[39mfor\u001b[39;00m eps \u001b[39min\u001b[39;00m epsilons:\n\u001b[0;32m---> 87\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel\u001b[39m.\u001b[39;49mset_params(classifier__epsilon\u001b[39m=\u001b[39;49meps)\n\u001b[1;32m 88\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mfit(body_train, label_train)\n\u001b[1;32m 90\u001b[0m accuracy \u001b[39m=\u001b[39m get_f1_score(label_val, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel\u001b[39m.\u001b[39mpredict(body_val), average \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mmacro\u001b[39m\u001b[39m'\u001b[39m)\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/pipeline.py:222\u001b[0m, in \u001b[0;36mPipeline.set_params\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mset_params\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 204\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Set the parameters of this estimator.\u001b[39;00m\n\u001b[1;32m 205\u001b[0m \n\u001b[1;32m 206\u001b[0m \u001b[39m Valid parameter keys can be listed with ``get_params()``. Note that\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[39m Pipeline class instance.\u001b[39;00m\n\u001b[1;32m 221\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 222\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_set_params(\u001b[39m\"\u001b[39;49m\u001b[39msteps\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 223\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/utils/metaestimators.py:68\u001b[0m, in \u001b[0;36m_BaseComposition._set_params\u001b[0;34m(self, attr, **params)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_replace_estimator(attr, name, params\u001b[39m.\u001b[39mpop(name))\n\u001b[1;32m 67\u001b[0m \u001b[39m# 3. Step parameters and other initialisation arguments\u001b[39;00m\n\u001b[0;32m---> 68\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mset_params(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparams)\n\u001b[1;32m 69\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/base.py:223\u001b[0m, in \u001b[0;36mBaseEstimator.set_params\u001b[0;34m(self, **params)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m params:\n\u001b[1;32m 221\u001b[0m \u001b[39m# Simple optimization to gain speed (inspect is slow)\u001b[39;00m\n\u001b[1;32m 222\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n\u001b[0;32m--> 223\u001b[0m valid_params \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mget_params(deep\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 225\u001b[0m nested_params \u001b[39m=\u001b[39m defaultdict(\u001b[39mdict\u001b[39m) \u001b[39m# grouped by prefix\u001b[39;00m\n\u001b[1;32m 226\u001b[0m \u001b[39mfor\u001b[39;00m key, value \u001b[39min\u001b[39;00m params\u001b[39m.\u001b[39mitems():\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/pipeline.py:201\u001b[0m, in \u001b[0;36mPipeline.get_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_params\u001b[39m(\u001b[39mself\u001b[39m, deep\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 185\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Get parameters for this estimator.\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \n\u001b[1;32m 187\u001b[0m \u001b[39m Returns the parameters given in the constructor as well as the\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[39m Parameter names mapped to their values.\u001b[39;00m\n\u001b[1;32m 200\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 201\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_params(\u001b[39m\"\u001b[39;49m\u001b[39msteps\u001b[39;49m\u001b[39m\"\u001b[39;49m, deep\u001b[39m=\u001b[39;49mdeep)\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/utils/metaestimators.py:46\u001b[0m, in \u001b[0;36m_BaseComposition._get_params\u001b[0;34m(self, attr, deep)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[39mfor\u001b[39;00m name, estimator \u001b[39min\u001b[39;00m estimators:\n\u001b[1;32m 45\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(estimator, \u001b[39m\"\u001b[39m\u001b[39mget_params\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m---> 46\u001b[0m \u001b[39mfor\u001b[39;00m key, value \u001b[39min\u001b[39;00m estimator\u001b[39m.\u001b[39;49mget_params(deep\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39mitems():\n\u001b[1;32m 47\u001b[0m out[\u001b[39m\"\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m__\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m (name, key)] \u001b[39m=\u001b[39m value\n\u001b[1;32m 48\u001b[0m \u001b[39mreturn\u001b[39;00m out\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/sklearn/base.py:195\u001b[0m, in \u001b[0;36mBaseEstimator.get_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m 193\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mdict\u001b[39m()\n\u001b[1;32m 194\u001b[0m \u001b[39mfor\u001b[39;00m key \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_param_names():\n\u001b[0;32m--> 195\u001b[0m value \u001b[39m=\u001b[39m \u001b[39mgetattr\u001b[39;49m(\u001b[39mself\u001b[39;49m, key)\n\u001b[1;32m 196\u001b[0m \u001b[39mif\u001b[39;00m deep \u001b[39mand\u001b[39;00m \u001b[39mhasattr\u001b[39m(value, \u001b[39m\"\u001b[39m\u001b[39mget_params\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(value, \u001b[39mtype\u001b[39m):\n\u001b[1;32m 197\u001b[0m deep_items \u001b[39m=\u001b[39m value\u001b[39m.\u001b[39mget_params()\u001b[39m.\u001b[39mitems()\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Word2VecEmbedder' object has no attribute 'model_name'" + ] + } + ], "source": [ "\n", "# Call your code that produces output\n", - "model.train(train_data['Body'], train_data['Label'])\n" + "model.train(train_data['Body'], train_data['Label'], wandb = run)\n" ] }, {