From 3ddc6304769d579814405b9c7d112770e5a49245 Mon Sep 17 00:00:00 2001 From: Advaith Rao Date: Tue, 5 Dec 2023 23:35:20 -0500 Subject: [PATCH] Updated rf differential privacy to use f1 score for validation --- ethics/differential_privacy.py | 27 +- notebooks/differential_privacy.ipynb | 634 ------------------- notebooks/rf_differential_privacy.ipynb | 595 +++++++++++++++++ pipelines/differential_privacy_trainer.py | 1 + pipelines/random_forest_trainer.py | 2 +- pipelines/rf_differential_privacy_trainer.py | 8 +- pipelines/roberta_trainer.py | 2 +- 7 files changed, 616 insertions(+), 653 deletions(-) delete mode 100644 notebooks/differential_privacy.ipynb create mode 100644 notebooks/rf_differential_privacy.ipynb diff --git a/ethics/differential_privacy.py b/ethics/differential_privacy.py index ee9ee50..f336c3a 100644 --- a/ethics/differential_privacy.py +++ b/ethics/differential_privacy.py @@ -29,7 +29,7 @@ from scipy.sparse import hstack from ethics.base import BaseDistilbertModel -from utils.util_modeler import Word2VecEmbedder, TPSampler +from utils.util_modeler import Word2VecEmbedder, TPSampler, get_f1_score from opacus import PrivacyEngine from opacus.utils.batch_memory_manager import BatchMemoryManager @@ -50,6 +50,11 @@ def __init__( self.criterion = criterion self.njobs = njobs + self.model = Pipeline([ + ('vectorizer', self.vectorizer), + ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, n_jobs=self.njobs)) + ]) + self.vectorizer = Word2VecEmbedder() def train( @@ -72,29 +77,25 @@ def train( if isinstance(label, pd.Series): label = label.tolist() + body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label) + # Train the RF model epsilons = [1e-8, 1e-2, 1, 7.5, 20] accuracies = [] - body_train, body_val, label_train, label_val = train_test_split(body, label, test_size=0.2, random_state=42, stratify=label) - for eps in epsilons: - self.model = Pipeline([ - ('vectorizer', self.vectorizer), - ('classifier', RandomForestClassifier(n_estimators=self.n_estimators, epsilon=eps, criterion=self.criterion, n_jobs=self.njobs)) - ]) - + self.model.set_params(classifier__epsilon=eps) self.model.fit(body_train, label_train) - accuracy = self.model.score(body_val, label_val) - print('********* \n Epsilon %.2f - Accuracy %.5f \n *********' % (eps, accuracy)) + accuracy = get_f1_score(label_val, self.model.predict(body_val), average = 'macro') + print('********* \n Epsilon %.2f - Validation F1 Score %.5f \n *********' % (eps, accuracy)) accuracies.append(accuracy) plt.plot(epsilons, accuracies, marker='o') plt.xscale('log') # Use a logarithmic scale for better visibility plt.xlabel('Epsilon') - plt.ylabel('Accuracy') - plt.title('Accuracy vs Epsilon') + plt.ylabel('F1 Score') + plt.title('F1 Score vs Epsilon') plt.grid(True) plt.savefig("rf_dp_accuracy_vs_epsilon_plot.png") @@ -102,7 +103,7 @@ def train( # Log the plot to wandb wandb.log({"Accuracy vs Epsilon": plt}) - print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Accuracy = {np.max(accuracies)} \n {"="*20}') + print(f'{"="*20} \n Best Model for Epsilon = {epsilons[np.argmax(accuracies)]} with Validation F1 Score = {np.max(accuracies)} \n {"="*20}') #Fit model with best epsilon self.model = Pipeline([ diff --git a/notebooks/differential_privacy.ipynb b/notebooks/differential_privacy.ipynb deleted file mode 100644 index 4bdbaaa..0000000 --- a/notebooks/differential_privacy.ipynb +++ /dev/null @@ -1,634 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !poetry run pip3 install --force-reinstall opacus==0.13.0\n", - "# !poetry update" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# !git clone https://github.com/woodyx218/opacus_global_clipping.git\n", - "# !mv opacus_global_clipping ./ethics/\n", - "# !pip3 install -e ./ethics/opacus_global_clipping\n", - "# !rm -rf ./ethics/opacus" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# !ls ./ethics/opacus" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !poetry run pip3 install --upgrade opacus" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] /common/home/ps1279/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "from ethics.differential_privacy import DistilbertPrivacyModel" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:512\"\n", - "\n", - "import shutil\n", - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.svm import SVC\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn.utils.class_weight import compute_class_weight\n", - "from sklearn.pipeline import Pipeline\n", - "import torch\n", - "from torch import nn\n", - "from torch.optim import AdamW\n", - "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n", - "\n", - "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel\n", - "from transformers import get_linear_schedule_with_warmup\n", - "\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import torch.nn.functional as F\n", - "\n", - "import wandb\n", - "from mlflow.sklearn import save_model\n", - "from scipy.sparse import hstack\n", - "\n", - "from utils.util_modeler import Word2VecEmbedder, TPSampler\n", - "\n", - "# import sys\n", - "# sys.path.append(\n", - "# '../ethics'\n", - "# )\n", - "\n", - "from opacus import PrivacyEngine\n", - "# from opacus.utils.uniform_sampler import UniformWithReplacementSampler\n", - "from opacus.utils.batch_memory_manager import BatchMemoryManager\n", - "from opacus.utils.uniform_sampler import UniformWithReplacementSampler\n", - "# from opacus.grad_sample.utils import register_grad_sampler\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "class BaseModel(nn.Module):\n", - " def __init__(self, num_labels, model_name='distilbert-base-uncased', device = 'cuda'):\n", - " super(BaseModel, self).__init__()\n", - "\n", - " # Load pre-trained RobertaModel\n", - " self.model = DistilBertModel.from_pretrained(model_name).to(device)\n", - "\n", - " for param in self.model.parameters():\n", - " param.requires_grad = False\n", - "\n", - " # Define classification head\n", - " self.classification_head = nn.Sequential(\n", - " nn.Linear(self.model.config.hidden_size, 128),\n", - " nn.ReLU(),\n", - " nn.Linear(128, num_labels)\n", - " )\n", - "\n", - " def forward(self, input_ids, attention_mask, labels=None):\n", - " # Get model outputs\n", - " outputs = self.model(input_ids, attention_mask=attention_mask)\n", - " last_hidden_states = outputs.last_hidden_state\n", - "\n", - " # Apply classification head\n", - " logits = self.classification_head(last_hidden_states[:, 0, :])\n", - "\n", - " return logits" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "class DistilbertPrivacyModel:\n", - " def __init__(\n", - " self, \n", - " num_labels=2, \n", - " path='', \n", - " model_name='distilbert-base-uncased', \n", - " learning_rate=2e-5, \n", - " epsilon=1e-8, \n", - " num_epochs=40, \n", - " batch_size=128, \n", - " device=None\n", - " ):\n", - " self.num_labels = num_labels\n", - " self.path = path\n", - " self.model_name = model_name\n", - " self.learning_rate = learning_rate\n", - " self.epsilon = epsilon\n", - " self.num_epochs = num_epochs\n", - " self.batch_size = batch_size\n", - " self.device = device\n", - "\n", - " if not self.device and torch.cuda.is_available():\n", - " self.device = 'cuda'\n", - " elif not self.device:\n", - " self.device = 'cpu'\n", - "\n", - " self.device = torch.device(self.device)\n", - " self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)\n", - "\n", - " if self.path != '':\n", - " raise NotImplementedError('Loading model from path is not implemented yet.')\n", - " else:\n", - " self.model = BaseModel(num_labels=self.num_labels, model_name=self.model_name)\n", - " self.model.to(self.device)\n", - " \n", - " self.privacy_engine = PrivacyEngine()\n", - " \n", - " def train(\n", - " self, \n", - " body: pd.Series | list[str], \n", - " label: pd.Series | list[int], \n", - " validation_size=0.2,\n", - " wandb=None\n", - " ):\n", - " \"\"\"Trains the model using the given data.\n", - "\n", - " Args:\n", - " body (pd.Series | list[str]): The body of the email.\n", - " label (pd.Series | list[int]): The label of the email.\n", - " validation_size (float, optional): The size of the validation set. Defaults to 0.2.\n", - " wandb (wandb, optional): The wandb object. Defaults to None. If given, logs the training process to wandb.\n", - "\n", - " Raises:\n", - " ValueError: If the body and label are not of the same size.\n", - " \"\"\"\n", - "\n", - " if isinstance(body, pd.Series):\n", - " body = body.tolist()\n", - " if isinstance(label, pd.Series):\n", - " label = label.tolist()\n", - "\n", - " # Tokenize input texts and convert labels to tensors\n", - " input_ids = []\n", - " attention_masks = []\n", - " label_ids = []\n", - "\n", - " for _body, _label in zip(body, label):\n", - " # Tokenize the input text using the Roberta tokenizer\n", - " inputs = self.tokenizer.encode_plus(\n", - " _body,\n", - " add_special_tokens=True,\n", - " max_length=512,\n", - " padding='max_length',\n", - " return_attention_mask=True,\n", - " return_tensors='pt',\n", - " truncation=True\n", - " )\n", - "\n", - " input_ids.append(inputs['input_ids'])\n", - " attention_masks.append(inputs['attention_mask'])\n", - " label_ids.append(torch.tensor(_label)) # Convert the label to a tensor\n", - "\n", - " # Convert lists to tensors\n", - " input_ids = torch.cat(input_ids, dim=0)\n", - " attention_masks = torch.cat(attention_masks, dim=0)\n", - " label_ids = torch.stack(label_ids)\n", - "\n", - " # Split the data into train and validation sets\n", - " dataset = TensorDataset(input_ids, attention_masks, label_ids)\n", - " dataset_size = len(dataset)\n", - " val_size = int(validation_size * dataset_size)\n", - " train_size = dataset_size - val_size\n", - " train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])\n", - "\n", - " # Create data loaders for training and validation data\n", - " train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)\n", - " validation_dataloader = DataLoader(val_dataset, batch_size=self.batch_size)\n", - "\n", - " # Initialize the optimizer and learning rate scheduler\n", - " optimizer = AdamW(list(self.model.parameters()),\n", - " lr=self.learning_rate, eps=self.epsilon)\n", - " total_steps = len(train_dataloader) * self.num_epochs\n", - " scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)\n", - "\n", - " MAX_GRAD_NORM = 0.1\n", - "\n", - " # self.model, optimizer, _ = self.privacy_engine.make_private_with_epsilon(\n", - " # module=self.model,\n", - " # optimizer=optimizer,\n", - " # data_loader=train_dataloader,\n", - " # target_delta=1/total_steps,\n", - " # target_epsilon=self.epsilon, \n", - " # epochs=self.num_epochs,\n", - " # max_grad_norm=MAX_GRAD_NORM,\n", - " # )\n", - "\n", - " # print(f\"******** Using sigma={optimizer.noise_multiplier} and C={MAX_GRAD_NORM} ********\")\n", - "\n", - " # Initialize variables for early stopping\n", - " best_validation_loss = float(\"inf\")\n", - " patience = 5 # Number of epochs to wait for improvement\n", - " wait = 0\n", - "\n", - " for epoch in range(self.num_epochs):\n", - " print(f'{\"=\"*20} Epoch {epoch + 1}/{self.num_epochs} {\"=\"*20}')\n", - "\n", - " # Training loop\n", - " self.model.train()\n", - " total_train_loss = 0\n", - "\n", - " # with BatchMemoryManager(\n", - " # data_loader=train_dataloader, \n", - " # max_physical_batch_size=self.batch_size, \n", - " # optimizer=optimizer\n", - " # ) as memory_safe_data_loader:\n", - " # for step, batch in enumerate(memory_safe_data_loader):\n", - "\n", - " for step, batch in enumerate(train_dataloader):\n", - " optimizer.zero_grad()\n", - " \n", - " b_input_ids = batch[0].to(self.device)\n", - " b_input_mask = batch[1].to(self.device)\n", - " b_labels = batch[2].to(self.device)\n", - "\n", - " # Forward pass\n", - " logits = self.model(b_input_ids, attention_mask=b_input_mask)\n", - " \n", - " loss = F.cross_entropy(logits, b_labels)\n", - "\n", - " total_train_loss += loss.item()\n", - "\n", - " # Backward pass\n", - " loss.backward()\n", - "\n", - " # torch.nn.utils.clip_grad_norm_(list(self.model.parameters()), 1.0)\n", - "\n", - " # Update the model parameters\n", - " optimizer.step()\n", - "\n", - " # Update the learning rate\n", - " scheduler.step()\n", - "\n", - " if step % 100 == 0 and step != 0:\n", - " avg_train_loss = total_train_loss / 100\n", - " print(f'Step {step}/{len(train_dataloader)} - Average training loss: {avg_train_loss:.4f}')\n", - "\n", - " total_train_loss = 0\n", - "\n", - " avg_train_loss = total_train_loss / len(train_dataloader)\n", - " print(f'Training loss: {avg_train_loss:.4f}')\n", - "\n", - " # Evaluation loop\n", - " self.model.eval()\n", - " total_eval_accuracy = 0\n", - " total_eval_loss = 0\n", - "\n", - " for batch in validation_dataloader:\n", - " b_input_ids = batch[0].to(self.device)\n", - " b_input_mask = batch[1].to(self.device)\n", - " b_labels = batch[2].to(self.device)\n", - "\n", - " with torch.no_grad():\n", - " logits = self.model(b_input_ids, attention_mask=b_input_mask)\n", - " loss = F.cross_entropy(logits, b_labels)\n", - "\n", - " total_eval_loss += loss.item()\n", - " total_eval_accuracy += self.accuracy(logits, b_labels)\n", - "\n", - " total_eval_accuracy += self.accuracy(logits, b_labels)\n", - "\n", - " if len(validation_dataloader) > 0:\n", - " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n", - " print(f'Validation Accuracy: {avg_val_accuracy:.4f}')\n", - "\n", - " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", - " print(f'Validation Loss: {avg_val_loss:.4f}')\n", - "\n", - " # Early stopping check\n", - " if avg_val_loss < best_validation_loss:\n", - " best_validation_loss = avg_val_loss\n", - " wait = 0\n", - " else:\n", - " wait += 1\n", - "\n", - " if wait >= patience:\n", - " print(f'Early stopping after {patience} epochs without improvement.')\n", - " break\n", - " else:\n", - " print('No validation data provided.')\n", - " avg_val_accuracy = 0\n", - " avg_val_loss = 0\n", - "\n", - " if wandb is not None:\n", - " wandb.log({\n", - " 'epoch': epoch, \n", - " 'train_loss': avg_train_loss, \n", - " 'val_loss': avg_val_loss,\n", - " 'val_accuracy': avg_val_accuracy,\n", - " })\n", - "\n", - " def predict(\n", - " self, \n", - " body: pd.Series | list[str]\n", - " ):\n", - " \"\"\"Predicts the labels of the given data.\n", - "\n", - " Args:\n", - " body (pd.Series | list[str]): The body of the email.\n", - "\n", - " Returns:\n", - " np.array: The predictions of the model.\n", - " \"\"\"\n", - "\n", - " # If input_texts is a Pandas Series, convert it to a list\n", - " if isinstance(body, pd.Series):\n", - " body = body.tolist()\n", - "\n", - " input_ids = []\n", - " attention_masks = []\n", - "\n", - " for _body in body:\n", - " inputs = self.tokenizer.encode_plus(\n", - " _body,\n", - " add_special_tokens=True,\n", - " max_length=512,\n", - " padding='max_length',\n", - " return_attention_mask=True,\n", - " return_tensors='pt',\n", - " truncation=True\n", - " )\n", - "\n", - " input_ids.append(inputs['input_ids'])\n", - " attention_masks.append(inputs['attention_mask'])\n", - "\n", - " input_ids = torch.cat(input_ids, dim=0)\n", - " attention_masks = torch.cat(attention_masks, dim=0)\n", - "\n", - " dataset = TensorDataset(input_ids, attention_masks)\n", - " dataloader = DataLoader(dataset, batch_size=self.batch_size)\n", - "\n", - " self.model.eval()\n", - " predictions = []\n", - "\n", - " for batch in dataloader:\n", - " b_input_ids = batch[0].to(self.device)\n", - " b_input_mask = batch[1].to(self.device)\n", - "\n", - " with torch.no_grad():\n", - " logits = self.model(b_input_ids, attention_mask=b_input_mask)\n", - "\n", - " logits = logits.detach().cpu().numpy()\n", - "\n", - " # Apply a threshold (e.g., 0.5) to convert logits to class predictions\n", - " class_predictions = np.argmax(logits, axis=1)\n", - " \n", - " predictions.extend(class_predictions.tolist())\n", - "\n", - " return predictions\n", - " \n", - " def save_model(\n", - " self,\n", - " path: str\n", - " ):\n", - " \"\"\"Saves the model to the given path.\n", - "\n", - " Args:\n", - " path (str): The path to save the model to.\n", - " \"\"\"\n", - "\n", - " # Check if the directory exists, and if not, create it\n", - " if not os.path.exists(path):\n", - " os.makedirs(path, exist_ok=True)\n", - "\n", - " # Save the transformer model and the classification head\n", - " self.model.save_pretrained(path)\n", - " torch.save(self.classification_head.state_dict(), os.path.join(path, 'classification_head.pth'))\n", - " \n", - " def accuracy(\n", - " self, \n", - " preds, \n", - " labels\n", - " ):\n", - " \"\"\"Calculates the accuracy of the model.\n", - "\n", - " Args:\n", - " preds (torch.Tensor|numpy.ndarray): The predictions of the model.\n", - " labels (torch.Tensor|numpy.ndarray): The labels of the data.\n", - "\n", - " Returns:\n", - " float: The accuracy of the model.\n", - " \"\"\"\n", - "\n", - " if isinstance(preds, np.ndarray):\n", - " preds = torch.from_numpy(preds)\n", - " if isinstance(labels, np.ndarray):\n", - " labels = torch.from_numpy(labels)\n", - " \n", - " _, preds = torch.max(preds, dim=1)\n", - " \n", - " return torch.tensor(torch.sum(preds == labels).item() / len(preds))" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# data = pd.read_csv('./data/fraud_detector_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "data = data[data.Source == 'Phishing Data']\n", - "\n", - "data = pd.concat(\n", - " [data[data.Label == 1].head(100),\n", - " data[data.Label == 0].head(100)]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/common/home/ps1279/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/privacy_engine.py:142: UserWarning: Secure RNG turned off. This is perfectly fine for experimentation as it allows for much faster training performance, but remember to turn it on and retrain one last time before production with ``secure_mode`` turned on.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "# model = DistilbertPrivacyModel(\n", - "# num_epochs=1,\n", - "# epsilon=1e-8,\n", - "# batch_size=2,\n", - "# device='cuda'\n", - "# )\n", - "\n", - "model = DistilbertPrivacyModel(\n", - " num_epochs=1,\n", - " epsilon=1e-8,\n", - " batch_size=2,\n", - " device='cuda'\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "==================== Epoch 1/1 ====================\n", - "Training loss: 0.6902\n", - "Validation Accuracy: 0.7500\n", - "Validation Loss: 0.7111\n" - ] - } - ], - "source": [ - "model.train(\n", - " body=data.Body,\n", - " label=data.Label,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Per sample gradient is not initialized. Not updated in backward pass?", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/arao/Local/Github/Fraud-Detector/notebooks/differential_privacy.ipynb Cell 9\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m e\u001b[39m.\u001b[39;49mstep()\n", - "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:513\u001b[0m, in \u001b[0;36mDPOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39menable_grad():\n\u001b[1;32m 511\u001b[0m closure()\n\u001b[0;32m--> 513\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpre_step():\n\u001b[1;32m 514\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moriginal_optimizer\u001b[39m.\u001b[39mstep()\n\u001b[1;32m 515\u001b[0m \u001b[39melse\u001b[39;00m:\n", - "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:494\u001b[0m, in \u001b[0;36mDPOptimizer.pre_step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 483\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpre_step\u001b[39m(\n\u001b[1;32m 484\u001b[0m \u001b[39mself\u001b[39m, closure: Optional[Callable[[], \u001b[39mfloat\u001b[39m]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 485\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Optional[\u001b[39mfloat\u001b[39m]:\n\u001b[1;32m 486\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 487\u001b[0m \u001b[39m Perform actions specific to ``DPOptimizer`` before calling\u001b[39;00m\n\u001b[1;32m 488\u001b[0m \u001b[39m underlying ``optimizer.step()``\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 492\u001b[0m \u001b[39m returns the loss. Optional for most optimizers.\u001b[39;00m\n\u001b[1;32m 493\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 494\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mclip_and_accumulate()\n\u001b[1;32m 495\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_check_skip_next_step():\n\u001b[1;32m 496\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_is_last_step_skipped \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:397\u001b[0m, in \u001b[0;36mDPOptimizer.clip_and_accumulate\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclip_and_accumulate\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 392\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 393\u001b[0m \u001b[39m Performs gradient clipping.\u001b[39;00m\n\u001b[1;32m 394\u001b[0m \u001b[39m Stores clipped and aggregated gradients into `p.summed_grad```\u001b[39;00m\n\u001b[1;32m 395\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 397\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgrad_samples[\u001b[39m0\u001b[39m]) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 398\u001b[0m \u001b[39m# Empty batch\u001b[39;00m\n\u001b[1;32m 399\u001b[0m per_sample_clip_factor \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mzeros((\u001b[39m0\u001b[39m,))\n\u001b[1;32m 400\u001b[0m \u001b[39melse\u001b[39;00m:\n", - "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:345\u001b[0m, in \u001b[0;36mDPOptimizer.grad_samples\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 343\u001b[0m ret \u001b[39m=\u001b[39m []\n\u001b[1;32m 344\u001b[0m \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams:\n\u001b[0;32m--> 345\u001b[0m ret\u001b[39m.\u001b[39mappend(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_flat_grad_sample(p))\n\u001b[1;32m 346\u001b[0m \u001b[39mreturn\u001b[39;00m ret\n", - "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/ethical-fraud-detector-qnRNkJHZ-py3.10/lib/python3.10/site-packages/opacus/optimizers/optimizer.py:282\u001b[0m, in \u001b[0;36mDPOptimizer._get_flat_grad_sample\u001b[0;34m(self, p)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 279\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPer sample gradient not found. Are you using GradSampleModule?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 280\u001b[0m )\n\u001b[1;32m 281\u001b[0m \u001b[39mif\u001b[39;00m p\u001b[39m.\u001b[39mgrad_sample \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 282\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 283\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPer sample gradient is not initialized. Not updated in backward pass?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[1;32m 285\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(p\u001b[39m.\u001b[39mgrad_sample, torch\u001b[39m.\u001b[39mTensor):\n\u001b[1;32m 286\u001b[0m ret \u001b[39m=\u001b[39m p\u001b[39m.\u001b[39mgrad_sample\n", - "\u001b[0;31mValueError\u001b[0m: Per sample gradient is not initialized. Not updated in backward pass?" - ] - } - ], - "source": [ - "e.step()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "f" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "model = BaseModel()\n", - "# list(model.parameters())\n", - "# torch.nn.utils.clip_grad_norm_(list(model.parameters()), 1.0)\n", - "# list(model.parameters())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/rf_differential_privacy.ipynb b/notebooks/rf_differential_privacy.ipynb new file mode 100644 index 0000000..b824488 --- /dev/null +++ b/notebooks/rf_differential_privacy.ipynb @@ -0,0 +1,595 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from datetime import datetime\n", + "import pandas as pd\n", + "import sys\n", + "import os\n", + "\n", + "from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData\n", + "from detector.labeler import EnronLabeler, MismatchLabeler\n", + "from ethics.differential_privacy import RandomForestPrivacyModel\n", + "from detector.preprocessor import Preprocessor\n", + "from utils.util_modeler import evaluate_and_log, get_f1_score, Augmentor\n", + "\n", + "import wandb\n", + "import argparse\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Init wandb for model tracking" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "12/05/2023 21:48:20:ERROR:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33madvaithrao\u001b[0m (\u001b[33mregressors\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /common/home/ps1279/.netrc\n" + ] + }, + { + "data": { + "text/html": [ + "wandb version 0.16.1 is available! To upgrade, please run:\n", + " $ pip install wandb --upgrade" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.16.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_214822-lnh081ou" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run autumn-sky-109 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/regressors/Fraud-Detector" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wandbdict = {\n", + " 'key': os.getenv('WANDB_API_KEY'),\n", + " 'entity': os.getenv('WANDB_ENTITY'),\n", + " 'project': os.getenv('WANDB_PROJECT'),\n", + "}\n", + "wandb.login(key=wandbdict['key'])\n", + "run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('./data/fraud_detector_data.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Splits" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = data[data.Split == 'Train']\n", + "sanity_data = data[data.Split == 'Sanity']\n", + "gold_fraud_data = data[data.Split == 'Gold Fraud']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "hyper_params = {\n", + " 'num_labels': 2,\n", + " 'n_estimators': 100,\n", + " 'criterion': 'gini'\n", + "}\n", + "\n", + "model = RandomForestPrivacyModel()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Finishing last run (ID:lnh081ou) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "wandb: WARNING Source type is set to 'repo' but some required information is missing from the environment. A job will not be created from this run. See https://docs.wandb.ai/guides/launch/create-job\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2f74cfedf89a4095aee568bfa7cbcfd2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run autumn-sky-109 at: https://wandb.ai/regressors/Fraud-Detector/runs/lnh081ou
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 1 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20231205_214822-lnh081ou/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:lnh081ou). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "40d6313528d041969c9df9845b906388", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011112813154856365, max=1.0…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "wandb version 0.16.1 is available! To upgrade, please run:\n", + " $ pip install wandb --upgrade" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.16.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /common/home/ps1279/Fraud-Detector/wandb/run-20231205_215125-4jzx14ww" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run generous-energy-110 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/regressors/Fraud-Detector" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/regressors/Fraud-Detector/runs/4jzx14ww" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# run = wandb.init(config=hyper_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Augmentation" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "augmentor = Augmentor()\n", + "\n", + "train_body, train_labels = augmentor(\n", + " train_data['Body'].tolist(),\n", + " train_data['Label'].tolist(),\n", + " aug_label=1,\n", + " num_aug_per_label_1=9,\n", + " shuffle=True\n", + ")\n", + "\n", + "train_data = pd.DataFrame(\n", + " {\n", + " 'Body': train_body,\n", + " 'Label': train_labels\n", + " }\n", + ")\n", + "\n", + "train_data.drop_duplicates(subset=['Body'], inplace=True)\n", + "train_data.reset_index(drop=True, inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_data.to_csv('./data/augmented_train_data.csv', index=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Call your code that produces output\n", + "model.train(train_data['Body'], train_data['Label'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "f1_scores = {}\n", + "os.makedirs('/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/logs', exist_ok=True)\n", + "save_path='/common/home/ps1279/models/rf_diff_privacy/2023-12-05/rf_diff_privacy/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict on all datasets and generate logs + mismatch_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data['Prediction'] = model.predict(body=train_data['Body'])\n", + "evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())\n", + "f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])\n", + "evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())\n", + "f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])\n", + "evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())\n", + "f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#save mismatch data into a csv file\n", + "mismatch_data = pd.concat(\n", + " [\n", + " train_data[train_data['Prediction'] != train_data['Label']],\n", + " sanity_data[sanity_data['Prediction'] != sanity_data['Label']],\n", + " gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]\n", + " ],\n", + " axis=0,\n", + " ignore_index=True\n", + ")\n", + "\n", + "mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_params = {**hyper_params, **f1_scores}\n", + "run.config.update(all_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logs_path = os.path.join(save_path,'logs')\n", + "log_artifact = wandb.Artifact(\"fraud-detector-logs\", type=\"logs\")\n", + "log_artifact.add_dir(logs_path)\n", + "run.use_artifact(log_artifact)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model(os.path.join(save_path,'model'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = os.path.join(save_path, 'model')\n", + "model_artifact = wandb.Artifact(\"fraud-detector-model\", type=\"model\")\n", + "model_artifact.add_dir(model_path)\n", + "run.use_artifact(model_artifact)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.finish()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pipelines/differential_privacy_trainer.py b/pipelines/differential_privacy_trainer.py index 208bc4f..d98bf80 100644 --- a/pipelines/differential_privacy_trainer.py +++ b/pipelines/differential_privacy_trainer.py @@ -10,6 +10,7 @@ import sys import os + from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData from detector.labeler import EnronLabeler, MismatchLabeler from ethics.differential_privacy import DistilbertPrivacyModel, RandomForestPrivacyModel diff --git a/pipelines/random_forest_trainer.py b/pipelines/random_forest_trainer.py index bc575f3..f2ab8e8 100644 --- a/pipelines/random_forest_trainer.py +++ b/pipelines/random_forest_trainer.py @@ -124,7 +124,7 @@ def data_split(data): return train, sanity, gold_fraud def train_model(train_data, hyper_params, use_aug=False): - run = wandb.init(config=hyper_params) + # run = wandb.init(config=hyper_params) model = RandomForestFraudModel(**hyper_params) # #drop train examples with Label=1 and Body less than 4 words diff --git a/pipelines/rf_differential_privacy_trainer.py b/pipelines/rf_differential_privacy_trainer.py index 7fe3380..f2f6a73 100644 --- a/pipelines/rf_differential_privacy_trainer.py +++ b/pipelines/rf_differential_privacy_trainer.py @@ -123,8 +123,8 @@ def data_split(data): return train, sanity, gold_fraud -def train_model(train_data, hyper_params, use_aug=False): - run = wandb.init(config=hyper_params) +def train_model(train_data, hyper_params, use_aug=False, wandb = None): + # run = wandb.init(config=hyper_params) model = RandomForestPrivacyModel(**hyper_params) # #drop train examples with Label=1 and Body less than 4 words @@ -153,7 +153,7 @@ def train_model(train_data, hyper_params, use_aug=False): train_data.reset_index(drop=True, inplace=True) # Call your code that produces output - model.train(body=train_data['Body'], label=train_data['Label']) + model.train(body=train_data['Body'], label=train_data['Label'], wandb = wandb) return model def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path): @@ -272,7 +272,7 @@ def dump_logs_to_wandb(hyper_params, f1_scores, save_path): train_data, sanity_data, gold_fraud_data = data_split(data) # Train the model - model = train_model(train_data, hyper_params, use_aug=args.use_aug) + model = train_model(train_data, hyper_params, use_aug=args.use_aug, wandb = run) # Test the model f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path) diff --git a/pipelines/roberta_trainer.py b/pipelines/roberta_trainer.py index a326cbc..cacf137 100644 --- a/pipelines/roberta_trainer.py +++ b/pipelines/roberta_trainer.py @@ -127,7 +127,7 @@ def data_split(data): return train, sanity, gold_fraud def train_model(train_data, hyper_params, use_aug=False): - run = wandb.init(config=hyper_params) + # run = wandb.init(config=hyper_params) model = RobertaFraudModel(**hyper_params) if use_aug: