diff --git a/notebooks/homomorphic (1).ipynb b/notebooks/homomorphic (1).ipynb deleted file mode 100644 index aa0b71e..0000000 --- a/notebooks/homomorphic (1).ipynb +++ /dev/null @@ -1,705 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "source": [ - "# !pip3 install concrete-ml" - ], - "metadata": { - "id": "rQ53bof5WlEL" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# !pip3 install nlpaug\n", - "# !pip3 install wandb" - ], - "metadata": { - "id": "u2YAx99DY1JG" - }, - "execution_count": 11, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import gensim.downloader\n", - "import nltk\n", - "from nltk.tokenize import RegexpTokenizer\n", - "from nltk.corpus import stopwords\n", - "import string\n", - "import wandb\n", - "# from torch.utils.data import Sampler\n", - "from sklearn.utils.class_weight import compute_sample_weight\n", - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "from sklearn.utils import shuffle as shuffler\n", - "import random\n", - "\n", - "import nlpaug.augmenter.word as naw" - ], - "metadata": { - "id": "x_hMPWSvXYmc" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def get_f1_score(\n", - " y_true: list[int],\n", - " y_pred: list[int],\n", - " average: str = 'weighted'\n", - " ):\n", - " \"\"\"Returns the F1 score.\n", - "\n", - " Args:\n", - " y_true (list[int]): The true labels.\n", - " y_pred (list[int]): The predicted labels.\n", - " average (str, optional): The averaging method. Defaults to 'weighted'.\n", - "\n", - " Returns:\n", - " float: The F1 score.\n", - " \"\"\"\n", - "\n", - " return f1_score(y_true, y_pred, average='weighted')\n", - "def calculate_document_embedding(doc, model, tokenizer, embed_size):\n", - " \"\"\"Calculates the document embedding for the given document.\n", - "\n", - " Utility function for below class - Word2VecEmbedder\n", - "\n", - " Args:\n", - " doc (str): The document.\n", - " model (gensim.models.keyedvectors.Word2VecKeyedVectors): The Word2Vec model.\n", - " tokenizer (nltk.tokenize.regexp.RegexpTokenizer): The tokenizer.\n", - " embed_size (int): The embedding size.\n", - "\n", - " Returns:\n", - " np.ndarray: The document embedding.\n", - " \"\"\"\n", - "\n", - " doc_embed = np.zeros(embed_size)\n", - " words = tokenizer.tokenize(doc)\n", - " stopset = stopwords.words('english') + list(string.punctuation)\n", - "\n", - " #we lowercase the words specifically for OOV embeddings to be same for same words different case\n", - " words = [word.lower() for word in words]\n", - " words = [word for word in words if word not in stopset]\n", - "\n", - " word_count = 0\n", - " for word in words:\n", - " if word in model:\n", - " doc_embed += model[word]\n", - " word_count += 1\n", - "\n", - " return doc_embed / word_count if word_count != 0 else doc_embed\n", - "def evaluate_and_log(\n", - " x: list[str],\n", - " y_true: list[int],\n", - " y_pred: list[int],\n", - " filename: str,\n", - " experiment: wandb = None,\n", - " id: list[str] = None\n", - " ):\n", - " \"\"\"Evaluates the model's performance and logs the results.\n", - "\n", - " Args:\n", - " x (list[str]): The texts used for evaluation.\n", - " y_true (list[int]): The actual labels.\n", - " y_pred (list[int]): The predicted labels.\n", - " filename (str): The name of the log file.\n", - " \"\"\"\n", - "\n", - " if id is None:\n", - " id = [str(i) for i in range(len(x))]\n", - "\n", - " if len(x) != len(y_true) or len(x) != len(y_pred):\n", - " raise ValueError(\"Input lists (x, y_true, and y_pred) must have the same length.\")\n", - "\n", - " # Calculate the classification report and confusion matrix\n", - " class_report, conf_matrix = get_classification_report_confusion_matrix(y_true, y_pred)\n", - "\n", - " # Find mismatched examples -> indices from y_pred and y_true where they are not the same\n", - " mismatched_indices = np.where(np.array(y_true) != np.array(y_pred))[0]\n", - " mismatched_examples = []\n", - "\n", - " if experiment is not None:\n", - " table = wandb.Table(columns=[\"Actual\", \"Predicted\", \"Text\"])\n", - "\n", - " for i in mismatched_indices:\n", - " # Format the mismatched example in a code block\n", - " mismatched_example = f\"\\nMail ID: {id[i]}\\nActual: {y_true[i]}\\nPredicted: {y_pred[i]}\\n\\nText: {x[i]}\\n\\n\"\n", - " mismatched_examples.append(mismatched_example)\n", - "\n", - " if experiment is not None:\n", - " table.add_data(y_true[i], y_pred[i], x[i])\n", - "\n", - " # Format the results for logging\n", - " log_content = f\"---------Classification Report---------\\n{classification_report(y_true, y_pred)}\\n\\n\"\n", - " log_content += f\"---------Confusion Matrix---------\\n{conf_matrix}\\n\\n\"\n", - " log_content += \"---------Mismatched Examples---------\\n\\n\"\n", - " log_content += \"\\n\\n\".join(mismatched_examples)\n", - "\n", - " # Log the table\n", - " if experiment is not None:\n", - " wandb.log({\"Mismatched_Examples\": table})\n", - "\n", - " # Save the results to the log file\n", - " with open(filename, 'w') as log_file:\n", - " log_file.write(log_content)\n", - "\n", - "\n", - "class Word2VecEmbedder(BaseEstimator, TransformerMixin):\n", - " def __init__(\n", - " self,\n", - " model_name: str = 'word2vec-google-news-300',\n", - " tokenizer=RegexpTokenizer(r'\\w+')\n", - " ):\n", - " self.model = gensim.downloader.load(model_name)\n", - " self.tokenizer = tokenizer\n", - " self.embed_size = 300\n", - "\n", - " def fit(\n", - " self,\n", - " X,\n", - " y=None\n", - " ):\n", - " return self\n", - "\n", - "\n", - " def transform(\n", - " self,\n", - " X\n", - " ):\n", - " \"\"\"Calculate Word2Vec embeddings for the given text.\n", - "\n", - " Args:\n", - " X (list): List of text documents.\n", - "\n", - " Returns:\n", - " np.ndarray: Word2Vec embeddings for the input text.\n", - " \"\"\"\n", - "\n", - " if isinstance(X, str):\n", - " X = [X]\n", - "\n", - " return np.vstack([calculate_document_embedding(doc, self.model, self.tokenizer, self.embed_size) for doc in X])\n", - "\n", - "\n", - "class TPSampler:\n", - " def __init__(\n", - " self,\n", - " class_labels,\n", - " tp_ratio=0.1,\n", - " batch_size=32\n", - " ):\n", - " \"\"\"A custom sampler to sample the training data.\n", - "\n", - " Args:\n", - " class_labels (list[int]): The class labels of the training data.\n", - " tp_ratio (float, optional): The ratio of true positives to sample. Defaults to 0.1.\n", - " batch_size (int, optional): The batch size. Defaults to 32.\n", - "\n", - " Returns:\n", - " iter: The indices of the sampled data.\n", - " \"\"\"\n", - "\n", - " self.tp_indices = [i for i, label in enumerate(class_labels) if label == 1]\n", - " self.non_tp_indices = [i for i, label in enumerate(class_labels) if label == 0]\n", - " self.tp_ratio = tp_ratio\n", - " self.batch_size = batch_size\n", - "\n", - " def __iter__(self):\n", - " \"\"\"Iterate through the sampled indices.\n", - "\n", - " Returns:\n", - " iter: The indices of the sampled data.\n", - " \"\"\"\n", - "\n", - " num_samples = len(self.tp_indices)\n", - " tp_batch_size = int(self.tp_ratio * self.batch_size)\n", - " non_tp_batch_size = self.batch_size - tp_batch_size\n", - " sampled_indices = []\n", - "\n", - " while len(sampled_indices) < num_samples:\n", - " tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=False)\n", - " non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=False)\n", - " batch_indices = np.concatenate((tp_indices, non_tp_indices))\n", - " np.random.shuffle(batch_indices)\n", - " sampled_indices.extend(batch_indices)\n", - "\n", - " return iter(sampled_indices)\n", - "\n", - " def __len__(\n", - " self\n", - " ):\n", - " \"\"\"Returns the total number of samples for the dataloader.\n", - "\n", - " Returns:\n", - " int: The total number of samples for the dataloader.\n", - " \"\"\"\n", - "\n", - " return len(self.tp_indices) # This defines the total number of samples for the dataloader\n", - "\n", - "\n", - "class Augmentor:\n", - " def __init__(\n", - " self,\n", - " augmentor = None\n", - " ):\n", - " \"\"\"A custom augmentor to augment the training data.\n", - "\n", - " Args:\n", - " augmentor (albumentations.core.composition.Compose): The augmentor to use.\n", - " \"\"\"\n", - "\n", - " if augmentor is None:\n", - " augmentor = naw.SynonymAug()\n", - "\n", - " def __call__(\n", - " self,\n", - " X,\n", - " y,\n", - " aug_label = 1,\n", - " num_aug_per_label_1 = 10,\n", - " shuffle=True\n", - " ):\n", - " \"\"\"Augment the training data.\n", - "\n", - " Args:\n", - " X (list): The input data.\n", - " y (list): The labels.\n", - " aug_label (int, optional): The label to augment. Defaults to 1.\n", - " num_aug_per_label_1 (int, optional): The number of augmentations to apply to the label. Defaults to 10.\n", - " shuffle (bool, optional): Whether to shuffle the data. Defaults to True.\n", - "\n", - " Returns:\n", - " tuple: The augmented data and labels.\n", - " \"\"\"\n", - "\n", - " if isinstance(X, str):\n", - " X = [X]\n", - " elif isinstance(X, pd.Series):\n", - " X = X.tolist()\n", - "\n", - " if isinstance(y, str):\n", - " y = [y]\n", - " elif isinstance(y, pd.Series):\n", - " y = y.tolist()\n", - "\n", - " X, y = self.augment_data(X, y, aug_label, num_aug_per_label_1=num_aug_per_label_1)\n", - "\n", - " if shuffle:\n", - " X, y = shuffler(X, y, random_state=42)\n", - "\n", - " return X, y\n", - "\n", - " def augment_data(\n", - " self,\n", - " input_text,\n", - " input_labels,\n", - " aug_label=1,\n", - " num_aug_per_label_1=10\n", - " ):\n", - "\n", - " augmented_texts = []\n", - " augmented_labels = []\n", - "\n", - " for text, lbl in zip(input_text, input_labels):\n", - " augmented_texts.append(text)\n", - " augmented_labels.append(lbl)\n", - "\n", - " # Apply augmentation only to instances with label 1\n", - " if float(lbl) == float(aug_label):\n", - " for _ in range(num_aug_per_label_1):\n", - " augmented_text = self.apply_augmentation(text)\n", - " augmented_texts.append(augmented_text)\n", - " augmented_labels.append(lbl)\n", - "\n", - " return augmented_texts, augmented_labels\n", - "\n", - " def apply_augmentation(\n", - " self,\n", - " text\n", - " ):\n", - "\n", - " # Choose an augmentation technique (you can explore different techniques)\n", - " aug = naw.SynonymAug()\n", - "\n", - " # Augment the text\n", - " augmented_text = aug.augment(text)[0]\n", - "\n", - " return augmented_text\n", - "\n", - "" - ], - "metadata": { - "id": "orw9VDGjXPQw" - }, - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "\n", - "\n", - "from google.colab import drive\n", - "drive.mount('/content/drive/', force_remount=True)\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DlZaYdEAYFKL", - "outputId": "4f00a84a-dfdc-4611-cd7e-ec33e0388426" - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive/\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QsT7TK2gWfRc", - "outputId": "6928619d-0f12-4bfd-97c5-c0f5a3086058" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import sys\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.pipeline import Pipeline\n", - "from concrete.ml.sklearn.rf import RandomForestClassifier\n", - "# sys.path.append('../')\n", - "# from utils.util_modeler import Word2VecEmbedder" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "JVNJyVqUWfRd" - }, - "outputs": [], - "source": [ - "data = pd.read_csv('/content/drive/MyDrive/fraud_detector_data.csv', sep=',' , lineterminator='\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "Rl5316KsWfRd" - }, - "outputs": [], - "source": [ - "train_data = data[data.Split == 'Train']\n", - "sanity_data = data[data.Split == 'Sanity']\n", - "gold_fraud_data = data[data.Split == 'Gold Fraud']" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "dcupfyUeWfRd" - }, - "outputs": [], - "source": [ - "# X = train_data.Body\n", - "# y = train_data.Label\n", - "\n", - "# X = X.values\n", - "# y = y.values" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "suiVfemxWfRd" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "L6HbnTxT1O3R" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "4732BAuHWfRe" - }, - "outputs": [], - "source": [ - "init_params_rf = {\"max_depth\": 7, \"n_estimators\": 100}\n", - "init_params_cml = {\"n_bits\": 3}" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nOW_AyIvWfRe", - "outputId": "22b296f8-a5c4-4742-866b-d0fb7aa4568c" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n" - ] - } - ], - "source": [ - "model = Pipeline([\n", - " ('vectorizer', Word2VecEmbedder()),\n", - " ('classifier', RandomForestClassifier(**init_params_rf, **init_params_cml))\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H-fcjO82WfRe" - }, - "outputs": [], - "source": [ - "augmentor = Augmentor()\n", - "\n", - "train_body, train_labels = augmentor(\n", - " train_data['Body'].tolist(),\n", - " train_data['Label'].tolist(),\n", - " aug_label=1,\n", - " num_aug_per_label_1=9,\n", - " shuffle=True\n", - ")\n", - "\n", - "train_data = pd.DataFrame(\n", - " {\n", - " 'Body': train_body,\n", - " 'Label': train_labels\n", - " }\n", - ")\n", - "\n", - "train_data.drop_duplicates(subset=['Body'], inplace=True)\n", - "train_data.reset_index(drop=True, inplace=True)\n", - "\n", - "# Call your code that produces output\n", - "model.fit(train_data['Body'], train_data['Label'])\n" - ] - }, - { - "cell_type": "code", - "source": [ - "!pip3 install mlflow\n", - "import os\n", - "from mlflow.sklearn import save_model\n", - "save_model(model,'/content/drive/MyDrive/')" - ], - "metadata": { - "id": "uMgQvxnC9WIJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "WANDB_API_KEY=\"f2341bb05edcb6d412894698013186646aefa0e2\"\n", - "WANDB_PROJECT=\"Fraud-Detector\"\n", - "WANDB_ENTITY=\"regressors\"\n", - "\n", - "wandbdict = {\n", - " 'key': WANDB_API_KEY,\n", - " 'entity': WANDB_PROJECT,\n", - " 'project': WANDB_ENTITY,\n", - "}\n", - "wandb.login(key=wandbdict['key'])\n", - "run = wandb.init(project=wandbdict['project'], entity=wandbdict['entity'])" - ], - "metadata": { - "id": "6mKt3tUE-U5q" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "f1_scores = {}\n", - "os.makedirs('/content/drive/MyDrive/logs')\n", - "save_path='/content/drive/MyDrive/'\n" - ], - "metadata": { - "id": "D_c9C2Mq3Qld" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [], - "metadata": { - "id": "pH2qWaNw6dxB" - } - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "hngi7nsOWfRe", - "outputId": "cc909b03-3ffe-407c-82b0-58fa63cf22ec" - }, - "outputs": [ - { - "output_type": "error", - "ename": "LookupError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/corpus/util.py\u001b[0m in \u001b[0;36m__load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0mroot\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{self.subdir}/{zip_name}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mfind\u001b[0;34m(resource_name, paths)\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[0mresource_not_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"\\n{sep}\\n{msg}\\n{sep}\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 583\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 584\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mLookupError\u001b[0m: \n**********************************************************************\n Resource \u001b[93mstopwords\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('stopwords')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mcorpora/stopwords.zip/stopwords/\u001b[0m\n\n Searched in:\n - '/root/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 376\u001b[0m \"\"\"\n\u001b[1;32m 377\u001b[0m \u001b[0mfit_params_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_fit_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 378\u001b[0;31m \u001b[0mXt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params_steps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 379\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_print_elapsed_time\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pipeline\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_log_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"passthrough\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, **fit_params_steps)\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0mcloned_transformer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;31m# Fit or load from cache the current transformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m X, fitted_transformer = fit_transform_one_cached(\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0mcloned_transformer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/joblib/memory.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 351\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 353\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 354\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcall_and_shelve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit_transform_one\u001b[0;34m(transformer, X, y, weight, message_clsname, message, **fit_params)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_print_elapsed_time\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_clsname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 869\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fit_transform\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 870\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 871\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 872\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/base.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 869\u001b[0m \u001b[0;31m# fit method of arity 2 (supervised transformation)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 870\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 871\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 872\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mtransform\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcalculate_document_embedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membed_size\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 35\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 34\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcalculate_document_embedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membed_size\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 35\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36mcalculate_document_embedding\u001b[0;34m(doc, model, tokenizer, embed_size)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mdoc_embed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membed_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mwords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mstopset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'english'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpunctuation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m#we lowercase the words specifically for OOV embeddings to be same for same words different case\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/corpus/util.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, attr)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"LazyCorpusLoader object has no attribute '__bases__'\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__load\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;31m# This looks circular, but its not, since __load() changes our\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;31m# __class__ to something new:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/corpus/util.py\u001b[0m in \u001b[0;36m__load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mroot\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{self.subdir}/{zip_name}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;31m# Load the corpus.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/corpus/util.py\u001b[0m in \u001b[0;36m__load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m \u001b[0mroot\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{self.subdir}/{self.__name}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 82\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLookupError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/nltk/data.py\u001b[0m in \u001b[0;36mfind\u001b[0;34m(resource_name, paths)\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"*\"\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m70\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[0mresource_not_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"\\n{sep}\\n{msg}\\n{sep}\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 583\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 584\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mLookupError\u001b[0m: \n**********************************************************************\n Resource \u001b[93mstopwords\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('stopwords')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mcorpora/stopwords\u001b[0m\n\n Searched in:\n - '/root/nltk_data'\n - '/usr/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n" - ] - } - ], - "source": [ - "train_data['Prediction'] = model.predict(body=train_data['Body'])\n", - "evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())\n", - "f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())" - ] - }, - { - "cell_type": "code", - "source": [ - "sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])\n", - "evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())\n", - "f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())\n", - "\n" - ], - "metadata": { - "id": "zCaQWTjf_UQ8" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MOXRQFjZWfRe" - }, - "outputs": [], - "source": [ - "gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])\n", - "evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())\n", - "f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())\n", - "\n" - ] - }, - { - "cell_type": "code", - "source": [ - "#save mismatch data into a csv file\n", - "mismatch_data = pd.concat(\n", - " [\n", - " train_data[train_data['Prediction'] != train_data['Label']],\n", - " sanity_data[sanity_data['Prediction'] != sanity_data['Label']],\n", - " gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]\n", - " ],\n", - " axis=0,\n", - " ignore_index=True\n", - ")\n", - "\n", - "mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)" - ], - "metadata": { - "id": "G4s7RYRV_a9p" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Fraud-Detector-env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "colab": { - "provenance": [] - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file