From 8b55baf3b76108015ca204c339941425a0886dfe Mon Sep 17 00:00:00 2001 From: advaithsrao Date: Thu, 19 Oct 2023 16:10:18 -0400 Subject: [PATCH] Updated README with wiki, updated data_explorer.ipynb --- README.md | 13 +- notebooks/data_explorer.ipynb | 1302 ++++++++++++++++++++++++++++++++- 2 files changed, 1294 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 0b17a6a..8c322b4 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ Fraud Detection Package with fine-tuned RoBERTa model, equipped with ethical con - [Fraud-Detector](#fraud-detector) - [Team](#team) - [Steps to run](#steps-to-run) - - [Step 1](#step-1) - [Abstract](#abstract) - [Dataset Description](#dataset-description) - [Proposed Methodology](#proposed-methodology) @@ -21,17 +20,7 @@ Fraud Detection Package with fine-tuned RoBERTa model, equipped with ethical con ## Steps to run -### Step 1 - -Create an environment with poetry file - -```shell -pip3 install poetry - -poetry install - -poetry shell -``` +**All helper functions and run steps can be found here: https://github.com/advaithsrao/Fraud-Detector/wiki/Repository-Helpers** ## Abstract In today's data-driven landscape, the detection of fraud emails within corporate communications is critical. With email communication still being the most used mode of communication in organizations, hackers over time have found creative ways to bypass several security layers. In 2022 alone, email-based scams have led to losses of over $2.7 billion. diff --git a/notebooks/data_explorer.ipynb b/notebooks/data_explorer.ipynb index 8bcc5ef..844b42d 100644 --- a/notebooks/data_explorer.ipynb +++ b/notebooks/data_explorer.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -11,29 +11,50 @@ "['../config.ini']" ] }, - "execution_count": 2, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "import os\n", "import pandas as pd\n", "import glob\n", "import re\n", "from concurrent.futures import ThreadPoolExecutor\n", "import email\n", "\n", - "from utils.cleanup import remove_new_lines\n", - "from utils.data_fetch import LoadEnronData" + "import numpy as np\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import FunctionTransformer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from transformers import AutoTokenizer, AutoModel\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "from joblib import Parallel, delayed\n", + "\n", + "import sys\n", + "\n", + "sys.path.append(\"..\")\n", + "\n", + "from utils.cleanup import Preprocessor\n", + "from utils.data_fetch import LoadEnronData, PersonOfInterest\n", + "\n", + "#read config.ini file\n", + "import configparser\n", + "config = configparser.ConfigParser()\n", + "config.read('../config.ini')" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "pois = ['kenneth.lay@enron.com', 'ken.rice@enron.com', 'raymond.bowen@enron.com', 'kevin.hannon@enron.com', 'jeff.skilling@enron.com', 'paula.rieker@enron.com', 'david.delainey@enron.com', 'scott.yeager@enron.com', 'rex.shelby@enron.com', 'tim.belden@enron.com', 'ben.glisan@enron.com', 'andrew.fastow@enron.com', 'richard.causey@enron.com', 'wes.colwell@enron.com', 'joe.hirko@enron.com', 'michael.kopper@enron.com', 'mark.koenig@enron.com', 'christopher.calger@enron.com']" + "# pois = ['kenneth.lay@enron.com', 'ken.rice@enron.com', 'raymond.bowen@enron.com', 'kevin.hannon@enron.com', 'jeff.skilling@enron.com', 'paula.rieker@enron.com', 'david.delainey@enron.com', 'scott.yeager@enron.com', 'rex.shelby@enron.com', 'tim.belden@enron.com', 'ben.glisan@enron.com', 'andrew.fastow@enron.com', 'richard.causey@enron.com', 'wes.colwell@enron.com', 'joe.hirko@enron.com', 'michael.kopper@enron.com', 'mark.koenig@enron.com', 'christopher.calger@enron.com']" ] }, { @@ -42,7 +63,10 @@ "metadata": {}, "outputs": [], "source": [ - "data = LoadEnronData()" + "#only run if you do not have enron_data.csv saved locally\n", + "#else skip\n", + "data_loader = LoadEnronData()\n", + "data = data_loader()" ] }, { @@ -588,7 +612,7 @@ "metadata": {}, "outputs": [], "source": [ - "data.to_csv('~/Local/datasets/enron_data.csv', index=False, sep=',', lineterminator='\\n')" + "# data.to_csv('~/Local/datasets/enron_data.csv', index=False, sep=',', lineterminator='\\n')" ] }, { @@ -1042,11 +1066,1271 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['scholtes-d',\n", + " 'stf',\n", + " '<9368661.1075840029659.JavaMail.evans@thyme>',\n", + " 'Mon, 9 Apr 2001 21:55:00 -0700 (PDT)',\n", + " 'khara@avistaenergy.com',\n", + " 'cara.semperger@enron.com, demetrios.fotiou@bchydro.bc.ca, \\n\\tdiana.scholtes@enron.com, gjcarter@bpa.gov, jtaffe@ci.tacoma.wa.us, \\n\\tjhughes@sppc.com, paul.kroger@pacificorp.com, kmpe@dynegy.com',\n", + " 'FW: BCHA Automatic Denial/Approval',\n", + " 1.0,\n", + " 'text/plain; charset=us-ascii',\n", + " '7bit',\n", + " 'Hara, Kathy ',\n", + " 'Cara Semperger (E-mail) , Demetrios Fotiou (E-mail) , Diana Scholtes (E-mail) , Gloria Carter (E-mail) , Joe Taffe (E-mail) , John Hughes (E-mail) , Paul Kroger (E-mail) , Kimberly Peck (E-mail) ',\n", + " nan,\n", + " nan,\n", + " '\\\\ExMerge - Scholtes, Diana\\\\STF\\\\E-TAG',\n", + " 'SCHOLTES-D',\n", + " nan,\n", + " '\\n\\n> -----Original Message-----\\n> From: \\tHara, Kathy\\n> Sent:\\tMonday, April 09, 2001 11:53\\n> To:\\tMark Hackney (E-mail)\\n> Cc:\\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson,\\n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci;\\n> \\'Heather Bare\\'; Locke, Kathy\\n> Subject:\\tBCHA Automatic Denial/Approval\\n>\\n> Mark\\n>\\n> We have been told by one of our Transmission Provider\\'s that they do not\\n> need to give us an OASIS number until half-past. If we wait until\\n> half-past to receive a valid oasis number, we cannot avoid launching late\\n> tags. I think that this places too much pressure on the merchant.\\n>\\n> We are also encountering problems with BC Hydro\\'s automatic\\n> approval/denial software. What happens if a VALID tag is denied in the\\n> \"No Tag, No Flow\" period, the control are cannot withdraw the denial, and\\n> it is too late to launch another tag? Which entity takes responsibility\\n> for inadvertents and schedule cuts?\\n>\\n> I would like to get some of the timing issues resolved prior to\\n> implementing \"No Tag, No Flow.\" The problems seem to be isolated, but it\\n> only takes a single entity to create huge problems for everyone involved.\\n>\\n>\\n> Thanks,\\n> Kathy Hara',\n", + " nan,\n", + " nan,\n", + " nan,\n", + " nan,\n", + " nan]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.values.tolist()[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## kmeans" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/zj/8wm19rjs2zzf750trg99dd1h0000gn/T/ipykernel_5310/30082975.py:1: DtypeWarning: Columns (13,20,21,22) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = pd.read_csv('~/Local/datasets/enron_data.csv', sep=',', lineterminator='\\n')\n" + ] + } + ], + "source": [ + "# data = pd.read_csv('~/Local/datasets/enron_data.csv', sep=',', lineterminator='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# data.rename(columns={'folder_name': 'Folder-Name' , 'folder_user': 'Folder-User'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "# preprocessor = Preprocessor()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# def Cc_to_list(text) -> list[str] | None:\n", + "# if type(text) != str:\n", + "# return text\n", + "# text = preprocessor(text)\n", + "# text = text.split(',')\n", + "# return [item.strip() for item in text]\n", + "\n", + "# # data['Cc'] = data['Cc'].map(Cc_to_list)\n", + "\n", + "# # data['Body'] = data['Body'].apply(lambda x: preprocessor(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# poi = PersonOfInterest().return_person_of_interest()\n", + "# poi_emails = poi['emails']\n", + "# poi_names = poi['names']\n", + "\n", + "# data['Poi_present'] = data.apply(\n", + "# lambda row:\n", + "# True if row['To'] in poi_emails \\\n", + "# or (\n", + "# type(row['Cc']) == list \\\n", + "# and \\\n", + "# bool(\n", + "# [\n", + "# email for email in row['Cc'] if email in poi_emails\n", + "# ]\n", + "# ) \\\n", + "# ) \\\n", + "# else False,\n", + "# axis=1\n", + "# )\n", + "\n", + "# suspicious_folders = config['folders.possible_fraud']['folders'].split(' & ')\n", + "# suspicious_folders = [folder.strip() for folder in suspicious_folders]\n", + "# data['Suspicious_folders'] = data['Folder-Name'].apply(lambda x: True if x in suspicious_folders else False)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 507589\n", + "True 9811\n", + "Name: Poi_present, dtype: int64" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data['Poi_present'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 465961\n", + "True 51439\n", + "Name: Suspicious_folders, dtype: int64" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data['Suspicious_folders'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "# data.to_csv('~/Local/datasets/enron_preprocessed_data.csv', index=False, sep=',', lineterminator='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Load" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/zj/8wm19rjs2zzf750trg99dd1h0000gn/T/ipykernel_5310/517494740.py:1: DtypeWarning: Columns (13,20,21,22) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " data = pd.read_csv('~/Local/datasets/enron_preprocessed_data.csv', sep=',', lineterminator='\\n')\n" + ] + } + ], + "source": [ + "data = pd.read_csv('~/Local/datasets/enron_preprocessed_data.csv', sep=',', lineterminator='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "#get rows where poi_present or Suspicious Folders is True\n", + "data = data.loc[(data['Poi_present'] == True) | (data['Suspicious_folders'] == True), :]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Folder-UserFolder-NameMessage-IDDateFromToSubjectMime-VersionContent-TypeContent-Transfer-Encoding...X-OriginX-FileNameBodyCcBccTimeAttendeesRePoi_presentSuspicious_folders
96arnold-jdeleted_items<25351532.1075852689302.JavaMail.evans@thyme>Fri, 5 Oct 2001 07:56:38 -0700 (PDT)soblander@carrfut.comsoblander@carrfut.comoption candlesticks as a hot link 10/51.0text/plain; charset=ANSI_X3.4-19687bit...Arnold-JJARNOLD (Non-Privileged).pstThe information contained herein is based on s...NaNNaNNaNNaNNaNFalseTrue
97arnold-jdeleted_items<22134312.1075861665211.JavaMail.evans@thyme>Tue, 20 Nov 2001 16:08:27 -0800 (PST)errol.mclaughlin@enron.comjohn.arnold@enron.com, bilal.bajwa@enron.com, ...TRV Notification: (NG - PROPT P/L - 11/20/2001)1.0text/plain; charset=us-ascii7bit...Arnold-JJARNOLD (Non-Privileged).pstThe report named: NG - PROPT P/L <http://trv.c...NaNNaNNaNNaNNaNFalseTrue
98arnold-jdeleted_items<17391691.1075861672096.JavaMail.evans@thyme>Tue, 27 Nov 2001 05:02:04 -0800 (PST)carrfuturesenergy@carrfut.comrvujtech@carrfut.comrevised unleaded chart 11/271.0text/plain; charset=us-ascii7bit...Arnold-JJARNOLD (Non-Privileged).pst\\n\\n\\nUnleaded http://www.carrfut.com/rese...NaNNaNNaNNaNNaNFalseTrue
99arnold-jdeleted_items<27157097.1075852698801.JavaMail.evans@thyme>Wed, 17 Oct 2001 12:16:42 -0700 (PDT)dailyquote@smtp.quote.comjarnold@enron.comThe Daily Quote1.0text/plain; charset=ANSI_X3.4-1968quoted-printable...Arnold-JJARNOLD (Non-Privileged).pst\\n[IMAGE]=09Quote.com =09 Log In | Sign Up |...NaNNaNNaNNaNNaNFalseTrue
100arnold-jdeleted_items<21604342.1075861668678.JavaMail.evans@thyme>Thu, 15 Nov 2001 15:25:51 -0800 (PST)ina.rangel@enron.comdutch.quigley@enron.com, john.arnold@enron.com...FW: Move Related Issues1.0text/plain; charset=us-ascii7bit...Arnold-JJARNOLD (Non-Privileged).pst\\n\\n\\n\\nPLEASE MAKE SURE YOU ARE COMPLETELY PA...NaNNaNNaNNaNNaNFalseTrue
..................................................................
517276scholtes-dwest_bank<18141583.1075840031469.JavaMail.evans@thyme>Tue, 29 Jan 2002 20:28:41 -0800 (PST)sean.crandall@enron.comstephen.thome@enron.comP&L associated with Avista transaction to cove...1.0text/plain; charset=us-ascii7bit...SCHOLTES-DNaN\\t\\t\\t\\t\\t\\t\\t\\t6th Week\\t \\t Gross Revenue\\t...['diana.scholtes@enron.com', 'tim.belden@enron...diana.scholtes@enron.com, tim.belden@enron.comNaNNaNNaNTrueFalse
517284scholtes-dwest_bank<15463408.1075840032052.JavaMail.evans@thyme>Mon, 14 Jan 2002 14:51:46 -0800 (PST)stephen.thome@enron.comharlan.murphy@enron.com, dale.rasmussen@enron....RE: Conference Call with Houston1.0text/plain; charset=us-ascii7bit...SCHOLTES-DNaNHere is Houston's list:\\n\\n \\n\\n -----Original...['edward.baughman@enron.com', 'jim.brysch@enro...edward.baughman@enron.com, jim.brysch@enron.co...NaNNaNNaNTrueFalse
517285scholtes-dwest_bank<32685607.1075840032291.JavaMail.evans@thyme>Wed, 9 Jan 2002 12:50:50 -0800 (PST)w..white@enron.comtim.belden@enron.comRE: Dec and Jan liquidations1.0text/plain; charset=us-ascii7bit...SCHOLTES-DNaNTim,\\n\\nJohn's communication regarding not upd...['john.postlethwaite@enron.com', 'leslie.reeve...john.postlethwaite@enron.com, leslie.reeves@en...NaNNaNNaNTrueFalse
517290scholtes-dwest_bank<8891751.1075840031841.JavaMail.evans@thyme>Thu, 17 Jan 2002 12:49:30 -0800 (PST)sean.crandall@enron.comstephen.thome@enron.comP&L associated with Avista transaction to cove...1.0text/plain; charset=us-ascii7bit...SCHOLTES-DNaN\\t\\t\\t\\t\\t\\t\\t\\t4th Week\\t \\t Gross Revenue\\t...['diana.scholtes@enron.com', 'tim.belden@enron...diana.scholtes@enron.com, tim.belden@enron.comNaNNaNNaNTrueFalse
517292scholtes-dwest_bank<12256262.1075840031552.JavaMail.evans@thyme>Thu, 24 Jan 2002 20:29:08 -0800 (PST)sean.crandall@enron.comstephen.thome@enron.comP&L associated with Avista transaction to cove...1.0text/plain; charset=us-ascii7bit...SCHOLTES-DNaN\\t\\t\\t\\t\\t\\t\\t\\t4th Week\\t \\t Gross Revenue\\t...['diana.scholtes@enron.com', 'tim.belden@enron...diana.scholtes@enron.com, tim.belden@enron.comNaNNaNNaNTrueFalse
\n", + "

59689 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " Folder-User Folder-Name \\\n", + "96 arnold-j deleted_items \n", + "97 arnold-j deleted_items \n", + "98 arnold-j deleted_items \n", + "99 arnold-j deleted_items \n", + "100 arnold-j deleted_items \n", + "... ... ... \n", + "517276 scholtes-d west_bank \n", + "517284 scholtes-d west_bank \n", + "517285 scholtes-d west_bank \n", + "517290 scholtes-d west_bank \n", + "517292 scholtes-d west_bank \n", + "\n", + " Message-ID \\\n", + "96 <25351532.1075852689302.JavaMail.evans@thyme> \n", + "97 <22134312.1075861665211.JavaMail.evans@thyme> \n", + "98 <17391691.1075861672096.JavaMail.evans@thyme> \n", + "99 <27157097.1075852698801.JavaMail.evans@thyme> \n", + "100 <21604342.1075861668678.JavaMail.evans@thyme> \n", + "... ... \n", + "517276 <18141583.1075840031469.JavaMail.evans@thyme> \n", + "517284 <15463408.1075840032052.JavaMail.evans@thyme> \n", + "517285 <32685607.1075840032291.JavaMail.evans@thyme> \n", + "517290 <8891751.1075840031841.JavaMail.evans@thyme> \n", + "517292 <12256262.1075840031552.JavaMail.evans@thyme> \n", + "\n", + " Date From \\\n", + "96 Fri, 5 Oct 2001 07:56:38 -0700 (PDT) soblander@carrfut.com \n", + "97 Tue, 20 Nov 2001 16:08:27 -0800 (PST) errol.mclaughlin@enron.com \n", + "98 Tue, 27 Nov 2001 05:02:04 -0800 (PST) carrfuturesenergy@carrfut.com \n", + "99 Wed, 17 Oct 2001 12:16:42 -0700 (PDT) dailyquote@smtp.quote.com \n", + "100 Thu, 15 Nov 2001 15:25:51 -0800 (PST) ina.rangel@enron.com \n", + "... ... ... \n", + "517276 Tue, 29 Jan 2002 20:28:41 -0800 (PST) sean.crandall@enron.com \n", + "517284 Mon, 14 Jan 2002 14:51:46 -0800 (PST) stephen.thome@enron.com \n", + "517285 Wed, 9 Jan 2002 12:50:50 -0800 (PST) w..white@enron.com \n", + "517290 Thu, 17 Jan 2002 12:49:30 -0800 (PST) sean.crandall@enron.com \n", + "517292 Thu, 24 Jan 2002 20:29:08 -0800 (PST) sean.crandall@enron.com \n", + "\n", + " To \\\n", + "96 soblander@carrfut.com \n", + "97 john.arnold@enron.com, bilal.bajwa@enron.com, ... \n", + "98 rvujtech@carrfut.com \n", + "99 jarnold@enron.com \n", + "100 dutch.quigley@enron.com, john.arnold@enron.com... \n", + "... ... \n", + "517276 stephen.thome@enron.com \n", + "517284 harlan.murphy@enron.com, dale.rasmussen@enron.... \n", + "517285 tim.belden@enron.com \n", + "517290 stephen.thome@enron.com \n", + "517292 stephen.thome@enron.com \n", + "\n", + " Subject Mime-Version \\\n", + "96 option candlesticks as a hot link 10/5 1.0 \n", + "97 TRV Notification: (NG - PROPT P/L - 11/20/2001) 1.0 \n", + "98 revised unleaded chart 11/27 1.0 \n", + "99 The Daily Quote 1.0 \n", + "100 FW: Move Related Issues 1.0 \n", + "... ... ... \n", + "517276 P&L associated with Avista transaction to cove... 1.0 \n", + "517284 RE: Conference Call with Houston 1.0 \n", + "517285 RE: Dec and Jan liquidations 1.0 \n", + "517290 P&L associated with Avista transaction to cove... 1.0 \n", + "517292 P&L associated with Avista transaction to cove... 1.0 \n", + "\n", + " Content-Type Content-Transfer-Encoding ... \\\n", + "96 text/plain; charset=ANSI_X3.4-1968 7bit ... \n", + "97 text/plain; charset=us-ascii 7bit ... \n", + "98 text/plain; charset=us-ascii 7bit ... \n", + "99 text/plain; charset=ANSI_X3.4-1968 quoted-printable ... \n", + "100 text/plain; charset=us-ascii 7bit ... \n", + "... ... ... ... \n", + "517276 text/plain; charset=us-ascii 7bit ... \n", + "517284 text/plain; charset=us-ascii 7bit ... \n", + "517285 text/plain; charset=us-ascii 7bit ... \n", + "517290 text/plain; charset=us-ascii 7bit ... \n", + "517292 text/plain; charset=us-ascii 7bit ... \n", + "\n", + " X-Origin X-FileName \\\n", + "96 Arnold-J JARNOLD (Non-Privileged).pst \n", + "97 Arnold-J JARNOLD (Non-Privileged).pst \n", + "98 Arnold-J JARNOLD (Non-Privileged).pst \n", + "99 Arnold-J JARNOLD (Non-Privileged).pst \n", + "100 Arnold-J JARNOLD (Non-Privileged).pst \n", + "... ... ... \n", + "517276 SCHOLTES-D NaN \n", + "517284 SCHOLTES-D NaN \n", + "517285 SCHOLTES-D NaN \n", + "517290 SCHOLTES-D NaN \n", + "517292 SCHOLTES-D NaN \n", + "\n", + " Body \\\n", + "96 The information contained herein is based on s... \n", + "97 The report named: NG - PROPT P/L " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Plot the elbow method graph\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(n_clusters_range, inertia, marker='o')\n", + "plt.xlabel('Number of Clusters')\n", + "plt.ylabel('Inertia (Within-Cluster Sum of Squares)')\n", + "plt.title('Elbow Method for Optimal Number of Clusters')\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "#explore clusters\n", + "pipeline = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words=None, max_df = 500, min_df = 1, max_features=10000)),\n", + " ('kmeans', KMeans(n_clusters=100))\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to compute inertia for a specific n_clusters value\n", + "\n", + "def compute_inertia(n_clusters, mails):\n", + " pipeline = Pipeline([\n", + " ('tfidf', TfidfVectorizer(stop_words='english', max_df = 10, min_df = 1, max_features=10000)),\n", + " ('kmeans', KMeans(n_clusters=n_clusters))\n", + " ])\n", + " pipeline.fit(mails)\n", + " inertia = pipeline.named_steps['kmeans'].inertia_\n", + " with open(f'../resources/kmeans/run_1_w_poi_and_sus_folders_10maxdf.txt', 'a') as f:\n", + " f.write(f\"N_Clusters: {n_clusters}\\nInertia: {inertia}\\n\")\n", + " \n", + " print(f'Calculated Inertia for {n_clusters} : {inertia}')\n", + " return inertia\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/arao/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n", + "/Users/arao/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n", + "/Users/arao/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n", + "/Users/arao/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n", + " super()._check_params_vs_input(X, default_n_init=10)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculated Inertia for 500 : 15019.191537138831\n", + "Calculated Inertia for 550 : 14684.17130385475\n", + "Calculated Inertia for 600 : 14374.848991797124\n", + "Calculated Inertia for 650 : 14057.157131674114\n", + "Calculated Inertia for 700 : 13783.555039895571\n", + "Calculated Inertia for 750 : 13501.249491940187\n", + "Calculated Inertia for 800 : 13219.06166046982\n", + "Calculated Inertia for 850 : 12949.590380005708\n", + "Calculated Inertia for 900 : 12681.947831370551\n", + "Calculated Inertia for 950 : 12428.61135676556\n", + "Calculated Inertia for 1000 : 12180.732582529852\n", + "Calculated Inertia for 1050 : 11933.07012198666\n", + "Calculated Inertia for 1100 : 11640.306944145479\n", + "Calculated Inertia for 1150 : 11397.618406434694\n", + "Calculated Inertia for 1200 : 11166.884081568513\n", + "Calculated Inertia for 1250 : 10940.118150779646\n", + "Calculated Inertia for 1300 : 10730.724701958083\n", + "Calculated Inertia for 1350 : 10509.937639144522\n", + "Calculated Inertia for 1400 : 10303.41797895611\n", + "Calculated Inertia for 1450 : 10106.121368295855\n", + "Calculated Inertia for 1500 : 9904.194739004715\n", + "Calculated Inertia for 1550 : 9702.610206195923\n", + "Calculated Inertia for 1600 : 9510.580481966546\n", + "Calculated Inertia for 1650 : 9334.247621474364\n", + "Calculated Inertia for 1700 : 9146.687866573158\n", + "Calculated Inertia for 1750 : 8976.882866259715\n", + "Calculated Inertia for 1800 : 8791.535902250873\n", + "Calculated Inertia for 1850 : 8624.110337703369\n", + "Calculated Inertia for 1900 : 8436.86471451316\n", + "Calculated Inertia for 1950 : 8273.33411089806\n", + "Calculated Inertia for 2000 : 8112.062823700676\n", + "Calculated Inertia for 2050 : 7944.2491326685895\n", + "Calculated Inertia for 2100 : 7809.046621159516\n", + "Calculated Inertia for 2150 : 7653.851824522634\n", + "Calculated Inertia for 2200 : 7499.978817813828\n", + "Calculated Inertia for 2250 : 7344.636875910398\n", + "Calculated Inertia for 2300 : 7199.843699698313\n", + "Calculated Inertia for 2350 : 7071.828086714928\n", + "Calculated Inertia for 2400 : 6929.354089857098\n", + "Calculated Inertia for 2450 : 6777.656852917541\n", + "Calculated Inertia for 2500 : 6642.394188121581\n", + "Calculated Inertia for 2550 : 6515.6980203186495\n", + "Calculated Inertia for 2600 : 6394.976171896597\n", + "Calculated Inertia for 2650 : 6266.477392944355\n", + "Calculated Inertia for 2700 : 6146.927239839926\n", + "Calculated Inertia for 2750 : 6026.1101895267475\n", + "Calculated Inertia for 2800 : 5910.219214607946\n", + "Calculated Inertia for 2850 : 5802.9117479636425\n", + "Calculated Inertia for 2900 : 5680.139423752435\n", + "Calculated Inertia for 2950 : 5558.06209006871\n", + "Calculated Inertia for 3000 : 5449.357621340889\n", + "Calculated Inertia for 3050 : 5344.939750382897\n", + "Calculated Inertia for 3100 : 5244.578815685514\n", + "Calculated Inertia for 3150 : 5139.664643961472\n", + "Calculated Inertia for 3200 : 5046.110559375751\n", + "Calculated Inertia for 3250 : 4944.283358427517\n", + "Calculated Inertia for 3300 : 4830.689623163455\n", + "Calculated Inertia for 3350 : 4749.257333117264\n", + "Calculated Inertia for 3400 : 4647.883801361033\n", + "Calculated Inertia for 3450 : 4557.73353056649\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/arao/Local/Github/Fraud-Detector/notebooks/data_explorer.ipynb Cell 36\u001b[0m line \u001b[0;36m4\n\u001b[1;32m 1\u001b[0m \u001b[39m# Determine the optimal number of clusters using the elbow method in parallel\u001b[39;00m\n\u001b[1;32m 2\u001b[0m n_clusters_range \u001b[39m=\u001b[39m \u001b[39mrange\u001b[39m(\u001b[39m500\u001b[39m, \u001b[39m5000\u001b[39m, \u001b[39m50\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m inertia \u001b[39m=\u001b[39m Parallel(n_jobs\u001b[39m=\u001b[39;49m\u001b[39m4\u001b[39;49m)(delayed(compute_inertia)(n, data[\u001b[39m'\u001b[39;49m\u001b[39mBody\u001b[39;49m\u001b[39m'\u001b[39;49m]\u001b[39m.\u001b[39;49mtolist()) \u001b[39mfor\u001b[39;49;00m n \u001b[39min\u001b[39;49;00m n_clusters_range)\n\u001b[1;32m 5\u001b[0m \u001b[39m# inertia = [compute_inertia(n, data['Body'].tolist()) for n in n_clusters_range]\u001b[39;00m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/joblib/parallel.py:1952\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1946\u001b[0m \u001b[39m# The first item from the output is blank, but it makes the interpreter\u001b[39;00m\n\u001b[1;32m 1947\u001b[0m \u001b[39m# progress until it enters the Try/Except block of the generator and\u001b[39;00m\n\u001b[1;32m 1948\u001b[0m \u001b[39m# reach the first `yield` statement. This starts the aynchronous\u001b[39;00m\n\u001b[1;32m 1949\u001b[0m \u001b[39m# dispatch of the tasks to the workers.\u001b[39;00m\n\u001b[1;32m 1950\u001b[0m \u001b[39mnext\u001b[39m(output)\n\u001b[0;32m-> 1952\u001b[0m \u001b[39mreturn\u001b[39;00m output \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_generator \u001b[39melse\u001b[39;00m \u001b[39mlist\u001b[39;49m(output)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/joblib/parallel.py:1595\u001b[0m, in \u001b[0;36mParallel._get_outputs\u001b[0;34m(self, iterator, pre_dispatch)\u001b[0m\n\u001b[1;32m 1592\u001b[0m \u001b[39myield\u001b[39;00m\n\u001b[1;32m 1594\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backend\u001b[39m.\u001b[39mretrieval_context():\n\u001b[0;32m-> 1595\u001b[0m \u001b[39myield from\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_retrieve()\n\u001b[1;32m 1597\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mGeneratorExit\u001b[39;00m:\n\u001b[1;32m 1598\u001b[0m \u001b[39m# The generator has been garbage collected before being fully\u001b[39;00m\n\u001b[1;32m 1599\u001b[0m \u001b[39m# consumed. This aborts the remaining tasks if possible and warn\u001b[39;00m\n\u001b[1;32m 1600\u001b[0m \u001b[39m# the user if necessary.\u001b[39;00m\n\u001b[1;32m 1601\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_exception \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/ethical-fraud-detector-V2FuArT3-py3.10/lib/python3.10/site-packages/joblib/parallel.py:1707\u001b[0m, in \u001b[0;36mParallel._retrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1702\u001b[0m \u001b[39m# If the next job is not ready for retrieval yet, we just wait for\u001b[39;00m\n\u001b[1;32m 1703\u001b[0m \u001b[39m# async callbacks to progress.\u001b[39;00m\n\u001b[1;32m 1704\u001b[0m \u001b[39mif\u001b[39;00m ((\u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m) \u001b[39mor\u001b[39;00m\n\u001b[1;32m 1705\u001b[0m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_jobs[\u001b[39m0\u001b[39m]\u001b[39m.\u001b[39mget_status(\n\u001b[1;32m 1706\u001b[0m timeout\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout) \u001b[39m==\u001b[39m TASK_PENDING)):\n\u001b[0;32m-> 1707\u001b[0m time\u001b[39m.\u001b[39;49msleep(\u001b[39m0.01\u001b[39;49m)\n\u001b[1;32m 1708\u001b[0m \u001b[39mcontinue\u001b[39;00m\n\u001b[1;32m 1710\u001b[0m \u001b[39m# We need to be careful: the job list can be filling up as\u001b[39;00m\n\u001b[1;32m 1711\u001b[0m \u001b[39m# we empty it and Python list are not thread-safe by\u001b[39;00m\n\u001b[1;32m 1712\u001b[0m \u001b[39m# default hence the use of the lock\u001b[39;00m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "# Determine the optimal number of clusters using the elbow method in parallel\n", + "n_clusters_range = range(500, 5000, 50)\n", + "\n", + "inertia = Parallel(n_jobs=4)(delayed(compute_inertia)(n, data['Body'].tolist()) for n in n_clusters_range)\n", + "# inertia = [compute_inertia(n, data['Body'].tolist()) for n in n_clusters_range]" + ] + }, { "cell_type": "code", "execution_count": null,