diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..5b7f968 --- /dev/null +++ b/config.ini @@ -0,0 +1,3 @@ +[data] +; path to the local enron dataset +; enron = <> \ No newline at end of file diff --git a/notebooks/data_explorer.ipynb b/notebooks/data_explorer.ipynb new file mode 100644 index 0000000..8bcc5ef --- /dev/null +++ b/notebooks/data_explorer.ipynb @@ -0,0 +1,1079 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../config.ini']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import glob\n", + "import re\n", + "from concurrent.futures import ThreadPoolExecutor\n", + "import email\n", + "\n", + "from utils.cleanup import remove_new_lines\n", + "from utils.data_fetch import LoadEnronData" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pois = ['kenneth.lay@enron.com', 'ken.rice@enron.com', 'raymond.bowen@enron.com', 'kevin.hannon@enron.com', 'jeff.skilling@enron.com', 'paula.rieker@enron.com', 'david.delainey@enron.com', 'scott.yeager@enron.com', 'rex.shelby@enron.com', 'tim.belden@enron.com', 'ben.glisan@enron.com', 'andrew.fastow@enron.com', 'richard.causey@enron.com', 'wes.colwell@enron.com', 'joe.hirko@enron.com', 'michael.kopper@enron.com', 'mark.koenig@enron.com', 'christopher.calger@enron.com']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = LoadEnronData()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# #get path from config -> [data] -> path assignment\n", + "# #make sure to change the path in config.ini to your local path\n", + "\n", + "# datapath = config['data']['enron']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# files = glob.glob(datapath + \"/**/*.\", recursive=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# def process_email(file):\n", + "# email_fields = {}\n", + "# folder_user = file.split(datapath)[1].split('/')[0]\n", + "# folder_name = file.split(datapath)[1].split('/')[1]\n", + "\n", + "# email_fields['Folder-user'] = folder_user\n", + "# email_fields['Folder-name'] = folder_name\n", + "\n", + "# with open(file, \"rb\") as binary_file:\n", + "# msg = email.message_from_binary_file(binary_file)\n", + "\n", + "# # Extract fields from the email\n", + "# for field in msg.keys():\n", + "# email_fields[field] = msg[field]\n", + "\n", + "# # Extract the email body\n", + "# email_fields['Body'] = msg.get_payload()\n", + "\n", + "# # print(f'Done with user {folder_user} and folder {folder_name}')\n", + "# return email_fields\n", + "\n", + "# def get_email_df(files):\n", + "# emails = []\n", + "# with ThreadPoolExecutor(max_workers=8) as executor: # Adjust max_workers as needed\n", + "# results = list(executor.map(process_email, files))\n", + "# emails.extend(results)\n", + "\n", + "# return pd.DataFrame(emails)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# data = get_email_df(files)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folder_userfolder_nameMessage-IDDateFromToSubjectMime-VersionContent-TypeContent-Transfer-Encoding...X-bccX-FolderX-OriginX-FileNameBodyCcBccTimeAttendeesRe
0arnold-jnotes_inbox<17334447.1075857585446.JavaMail.evans@thyme>Thu, 16 Nov 2000 09:30:00 -0800 (PST)msagel@home.comjarnold@enron.comStatus1.0text/plain; charset=ANSI_X3.4-19687bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfJohn:\\n?\\nI'm not really sure what happened be...NaNNaNNaNNaNNaN
1arnold-jnotes_inbox<19171686.1075857585034.JavaMail.evans@thyme>Fri, 8 Dec 2000 05:05:00 -0800 (PST)slafontaine@globalp.comjohn.arnold@enron.comre:summer inverses1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfi suck-hope youve made more money in natgas la...NaNNaNNaNNaNNaN
2arnold-jnotes_inbox<29887033.1075857630725.JavaMail.evans@thyme>Tue, 15 May 2001 09:43:00 -0700 (PDT)iceoperations@intcx.comicehelpdesk@intcx.com, internalmarketing@intcx...The WTI Bullet swap contracts1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Jun2001\\Notes Folders\\Notes inboxArnold-JJarnold.nsfHi,\\n\\n\\n Following the e-mail you have rece...NaNNaNNaNNaNNaN
3arnold-jnotes_inbox<29084893.1075849630138.JavaMail.evans@thyme>Mon, 27 Nov 2000 01:49:00 -0800 (PST)jeff.youngflesh@enron.comanthony.gilmore@enron.com, colleen.koenig@enro...Invitation: EBS/GSS Meeting w/Bristol Babcock ...1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Nov2001\\Notes Folders\\Notes inboxARNOLD-Jjarnold.nsfConference Room TBD. \\n\\nThis meeting will be...NaNNaNNaNNaNNaN
4arnold-jnotes_inbox<30248874.1075857584813.JavaMail.evans@thyme>Tue, 12 Dec 2000 09:33:00 -0800 (PST)caroline.abramo@enron.commike.grigsby@enron.comHarvard Mgmt1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfMike- I have their trader coming into the offi...john.arnold@enron.comjohn.arnold@enron.comNaNNaNNaN
..................................................................
517395scholtes-dstf<18618854.1075840028791.JavaMail.evans@thyme>Thu, 11 Oct 2001 09:39:47 -0700 (PDT)jodi.droll@xcelenergy.comisas@wscc.comRE: Scheduling Time constant1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\Current issuesSCHOLTES-DI (PSCO TP & CA) agree with Don. I thought we...NaNNaNNaNNaNNaN
517396scholtes-dstf<14350892.1075840028690.JavaMail.evans@thyme>Wed, 17 Oct 2001 16:11:39 -0700 (PDT)mark.hackney@aps.comisas@wscc.comFW: Standards Announcement - October 17, 20011.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\Current issuesSCHOLTES-DFYI!!\\n\\n-----Original Message-----\\nFrom: Bar...NaNNaNNaNNaNNaN
517397scholtes-dstf<21704474.1075840029683.JavaMail.evans@thyme>Fri, 6 Apr 2001 01:42:00 -0700 (PDT)paul.rice@pacificorp.comisas@wscc.comLate tags1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-DNothing is easy is it?? At the risk of the cr...NaNNaNNaNNaNNaN
517398scholtes-dstf<9367927.1075840029633.JavaMail.evans@thyme>Mon, 9 Apr 2001 23:11:00 -0700 (PDT)gjcarter@bpa.gov'hara@enron.com, khara@avistaenergy.com, cara....RE: BCHA Automatic Denial/Approval1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-DI think you are right on! In addition, I would...NaNNaNNaNNaNNaN
517399scholtes-dstf<9368661.1075840029659.JavaMail.evans@thyme>Mon, 9 Apr 2001 21:55:00 -0700 (PDT)khara@avistaenergy.comcara.semperger@enron.com, demetrios.fotiou@bch...FW: BCHA Automatic Denial/Approval1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-D\\n\\n> -----Original Message-----\\n> From: \\tH...NaNNaNNaNNaNNaN
\n", + "

517400 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " folder_user folder_name \\\n", + "0 arnold-j notes_inbox \n", + "1 arnold-j notes_inbox \n", + "2 arnold-j notes_inbox \n", + "3 arnold-j notes_inbox \n", + "4 arnold-j notes_inbox \n", + "... ... ... \n", + "517395 scholtes-d stf \n", + "517396 scholtes-d stf \n", + "517397 scholtes-d stf \n", + "517398 scholtes-d stf \n", + "517399 scholtes-d stf \n", + "\n", + " Message-ID \\\n", + "0 <17334447.1075857585446.JavaMail.evans@thyme> \n", + "1 <19171686.1075857585034.JavaMail.evans@thyme> \n", + "2 <29887033.1075857630725.JavaMail.evans@thyme> \n", + "3 <29084893.1075849630138.JavaMail.evans@thyme> \n", + "4 <30248874.1075857584813.JavaMail.evans@thyme> \n", + "... ... \n", + "517395 <18618854.1075840028791.JavaMail.evans@thyme> \n", + "517396 <14350892.1075840028690.JavaMail.evans@thyme> \n", + "517397 <21704474.1075840029683.JavaMail.evans@thyme> \n", + "517398 <9367927.1075840029633.JavaMail.evans@thyme> \n", + "517399 <9368661.1075840029659.JavaMail.evans@thyme> \n", + "\n", + " Date From \\\n", + "0 Thu, 16 Nov 2000 09:30:00 -0800 (PST) msagel@home.com \n", + "1 Fri, 8 Dec 2000 05:05:00 -0800 (PST) slafontaine@globalp.com \n", + "2 Tue, 15 May 2001 09:43:00 -0700 (PDT) iceoperations@intcx.com \n", + "3 Mon, 27 Nov 2000 01:49:00 -0800 (PST) jeff.youngflesh@enron.com \n", + "4 Tue, 12 Dec 2000 09:33:00 -0800 (PST) caroline.abramo@enron.com \n", + "... ... ... \n", + "517395 Thu, 11 Oct 2001 09:39:47 -0700 (PDT) jodi.droll@xcelenergy.com \n", + "517396 Wed, 17 Oct 2001 16:11:39 -0700 (PDT) mark.hackney@aps.com \n", + "517397 Fri, 6 Apr 2001 01:42:00 -0700 (PDT) paul.rice@pacificorp.com \n", + "517398 Mon, 9 Apr 2001 23:11:00 -0700 (PDT) gjcarter@bpa.gov \n", + "517399 Mon, 9 Apr 2001 21:55:00 -0700 (PDT) khara@avistaenergy.com \n", + "\n", + " To \\\n", + "0 jarnold@enron.com \n", + "1 john.arnold@enron.com \n", + "2 icehelpdesk@intcx.com, internalmarketing@intcx... \n", + "3 anthony.gilmore@enron.com, colleen.koenig@enro... \n", + "4 mike.grigsby@enron.com \n", + "... ... \n", + "517395 isas@wscc.com \n", + "517396 isas@wscc.com \n", + "517397 isas@wscc.com \n", + "517398 'hara@enron.com, khara@avistaenergy.com, cara.... \n", + "517399 cara.semperger@enron.com, demetrios.fotiou@bch... \n", + "\n", + " Subject Mime-Version \\\n", + "0 Status 1.0 \n", + "1 re:summer inverses 1.0 \n", + "2 The WTI Bullet swap contracts 1.0 \n", + "3 Invitation: EBS/GSS Meeting w/Bristol Babcock ... 1.0 \n", + "4 Harvard Mgmt 1.0 \n", + "... ... ... \n", + "517395 RE: Scheduling Time constant 1.0 \n", + "517396 FW: Standards Announcement - October 17, 2001 1.0 \n", + "517397 Late tags 1.0 \n", + "517398 RE: BCHA Automatic Denial/Approval 1.0 \n", + "517399 FW: BCHA Automatic Denial/Approval 1.0 \n", + "\n", + " Content-Type Content-Transfer-Encoding ... \\\n", + "0 text/plain; charset=ANSI_X3.4-1968 7bit ... \n", + "1 text/plain; charset=us-ascii 7bit ... \n", + "2 text/plain; charset=us-ascii 7bit ... \n", + "3 text/plain; charset=us-ascii 7bit ... \n", + "4 text/plain; charset=us-ascii 7bit ... \n", + "... ... ... ... \n", + "517395 text/plain; charset=us-ascii 7bit ... \n", + "517396 text/plain; charset=us-ascii 7bit ... \n", + "517397 text/plain; charset=us-ascii 7bit ... \n", + "517398 text/plain; charset=us-ascii 7bit ... \n", + "517399 text/plain; charset=us-ascii 7bit ... \n", + "\n", + " X-bcc X-Folder X-Origin \\\n", + "0 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "1 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "2 \\John_Arnold_Jun2001\\Notes Folders\\Notes inbox Arnold-J \n", + "3 \\John_Arnold_Nov2001\\Notes Folders\\Notes inbox ARNOLD-J \n", + "4 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "... ... ... ... \n", + "517395 \\ExMerge - Scholtes, Diana\\STF\\Current issues SCHOLTES-D \n", + "517396 \\ExMerge - Scholtes, Diana\\STF\\Current issues SCHOLTES-D \n", + "517397 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "517398 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "517399 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "\n", + " X-FileName Body \\\n", + "0 Jarnold.nsf John:\\n?\\nI'm not really sure what happened be... \n", + "1 Jarnold.nsf i suck-hope youve made more money in natgas la... \n", + "2 Jarnold.nsf Hi,\\n\\n\\n Following the e-mail you have rece... \n", + "3 jarnold.nsf Conference Room TBD. \\n\\nThis meeting will be... \n", + "4 Jarnold.nsf Mike- I have their trader coming into the offi... \n", + "... ... ... \n", + "517395 I (PSCO TP & CA) agree with Don. I thought we... \n", + "517396 FYI!!\\n\\n-----Original Message-----\\nFrom: Bar... \n", + "517397 Nothing is easy is it?? At the risk of the cr... \n", + "517398 I think you are right on! In addition, I would... \n", + "517399 \\n\\n> -----Original Message-----\\n> From: \\tH... \n", + "\n", + " Cc Bcc Time Attendees Re \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 john.arnold@enron.com john.arnold@enron.com NaN NaN NaN \n", + "... ... ... ... ... ... \n", + "517395 NaN NaN NaN NaN NaN \n", + "517396 NaN NaN NaN NaN NaN \n", + "517397 NaN NaN NaN NaN NaN \n", + "517398 NaN NaN NaN NaN NaN \n", + "517399 NaN NaN NaN NaN NaN \n", + "\n", + "[517400 rows x 23 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'<19171686.1075857585034.JavaMail.evans@thyme>\\r\\nDate: Fri, 8 Dec 2000 05:05:00 -0800 (PST)\\r\\nFrom: slafontaine@globalp.com\\r\\nTo: john.arnold@enron.com\\r\\nSubject: re:summer inverses\\r\\nMime-Version: 1.0\\r\\nContent-Type: text/plain; charset=us-ascii\\r\\nContent-Transfer-Encoding: 7bit\\r\\nX-From: slafontaine@globalp.com\\r\\nX-To: John.Arnold@enron.com\\r\\nX-cc: \\r\\nX-bcc: \\r\\nX-Folder: \\\\John_Arnold_Dec2000\\\\Notes Folders\\\\Notes inbox\\r\\nX-Origin: Arnold-J\\r\\nX-FileName: Jarnold.nsf\\r\\n\\r\\ni suck-hope youve made more money in natgas last 3 weeks than i have. mkt shud\\nbe getting bearish feb forward-cuz we already have the weather upon us-fuel\\nswitching and the rest shud invert the whole curve not just dec cash to jan \\nand\\nfeb forward???? have a good weekend john\\n'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data['ID'].tolist()[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv('~/Local/datasets/enron_data.csv', index=False, sep=',', lineterminator='\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folder_userfolder_nameMessage-IDDateFromToSubjectMime-VersionContent-TypeContent-Transfer-Encoding...X-bccX-FolderX-OriginX-FileNameBodyCcBccTimeAttendeesRe
0arnold-jnotes_inbox<17334447.1075857585446.JavaMail.evans@thyme>Thu, 16 Nov 2000 09:30:00 -0800 (PST)msagel@home.comjarnold@enron.comStatus1.0text/plain; charset=ANSI_X3.4-19687bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfJohn:\\n?\\nI'm not really sure what happened be...NaNNaNNaNNaNNaN
1arnold-jnotes_inbox<19171686.1075857585034.JavaMail.evans@thyme>Fri, 8 Dec 2000 05:05:00 -0800 (PST)slafontaine@globalp.comjohn.arnold@enron.comre:summer inverses1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfi suck-hope youve made more money in natgas la...NaNNaNNaNNaNNaN
2arnold-jnotes_inbox<29887033.1075857630725.JavaMail.evans@thyme>Tue, 15 May 2001 09:43:00 -0700 (PDT)iceoperations@intcx.comicehelpdesk@intcx.com, internalmarketing@intcx...The WTI Bullet swap contracts1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Jun2001\\Notes Folders\\Notes inboxArnold-JJarnold.nsfHi,\\n\\n\\n Following the e-mail you have rece...NaNNaNNaNNaNNaN
3arnold-jnotes_inbox<29084893.1075849630138.JavaMail.evans@thyme>Mon, 27 Nov 2000 01:49:00 -0800 (PST)jeff.youngflesh@enron.comanthony.gilmore@enron.com, colleen.koenig@enro...Invitation: EBS/GSS Meeting w/Bristol Babcock ...1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Nov2001\\Notes Folders\\Notes inboxARNOLD-Jjarnold.nsfConference Room TBD. \\n\\nThis meeting will be...NaNNaNNaNNaNNaN
4arnold-jnotes_inbox<30248874.1075857584813.JavaMail.evans@thyme>Tue, 12 Dec 2000 09:33:00 -0800 (PST)caroline.abramo@enron.commike.grigsby@enron.comHarvard Mgmt1.0text/plain; charset=us-ascii7bit...\\John_Arnold_Dec2000\\Notes Folders\\Notes inboxArnold-JJarnold.nsfMike- I have their trader coming into the offi...john.arnold@enron.comjohn.arnold@enron.comNaNNaNNaN
..................................................................
517395scholtes-dstf<18618854.1075840028791.JavaMail.evans@thyme>Thu, 11 Oct 2001 09:39:47 -0700 (PDT)jodi.droll@xcelenergy.comisas@wscc.comRE: Scheduling Time constant1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\Current issuesSCHOLTES-DI (PSCO TP & CA) agree with Don. I thought we...NaNNaNNaNNaNNaN
517396scholtes-dstf<14350892.1075840028690.JavaMail.evans@thyme>Wed, 17 Oct 2001 16:11:39 -0700 (PDT)mark.hackney@aps.comisas@wscc.comFW: Standards Announcement - October 17, 20011.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\Current issuesSCHOLTES-DFYI!!\\n\\n-----Original Message-----\\nFrom: Bar...NaNNaNNaNNaNNaN
517397scholtes-dstf<21704474.1075840029683.JavaMail.evans@thyme>Fri, 6 Apr 2001 01:42:00 -0700 (PDT)paul.rice@pacificorp.comisas@wscc.comLate tags1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-DNothing is easy is it?? At the risk of the cr...NaNNaNNaNNaNNaN
517398scholtes-dstf<9367927.1075840029633.JavaMail.evans@thyme>Mon, 9 Apr 2001 23:11:00 -0700 (PDT)gjcarter@bpa.gov'hara@enron.com, khara@avistaenergy.com, cara....RE: BCHA Automatic Denial/Approval1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-DI think you are right on! In addition, I would...NaNNaNNaNNaNNaN
517399scholtes-dstf<9368661.1075840029659.JavaMail.evans@thyme>Mon, 9 Apr 2001 21:55:00 -0700 (PDT)khara@avistaenergy.comcara.semperger@enron.com, demetrios.fotiou@bch...FW: BCHA Automatic Denial/Approval1.0text/plain; charset=us-ascii7bit...\\ExMerge - Scholtes, Diana\\STF\\E-TAGSCHOLTES-D\\n\\n> -----Original Message-----\\n> From: \\tH...NaNNaNNaNNaNNaN
\n", + "

517400 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " folder_user folder_name \\\n", + "0 arnold-j notes_inbox \n", + "1 arnold-j notes_inbox \n", + "2 arnold-j notes_inbox \n", + "3 arnold-j notes_inbox \n", + "4 arnold-j notes_inbox \n", + "... ... ... \n", + "517395 scholtes-d stf \n", + "517396 scholtes-d stf \n", + "517397 scholtes-d stf \n", + "517398 scholtes-d stf \n", + "517399 scholtes-d stf \n", + "\n", + " Message-ID \\\n", + "0 <17334447.1075857585446.JavaMail.evans@thyme> \n", + "1 <19171686.1075857585034.JavaMail.evans@thyme> \n", + "2 <29887033.1075857630725.JavaMail.evans@thyme> \n", + "3 <29084893.1075849630138.JavaMail.evans@thyme> \n", + "4 <30248874.1075857584813.JavaMail.evans@thyme> \n", + "... ... \n", + "517395 <18618854.1075840028791.JavaMail.evans@thyme> \n", + "517396 <14350892.1075840028690.JavaMail.evans@thyme> \n", + "517397 <21704474.1075840029683.JavaMail.evans@thyme> \n", + "517398 <9367927.1075840029633.JavaMail.evans@thyme> \n", + "517399 <9368661.1075840029659.JavaMail.evans@thyme> \n", + "\n", + " Date From \\\n", + "0 Thu, 16 Nov 2000 09:30:00 -0800 (PST) msagel@home.com \n", + "1 Fri, 8 Dec 2000 05:05:00 -0800 (PST) slafontaine@globalp.com \n", + "2 Tue, 15 May 2001 09:43:00 -0700 (PDT) iceoperations@intcx.com \n", + "3 Mon, 27 Nov 2000 01:49:00 -0800 (PST) jeff.youngflesh@enron.com \n", + "4 Tue, 12 Dec 2000 09:33:00 -0800 (PST) caroline.abramo@enron.com \n", + "... ... ... \n", + "517395 Thu, 11 Oct 2001 09:39:47 -0700 (PDT) jodi.droll@xcelenergy.com \n", + "517396 Wed, 17 Oct 2001 16:11:39 -0700 (PDT) mark.hackney@aps.com \n", + "517397 Fri, 6 Apr 2001 01:42:00 -0700 (PDT) paul.rice@pacificorp.com \n", + "517398 Mon, 9 Apr 2001 23:11:00 -0700 (PDT) gjcarter@bpa.gov \n", + "517399 Mon, 9 Apr 2001 21:55:00 -0700 (PDT) khara@avistaenergy.com \n", + "\n", + " To \\\n", + "0 jarnold@enron.com \n", + "1 john.arnold@enron.com \n", + "2 icehelpdesk@intcx.com, internalmarketing@intcx... \n", + "3 anthony.gilmore@enron.com, colleen.koenig@enro... \n", + "4 mike.grigsby@enron.com \n", + "... ... \n", + "517395 isas@wscc.com \n", + "517396 isas@wscc.com \n", + "517397 isas@wscc.com \n", + "517398 'hara@enron.com, khara@avistaenergy.com, cara.... \n", + "517399 cara.semperger@enron.com, demetrios.fotiou@bch... \n", + "\n", + " Subject Mime-Version \\\n", + "0 Status 1.0 \n", + "1 re:summer inverses 1.0 \n", + "2 The WTI Bullet swap contracts 1.0 \n", + "3 Invitation: EBS/GSS Meeting w/Bristol Babcock ... 1.0 \n", + "4 Harvard Mgmt 1.0 \n", + "... ... ... \n", + "517395 RE: Scheduling Time constant 1.0 \n", + "517396 FW: Standards Announcement - October 17, 2001 1.0 \n", + "517397 Late tags 1.0 \n", + "517398 RE: BCHA Automatic Denial/Approval 1.0 \n", + "517399 FW: BCHA Automatic Denial/Approval 1.0 \n", + "\n", + " Content-Type Content-Transfer-Encoding ... \\\n", + "0 text/plain; charset=ANSI_X3.4-1968 7bit ... \n", + "1 text/plain; charset=us-ascii 7bit ... \n", + "2 text/plain; charset=us-ascii 7bit ... \n", + "3 text/plain; charset=us-ascii 7bit ... \n", + "4 text/plain; charset=us-ascii 7bit ... \n", + "... ... ... ... \n", + "517395 text/plain; charset=us-ascii 7bit ... \n", + "517396 text/plain; charset=us-ascii 7bit ... \n", + "517397 text/plain; charset=us-ascii 7bit ... \n", + "517398 text/plain; charset=us-ascii 7bit ... \n", + "517399 text/plain; charset=us-ascii 7bit ... \n", + "\n", + " X-bcc X-Folder X-Origin \\\n", + "0 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "1 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "2 \\John_Arnold_Jun2001\\Notes Folders\\Notes inbox Arnold-J \n", + "3 \\John_Arnold_Nov2001\\Notes Folders\\Notes inbox ARNOLD-J \n", + "4 \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox Arnold-J \n", + "... ... ... ... \n", + "517395 \\ExMerge - Scholtes, Diana\\STF\\Current issues SCHOLTES-D \n", + "517396 \\ExMerge - Scholtes, Diana\\STF\\Current issues SCHOLTES-D \n", + "517397 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "517398 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "517399 \\ExMerge - Scholtes, Diana\\STF\\E-TAG SCHOLTES-D \n", + "\n", + " X-FileName Body \\\n", + "0 Jarnold.nsf John:\\n?\\nI'm not really sure what happened be... \n", + "1 Jarnold.nsf i suck-hope youve made more money in natgas la... \n", + "2 Jarnold.nsf Hi,\\n\\n\\n Following the e-mail you have rece... \n", + "3 jarnold.nsf Conference Room TBD. \\n\\nThis meeting will be... \n", + "4 Jarnold.nsf Mike- I have their trader coming into the offi... \n", + "... ... ... \n", + "517395 I (PSCO TP & CA) agree with Don. I thought we... \n", + "517396 FYI!!\\n\\n-----Original Message-----\\nFrom: Bar... \n", + "517397 Nothing is easy is it?? At the risk of the cr... \n", + "517398 I think you are right on! In addition, I would... \n", + "517399 \\n\\n> -----Original Message-----\\n> From: \\tH... \n", + "\n", + " Cc Bcc Time Attendees Re \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 john.arnold@enron.com john.arnold@enron.com NaN NaN NaN \n", + "... ... ... ... ... ... \n", + "517395 NaN NaN NaN NaN NaN \n", + "517396 NaN NaN NaN NaN NaN \n", + "517397 NaN NaN NaN NaN NaN \n", + "517398 NaN NaN NaN NaN NaN \n", + "517399 NaN NaN NaN NaN NaN \n", + "\n", + "[517400 rows x 23 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ethical-fraud-detector-V2FuArT3-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils/__init__,py b/utils/__init__,py new file mode 100644 index 0000000..e69de29 diff --git a/utils/cleanup.py b/utils/cleanup.py new file mode 100644 index 0000000..2c3f505 --- /dev/null +++ b/utils/cleanup.py @@ -0,0 +1,18 @@ +import re + +def remove_new_lines( + text: str, +): + """Remove new lines from text + + Args: + text (str): text to remove new lines from + + Returns: + text (str): text with new lines removed + """ + text = re.sub(r'\r\n', ' ', text) + text = re.sub(r'\n', ' ', text) + text = re.sub(r'\r', ' ', text) + return text + \ No newline at end of file diff --git a/utils/data_fetch.py b/utils/data_fetch.py new file mode 100644 index 0000000..a8f456f --- /dev/null +++ b/utils/data_fetch.py @@ -0,0 +1,76 @@ +import pandas as pd +import glob +import email +from concurrent.futures import ThreadPoolExecutor + +#read config.ini file +import configparser +config = configparser.ConfigParser() +config.read('../config.ini') + + +class LoadEnronData: + def __call__( + self, + datapath: str | None = None, + ): + """Load the Enron email data + + Note: + To run this, please specify the local path to enron dataset in config.ini. + Download path for enron dataset: https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz + + Args: + datapath (str, optional): Path to the Enron email data. Defaults to None. + + Returns: + email_df (pd.DataFrame): DataFrame containing the email data + """ + self.datapath = datapath + + if self.datapath is None: + self.datapath = config['data']['enron'] + + # Get all the email files + files = glob.glob(datapath + "/**/*.", recursive=True) + + # Get the email fields + email_df = self.get_email_df(files) + + return email_df + + def process_email( + self, + file: str, + ): + email_fields = {} + folder_user = file.split(self.datapath)[1].split('/')[0] + folder_name = file.split(self.datapath)[1].split('/')[1] + + email_fields['Folder-User'] = folder_user + email_fields['Folder_Name'] = folder_name + + with open(file, "rb") as binary_file: + msg = email.message_from_binary_file(binary_file) + + # Extract fields from the email + for field in msg.keys(): + email_fields[field] = msg[field] + + # Extract the email body + email_fields['Body'] = msg.get_payload() + + # print(f'Done with user {folder_user} and folder {folder_name}') + return email_fields + + def get_email_df( + self, + files + ): + emails = [] + + with ThreadPoolExecutor(max_workers=8) as executor: # Adjust max_workers as needed + results = list(executor.map(self.process_email, files)) + emails.extend(results) + + return pd.DataFrame(emails)