diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..5b7f968 --- /dev/null +++ b/config.ini @@ -0,0 +1,3 @@ +[data] +; path to the local enron dataset +; enron = <> \ No newline at end of file diff --git a/notebooks/data_explorer.ipynb b/notebooks/data_explorer.ipynb new file mode 100644 index 0000000..8bcc5ef --- /dev/null +++ b/notebooks/data_explorer.ipynb @@ -0,0 +1,1079 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../config.ini']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import glob\n", + "import re\n", + "from concurrent.futures import ThreadPoolExecutor\n", + "import email\n", + "\n", + "from utils.cleanup import remove_new_lines\n", + "from utils.data_fetch import LoadEnronData" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pois = ['kenneth.lay@enron.com', 'ken.rice@enron.com', 'raymond.bowen@enron.com', 'kevin.hannon@enron.com', 'jeff.skilling@enron.com', 'paula.rieker@enron.com', 'david.delainey@enron.com', 'scott.yeager@enron.com', 'rex.shelby@enron.com', 'tim.belden@enron.com', 'ben.glisan@enron.com', 'andrew.fastow@enron.com', 'richard.causey@enron.com', 'wes.colwell@enron.com', 'joe.hirko@enron.com', 'michael.kopper@enron.com', 'mark.koenig@enron.com', 'christopher.calger@enron.com']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = LoadEnronData()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# #get path from config -> [data] -> path assignment\n", + "# #make sure to change the path in config.ini to your local path\n", + "\n", + "# datapath = config['data']['enron']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# files = glob.glob(datapath + \"/**/*.\", recursive=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# def process_email(file):\n", + "# email_fields = {}\n", + "# folder_user = file.split(datapath)[1].split('/')[0]\n", + "# folder_name = file.split(datapath)[1].split('/')[1]\n", + "\n", + "# email_fields['Folder-user'] = folder_user\n", + "# email_fields['Folder-name'] = folder_name\n", + "\n", + "# with open(file, \"rb\") as binary_file:\n", + "# msg = email.message_from_binary_file(binary_file)\n", + "\n", + "# # Extract fields from the email\n", + "# for field in msg.keys():\n", + "# email_fields[field] = msg[field]\n", + "\n", + "# # Extract the email body\n", + "# email_fields['Body'] = msg.get_payload()\n", + "\n", + "# # print(f'Done with user {folder_user} and folder {folder_name}')\n", + "# return email_fields\n", + "\n", + "# def get_email_df(files):\n", + "# emails = []\n", + "# with ThreadPoolExecutor(max_workers=8) as executor: # Adjust max_workers as needed\n", + "# results = list(executor.map(process_email, files))\n", + "# emails.extend(results)\n", + "\n", + "# return pd.DataFrame(emails)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# data = get_email_df(files)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | folder_user | \n", + "folder_name | \n", + "Message-ID | \n", + "Date | \n", + "From | \n", + "To | \n", + "Subject | \n", + "Mime-Version | \n", + "Content-Type | \n", + "Content-Transfer-Encoding | \n", + "... | \n", + "X-bcc | \n", + "X-Folder | \n", + "X-Origin | \n", + "X-FileName | \n", + "Body | \n", + "Cc | \n", + "Bcc | \n", + "Time | \n", + "Attendees | \n", + "Re | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<17334447.1075857585446.JavaMail.evans@thyme> | \n", + "Thu, 16 Nov 2000 09:30:00 -0800 (PST) | \n", + "msagel@home.com | \n", + "jarnold@enron.com | \n", + "Status | \n", + "1.0 | \n", + "text/plain; charset=ANSI_X3.4-1968 | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "John:\\n?\\nI'm not really sure what happened be... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
1 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<19171686.1075857585034.JavaMail.evans@thyme> | \n", + "Fri, 8 Dec 2000 05:05:00 -0800 (PST) | \n", + "slafontaine@globalp.com | \n", + "john.arnold@enron.com | \n", + "re:summer inverses | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "i suck-hope youve made more money in natgas la... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
2 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<29887033.1075857630725.JavaMail.evans@thyme> | \n", + "Tue, 15 May 2001 09:43:00 -0700 (PDT) | \n", + "iceoperations@intcx.com | \n", + "icehelpdesk@intcx.com, internalmarketing@intcx... | \n", + "The WTI Bullet swap contracts | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Jun2001\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "Hi,\\n\\n\\n Following the e-mail you have rece... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
3 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<29084893.1075849630138.JavaMail.evans@thyme> | \n", + "Mon, 27 Nov 2000 01:49:00 -0800 (PST) | \n", + "jeff.youngflesh@enron.com | \n", + "anthony.gilmore@enron.com, colleen.koenig@enro... | \n", + "Invitation: EBS/GSS Meeting w/Bristol Babcock ... | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Nov2001\\Notes Folders\\Notes inbox | \n", + "ARNOLD-J | \n", + "jarnold.nsf | \n", + "Conference Room TBD. \\n\\nThis meeting will be... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
4 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<30248874.1075857584813.JavaMail.evans@thyme> | \n", + "Tue, 12 Dec 2000 09:33:00 -0800 (PST) | \n", + "caroline.abramo@enron.com | \n", + "mike.grigsby@enron.com | \n", + "Harvard Mgmt | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "Mike- I have their trader coming into the offi... | \n", + "john.arnold@enron.com | \n", + "john.arnold@enron.com | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
517395 | \n", + "scholtes-d | \n", + "stf | \n", + "<18618854.1075840028791.JavaMail.evans@thyme> | \n", + "Thu, 11 Oct 2001 09:39:47 -0700 (PDT) | \n", + "jodi.droll@xcelenergy.com | \n", + "isas@wscc.com | \n", + "RE: Scheduling Time constant | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\Current issues | \n", + "SCHOLTES-D | \n", + "\n", + " | I (PSCO TP & CA) agree with Don. I thought we... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517396 | \n", + "scholtes-d | \n", + "stf | \n", + "<14350892.1075840028690.JavaMail.evans@thyme> | \n", + "Wed, 17 Oct 2001 16:11:39 -0700 (PDT) | \n", + "mark.hackney@aps.com | \n", + "isas@wscc.com | \n", + "FW: Standards Announcement - October 17, 2001 | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\Current issues | \n", + "SCHOLTES-D | \n", + "\n", + " | FYI!!\\n\\n-----Original Message-----\\nFrom: Bar... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517397 | \n", + "scholtes-d | \n", + "stf | \n", + "<21704474.1075840029683.JavaMail.evans@thyme> | \n", + "Fri, 6 Apr 2001 01:42:00 -0700 (PDT) | \n", + "paul.rice@pacificorp.com | \n", + "isas@wscc.com | \n", + "Late tags | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | Nothing is easy is it?? At the risk of the cr... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517398 | \n", + "scholtes-d | \n", + "stf | \n", + "<9367927.1075840029633.JavaMail.evans@thyme> | \n", + "Mon, 9 Apr 2001 23:11:00 -0700 (PDT) | \n", + "gjcarter@bpa.gov | \n", + "'hara@enron.com, khara@avistaenergy.com, cara.... | \n", + "RE: BCHA Automatic Denial/Approval | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | I think you are right on! In addition, I would... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517399 | \n", + "scholtes-d | \n", + "stf | \n", + "<9368661.1075840029659.JavaMail.evans@thyme> | \n", + "Mon, 9 Apr 2001 21:55:00 -0700 (PDT) | \n", + "khara@avistaenergy.com | \n", + "cara.semperger@enron.com, demetrios.fotiou@bch... | \n", + "FW: BCHA Automatic Denial/Approval | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | \\n\\n> -----Original Message-----\\n> From: \\tH... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517400 rows × 23 columns
\n", + "\n", + " | folder_user | \n", + "folder_name | \n", + "Message-ID | \n", + "Date | \n", + "From | \n", + "To | \n", + "Subject | \n", + "Mime-Version | \n", + "Content-Type | \n", + "Content-Transfer-Encoding | \n", + "... | \n", + "X-bcc | \n", + "X-Folder | \n", + "X-Origin | \n", + "X-FileName | \n", + "Body | \n", + "Cc | \n", + "Bcc | \n", + "Time | \n", + "Attendees | \n", + "Re | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<17334447.1075857585446.JavaMail.evans@thyme> | \n", + "Thu, 16 Nov 2000 09:30:00 -0800 (PST) | \n", + "msagel@home.com | \n", + "jarnold@enron.com | \n", + "Status | \n", + "1.0 | \n", + "text/plain; charset=ANSI_X3.4-1968 | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "John:\\n?\\nI'm not really sure what happened be... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
1 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<19171686.1075857585034.JavaMail.evans@thyme> | \n", + "Fri, 8 Dec 2000 05:05:00 -0800 (PST) | \n", + "slafontaine@globalp.com | \n", + "john.arnold@enron.com | \n", + "re:summer inverses | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "i suck-hope youve made more money in natgas la... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
2 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<29887033.1075857630725.JavaMail.evans@thyme> | \n", + "Tue, 15 May 2001 09:43:00 -0700 (PDT) | \n", + "iceoperations@intcx.com | \n", + "icehelpdesk@intcx.com, internalmarketing@intcx... | \n", + "The WTI Bullet swap contracts | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Jun2001\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "Hi,\\n\\n\\n Following the e-mail you have rece... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
3 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<29084893.1075849630138.JavaMail.evans@thyme> | \n", + "Mon, 27 Nov 2000 01:49:00 -0800 (PST) | \n", + "jeff.youngflesh@enron.com | \n", + "anthony.gilmore@enron.com, colleen.koenig@enro... | \n", + "Invitation: EBS/GSS Meeting w/Bristol Babcock ... | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Nov2001\\Notes Folders\\Notes inbox | \n", + "ARNOLD-J | \n", + "jarnold.nsf | \n", + "Conference Room TBD. \\n\\nThis meeting will be... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
4 | \n", + "arnold-j | \n", + "notes_inbox | \n", + "<30248874.1075857584813.JavaMail.evans@thyme> | \n", + "Tue, 12 Dec 2000 09:33:00 -0800 (PST) | \n", + "caroline.abramo@enron.com | \n", + "mike.grigsby@enron.com | \n", + "Harvard Mgmt | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\John_Arnold_Dec2000\\Notes Folders\\Notes inbox | \n", + "Arnold-J | \n", + "Jarnold.nsf | \n", + "Mike- I have their trader coming into the offi... | \n", + "john.arnold@enron.com | \n", + "john.arnold@enron.com | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
517395 | \n", + "scholtes-d | \n", + "stf | \n", + "<18618854.1075840028791.JavaMail.evans@thyme> | \n", + "Thu, 11 Oct 2001 09:39:47 -0700 (PDT) | \n", + "jodi.droll@xcelenergy.com | \n", + "isas@wscc.com | \n", + "RE: Scheduling Time constant | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\Current issues | \n", + "SCHOLTES-D | \n", + "\n", + " | I (PSCO TP & CA) agree with Don. I thought we... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517396 | \n", + "scholtes-d | \n", + "stf | \n", + "<14350892.1075840028690.JavaMail.evans@thyme> | \n", + "Wed, 17 Oct 2001 16:11:39 -0700 (PDT) | \n", + "mark.hackney@aps.com | \n", + "isas@wscc.com | \n", + "FW: Standards Announcement - October 17, 2001 | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\Current issues | \n", + "SCHOLTES-D | \n", + "\n", + " | FYI!!\\n\\n-----Original Message-----\\nFrom: Bar... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517397 | \n", + "scholtes-d | \n", + "stf | \n", + "<21704474.1075840029683.JavaMail.evans@thyme> | \n", + "Fri, 6 Apr 2001 01:42:00 -0700 (PDT) | \n", + "paul.rice@pacificorp.com | \n", + "isas@wscc.com | \n", + "Late tags | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | Nothing is easy is it?? At the risk of the cr... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517398 | \n", + "scholtes-d | \n", + "stf | \n", + "<9367927.1075840029633.JavaMail.evans@thyme> | \n", + "Mon, 9 Apr 2001 23:11:00 -0700 (PDT) | \n", + "gjcarter@bpa.gov | \n", + "'hara@enron.com, khara@avistaenergy.com, cara.... | \n", + "RE: BCHA Automatic Denial/Approval | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | I think you are right on! In addition, I would... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517399 | \n", + "scholtes-d | \n", + "stf | \n", + "<9368661.1075840029659.JavaMail.evans@thyme> | \n", + "Mon, 9 Apr 2001 21:55:00 -0700 (PDT) | \n", + "khara@avistaenergy.com | \n", + "cara.semperger@enron.com, demetrios.fotiou@bch... | \n", + "FW: BCHA Automatic Denial/Approval | \n", + "1.0 | \n", + "text/plain; charset=us-ascii | \n", + "7bit | \n", + "... | \n", + "\n", + " | \\ExMerge - Scholtes, Diana\\STF\\E-TAG | \n", + "SCHOLTES-D | \n", + "\n", + " | \\n\\n> -----Original Message-----\\n> From: \\tH... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
517400 rows × 23 columns
\n", + "