From b4b92531fcb3a8a8c8227286f4b1d2752b14021a Mon Sep 17 00:00:00 2001 From: advaithsrao Date: Tue, 17 Oct 2023 17:15:39 -0400 Subject: [PATCH] Updated config.ini with POIs and data_fetch script to load enron dataframe conversion and poi load --- config.ini | 147 ++------------------------------------------ utils/data_fetch.py | 56 ++++++++++++++++- 2 files changed, 59 insertions(+), 144 deletions(-) diff --git a/config.ini b/config.ini index cfb53f4..d0ea3d9 100644 --- a/config.ini +++ b/config.ini @@ -1,144 +1,9 @@ [data] -; path to the local enron dataset -; enron = <> +;replace enron = in the below line to make the utils->data_fetch.py->LoadEnronData() work to pull enron data as a DataFrame +enron = https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz -[person_of_interest_email] -email = kenneth_lay@enron.net -email = kenneth_lay@enron.com -email = klay.enron@enron.com -email = kenneth.lay@enron.com -email = klay@enron.com -email = layk@enron.com -email = chairman.ken@enron.com -email = jeffreyskilling@yahoo.com -email = jeff_skilling@enron.com -email = jskilling@enron.com -email = effrey.skilling@enron.com -email = skilling@enron.com -email = jeffrey.k.skilling@enron.com -email = jeff.skilling@enron.com -email = kevin_a_howard.enronxgate.enron@enron.net -email = kevin.hannon@enron.com -email = kevin.hannon@enron.net -email = kevin.hannon@gcm.com -email = michael.krautz@enron.com -email = scott.yeager@enron.com -email = syeager@fyi-net.com -email = scott_yeager@enron.net -email = syeager@flash.net -email = joe'.'hirko@enron.com -email = joe.hirko@enron.com -email = rex.shelby@enron.com -email = rex.shelby@enron.nt -email = rex_shelby@enron.net -email = jbrown@enron.com -email = james.brown@enron.com -email = rick.causey@enron.com -email = richard.causey@enron.com -email = rcausey@enron.com -email = calger@enron.com -email = chris.calger@enron.com -email = christopher.calger@enron.com -email = ccalger@enron.com -email = tim_despain.enronxgate.enron@enron.net -email = tim.despain@enron.com -email = kevin_hannon@enron.com -email = kevin'.'hannon@enron.com -email = kevin_hannon@enron.net -email = kevin.hannon@enron.com -email = mkoenig@enron.com -email = mark.koenig@enron.com -email = m..forney@enron.com -email = ken'.'rice@enron.com -email = ken.rice@enron.com -email = ken_rice@enron.com -email = ken_rice@enron.net -email = paula.rieker@enron.com -email = prieker@enron.com -email = andrew.fastow@enron.com -email = lfastow@pdq.net -email = andrew.s.fastow@enron.com -email = lfastow@pop.pdq.net -email = andy.fastow@enron.com -email = david.w.delainey@enron.com -email = delainey.dave@enron.com -email = 'delainey@enron.com -email = david.delainey@enron.com -email = 'david.delainey'@enron.com -email = dave.delainey@enron.com -email = delainey'.'david@enron.com -email = ben.glisan@enron.com -email = bglisan@enron.com -email = ben_f_glisan@enron.com -email = ben'.'glisan@enron.com -email = jeff.richter@enron.com -email = jrichter@nwlink.com -email = lawrencelawyer@aol.com -email = lawyer'.'larry@enron.com -email = larry_lawyer@enron.com -email = llawyer@enron.com -email = larry.lawyer@enron.com -email = lawrence.lawyer@enron.com -email = tbelden@enron.com -email = tim.belden@enron.com -email = tim_belden@pgn.com -email = tbelden@ect.enron.com -email = michael.kopper@enron.com -email = dave.duncan@enron.com -email = dave.duncan@cipco.org -email = duncan.dave@enron.com -email = ray.bowen@enron.com -email = raymond.bowen@enron.com -email = 'bowen@enron.com -email = wes.colwell@enron.com -email = dan.boyle@enron.com -email = cloehr@enron.com -email = chris.loehr@enron.com -email = joe.hirko@enron.com -email = kevin.hannon@enron.com -email = mforney@enron.com -email = ken.rice@enron.com -email = delainey@enron.com -email = david.delainey@enron.com -email = delainey.david@enron.com -email = ben.glisan@enron.com -email = lawyer.larry@enron.com -email = bowen@enron.com +[person_of_interest.emails] +emails = kenneth_lay@enron.net & kenneth_lay@enron.com & klay.enron@enron.com & kenneth.lay@enron.com & klay@enron.com & layk@enron.com & chairman.ken@enron.com & jeffreyskilling@yahoo.com & jeff_skilling@enron.com & jskilling@enron.com & effrey.skilling@enron.com & skilling@enron.com & jeffrey.k.skilling@enron.com & jeff.skilling@enron.com & kevin_a_howard.enronxgate.enron@enron.net & kevin.hannon@enron.com & kevin.hannon@enron.net & kevin.hannon@gcm.com & michael.krautz@enron.com & scott.yeager@enron.com & syeager@fyi-net.com & scott_yeager@enron.net & syeager@flash.net & joe'.'hirko@enron.com & joe.hirko@enron.com & rex.shelby@enron.com & rex.shelby@enron.nt & rex_shelby@enron.net & jbrown@enron.com & james.brown@enron.com & rick.causey@enron.com & richard.causey@enron.com & rcausey@enron.com & calger@enron.com & chris.calger@enron.com & christopher.calger@enron.com & ccalger@enron.com & tim_despain.enronxgate.enron@enron.net & tim.despain@enron.com & kevin_hannon@enron.com & kevin'.'hannon@enron.com & kevin_hannon@enron.net & kevin.hannon@enron.com & mkoenig@enron.com & mark.koenig@enron.com & m..forney@enron.com & ken'.'rice@enron.com & ken.rice@enron.com & ken_rice@enron.com & ken_rice@enron.net & paula.rieker@enron.com & prieker@enron.com & andrew.fastow@enron.com & lfastow@pdq.net & andrew.s.fastow@enron.com & lfastow@pop.pdq.net & andy.fastow@enron.com & david.w.delainey@enron.com & delainey.dave@enron.com & 'delainey@enron.com & david.delainey@enron.com & 'david.delainey'@enron.com & dave.delainey@enron.com & delainey'.'david@enron.com & ben.glisan@enron.com & bglisan@enron.com & ben_f_glisan@enron.com & ben'.'glisan@enron.com & jeff.richter@enron.com & jrichter@nwlink.com & lawrencelawyer@aol.com & lawyer'.'larry@enron.com & larry_lawyer@enron.com & llawyer@enron.com & larry.lawyer@enron.com & lawrence.lawyer@enron.com & tbelden@enron.com & tim.belden@enron.com & tim_belden@pgn.com & tbelden@ect.enron.com & michael.kopper@enron.com & dave.duncan@enron.com & dave.duncan@cipco.org & duncan.dave@enron.com & ray.bowen@enron.com & raymond.bowen@enron.com & 'bowen@enron.com & wes.colwell@enron.com & dan.boyle@enron.com & cloehr@enron.com & chris.loehr@enron.com & joe.hirko@enron.com & kevin.hannon@enron.com & mforney@enron.com & ken.rice@enron.com & delainey@enron.com & david.delainey@enron.com & delainey.david@enron.com & ben.glisan@enron.com & lawyer.larry@enron.com & bowen@enron.com -[person_of_interest_name] -[Names] -name = Lay, Kenneth -name = Skilling, Jeffrey -name = Howard, Kevin -name = Krautz, Michael -name = Yeager, Scott -name = Hirko, Joseph -name = Shelby, Rex -name = Bermingham, David -name = Darby, Giles -name = Mulgrew, Gary -name = Bayley, Daniel -name = Brown, James -name = Furst, Robert -name = Fuhs, William -name = Causey, Richard -name = Calger, Christopher -name = DeSpain, Timothy -name = Hannon, Kevin -name = Koenig, Mark -name = Forney, John -name = Rice, Kenneth -name = Rieker, Paula -name = Fastow, Lea -name = Fastow, Andrew -name = Delainey, David -name = Glisan, Ben -name = Richter, Jeffrey -name = Lawyer, Larry -name = Belden, Timothy -name = Kopper, Michael -name = Duncan, David -name = Bowen, Raymond -name = Colwell, Wesley -name = Boyle, Dan -name = Loehr, Christopher \ No newline at end of file +[person_of_interest.names] +names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yeager, Scott & Hirko, Joseph & Shelby, Rex & Bermingham, David & Darby, Giles & Mulgrew, Gary & Bayley, Daniel & Brown, James & Furst, Robert & Fuhs, William & Causey, Richard & Calger, Christopher & DeSpain, Timothy & Hannon, Kevin & Koenig, Mark & Forney, John & Rice, Kenneth & Rieker, Paula & Fastow, Lea & Fastow, Andrew & Delainey, David & Glisan, Ben & Richter, Jeffrey & Lawyer, Larry & Belden, Timothy & Kopper, Michael & Duncan, David & Bowen, Raymond & Colwell, Wesley & Boyle, Dan & Loehr, Christopher diff --git a/utils/data_fetch.py b/utils/data_fetch.py index a8f456f..73cb6a0 100644 --- a/utils/data_fetch.py +++ b/utils/data_fetch.py @@ -1,3 +1,4 @@ +import os import pandas as pd import glob import email @@ -6,7 +7,57 @@ #read config.ini file import configparser config = configparser.ConfigParser() -config.read('../config.ini') +config.read( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + '../config.ini' + ) +) + + +class PersonOfInterest: + def __init__( + self, + name_list: list[str] | None = None, + email_list: list[str] | None = None, + ): + """Class to operate with the person of interest data from config.ini file + """ + self.poi = {} + + #read [person_of_interest_name] and [person_of_interest_email] section from config.ini file if not given explicitly + if name_list is None: + self.poi['names'] = config['person_of_interest.names']['names'] + else: + self.poi['names'] = name_list + + if email_list is None: + self.poi['emails'] = config['person_of_interest.emails']['emails'] + else: + self.poi['emails'] = email_list + + #convert the values to lists + self.poi['names'] = [name.strip() for name in self.poi['names'].split('&')] + self.poi['emails'] = [email.strip() for email in self.poi['emails'].split('&')] + + def check_person_of_interest_name( + self, + name: str + ): + if name in self.poi['names']: + return True + + def check_person_of_interest_email( + self, + email: str + ): + if email in self.poi['emails']: + return True + + def return_person_of_interest( + self, + ): + return self.poi class LoadEnronData: @@ -17,8 +68,7 @@ def __call__( """Load the Enron email data Note: - To run this, please specify the local path to enron dataset in config.ini. - Download path for enron dataset: https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz + To run this locally Args: datapath (str, optional): Path to the Enron email data. Defaults to None.