From d662b9d7d13612dfa37249865695b4353340bee9 Mon Sep 17 00:00:00 2001 From: Advaith S Rao Date: Wed, 18 Oct 2023 01:06:19 -0400 Subject: [PATCH] Added basic test for utils and updated preprocessor (#11) * Added basic tests and updated preprocessor * Updated pipeline.yml * Updated pipeline.yml with v3.9 python * Updated data_fetch.py for issue fix with poi * Updated data_fetch.py for issue fix with LoadEnronData --- .github/workflows/pipeline.yml | 9 +- tests/__init__.py | 0 tests/__pycache__/__init__.cpython-310.pyc | Bin 0 -> 150 bytes .../test_utils.cpython-310-pytest-6.2.5.pyc | Bin 0 -> 5414 bytes tests/test_utils.py | 69 +++++++++++++ utils/cleanup.py | 93 +++++++++++++++--- utils/data_fetch.py | 7 +- 7 files changed, 159 insertions(+), 19 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/__pycache__/__init__.cpython-310.pyc create mode 100644 tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc create mode 100644 tests/test_utils.py diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 180900a..bc5155f 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -1,4 +1,4 @@ -name: CICD +name: Fraud Detector CI/CD on: push: @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.9 # Change to your desired Python version + python-version: 3.9 - name: Install Poetry run: | @@ -32,3 +32,8 @@ jobs: run: | poetry install working-directory: ${{ github.workspace }} + + - name: Run tests + run: | + poetry run pytest -v tests/ + working-directory: ${{ github.workspace }} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35d2e88f7f75e5a9ee643891b291f72c8ba502e3 GIT binary patch literal 150 zcmd1j<>g`kg2f^FsUZ3>h(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o6vKKeRZts8~O- zC^28(CqFqcN8dfOB%?G*->oRIG)31XwInsUB)>?%B(=DtSU)~KGcU6wK3=b&@)n0p TZhlH>PO2Tq$YLfS!NLFl->e~- literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc b/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7127d17115e4506bff533ae310c4c4aaa31e284c GIT binary patch literal 5414 zcmcIoTXP&o6`q-0?OhjNg9(AgRUzJrtc$O)g`LQ^*u-&Mv51R>s;$;`uXe22=}q^n z1Gc!3ulsNx|!z;}9f*CXYEP{Bys)6>8Z+~xIo9~}}CBKLvwzFRvi z@}ls8Q#&GxqJ;ZVc}$eoJ$*}-w~y~Yr*I$GU$ycVpmOf@y}7xn8&~dY+0m*YjZwPl z#WVM%HY$9hc_)mdmL`fP?@;0(Xn)NQdB9$WD!wLdV>0F{yQNk8GArKZn$NP=c+~2RdLGCyx?Fr+g@X5H*>Xqw z0b95<%g!t;EV9ML%jeIIMy@D~<9eOl=8g4G_SmVLr`y~Qs-pqd8<&gAL7=6WW%p$m z_LkUnzwL)=maQmyx+OPdFw0(*O0UT!c3TAj4>9;Px#5RP>^rIz!Zouad2g!KB?#i= zFA~vSVsEOx3<5dJZt##Vv1@XT2W^htYu!$#g_av?GxWo?S@r;~^p{4WUbrn`wAAbx z*YbrWc2_mlA!WBGx~q?51J1a1{q{0j?nbK3Bfr6J$k6A({Bj4&xWR*=Ncf$c@$~7T zWG%iS*{YNwi&P-kYLBTk?Qz#ayXaqTw zQa}z{^C7fr7!TPS%Mb26U}3kt3QM|Sgr#cnpn1B(O*F^YohI9qY?J#@(m9ee9z)`^ zY)F#G8Eo+dQ{4DwD07zPH+TpXjBhCP2D}?KT5u?8ksJw(_?jVofbECtBMTs=6L4&F zs8OulZM0Yi8{Bj?d;piUBx_6EXz?(b+vPSa3FCoLOoj~wv!o`4@b+pT+lFoWQHx!> z&TjXFR&W>}t;`4xAFj;2uq(xkYDSw73WyKMxh)Kkp<#X#R=>4;_s$Il3mEXy1bErE z=tkmB-cxLaV+-MaD^Qy!SqF#|_9>arP+_E1zzFovRRmCIzG?dt46|4+F%pRNfG%}M zQgj1=>PF_};ZSVSJ8V;Ru^9t@-Kt)b5ggRE+mh^w=!B9pOgB-ef(^I?JOC|`8tL@g z9V`Zc!{EIhnmftZ3S^C!%t+ZLJ69zHE3z8iTkXORPwjq{2F z)m=+TCA%ez?$N38!wrk{=~jZVE7)A?W=tUVc!RDqw`Tmz(V)##@#)h1cMa%x9wXE| zI047}RV-$Ab^ey-U2*z`jKBy=&qp9?W6OFM$Ii@kdU0Mp=H!(kk+X*)>*+jh4Pxxk zMGf?`4F1=Q<3!H;&XzlH2M8)#o^ZD^gUrC&aJRC9ta&+dw>{wvvQ1am_NKR!MH}in zuE<8&?TH;{!!f7OmKz|T>C?6swGjPc;)!?M*+R$?`N4$mqMnQL7*QDHcHH;f_Yfqr z&Q@kCD+;^xzJoFQ(JRJ!{R8SP?$Z0~ed^sF>;35g^_F(&{rf)k{(h|Yj|bFSPV4>9 zHD~sx86%$^>Hg5gL8LwK->-6IkM_6st-Y4gUO%Ar$vxVCzHjY6Olkk|0ku!<(f+r6 zYyTvr{SOD!?jTQ_z8AZts{3h<;;A22-Tv|CPjz~WRm5A{!szjA)xg;wQO1L_^U8EYgIs-Nzjkp8}g@FS$zlvIsYQ*NY?(Vquj#>6jabRLR^DVR$BDCr6 zQ!mvp6a4=qXX5}~S;ldogc0Rkall0^@RE2iaJJnYM-)#uTY1=+hmD0nA*#T};-F}L zhq+{95pk(R?R#wek;Oqa{>0)K+DP?5eu%sPxZewdVRcaFybL$q5T3ZKOii- z_D;(4wy!u`i7Zy)$dieE>=5f`__~N+1?ZedQ^0)mOE7qwBe^5d-V8&Ne!jJ}}DARJwxg6OA)wu{tp zsXv|K|3&DlWf2+{;|Zc#149-F?YR*&RQ4EhrfO0UE)aykH&HdKzXnC5;|vw&sJO5z z+Pnm1XnYA@PU!vK^ecDi+rVphrI%4_1-cug8zK|8pfc1AokZEW>*B($r!#R%#A5p4Y+A(mq$d$^G3iMjE~STE+#9T)NrLrf z4$nn^H*_9y?w{R{+~2sr@;-L;x6lw5>UHeLuwJh|6BkB#iFIpyqMjr*bBU2iG;lmO zwI3m&88^lGVVhl)IAeM^6CHdPigWf0fvDv=c$MZR_BtZYB@%0s$qQd2kUZ>mYz|eM zN!kRy*VUUcYP9Tz(Da2a&NuzXAO>mk9Ghf}(bLhb*W)Z%k@AOqDYY3);oEIRFY#CD g71GdRo^&d1!7I2GcgD?nCAaWw;c#KH;%5K#ZwOcN%>V!Z literal 0 HcmV?d00001 diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..30efb6b --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,69 @@ +import sys +sys.path.append("..") + +import pandas as pd +import pytest +from utils.cleanup import Preprocessor +from utils.data_fetch import PersonOfInterest, LoadEnronData + +@pytest.fixture +def example(): + return """ + \n\n> -----Original Message----- + \n> From: \tHara, Kathy + \n> Sent:\tMonday, April 09, 2001 11:53 + \n> To:\tMark Hackney (E-mail) + \n> Cc:\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson, + \n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci; + \n> \'Heather Bare\'; Locke, Kathy + \n> Subject:\tBCHA Automatic Denial/Approval + \n>\n> Mark + \n>\n> We have been told by one of our Transmission Provider\'s that they do not + \n> need to give us an OASIS number until half-past. If we wait until + \n> half-past to receive a valid oasis number, we cannot avoid launching late + \n> tags. I think that this places too much pressure on the merchant. + \n>\n> We are also encountering problems with BC Hydro\'s automatic + \n> approval/denial software. What happens if a VALID tag is denied in the + \n> "No Tag, No Flow" period, the control are cannot withdraw the denial, and + \n> it is too late to launch another tag? Which entity takes responsibility + \n> for inadvertents and schedule cuts? + \n>\n> I would like to get some of the timing issues resolved prior to + \n> implementing "No Tag, No Flow." The problems seem to be isolated, but it + \n> only takes a single entity to create huge problems for everyone involved. + \n>\n>\n> Thanks, + \n> Kathy Hara + """ + +def test_preprocesor(example): + preprocess = Preprocessor() + result = preprocess(example) + + #remove new lines + assert '\n' not in result + + #remove specific patterns + assert '-+Original Message-+' not in result + assert 'From:' not in result + assert 'Sent:' not in result + assert 'To:' not in result + assert 'Cc:' not in result + assert 'Subject:' not in result + + #remove multiple whitespace + assert ' ' not in result + +def test_person_of_interest(): + poi = PersonOfInterest() + assert type(poi.return_person_of_interest()) == dict + assert type(poi.return_person_of_interest()['names']) == list + assert type(poi.return_person_of_interest()['emails']) == list + + assert poi.check_person_of_interest_name('Lay, Kenneth') == True + assert poi.check_person_of_interest_email('kenneth_lay@enron.net') == True + +# def test_load_enron_data(): +# data_loader = LoadEnronData() +# assert type(data_loader()) == pd.DataFrame + +if __name__ == "__main__": + pytest.main() diff --git a/utils/cleanup.py b/utils/cleanup.py index 2c3f505..9b86167 100644 --- a/utils/cleanup.py +++ b/utils/cleanup.py @@ -1,18 +1,83 @@ import re +from typing import Any -def remove_new_lines( - text: str, -): - """Remove new lines from text - Args: - text (str): text to remove new lines from +class Preprocessor: + def __call__( + self, + text: str, + ) -> str: + """Preprocess text + + Args: + text (str): text to preprocess + + Returns: + text (str): preprocessed text + """ + text = self.remove_new_lines(text) + text = self.remove_specific_patterns(text) + text = self.remove_multiple_whitespace(text) + return text + + def remove_new_lines( + self, + text: str, + ) -> str: + """Remove new lines from text + + Args: + text (str): text to remove new lines from + + Returns: + text (str): text with new lines removed + """ + text = re.sub(r'\r\n', ' ', text) + text = re.sub(r'\n', ' ', text) + text = re.sub(r'\r', ' ', text) + return text + + def remove_specific_patterns( + self, + text: str, + ) -> str: + """Remove specific patterns from text + + Args: + text (str): text to remove patterns from + + Returns: + text (str): text with patterns removed + """ + message_type = [ + r'-+Original Message-+' + ] + + header_type = [ + r'From:.+?(?=Sent:)', + r'Sent:.+?(?=To:)', + r'To:.+?(?=Cc:)', + r'Cc:.+?(?=Subject:)', + r'Subject:.+?(\n|$)' + ] + + for pattern in message_type + header_type: + text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE) + + return text - Returns: - text (str): text with new lines removed - """ - text = re.sub(r'\r\n', ' ', text) - text = re.sub(r'\n', ' ', text) - text = re.sub(r'\r', ' ', text) - return text - \ No newline at end of file + def remove_multiple_whitespace( + self, + text: str, + ) -> str: + """Remove multiple whitespace from text + + Args: + text (str): text to remove multiple whitespace from + + Returns: + text (str): text with multiple whitespace removed + """ + text = re.sub(r'\s+', ' ', text) + return text + \ No newline at end of file diff --git a/utils/data_fetch.py b/utils/data_fetch.py index 73cb6a0..6a1f8c9 100644 --- a/utils/data_fetch.py +++ b/utils/data_fetch.py @@ -1,3 +1,4 @@ +from typing import List, Optional import os import pandas as pd import glob @@ -18,8 +19,8 @@ class PersonOfInterest: def __init__( self, - name_list: list[str] | None = None, - email_list: list[str] | None = None, + name_list: Optional[List[str]] = None, + email_list: Optional[List[str]] = None, ): """Class to operate with the person of interest data from config.ini file """ @@ -63,7 +64,7 @@ def return_person_of_interest( class LoadEnronData: def __call__( self, - datapath: str | None = None, + datapath: Optional[str] = None, ): """Load the Enron email data