Skip to content

Commit

Permalink
Added basic test for utils and updated preprocessor (#11)
Browse files Browse the repository at this point in the history
* Added basic tests and updated preprocessor

* Updated pipeline.yml

* Updated pipeline.yml with v3.9 python

* Updated data_fetch.py for issue fix with poi

* Updated data_fetch.py for issue fix with LoadEnronData
  • Loading branch information
advaithsrao authored Oct 18, 2023
1 parent 6bfff8c commit d662b9d
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 19 deletions.
9 changes: 7 additions & 2 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: CICD
name: Fraud Detector CI/CD

on:
push:
Expand All @@ -19,7 +19,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.9 # Change to your desired Python version
python-version: 3.9

- name: Install Poetry
run: |
Expand All @@ -32,3 +32,8 @@ jobs:
run: |
poetry install
working-directory: ${{ github.workspace }}

- name: Run tests
run: |
poetry run pytest -v tests/
working-directory: ${{ github.workspace }}
Empty file added tests/__init__.py
Empty file.
Binary file added tests/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
69 changes: 69 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import sys
sys.path.append("..")

import pandas as pd
import pytest
from utils.cleanup import Preprocessor
from utils.data_fetch import PersonOfInterest, LoadEnronData

@pytest.fixture
def example():
return """
\n\n> -----Original Message-----
\n> From: \tHara, Kathy
\n> Sent:\tMonday, April 09, 2001 11:53
\n> To:\tMark Hackney (E-mail)
\n> Cc:\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson,
\n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci;
\n> \'Heather Bare\'; Locke, Kathy
\n> Subject:\tBCHA Automatic Denial/Approval
\n>\n> Mark
\n>\n> We have been told by one of our Transmission Provider\'s that they do not
\n> need to give us an OASIS number until half-past. If we wait until
\n> half-past to receive a valid oasis number, we cannot avoid launching late
\n> tags. I think that this places too much pressure on the merchant.
\n>\n> We are also encountering problems with BC Hydro\'s automatic
\n> approval/denial software. What happens if a VALID tag is denied in the
\n> "No Tag, No Flow" period, the control are cannot withdraw the denial, and
\n> it is too late to launch another tag? Which entity takes responsibility
\n> for inadvertents and schedule cuts?
\n>\n> I would like to get some of the timing issues resolved prior to
\n> implementing "No Tag, No Flow." The problems seem to be isolated, but it
\n> only takes a single entity to create huge problems for everyone involved.
\n>\n>\n> Thanks,
\n> Kathy Hara
"""

def test_preprocesor(example):
preprocess = Preprocessor()
result = preprocess(example)

#remove new lines
assert '\n' not in result

#remove specific patterns
assert '-+Original Message-+' not in result
assert 'From:' not in result
assert 'Sent:' not in result
assert 'To:' not in result
assert 'Cc:' not in result
assert 'Subject:' not in result

#remove multiple whitespace
assert ' ' not in result

def test_person_of_interest():
poi = PersonOfInterest()
assert type(poi.return_person_of_interest()) == dict
assert type(poi.return_person_of_interest()['names']) == list
assert type(poi.return_person_of_interest()['emails']) == list

assert poi.check_person_of_interest_name('Lay, Kenneth') == True
assert poi.check_person_of_interest_email('[email protected]') == True

# def test_load_enron_data():
# data_loader = LoadEnronData()
# assert type(data_loader()) == pd.DataFrame

if __name__ == "__main__":
pytest.main()
93 changes: 79 additions & 14 deletions utils/cleanup.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,83 @@
import re
from typing import Any

def remove_new_lines(
text: str,
):
"""Remove new lines from text

Args:
text (str): text to remove new lines from
class Preprocessor:
def __call__(
self,
text: str,
) -> str:
"""Preprocess text
Args:
text (str): text to preprocess
Returns:
text (str): preprocessed text
"""
text = self.remove_new_lines(text)
text = self.remove_specific_patterns(text)
text = self.remove_multiple_whitespace(text)
return text

def remove_new_lines(
self,
text: str,
) -> str:
"""Remove new lines from text
Args:
text (str): text to remove new lines from
Returns:
text (str): text with new lines removed
"""
text = re.sub(r'\r\n', ' ', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\r', ' ', text)
return text

def remove_specific_patterns(
self,
text: str,
) -> str:
"""Remove specific patterns from text
Args:
text (str): text to remove patterns from
Returns:
text (str): text with patterns removed
"""
message_type = [
r'-+Original Message-+'
]

header_type = [
r'From:.+?(?=Sent:)',
r'Sent:.+?(?=To:)',
r'To:.+?(?=Cc:)',
r'Cc:.+?(?=Subject:)',
r'Subject:.+?(\n|$)'
]

for pattern in message_type + header_type:
text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE)

return text

Returns:
text (str): text with new lines removed
"""
text = re.sub(r'\r\n', ' ', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\r', ' ', text)
return text

def remove_multiple_whitespace(
self,
text: str,
) -> str:
"""Remove multiple whitespace from text
Args:
text (str): text to remove multiple whitespace from
Returns:
text (str): text with multiple whitespace removed
"""
text = re.sub(r'\s+', ' ', text)
return text

7 changes: 4 additions & 3 deletions utils/data_fetch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import List, Optional
import os
import pandas as pd
import glob
Expand All @@ -18,8 +19,8 @@
class PersonOfInterest:
def __init__(
self,
name_list: list[str] | None = None,
email_list: list[str] | None = None,
name_list: Optional[List[str]] = None,
email_list: Optional[List[str]] = None,
):
"""Class to operate with the person of interest data from config.ini file
"""
Expand Down Expand Up @@ -63,7 +64,7 @@ def return_person_of_interest(
class LoadEnronData:
def __call__(
self,
datapath: str | None = None,
datapath: Optional[str] = None,
):
"""Load the Enron email data
Expand Down

0 comments on commit d662b9d

Please sign in to comment.