diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml index 180900a..bc5155f 100644 --- a/.github/workflows/pipeline.yml +++ b/.github/workflows/pipeline.yml @@ -1,4 +1,4 @@ -name: CICD +name: Fraud Detector CI/CD on: push: @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.9 # Change to your desired Python version + python-version: 3.9 - name: Install Poetry run: | @@ -32,3 +32,8 @@ jobs: run: | poetry install working-directory: ${{ github.workspace }} + + - name: Run tests + run: | + poetry run pytest -v tests/ + working-directory: ${{ github.workspace }} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..35d2e88 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-310.pyc differ diff --git a/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc b/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc new file mode 100644 index 0000000..7127d17 Binary files /dev/null and b/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc differ diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..30efb6b --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,69 @@ +import sys +sys.path.append("..") + +import pandas as pd +import pytest +from utils.cleanup import Preprocessor +from utils.data_fetch import PersonOfInterest, LoadEnronData + +@pytest.fixture +def example(): + return """ + \n\n> -----Original Message----- + \n> From: \tHara, Kathy + \n> Sent:\tMonday, April 09, 2001 11:53 + \n> To:\tMark Hackney (E-mail) + \n> Cc:\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson, + \n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci; + \n> \'Heather Bare\'; Locke, Kathy + \n> Subject:\tBCHA Automatic Denial/Approval + \n>\n> Mark + \n>\n> We have been told by one of our Transmission Provider\'s that they do not + \n> need to give us an OASIS number until half-past. If we wait until + \n> half-past to receive a valid oasis number, we cannot avoid launching late + \n> tags. I think that this places too much pressure on the merchant. + \n>\n> We are also encountering problems with BC Hydro\'s automatic + \n> approval/denial software. What happens if a VALID tag is denied in the + \n> "No Tag, No Flow" period, the control are cannot withdraw the denial, and + \n> it is too late to launch another tag? Which entity takes responsibility + \n> for inadvertents and schedule cuts? + \n>\n> I would like to get some of the timing issues resolved prior to + \n> implementing "No Tag, No Flow." The problems seem to be isolated, but it + \n> only takes a single entity to create huge problems for everyone involved. + \n>\n>\n> Thanks, + \n> Kathy Hara + """ + +def test_preprocesor(example): + preprocess = Preprocessor() + result = preprocess(example) + + #remove new lines + assert '\n' not in result + + #remove specific patterns + assert '-+Original Message-+' not in result + assert 'From:' not in result + assert 'Sent:' not in result + assert 'To:' not in result + assert 'Cc:' not in result + assert 'Subject:' not in result + + #remove multiple whitespace + assert ' ' not in result + +def test_person_of_interest(): + poi = PersonOfInterest() + assert type(poi.return_person_of_interest()) == dict + assert type(poi.return_person_of_interest()['names']) == list + assert type(poi.return_person_of_interest()['emails']) == list + + assert poi.check_person_of_interest_name('Lay, Kenneth') == True + assert poi.check_person_of_interest_email('kenneth_lay@enron.net') == True + +# def test_load_enron_data(): +# data_loader = LoadEnronData() +# assert type(data_loader()) == pd.DataFrame + +if __name__ == "__main__": + pytest.main() diff --git a/utils/cleanup.py b/utils/cleanup.py index 2c3f505..9b86167 100644 --- a/utils/cleanup.py +++ b/utils/cleanup.py @@ -1,18 +1,83 @@ import re +from typing import Any -def remove_new_lines( - text: str, -): - """Remove new lines from text - Args: - text (str): text to remove new lines from +class Preprocessor: + def __call__( + self, + text: str, + ) -> str: + """Preprocess text + + Args: + text (str): text to preprocess + + Returns: + text (str): preprocessed text + """ + text = self.remove_new_lines(text) + text = self.remove_specific_patterns(text) + text = self.remove_multiple_whitespace(text) + return text + + def remove_new_lines( + self, + text: str, + ) -> str: + """Remove new lines from text + + Args: + text (str): text to remove new lines from + + Returns: + text (str): text with new lines removed + """ + text = re.sub(r'\r\n', ' ', text) + text = re.sub(r'\n', ' ', text) + text = re.sub(r'\r', ' ', text) + return text + + def remove_specific_patterns( + self, + text: str, + ) -> str: + """Remove specific patterns from text + + Args: + text (str): text to remove patterns from + + Returns: + text (str): text with patterns removed + """ + message_type = [ + r'-+Original Message-+' + ] + + header_type = [ + r'From:.+?(?=Sent:)', + r'Sent:.+?(?=To:)', + r'To:.+?(?=Cc:)', + r'Cc:.+?(?=Subject:)', + r'Subject:.+?(\n|$)' + ] + + for pattern in message_type + header_type: + text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE) + + return text - Returns: - text (str): text with new lines removed - """ - text = re.sub(r'\r\n', ' ', text) - text = re.sub(r'\n', ' ', text) - text = re.sub(r'\r', ' ', text) - return text - \ No newline at end of file + def remove_multiple_whitespace( + self, + text: str, + ) -> str: + """Remove multiple whitespace from text + + Args: + text (str): text to remove multiple whitespace from + + Returns: + text (str): text with multiple whitespace removed + """ + text = re.sub(r'\s+', ' ', text) + return text + \ No newline at end of file diff --git a/utils/data_fetch.py b/utils/data_fetch.py index 73cb6a0..6a1f8c9 100644 --- a/utils/data_fetch.py +++ b/utils/data_fetch.py @@ -1,3 +1,4 @@ +from typing import List, Optional import os import pandas as pd import glob @@ -18,8 +19,8 @@ class PersonOfInterest: def __init__( self, - name_list: list[str] | None = None, - email_list: list[str] | None = None, + name_list: Optional[List[str]] = None, + email_list: Optional[List[str]] = None, ): """Class to operate with the person of interest data from config.ini file """ @@ -63,7 +64,7 @@ def return_person_of_interest( class LoadEnronData: def __call__( self, - datapath: str | None = None, + datapath: Optional[str] = None, ): """Load the Enron email data