Added basic test for utils and updated preprocessor (#11)

* Added basic tests and updated preprocessor * Updated pipeline.yml * Updated pipeline.yml with v3.9 python * Updated data_fetch.py for issue fix with poi * Updated data_fetch.py for issue fix with LoadEnronData
advaithsrao · Oct 18, 2023 · d662b9d · d662b9d
1 parent 6bfff8c
commit d662b9d
Show file tree

Hide file tree

Showing 7 changed files with 159 additions and 19 deletions.
diff --git a/.github/workflows/pipeline.yml b/.github/workflows/pipeline.yml
@@ -1,4 +1,4 @@
-name: CICD
+name: Fraud Detector CI/CD
 
 on:
   push:
@@ -19,7 +19,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: 3.9  # Change to your desired Python version
+        python-version: 3.9
 
     - name: Install Poetry
       run: |
@@ -32,3 +32,8 @@ jobs:
       run: |
         poetry install
       working-directory: ${{ github.workspace }}
+
+    - name: Run tests
+      run: |
+        poetry run pytest -v tests/
+      working-directory: ${{ github.workspace }}
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc
diff --git a/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc b/tests/__pycache__/test_utils.cpython-310-pytest-6.2.5.pyc
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,69 @@
+import sys
+sys.path.append("..")
+
+import pandas as pd
+import pytest
+from utils.cleanup import Preprocessor
+from utils.data_fetch import PersonOfInterest, LoadEnronData
+
+@pytest.fixture
+def example():
+    return """
+    \n\n>  -----Original Message-----
+    \n> From: \tHara, Kathy
+    \n> Sent:\tMonday, April 09, 2001 11:53
+    \n> To:\tMark Hackney (E-mail)
+    \n> Cc:\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson,
+    \n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci;
+    \n> \'Heather Bare\'; Locke, Kathy
+    \n> Subject:\tBCHA Automatic Denial/Approval
+    \n>\n> Mark
+    \n>\n> We have been told by one of our Transmission Provider\'s that they do not
+    \n> need to give us an OASIS number until half-past.  If we wait until
+    \n> half-past to receive a valid oasis number, we cannot avoid launching late
+    \n> tags.  I think that this places too much pressure on the merchant.
+    \n>\n> We are also encountering problems with BC Hydro\'s automatic
+    \n> approval/denial software.  What happens if a VALID tag is denied in the
+    \n> "No Tag, No Flow" period, the control are cannot withdraw the denial, and
+    \n> it is too late to launch another tag?  Which entity takes responsibility
+    \n> for inadvertents and schedule cuts?
+    \n>\n> I would like to get some of the timing issues resolved prior to
+    \n> implementing "No Tag, No Flow."  The problems seem to be isolated, but it
+    \n> only takes a single entity to create huge problems for everyone involved.
+    \n>\n>\n> Thanks,
+    \n> Kathy Hara
+    """
+
+def test_preprocesor(example):
+    preprocess = Preprocessor()
+    result = preprocess(example)
+
+    #remove new lines
+    assert '\n' not in result
+
+    #remove specific patterns
+    assert '-+Original Message-+' not in result
+    assert 'From:' not in result
+    assert 'Sent:' not in result
+    assert 'To:' not in result
+    assert 'Cc:' not in result
+    assert 'Subject:' not in result
+
+    #remove multiple whitespace
+    assert '  ' not in result
+
+def test_person_of_interest():
+    poi = PersonOfInterest()
+    assert type(poi.return_person_of_interest()) == dict
+    assert type(poi.return_person_of_interest()['names']) == list
+    assert type(poi.return_person_of_interest()['emails']) == list
+
+    assert poi.check_person_of_interest_name('Lay, Kenneth') == True
+    assert poi.check_person_of_interest_email('[email protected]') == True
+
+# def test_load_enron_data():
+#     data_loader = LoadEnronData()
+#     assert type(data_loader()) == pd.DataFrame
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/utils/cleanup.py b/utils/cleanup.py
@@ -1,18 +1,83 @@
 import re
+from typing import Any
 
-def remove_new_lines(
-    text: str,
-):
-    """Remove new lines from text
 
-    Args:
-        text (str): text to remove new lines from
+class Preprocessor:
+    def __call__(
+        self, 
+        text: str,
+    ) -> str:
+        """Preprocess text
+
+        Args:
+            text (str): text to preprocess
+        
+        Returns:
+            text (str): preprocessed text
+        """
+        text = self.remove_new_lines(text)
+        text = self.remove_specific_patterns(text)
+        text = self.remove_multiple_whitespace(text)
+        return text
+
+    def remove_new_lines(
+        self,
+        text: str,
+    ) -> str:
+        """Remove new lines from text
+
+        Args:
+            text (str): text to remove new lines from
+        
+        Returns:
+            text (str): text with new lines removed
+        """
+        text = re.sub(r'\r\n', ' ', text)
+        text = re.sub(r'\n', ' ', text)
+        text = re.sub(r'\r', ' ', text)
+        return text
+
+    def remove_specific_patterns(
+        self,
+        text: str,
+    ) -> str:
+        """Remove specific patterns from text
+
+        Args:
+            text (str): text to remove patterns from
+        
+        Returns:
+            text (str): text with patterns removed
+        """
+        message_type = [
+            r'-+Original Message-+'
+        ]
+
+        header_type = [
+            r'From:.+?(?=Sent:)',
+            r'Sent:.+?(?=To:)',
+            r'To:.+?(?=Cc:)',
+            r'Cc:.+?(?=Subject:)',
+            r'Subject:.+?(\n|$)'
+        ]
+
+        for pattern in message_type + header_type:
+            text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE)
+
+        return text
 
-    Returns:
-        text (str): text with new lines removed
-    """
-    text = re.sub(r'\r\n', ' ', text)
-    text = re.sub(r'\n', ' ', text)
-    text = re.sub(r'\r', ' ', text)
-    return text
-
+    def remove_multiple_whitespace(
+        self,
+        text: str,
+    ) -> str:
+        """Remove multiple whitespace from text
+
+        Args:
+            text (str): text to remove multiple whitespace from
+        
+        Returns:
+            text (str): text with multiple whitespace removed
+        """
+        text = re.sub(r'\s+', ' ', text)
+        return text
+
diff --git a/utils/data_fetch.py b/utils/data_fetch.py
@@ -1,3 +1,4 @@
+from typing import List, Optional
 import os
 import pandas as pd
 import glob
@@ -18,8 +19,8 @@
 class PersonOfInterest:
     def __init__(
         self,
-        name_list: list[str] | None = None,
-        email_list: list[str] | None = None,
+        name_list: Optional[List[str]] = None,
+        email_list: Optional[List[str]] = None,
     ):
         """Class to operate with the person of interest data from config.ini file
         """
@@ -63,7 +64,7 @@ def return_person_of_interest(
 class LoadEnronData:
     def __call__(
         self,
-        datapath: str | None = None,
+        datapath: Optional[str] = None,
     ):
         """Load the Enron email data