-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added basic test for utils and updated preprocessor (#11)
* Added basic tests and updated preprocessor * Updated pipeline.yml * Updated pipeline.yml with v3.9 python * Updated data_fetch.py for issue fix with poi * Updated data_fetch.py for issue fix with LoadEnronData
- Loading branch information
1 parent
6bfff8c
commit d662b9d
Showing
7 changed files
with
159 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import sys | ||
sys.path.append("..") | ||
|
||
import pandas as pd | ||
import pytest | ||
from utils.cleanup import Preprocessor | ||
from utils.data_fetch import PersonOfInterest, LoadEnronData | ||
|
||
@pytest.fixture | ||
def example(): | ||
return """ | ||
\n\n> -----Original Message----- | ||
\n> From: \tHara, Kathy | ||
\n> Sent:\tMonday, April 09, 2001 11:53 | ||
\n> To:\tMark Hackney (E-mail) | ||
\n> Cc:\tAllred, Penny; Cimino, Tony; Fewel, George; Holland, Kevin; Johnson, | ||
\n> Rob; Pearson, Tom; Rozelle, Dana; Begalman, Buppha; Downing, Staci; | ||
\n> \'Heather Bare\'; Locke, Kathy | ||
\n> Subject:\tBCHA Automatic Denial/Approval | ||
\n>\n> Mark | ||
\n>\n> We have been told by one of our Transmission Provider\'s that they do not | ||
\n> need to give us an OASIS number until half-past. If we wait until | ||
\n> half-past to receive a valid oasis number, we cannot avoid launching late | ||
\n> tags. I think that this places too much pressure on the merchant. | ||
\n>\n> We are also encountering problems with BC Hydro\'s automatic | ||
\n> approval/denial software. What happens if a VALID tag is denied in the | ||
\n> "No Tag, No Flow" period, the control are cannot withdraw the denial, and | ||
\n> it is too late to launch another tag? Which entity takes responsibility | ||
\n> for inadvertents and schedule cuts? | ||
\n>\n> I would like to get some of the timing issues resolved prior to | ||
\n> implementing "No Tag, No Flow." The problems seem to be isolated, but it | ||
\n> only takes a single entity to create huge problems for everyone involved. | ||
\n>\n>\n> Thanks, | ||
\n> Kathy Hara | ||
""" | ||
|
||
def test_preprocesor(example): | ||
preprocess = Preprocessor() | ||
result = preprocess(example) | ||
|
||
#remove new lines | ||
assert '\n' not in result | ||
|
||
#remove specific patterns | ||
assert '-+Original Message-+' not in result | ||
assert 'From:' not in result | ||
assert 'Sent:' not in result | ||
assert 'To:' not in result | ||
assert 'Cc:' not in result | ||
assert 'Subject:' not in result | ||
|
||
#remove multiple whitespace | ||
assert ' ' not in result | ||
|
||
def test_person_of_interest(): | ||
poi = PersonOfInterest() | ||
assert type(poi.return_person_of_interest()) == dict | ||
assert type(poi.return_person_of_interest()['names']) == list | ||
assert type(poi.return_person_of_interest()['emails']) == list | ||
|
||
assert poi.check_person_of_interest_name('Lay, Kenneth') == True | ||
assert poi.check_person_of_interest_email('[email protected]') == True | ||
|
||
# def test_load_enron_data(): | ||
# data_loader = LoadEnronData() | ||
# assert type(data_loader()) == pd.DataFrame | ||
|
||
if __name__ == "__main__": | ||
pytest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,83 @@ | ||
import re | ||
from typing import Any | ||
|
||
def remove_new_lines( | ||
text: str, | ||
): | ||
"""Remove new lines from text | ||
|
||
Args: | ||
text (str): text to remove new lines from | ||
class Preprocessor: | ||
def __call__( | ||
self, | ||
text: str, | ||
) -> str: | ||
"""Preprocess text | ||
Args: | ||
text (str): text to preprocess | ||
Returns: | ||
text (str): preprocessed text | ||
""" | ||
text = self.remove_new_lines(text) | ||
text = self.remove_specific_patterns(text) | ||
text = self.remove_multiple_whitespace(text) | ||
return text | ||
|
||
def remove_new_lines( | ||
self, | ||
text: str, | ||
) -> str: | ||
"""Remove new lines from text | ||
Args: | ||
text (str): text to remove new lines from | ||
Returns: | ||
text (str): text with new lines removed | ||
""" | ||
text = re.sub(r'\r\n', ' ', text) | ||
text = re.sub(r'\n', ' ', text) | ||
text = re.sub(r'\r', ' ', text) | ||
return text | ||
|
||
def remove_specific_patterns( | ||
self, | ||
text: str, | ||
) -> str: | ||
"""Remove specific patterns from text | ||
Args: | ||
text (str): text to remove patterns from | ||
Returns: | ||
text (str): text with patterns removed | ||
""" | ||
message_type = [ | ||
r'-+Original Message-+' | ||
] | ||
|
||
header_type = [ | ||
r'From:.+?(?=Sent:)', | ||
r'Sent:.+?(?=To:)', | ||
r'To:.+?(?=Cc:)', | ||
r'Cc:.+?(?=Subject:)', | ||
r'Subject:.+?(\n|$)' | ||
] | ||
|
||
for pattern in message_type + header_type: | ||
text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE) | ||
|
||
return text | ||
|
||
Returns: | ||
text (str): text with new lines removed | ||
""" | ||
text = re.sub(r'\r\n', ' ', text) | ||
text = re.sub(r'\n', ' ', text) | ||
text = re.sub(r'\r', ' ', text) | ||
return text | ||
|
||
def remove_multiple_whitespace( | ||
self, | ||
text: str, | ||
) -> str: | ||
"""Remove multiple whitespace from text | ||
Args: | ||
text (str): text to remove multiple whitespace from | ||
Returns: | ||
text (str): text with multiple whitespace removed | ||
""" | ||
text = re.sub(r'\s+', ' ', text) | ||
return text | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters