Skip to content

Commit

Permalink
Perform Heuristics based relabeling for our fraud set (#29)
Browse files Browse the repository at this point in the history
* Perform Heuristics based relabeling for our fraud set - marketing to non-fraud, signatures, metadata dropped from fraud

* Test fix for mismatch labeler function

* Final dataframe updated after mismatch labeling class

* Last fix for mislabeled data labeler
  • Loading branch information
advaithsrao authored Nov 23, 2023
1 parent e618e40 commit ac12834
Show file tree
Hide file tree
Showing 11 changed files with 448 additions and 145 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,16 @@ In the early 2000s, Leslie Kaelbling at MIT purchased the dataset and noted that

| Set | Emails |
| --- | --- |
| Train | 304235 |
| Sanity | 200000 |
| Train | 224543 |
| Sanity | 250000 |
| Gold Fraud | 1000 |

**Training Label Split:**

| Label | Emails |
| --- | --- |
| 0 | 288428 |
| 1 | 15807 |
| 0 | 214080 |
| 1 | 10463 |


***
Expand Down
38 changes: 38 additions & 0 deletions config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,41 @@ names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yea

[folders.possible_fraud]
folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items

[preprocessor.patterns]
;unicode patterns
unicode = [^\x00-\x7F]+
;specific header and message patterns
message = -+Original Message-+
forward = -+Forwarded by-+
from = From:.+?(?=Sent:)
sent = Sent:.+?(?=To:)
to = To:.+?(?=Cc:)
cc = Cc:.+?(?=Subject:)
subject = Subject:.+?(\n|$)

[labeler.mismatch]
;min & max number of words in a sentence for fraud label
drop_threshold = 4 & 1500
;patterns to drop examples from fraud label
best_regards = Best Regards
sincerely = Sincerely
regards = Regards
your_sincerely = Your Sincerely
yours_sincerely = Yours Sincerely
yours_truly = Yours Truly
yours_faithfully = Yours Faithfully
thanks = Thanks
thank_you = Thank You
message_id = Message-ID:
from = From:
sent = Sent:
to = To:
cc = Cc:
undelivery = Undelivered Mail Returned to Sender
undeliverable = Undeliverable:
missed_reply = re\s
;reply patterns
replies = re\:|Re\:|RE\:|Fw\:|FW\:|Fwd\:|FWD\:|fwd\:
;marketing patterns
marketing = unsubscribe|read our Privacy Policy|update your(?: |)(?:communication|)(?: |)preferences|future(?: |)(?:promotional|)(?: |)(?:e-mail|e-mails|emails|email)|receive(?: |)(?:these notices|)(?: |)in the future|above for more information|mailing list|please click here and you will be removed|your name removed|remove yourself from this list|your (?:email|e-mail) removed|from our (?:email|e-mail) list|To be REMOVED from (?:this|our) list|To view our privacy policy|just let us know by clicking here|All prices and product availability subject to change without notice|(?:opt-out|opt out)|(?:opt in|opt-in|opted in|opted-in) to receive|if you no longer wish to receive|thank you for shopping with us|newsletter
4 changes: 2 additions & 2 deletions detector/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ def process_email(

email_fields = {}

folder_user = file.split(self.localpath)[1].split('/')[0]
folder_name = file.split(self.localpath)[1].split('/')[1]
folder_user = file.split(self.localpath)[1].split('/')[1]
folder_name = file.split(self.localpath)[1].split('/')[2]

email_fields['Folder-User'] = folder_user
email_fields['Folder-Name'] = folder_name
Expand Down
178 changes: 163 additions & 15 deletions detector/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,21 +328,17 @@ def contains_replies_forwards(
if data is None:
data = self.data

reply_patterns = self.config.get('labeler.mismatch','replies')
pattern = fr'\b(?:{reply_patterns})\b'

data['Contains-Reply-Forwards'] = data['Body'].swifter.apply(
lambda x: True \
if \
'Re:' in x \
or \
'RE:' in x \
or \
'Fw:' in x \
or \
'FW:' in x \
or \
'Fwd:' in x \
or \
'FWD:' in x \
else False
lambda x: bool(
re.search(
pattern,
x,
flags=re.IGNORECASE
)
)
)

return data
Expand Down Expand Up @@ -511,4 +507,156 @@ def get_labels(
axis = 1
)

return data
return data

class MismatchLabeler:
"""Class to relabel the mismatch examples from our dataset
Args:
data (pd.DataFrame): DataFrame
cfg (configparser.ConfigParser): ConfigParser object to read config.ini file
Returns:
data (pd.DataFrame): DataFrame containing the relabeled data with labeling updates
"""

def __init__(
self,
data: pd.DataFrame = None,
cfg: configparser.ConfigParser = None,
):

self.data = data
self.config = cfg

if self.data is None:
raise ValueError('data not provided')

if self.config is None:
self.config = config

def __call__(
self
) -> pd.DataFrame:

"""Call the Pipeline to label the enron data
Returns:
pd.DataFrame: DataFrame containing the enron data with labels
"""

self.data = self.drop_by_length(self.data)
print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body length less than 4 words and more than 600 words')

self.data = self.drop_by_pattern(self.data)
print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body containing the given pattern')

self.data = self.relabel_marketing_frauds(self.data)
print(f'\x1b[4mMismatchLabeler\x1b[0m: Relabeled marketing examples with label 1 to label 0 using marketing keywords')

return self.data

def drop_by_length(
self,
data: pd.DataFrame = None,
) -> pd.DataFrame:
"""Drop the fraud examples with body length less than 4 words and more than 600 words
Args:
data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
Returns:
data (pd.DataFrame): DataFrame containing the enron data with examples dropped
"""

if data is None:
data = self.data

drop_threshold = self.config.get('labeler.mismatch','drop_threshold')
min_length, max_length = convert_string_to_list(drop_threshold, sep = '&')
min_length, max_length = int(min_length), int(max_length)

data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))]
data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))]

return data

def drop_by_pattern(
self,
data: pd.DataFrame = None,
) -> pd.DataFrame:
"""Drop the fraud examples with body containing the given pattern
Args:
data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
Returns:
data (pd.DataFrame): DataFrame containing the enron data with examples dropped
"""

if data is None:
data = self.data

patterns = [
r'' + config.get('labeler.mismatch', 'best_regards'),
r'' + config.get('labeler.mismatch', 'sincerely'),
r'' + config.get('labeler.mismatch', 'regards'),
r'' + config.get('labeler.mismatch', 'your_sincerely'),
r'' + config.get('labeler.mismatch', 'yours_sincerely'),
r'' + config.get('labeler.mismatch', 'yours_truly'),
r'' + config.get('labeler.mismatch', 'yours_faithfully'),
r'' + config.get('labeler.mismatch', 'thanks'),
r'' + config.get('labeler.mismatch', 'thank_you'),
r'' + config.get('labeler.mismatch', 'message_id'),
r'' + config.get('labeler.mismatch', 'from'),
r'' + config.get('labeler.mismatch', 'sent'),
r'' + config.get('labeler.mismatch', 'to'),
r'' + config.get('labeler.mismatch', 'cc'),
r'' + config.get('labeler.mismatch', 'undelivery'),
r'' + config.get('labeler.mismatch', 'undeliverable'),
r'' + config.get('labeler.mismatch', 'missed_reply')
]

# Create a temporary column without Subject
data['Temp_Body'] = data.swifter.apply(lambda row: row['Body'].replace(row['Subject'], '') if pd.notna(row['Subject']) else row['Body'], axis=1)

combined_pattern = '|'.join(f'(?:^|^\s|^>|^ >)(?: |){pattern}' for pattern in patterns)

# Filter out rows where Label is 1 and any pattern matches
data = data[~((data['Label'] == 1) & data['Temp_Body'].str.contains(combined_pattern, case=False, regex=True))]

# Drop the temporary column
data = data.drop(columns=['Temp_Body'])

return data

def relabel_marketing_frauds(
self,
data: pd.DataFrame = None,
) -> pd.DataFrame:
"""Relabel the marketing examples with label 1 to label 0 using marketing keywords
Args:
data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
Returns:
data (pd.DataFrame): DataFrame containing the enron data with new column 'Label'
-> Label of the email
"""

if data is None:
data = self.data

marketing_keywords = self.config.get('labeler.mismatch','marketing')

data.loc[
(data['Label'] == 1) & \
data['Body'].str.contains(
marketing_keywords,
case=False, regex=True
),
'Label'
] = 0

return data

58 changes: 46 additions & 12 deletions detector/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,39 @@
import sys
sys.path.append("..")

import os
import re
import html2text
from typing import Any
import numpy as np

from utils.util_preprocessor import add_subject_to_body

#read config.ini file
import configparser
config = configparser.ConfigParser()
config.read(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'../config.ini'
)
)
class Preprocessor:
def __init__(
self,
cfg: configparser.ConfigParser = None,
) -> None:
"""Preprocessor class
Args:
cfg (configparser.ConfigParser, optional): ConfigParser object. Defaults to None.
"""

self.config = cfg

if self.config is None:
self.config = config

def __call__(
self,
text: str,
Expand Down Expand Up @@ -115,7 +140,9 @@ def remove_unicode_characters(
text (str): text with unicode characters removed
"""

return re.sub(r'[^\x00-\x7F]+', ' ', text)
unicode_pattern = r'' + config.get('preprocessor.patterns', 'unicode')

return re.sub(unicode_pattern, ' ', text)

def remove_specific_patterns(
self,
Expand All @@ -130,19 +157,26 @@ def remove_specific_patterns(
text (str): text with patterns removed
"""

message_type = [
r'-+Original Message-+'
]

header_type = [
r'From:.+?(?=Sent:)',
r'Sent:.+?(?=To:)',
r'To:.+?(?=Cc:)',
r'Cc:.+?(?=Subject:)',
r'Subject:.+?(\n|$)'
# Extract patterns from the [preprocessor.patterns] section
message_pattern = r'' + config.get('preprocessor.patterns', 'message')
forward_pattern = r'' + config.get('preprocessor.patterns', 'forward')
from_pattern = r'' + config.get('preprocessor.patterns', 'from')
sent_pattern = r'' + config.get('preprocessor.patterns', 'sent')
to_pattern = r'' + config.get('preprocessor.patterns', 'to')
cc_pattern = r'' + config.get('preprocessor.patterns', 'cc')
subject_pattern = r'' + config.get('preprocessor.patterns', 'subject')

patterns = [
message_pattern,
forward_pattern,
from_pattern,
sent_pattern,
to_pattern,
cc_pattern,
subject_pattern
]

for pattern in message_type + header_type:
for pattern in patterns:
text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE)

return text
Expand Down
Loading

0 comments on commit ac12834

Please sign in to comment.