Perform Heuristics based relabeling for our fraud set (#29)

* Perform Heuristics based relabeling for our fraud set - marketing to non-fraud, signatures, metadata dropped from fraud * Test fix for mismatch labeler function * Final dataframe updated after mismatch labeling class * Last fix for mislabeled data labeler
advaithsrao · Nov 23, 2023 · ac12834 · ac12834
1 parent e618e40
commit ac12834
Show file tree

Hide file tree

Showing 11 changed files with 448 additions and 145 deletions.
diff --git a/README.md b/README.md
@@ -60,16 +60,16 @@ In the early 2000s, Leslie Kaelbling at MIT purchased the dataset and noted that
 
 | Set | Emails |
 | --- | --- |
-| Train | 304235 |
-| Sanity | 200000 |
+| Train | 224543  |
+| Sanity | 250000 |
 | Gold Fraud | 1000 | 
 
 **Training Label Split:**
 
 | Label | Emails |
 | --- | --- |
-| 0 | 288428 |
-| 1 | 15807 |
+| 0 | 214080 |
+| 1 | 10463 |
 
 
 ***

diff --git a/config.ini b/config.ini
@@ -26,3 +26,41 @@ names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yea
 
 [folders.possible_fraud]
 folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items
+
+[preprocessor.patterns]
+;unicode patterns
+unicode = [^\x00-\x7F]+
+;specific header and message patterns
+message = -+Original Message-+
+forward = -+Forwarded by-+
+from = From:.+?(?=Sent:)
+sent = Sent:.+?(?=To:)
+to = To:.+?(?=Cc:)
+cc = Cc:.+?(?=Subject:)
+subject = Subject:.+?(\n|$)
+
+[labeler.mismatch]
+;min & max number of words in a sentence for fraud label
+drop_threshold = 4 & 1500 
+;patterns to drop examples from fraud label
+best_regards = Best Regards
+sincerely = Sincerely
+regards = Regards
+your_sincerely = Your Sincerely
+yours_sincerely = Yours Sincerely
+yours_truly = Yours Truly
+yours_faithfully = Yours Faithfully
+thanks = Thanks
+thank_you = Thank You
+message_id = Message-ID:
+from = From: 
+sent = Sent:
+to = To:
+cc = Cc:
+undelivery = Undelivered Mail Returned to Sender
+undeliverable = Undeliverable:
+missed_reply = re\s
+;reply patterns
+replies = re\:|Re\:|RE\:|Fw\:|FW\:|Fwd\:|FWD\:|fwd\:
+;marketing patterns
+marketing = unsubscribe|read our Privacy Policy|update your(?: |)(?:communication|)(?: |)preferences|future(?: |)(?:promotional|)(?: |)(?:e-mail|e-mails|emails|email)|receive(?: |)(?:these notices|)(?: |)in the future|above for more information|mailing list|please click here and you will be removed|your name removed|remove yourself from this list|your (?:email|e-mail) removed|from our (?:email|e-mail) list|To be REMOVED from (?:this|our) list|To view our privacy policy|just let us know by clicking here|All prices and product availability subject to change without notice|(?:opt-out|opt out)|(?:opt in|opt-in|opted in|opted-in) to receive|if you no longer wish to receive|thank you for shopping with us|newsletter
diff --git a/detector/data_loader.py b/detector/data_loader.py
@@ -182,8 +182,8 @@ def process_email(
 
         email_fields = {}
 
-        folder_user = file.split(self.localpath)[1].split('/')[0]
-        folder_name = file.split(self.localpath)[1].split('/')[1]
+        folder_user = file.split(self.localpath)[1].split('/')[1]
+        folder_name = file.split(self.localpath)[1].split('/')[2]
 
         email_fields['Folder-User'] = folder_user
         email_fields['Folder-Name'] = folder_name

diff --git a/detector/labeler.py b/detector/labeler.py
@@ -328,21 +328,17 @@ def contains_replies_forwards(
         if data is None:
             data = self.data
 
+        reply_patterns = self.config.get('labeler.mismatch','replies')
+        pattern = fr'\b(?:{reply_patterns})\b'
+
         data['Contains-Reply-Forwards'] = data['Body'].swifter.apply(
-            lambda x: True \
-                if \
-                    'Re:' in x \
-                    or \
-                    'RE:' in x \
-                    or \
-                    'Fw:' in x \
-                    or \
-                    'FW:' in x \
-                    or \
-                    'Fwd:' in x \
-                    or \
-                    'FWD:' in x \
-                else False
+            lambda x: bool(
+                re.search(
+                    pattern, 
+                    x, 
+                    flags=re.IGNORECASE
+                )
+            )
         )
 
         return data
@@ -511,4 +507,156 @@ def get_labels(
             axis = 1
         )
 
-        return data
+        return data
+
+class MismatchLabeler:
+    """Class to relabel the mismatch examples from our dataset
+
+    Args:
+        data (pd.DataFrame): DataFrame
+        cfg (configparser.ConfigParser): ConfigParser object to read config.ini file
+    
+    Returns:
+        data (pd.DataFrame): DataFrame containing the relabeled data with labeling updates
+    """
+
+    def __init__(
+        self, 
+        data: pd.DataFrame = None,
+        cfg: configparser.ConfigParser = None,
+    ):
+
+        self.data = data
+        self.config = cfg
+
+        if self.data is None:
+            raise ValueError('data not provided')
+
+        if self.config is None:
+            self.config = config
+
+    def __call__(
+        self
+    ) -> pd.DataFrame:
+
+        """Call the Pipeline to label the enron data
+
+        Returns:
+            pd.DataFrame: DataFrame containing the enron data with labels
+        """
+
+        self.data = self.drop_by_length(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body length less than 4 words and more than 600 words')
+
+        self.data = self.drop_by_pattern(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body containing the given pattern')
+
+        self.data = self.relabel_marketing_frauds(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Relabeled marketing examples with label 1 to label 0 using marketing keywords')
+
+        return self.data
+
+    def drop_by_length(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Drop the fraud examples with body length less than 4 words and more than 600 words
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with examples dropped
+        """
+
+        if data is None:
+            data = self.data
+
+        drop_threshold = self.config.get('labeler.mismatch','drop_threshold')
+        min_length, max_length = convert_string_to_list(drop_threshold, sep = '&')
+        min_length, max_length = int(min_length), int(max_length)
+
+        data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))]
+        data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))]
+
+        return data
+
+    def drop_by_pattern(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Drop the fraud examples with body containing the given pattern
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with examples dropped
+        """
+
+        if data is None:
+            data = self.data
+
+        patterns = [
+            r'' + config.get('labeler.mismatch', 'best_regards'),
+            r'' + config.get('labeler.mismatch', 'sincerely'),
+            r'' + config.get('labeler.mismatch', 'regards'),
+            r'' + config.get('labeler.mismatch', 'your_sincerely'),
+            r'' + config.get('labeler.mismatch', 'yours_sincerely'),
+            r'' + config.get('labeler.mismatch', 'yours_truly'),
+            r'' + config.get('labeler.mismatch', 'yours_faithfully'),
+            r'' + config.get('labeler.mismatch', 'thanks'),
+            r'' + config.get('labeler.mismatch', 'thank_you'),
+            r'' + config.get('labeler.mismatch', 'message_id'),
+            r'' + config.get('labeler.mismatch', 'from'),
+            r'' + config.get('labeler.mismatch', 'sent'),
+            r'' + config.get('labeler.mismatch', 'to'),
+            r'' + config.get('labeler.mismatch', 'cc'),
+            r'' + config.get('labeler.mismatch', 'undelivery'),
+            r'' + config.get('labeler.mismatch', 'undeliverable'),
+            r'' + config.get('labeler.mismatch', 'missed_reply')
+        ]
+
+        # Create a temporary column without Subject
+        data['Temp_Body'] = data.swifter.apply(lambda row: row['Body'].replace(row['Subject'], '') if pd.notna(row['Subject']) else row['Body'], axis=1)
+
+        combined_pattern = '|'.join(f'(?:^|^\s|^>|^ >)(?: |){pattern}' for pattern in patterns)
+
+        # Filter out rows where Label is 1 and any pattern matches
+        data = data[~((data['Label'] == 1) & data['Temp_Body'].str.contains(combined_pattern, case=False, regex=True))]
+
+        # Drop the temporary column
+        data = data.drop(columns=['Temp_Body'])
+
+        return data
+
+    def relabel_marketing_frauds(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Relabel the marketing examples with label 1 to label 0 using marketing keywords
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with new column 'Label' 
+            -> Label of the email
+        """
+
+        if data is None:
+            data = self.data
+
+        marketing_keywords = self.config.get('labeler.mismatch','marketing')
+
+        data.loc[
+            (data['Label'] == 1) & \
+            data['Body'].str.contains(
+                marketing_keywords,
+                case=False, regex=True
+            ), 
+            'Label'
+        ] = 0
+
+        return data
+
diff --git a/detector/preprocessor.py b/detector/preprocessor.py
@@ -1,14 +1,39 @@
 import sys
 sys.path.append("..")
 
+import os
 import re
 import html2text
 from typing import Any
 import numpy as np
 
 from utils.util_preprocessor import add_subject_to_body
 
+#read config.ini file
+import configparser
+config = configparser.ConfigParser()
+config.read(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        '../config.ini'
+    )
+)
 class Preprocessor:
+    def __init__(
+        self,
+        cfg: configparser.ConfigParser = None,
+    ) -> None:
+        """Preprocessor class
+
+        Args:
+            cfg (configparser.ConfigParser, optional): ConfigParser object. Defaults to None.
+        """
+
+        self.config = cfg
+
+        if self.config is None:
+            self.config = config
+
     def __call__(
         self, 
         text: str,
@@ -115,7 +140,9 @@ def remove_unicode_characters(
             text (str): text with unicode characters removed
         """
 
-        return re.sub(r'[^\x00-\x7F]+', ' ', text)
+        unicode_pattern = r'' + config.get('preprocessor.patterns', 'unicode')
+
+        return re.sub(unicode_pattern, ' ', text)
 
     def remove_specific_patterns(
         self,
@@ -130,19 +157,26 @@ def remove_specific_patterns(
             text (str): text with patterns removed
         """
 
-        message_type = [
-            r'-+Original Message-+'
-        ]
-
-        header_type = [
-            r'From:.+?(?=Sent:)',
-            r'Sent:.+?(?=To:)',
-            r'To:.+?(?=Cc:)',
-            r'Cc:.+?(?=Subject:)',
-            r'Subject:.+?(\n|$)'
+        # Extract patterns from the [preprocessor.patterns] section
+        message_pattern = r'' + config.get('preprocessor.patterns', 'message')
+        forward_pattern = r'' + config.get('preprocessor.patterns', 'forward')
+        from_pattern = r'' + config.get('preprocessor.patterns', 'from')
+        sent_pattern = r'' + config.get('preprocessor.patterns', 'sent')
+        to_pattern = r'' + config.get('preprocessor.patterns', 'to')
+        cc_pattern = r'' + config.get('preprocessor.patterns', 'cc')
+        subject_pattern = r'' + config.get('preprocessor.patterns', 'subject')
+
+        patterns = [
+            message_pattern,
+            forward_pattern,
+            from_pattern,
+            sent_pattern,
+            to_pattern,
+            cc_pattern,
+            subject_pattern
         ]
 
-        for pattern in message_type + header_type:
+        for pattern in patterns:
             text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE)
 
         return text