From ac1283458f4bea996f29d43be2fee7b34e39b7f7 Mon Sep 17 00:00:00 2001
From: Advaith S Rao <advaith652@gmail.com>
Date: Thu, 23 Nov 2023 18:28:49 -0500
Subject: [PATCH] Perform Heuristics based relabeling for our fraud set (#29)

* Perform Heuristics based relabeling for our fraud set - marketing to non-fraud, signatures, metadata dropped from fraud

* Test fix for mismatch labeler function

* Final dataframe updated after mismatch labeling class

* Last fix for mislabeled data labeler
---
 README.md                       |   8 +-
 config.ini                      |  38 +++++++
 detector/data_loader.py         |   4 +-
 detector/labeler.py             | 178 +++++++++++++++++++++++++++++---
 detector/preprocessor.py        |  58 ++++++++---
 pipelines/distilbert_trainer.py |  80 +++++++-------
 pipelines/roberta_trainer.py    |  75 ++++++++------
 pipelines/svm_trainer.py        |  74 +++++++------
 tests/test_labeler.py           |  48 ++++++++-
 tests/test_modeler.py           |  16 ++-
 utils/util_modeler.py           |  14 ++-
 11 files changed, 448 insertions(+), 145 deletions(-)

diff --git a/README.md b/README.md
index 7adf4ec..248f600 100644
--- a/README.md
+++ b/README.md
@@ -60,16 +60,16 @@ In the early 2000s, Leslie Kaelbling at MIT purchased the dataset and noted that
 
 | Set | Emails |
 | --- | --- |
-| Train | 304235 |
-| Sanity | 200000 |
+| Train | 224543  |
+| Sanity | 250000 |
 | Gold Fraud | 1000 | 
 
 **Training Label Split:**
 
 | Label | Emails |
 | --- | --- |
-| 0 | 288428 |
-| 1 | 15807 |
+| 0 | 214080 |
+| 1 | 10463 |
 
 
 ***
diff --git a/config.ini b/config.ini
index 08d4e59..d8152b8 100644
--- a/config.ini
+++ b/config.ini
@@ -26,3 +26,41 @@ names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yea
 
 [folders.possible_fraud]
 folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items
+
+[preprocessor.patterns]
+;unicode patterns
+unicode = [^\x00-\x7F]+
+;specific header and message patterns
+message = -+Original Message-+
+forward = -+Forwarded by-+
+from = From:.+?(?=Sent:)
+sent = Sent:.+?(?=To:)
+to = To:.+?(?=Cc:)
+cc = Cc:.+?(?=Subject:)
+subject = Subject:.+?(\n|$)
+
+[labeler.mismatch]
+;min & max number of words in a sentence for fraud label
+drop_threshold = 4 & 1500 
+;patterns to drop examples from fraud label
+best_regards = Best Regards
+sincerely = Sincerely
+regards = Regards
+your_sincerely = Your Sincerely
+yours_sincerely = Yours Sincerely
+yours_truly = Yours Truly
+yours_faithfully = Yours Faithfully
+thanks = Thanks
+thank_you = Thank You
+message_id = Message-ID:
+from = From: 
+sent = Sent:
+to = To:
+cc = Cc:
+undelivery = Undelivered Mail Returned to Sender
+undeliverable = Undeliverable:
+missed_reply = re\s
+;reply patterns
+replies = re\:|Re\:|RE\:|Fw\:|FW\:|Fwd\:|FWD\:|fwd\:
+;marketing patterns
+marketing = unsubscribe|read our Privacy Policy|update your(?: |)(?:communication|)(?: |)preferences|future(?: |)(?:promotional|)(?: |)(?:e-mail|e-mails|emails|email)|receive(?: |)(?:these notices|)(?: |)in the future|above for more information|mailing list|please click here and you will be removed|your name removed|remove yourself from this list|your (?:email|e-mail) removed|from our (?:email|e-mail) list|To be REMOVED from (?:this|our) list|To view our privacy policy|just let us know by clicking here|All prices and product availability subject to change without notice|(?:opt-out|opt out)|(?:opt in|opt-in|opted in|opted-in) to receive|if you no longer wish to receive|thank you for shopping with us|newsletter
diff --git a/detector/data_loader.py b/detector/data_loader.py
index dab0cd3..15c050d 100644
--- a/detector/data_loader.py
+++ b/detector/data_loader.py
@@ -182,8 +182,8 @@ def process_email(
 
         email_fields = {}
 
-        folder_user = file.split(self.localpath)[1].split('/')[0]
-        folder_name = file.split(self.localpath)[1].split('/')[1]
+        folder_user = file.split(self.localpath)[1].split('/')[1]
+        folder_name = file.split(self.localpath)[1].split('/')[2]
 
         email_fields['Folder-User'] = folder_user
         email_fields['Folder-Name'] = folder_name
diff --git a/detector/labeler.py b/detector/labeler.py
index d4466d3..ebf814d 100644
--- a/detector/labeler.py
+++ b/detector/labeler.py
@@ -328,21 +328,17 @@ def contains_replies_forwards(
         if data is None:
             data = self.data
 
+        reply_patterns = self.config.get('labeler.mismatch','replies')
+        pattern = fr'\b(?:{reply_patterns})\b'
+        
         data['Contains-Reply-Forwards'] = data['Body'].swifter.apply(
-            lambda x: True \
-                if \
-                    'Re:' in x \
-                    or \
-                    'RE:' in x \
-                    or \
-                    'Fw:' in x \
-                    or \
-                    'FW:' in x \
-                    or \
-                    'Fwd:' in x \
-                    or \
-                    'FWD:' in x \
-                else False
+            lambda x: bool(
+                re.search(
+                    pattern, 
+                    x, 
+                    flags=re.IGNORECASE
+                )
+            )
         )
 
         return data
@@ -511,4 +507,156 @@ def get_labels(
             axis = 1
         )
 
-        return data
\ No newline at end of file
+        return data
+
+class MismatchLabeler:
+    """Class to relabel the mismatch examples from our dataset
+
+    Args:
+        data (pd.DataFrame): DataFrame
+        cfg (configparser.ConfigParser): ConfigParser object to read config.ini file
+    
+    Returns:
+        data (pd.DataFrame): DataFrame containing the relabeled data with labeling updates
+    """
+    
+    def __init__(
+        self, 
+        data: pd.DataFrame = None,
+        cfg: configparser.ConfigParser = None,
+    ):
+        
+        self.data = data
+        self.config = cfg
+
+        if self.data is None:
+            raise ValueError('data not provided')
+        
+        if self.config is None:
+            self.config = config
+    
+    def __call__(
+        self
+    ) -> pd.DataFrame:
+        
+        """Call the Pipeline to label the enron data
+
+        Returns:
+            pd.DataFrame: DataFrame containing the enron data with labels
+        """
+
+        self.data = self.drop_by_length(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body length less than 4 words and more than 600 words')
+
+        self.data = self.drop_by_pattern(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body containing the given pattern')
+
+        self.data = self.relabel_marketing_frauds(self.data)
+        print(f'\x1b[4mMismatchLabeler\x1b[0m: Relabeled marketing examples with label 1 to label 0 using marketing keywords')
+
+        return self.data
+    
+    def drop_by_length(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Drop the fraud examples with body length less than 4 words and more than 600 words
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with examples dropped
+        """
+
+        if data is None:
+            data = self.data
+        
+        drop_threshold = self.config.get('labeler.mismatch','drop_threshold')
+        min_length, max_length = convert_string_to_list(drop_threshold, sep = '&')
+        min_length, max_length = int(min_length), int(max_length)
+
+        data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))]
+        data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))]
+        
+        return data
+    
+    def drop_by_pattern(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Drop the fraud examples with body containing the given pattern
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with examples dropped
+        """
+
+        if data is None:
+            data = self.data
+        
+        patterns = [
+            r'' + config.get('labeler.mismatch', 'best_regards'),
+            r'' + config.get('labeler.mismatch', 'sincerely'),
+            r'' + config.get('labeler.mismatch', 'regards'),
+            r'' + config.get('labeler.mismatch', 'your_sincerely'),
+            r'' + config.get('labeler.mismatch', 'yours_sincerely'),
+            r'' + config.get('labeler.mismatch', 'yours_truly'),
+            r'' + config.get('labeler.mismatch', 'yours_faithfully'),
+            r'' + config.get('labeler.mismatch', 'thanks'),
+            r'' + config.get('labeler.mismatch', 'thank_you'),
+            r'' + config.get('labeler.mismatch', 'message_id'),
+            r'' + config.get('labeler.mismatch', 'from'),
+            r'' + config.get('labeler.mismatch', 'sent'),
+            r'' + config.get('labeler.mismatch', 'to'),
+            r'' + config.get('labeler.mismatch', 'cc'),
+            r'' + config.get('labeler.mismatch', 'undelivery'),
+            r'' + config.get('labeler.mismatch', 'undeliverable'),
+            r'' + config.get('labeler.mismatch', 'missed_reply')
+        ]
+
+        # Create a temporary column without Subject
+        data['Temp_Body'] = data.swifter.apply(lambda row: row['Body'].replace(row['Subject'], '') if pd.notna(row['Subject']) else row['Body'], axis=1)
+
+        combined_pattern = '|'.join(f'(?:^|^\s|^>|^ >)(?: |){pattern}' for pattern in patterns)
+
+        # Filter out rows where Label is 1 and any pattern matches
+        data = data[~((data['Label'] == 1) & data['Temp_Body'].str.contains(combined_pattern, case=False, regex=True))]
+
+        # Drop the temporary column
+        data = data.drop(columns=['Temp_Body'])
+
+        return data
+    
+    def relabel_marketing_frauds(
+        self,
+        data: pd.DataFrame = None,
+    ) -> pd.DataFrame:
+        """Relabel the marketing examples with label 1 to label 0 using marketing keywords
+
+        Args:
+            data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None.
+
+        Returns:
+            data (pd.DataFrame): DataFrame containing the enron data with new column 'Label' 
+            -> Label of the email
+        """
+        
+        if data is None:
+            data = self.data
+        
+        marketing_keywords = self.config.get('labeler.mismatch','marketing')
+        
+        data.loc[
+            (data['Label'] == 1) & \
+            data['Body'].str.contains(
+                marketing_keywords,
+                case=False, regex=True
+            ), 
+            'Label'
+        ] = 0
+    
+        return data
+    
diff --git a/detector/preprocessor.py b/detector/preprocessor.py
index 58208f6..95411bf 100644
--- a/detector/preprocessor.py
+++ b/detector/preprocessor.py
@@ -1,6 +1,7 @@
 import sys
 sys.path.append("..")
 
+import os
 import re
 import html2text
 from typing import Any
@@ -8,7 +9,31 @@
 
 from utils.util_preprocessor import add_subject_to_body
 
+#read config.ini file
+import configparser
+config = configparser.ConfigParser()
+config.read(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        '../config.ini'
+    )
+)
 class Preprocessor:
+    def __init__(
+        self,
+        cfg: configparser.ConfigParser = None,
+    ) -> None:
+        """Preprocessor class
+
+        Args:
+            cfg (configparser.ConfigParser, optional): ConfigParser object. Defaults to None.
+        """
+
+        self.config = cfg
+
+        if self.config is None:
+            self.config = config
+
     def __call__(
         self, 
         text: str,
@@ -115,7 +140,9 @@ def remove_unicode_characters(
             text (str): text with unicode characters removed
         """
 
-        return re.sub(r'[^\x00-\x7F]+', ' ', text)
+        unicode_pattern = r'' + config.get('preprocessor.patterns', 'unicode')
+
+        return re.sub(unicode_pattern, ' ', text)
 
     def remove_specific_patterns(
         self,
@@ -130,19 +157,26 @@ def remove_specific_patterns(
             text (str): text with patterns removed
         """
 
-        message_type = [
-            r'-+Original Message-+'
-        ]
-
-        header_type = [
-            r'From:.+?(?=Sent:)',
-            r'Sent:.+?(?=To:)',
-            r'To:.+?(?=Cc:)',
-            r'Cc:.+?(?=Subject:)',
-            r'Subject:.+?(\n|$)'
+        # Extract patterns from the [preprocessor.patterns] section
+        message_pattern = r'' + config.get('preprocessor.patterns', 'message')
+        forward_pattern = r'' + config.get('preprocessor.patterns', 'forward')
+        from_pattern = r'' + config.get('preprocessor.patterns', 'from')
+        sent_pattern = r'' + config.get('preprocessor.patterns', 'sent')
+        to_pattern = r'' + config.get('preprocessor.patterns', 'to')
+        cc_pattern = r'' + config.get('preprocessor.patterns', 'cc')
+        subject_pattern = r'' + config.get('preprocessor.patterns', 'subject')
+
+        patterns = [
+            message_pattern,
+            forward_pattern,
+            from_pattern,
+            sent_pattern,
+            to_pattern,
+            cc_pattern,
+            subject_pattern
         ]
 
-        for pattern in message_type + header_type:
+        for pattern in patterns:
             text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE)
 
         return text
diff --git a/pipelines/distilbert_trainer.py b/pipelines/distilbert_trainer.py
index d89a93b..899dcc8 100644
--- a/pipelines/distilbert_trainer.py
+++ b/pipelines/distilbert_trainer.py
@@ -11,7 +11,7 @@
 import os
 
 from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
-from detector.labeler import EnronLabeler
+from detector.labeler import EnronLabeler, MismatchLabeler
 from detector.modeler import DistilbertModel
 from detector.preprocessor import Preprocessor
 from utils.util_modeler import evaluate_and_log, get_f1_score
@@ -79,6 +79,11 @@ def label_and_preprocess_data(data):
             ignore_index=True
         )
 
+        # Run Mismatch Labeler
+        data = MismatchLabeler(data)()
+
+        data.reset_index(drop=True, inplace=True)
+
     return data
 
 def data_split(data):
@@ -111,11 +116,6 @@ def data_split(data):
         ]
 
         train['Split'] = 'Train'
-        
-        #drop train examples with Label=1 and Body less than 4 words
-        train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))]
-
-        train = train.reset_index(drop=True)
 
     else:
         train = data[data['Split'] == 'Train']
@@ -128,16 +128,9 @@ def train_model(train_data, hyper_params):
     run = wandb.init(config=hyper_params)
     model = DistilbertModel(**hyper_params)
 
-    # os.makedirs(f'/tmp/{date}/logs', exist_ok=True)
-
-    # # Define a log file path
-    # log_filename = f"/tmp/{date}/logs/model_training.log"
-
-    # # Create or open the log file in write mode
-    # log_file = open(log_filename, "w")
-
-    # # Redirect stdout to the log file
-    # sys.stdout = log_file
+    # #drop train examples with Label=1 and Body less than 4 words
+    # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
+    # train_data = train_data.reset_index(drop=True)
 
     # Call your code that produces output
     model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run)
@@ -154,36 +147,45 @@ def test_model(train_data, sanity_data, gold_fraud_data, save_path):
     f1_scores = {}
     
     # Define a dictionary to store the predictions, true labels for each dataset
-    true_pred_map = {
-        'train':{},
-        'sanity':{},
-        'gold_fraud':{}
-    }
+    # true_pred_map = {
+    #     'train':{},
+    #     'sanity':{},
+    #     'gold_fraud':{}
+    # }
 
     os.makedirs(os.path.join(save_path,'logs'), exist_ok=True)
 
     # Save the model and logs to the date folder
     model.save_model(os.path.join(save_path,'model'))
 
-    true_pred_map['train']['true'] = train_data['Label'].tolist()
-    true_pred_map['train']['pred'] = model.predict(body=train_data['Body'])
-
-    evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run)
-    f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'])
-
-    true_pred_map['sanity']['true'] = sanity_data['Label'].tolist()
-    true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body'])
-    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run)
-    f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'])
+    train_data['Prediction'] = model.predict(body=train_data['Body'])
+    evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
+    f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())
+
+    sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
+    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
+    f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())
+
+    gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
+    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
+    f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())
+
+    #save mismatch data into a csv file
+    mismatch_data = pd.concat(
+        [
+            train_data[train_data['Prediction'] != train_data['Label']],
+            sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
+            gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
+        ],
+        axis=0,
+        ignore_index=True
+    )
 
-    true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist()
-    true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body'])
-    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run)
-    f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'])
+    mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)
 
-    return f1_scores, true_pred_map
+    return f1_scores
 
-def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
+def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
     # Log the hyperparameters and f1 scores to Weights and Biases
     all_params = {**hyper_params, **f1_scores}
     run.config.update(all_params)
@@ -254,10 +256,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
     model = train_model(train_data, hyper_params)
 
     # Test the model
-    f1_scores, true_pred_map = test_model(train_data, sanity_data, gold_fraud_data, save_path)
+    f1_scores = test_model(train_data, sanity_data, gold_fraud_data, save_path)
 
     # Dump the logs to Weights and Biases
-    dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path)
+    dump_logs_to_wandb(hyper_params, f1_scores, save_path)
 
     # Close the Weights and Biases run
     run.finish()
diff --git a/pipelines/roberta_trainer.py b/pipelines/roberta_trainer.py
index 91b927f..d867dd3 100644
--- a/pipelines/roberta_trainer.py
+++ b/pipelines/roberta_trainer.py
@@ -11,7 +11,7 @@
 import os
 
 from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
-from detector.labeler import EnronLabeler
+from detector.labeler import EnronLabeler, MismatchLabeler
 from detector.modeler import RobertaModel
 from detector.preprocessor import Preprocessor
 from utils.util_modeler import evaluate_and_log, get_f1_score
@@ -79,6 +79,11 @@ def label_and_preprocess_data(data):
             ignore_index=True
         )
 
+        # Run Mismatch Labeler
+        data = MismatchLabeler(data)()
+
+        data.reset_index(drop=True, inplace=True)
+
     return data
 
 def data_split(data):
@@ -111,11 +116,6 @@ def data_split(data):
         ]
 
         train['Split'] = 'Train'
-        
-        #drop train examples with Label=1 and Body less than 4 words
-        train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))]
-
-        train = train.reset_index(drop=True)
 
     else:
         train = data[data['Split'] == 'Train']
@@ -139,6 +139,10 @@ def train_model(train_data, hyper_params):
     # # Redirect stdout to the log file
     # sys.stdout = log_file
 
+    # #drop train examples with Label=1 and Body less than 4 words
+    # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
+    # train_data = train_data.reset_index(drop=True)
+    
     # Call your code that produces output
     model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run)
 
@@ -153,37 +157,46 @@ def test_model(train_data, sanity_data, gold_fraud_data, save_path):
     # Define a dictionary to store the f1 scores
     f1_scores = {}
     
-    # Define a dictionary to store the predictions, true labels for each dataset
-    true_pred_map = {
-        'train':{},
-        'sanity':{},
-        'gold_fraud':{}
-    }
+    # # Define a dictionary to store the predictions, true labels for each dataset
+    # true_pred_map = {
+    #     'train':{},
+    #     'sanity':{},
+    #     'gold_fraud':{}
+    # }
 
     os.makedirs(os.path.join(save_path,'logs'), exist_ok=True)
 
     # Save the model and logs to the date folder
     model.save_model(os.path.join(save_path,'model'))
 
-    true_pred_map['train']['true'] = train_data['Label'].tolist()
-    true_pred_map['train']['pred'] = model.predict(body=train_data['Body'])
-
-    evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run)
-    f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'])
-
-    true_pred_map['sanity']['true'] = sanity_data['Label'].tolist()
-    true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body'])
-    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run)
-    f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'])
-
-    true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist()
-    true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body'])
-    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run)
-    f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'])
+    train_data['Prediction'] = model.predict(body=train_data['Body'])
+    evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
+    f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())
+
+    sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
+    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
+    f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())
+
+    gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
+    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
+    f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())
+
+    #save mismatch data into a csv file
+    mismatch_data = pd.concat(
+        [
+            train_data[train_data['Prediction'] != train_data['Label']],
+            sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
+            gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
+        ],
+        axis=0,
+        ignore_index=True
+    )
 
-    return f1_scores, true_pred_map
+    mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)
+    
+    return f1_scores
 
-def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
+def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
     # Log the hyperparameters and f1 scores to Weights and Biases
     all_params = {**hyper_params, **f1_scores}
     run.config.update(all_params)
@@ -254,10 +267,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
     model = train_model(train_data, hyper_params)
 
     # Test the model
-    f1_scores, true_pred_map = test_model(train_data, sanity_data, gold_fraud_data, save_path)
+    f1_scores = test_model(train_data, sanity_data, gold_fraud_data, save_path)
 
     # Dump the logs to Weights and Biases
-    dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path)
+    dump_logs_to_wandb(hyper_params, f1_scores, save_path)
 
     # Close the Weights and Biases run
     run.finish()
diff --git a/pipelines/svm_trainer.py b/pipelines/svm_trainer.py
index 54e0bfd..e4be1ff 100644
--- a/pipelines/svm_trainer.py
+++ b/pipelines/svm_trainer.py
@@ -11,7 +11,7 @@
 import os
 
 from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData
-from detector.labeler import EnronLabeler
+from detector.labeler import EnronLabeler, MismatchLabeler
 from detector.modeler import SVMModel
 from detector.preprocessor import Preprocessor
 from utils.util_modeler import evaluate_and_log, get_f1_score
@@ -76,6 +76,12 @@ def label_and_preprocess_data(data):
             axis=0,
             ignore_index=True
         )
+
+        # Run Mismatch Labeler
+        data = MismatchLabeler(data)()
+
+        data.reset_index(drop=True, inplace=True)
+
     return data
 
 def data_split(data):
@@ -109,11 +115,6 @@ def data_split(data):
 
         train['Split'] = 'Train'
 
-        #drop train examples with Label=1 and Body less than 4 words
-        train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))]
-
-        train = train.reset_index(drop=True)
-
     else:
         train = data[data['Split'] == 'Train']
         gold_fraud = data[data['Split'] == 'Gold Fraud']
@@ -125,6 +126,10 @@ def train_model(train_data, hyper_params):
     run = wandb.init(config=hyper_params)
     model = SVMModel(**hyper_params)
 
+    # #drop train examples with Label=1 and Body less than 4 words
+    # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))]
+    # train_data = train_data.reset_index(drop=True)
+
     # Call your code that produces output
     model.train(body=train_data['Body'], label=train_data['Label'])
     return model
@@ -133,37 +138,46 @@ def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path):
     # Define a dictionary to store the f1 scores
     f1_scores = {}
     
-    # Define a dictionary to store the predictions, true labels for each dataset
-    true_pred_map = {
-        'train':{},
-        'sanity':{},
-        'gold_fraud':{}
-    }
+    # # Define a dictionary to store the predictions, true labels for each dataset
+    # true_pred_map = {
+    #     'train':{},
+    #     'sanity':{},
+    #     'gold_fraud':{}
+    # }
 
     os.makedirs(os.path.join(save_path,'logs'), exist_ok=True)
 
     # Save the model and logs to the date folder
     model.save_model(os.path.join(save_path,'model'))
 
-    true_pred_map['train']['true'] = train_data['Label'].tolist()
-    true_pred_map['train']['pred'] = model.predict(body=train_data['Body'])
-
-    evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run)
-    f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'])
-
-    true_pred_map['sanity']['true'] = sanity_data['Label'].tolist()
-    true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body'])
-    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run)
-    f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'])
+    train_data['Prediction'] = model.predict(body=train_data['Body'])
+    evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist())
+    f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist())
+
+    sanity_data['Prediction'] = model.predict(body=sanity_data['Body'])
+    evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist())
+    f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist())
+
+    gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body'])
+    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist())
+    f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist())
+
+    #save mismatch data into a csv file
+    mismatch_data = pd.concat(
+        [
+            train_data[train_data['Prediction'] != train_data['Label']],
+            sanity_data[sanity_data['Prediction'] != sanity_data['Label']],
+            gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']]
+        ],
+        axis=0,
+        ignore_index=True
+    )
 
-    true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist()
-    true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body'])
-    evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run)
-    f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'])
+    mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False)
 
-    return f1_scores, true_pred_map
+    return f1_scores
 
-def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
+def dump_logs_to_wandb(hyper_params, f1_scores, save_path):
     # Log the hyperparameters and f1 scores to Weights and Biases
     all_params = {**hyper_params, **f1_scores}
     run.config.update(all_params)
@@ -231,10 +245,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path):
     model = train_model(train_data, hyper_params)
 
     # Test the model
-    f1_scores, true_pred_map = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path)
+    f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path)
 
     # Dump the logs to Weights and Biases
-    dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path)
+    dump_logs_to_wandb(hyper_params, f1_scores, save_path)
 
     # Close the Weights and Biases run
     run.finish()
diff --git a/tests/test_labeler.py b/tests/test_labeler.py
index 7d60826..6524a80 100644
--- a/tests/test_labeler.py
+++ b/tests/test_labeler.py
@@ -1,13 +1,25 @@
-import os
 import sys
 sys.path.append("..")
 
+import os
+# import random
 import pandas as pd
 import pytest
 
 from detector.data_loader import LoadEnronData
-from detector.labeler import EnronLabeler
+from detector.labeler import EnronLabeler, MismatchLabeler
 from utils.util_data_loader import sha256_hash
+from utils.util_preprocessor import convert_string_to_list
+
+#read config.ini file
+import configparser
+config = configparser.ConfigParser()
+config.read(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        '../config.ini'
+    )
+)
 
 @pytest.fixture
 def dataframe():
@@ -23,7 +35,6 @@ def dataframe():
     
     return data  
 
-
 def test_enron_labeler(dataframe):
     pipeline = EnronLabeler(dataframe)
     # assert type(pipeline()) == pd.DataFrame
@@ -42,5 +53,36 @@ def test_enron_labeler(dataframe):
     # assert type(pipeline.get_social_engineering_annotation()) == pd.DataFrame
     # assert type(pipeline.get_labels()) == pd.DataFrame
 
+def test_mismatch_labeler(dataframe):
+    idx_of_body = dataframe.columns.tolist().index('Body')
+    dataframe.iloc[0, idx_of_body] = 'Best Regards' + dataframe.iloc[0, idx_of_body]
+    dataframe.iloc[1, idx_of_body] = '>From:' + dataframe.iloc[1, idx_of_body]
+    dataframe.iloc[2, idx_of_body] = dataframe.iloc[2, idx_of_body] + 'Unsubscribe'
+    dataframe.iloc[3, idx_of_body] = dataframe.iloc[3, idx_of_body] + 'update your preferences'
+    dataframe['Label'] = [1] * len(dataframe)
+
+    
+    drop_threshold = config.get('labeler.mismatch','drop_threshold')
+    min_length, max_length = convert_string_to_list(drop_threshold, sep = '&')
+    min_length, max_length = int(min_length), int(max_length)
+
+    pipeline = MismatchLabeler(dataframe)
+    # assert type(pipeline()) == pd.DataFrame
+
+    data = pipeline()
+    
+    assert type(data) == pd.DataFrame
+
+    #drop by length function
+    assert len(data[((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))]) == 0
+    assert len(data[((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))]) == 0
+
+    #drop by pattern function
+    assert len(data[((data['Label'] == 1) & data['Body'].str.contains('(?:^|^\s|^>|^ >)(?: |)best regards', case=False, regex=True))]) == 0
+    assert len(data[((data['Label'] == 1) & data['Body'].str.contains('(?:^|^\s|^>|^ >)(?: |)from:', case=False, regex=True))]) == 0
+
+    #relabel marketing function
+    assert len(data[((data['Label'] == 1) & data['Body'].str.contains('unsubscribe', case=False, regex=True))]) == 0
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/test_modeler.py b/tests/test_modeler.py
index dabe15f..38033b4 100644
--- a/tests/test_modeler.py
+++ b/tests/test_modeler.py
@@ -11,7 +11,7 @@
 import pytest
 
 from utils.util_modeler import evaluate_and_log, get_f1_score, get_classification_report_confusion_matrix, Word2VecEmbedder, TPSampler
-
+from utils.util_data_loader import sha256_hash
 
 @pytest.fixture
 def x():
@@ -25,6 +25,10 @@ def y_true():
 def y_pred():
     return [0, 1]
 
+@pytest.fixture
+def id():
+    return [sha256_hash('Give me your account number quick'), sha256_hash('Give me your account number quick')]
+
 @pytest.fixture
 def mail():
     return """
@@ -38,8 +42,10 @@ def mail():
     """
 
 def test_get_f1_score(y_true, y_pred):
-    f1_score = get_f1_score(y_true, y_pred)
-    assert round(f1_score,3) == 0.667
+    macro_f1_score = get_f1_score(y_true, y_pred, average='macro')
+    weighted_f1_score = get_f1_score(y_true, y_pred, average='weighted')
+    assert round(macro_f1_score,3) == 0.667
+    assert round(weighted_f1_score,3) == 0.667
 
 def test_get_classification_report_confusion_matrix(y_true, y_pred):
     class_report, conf_matrix = get_classification_report_confusion_matrix(y_true, y_pred)
@@ -74,8 +80,8 @@ def test_get_classification_report_confusion_matrix(y_true, y_pred):
 
     assert (conf_matrix == np.array([[0, 0], [1, 1]])).all()
 
-def test_evaluate_and_log(x, y_true, y_pred):
-    evaluate_and_log(x, y_true, y_pred, '/tmp/test.log')
+def test_evaluate_and_log(x, y_true, y_pred, id):
+    evaluate_and_log(x=x, y_true=y_true, y_pred=y_pred, filename='/tmp/test.log', id=id)
     assert os.path.exists('/tmp/test.log')
 
 def test_word2vec_embedding(mail):
diff --git a/utils/util_modeler.py b/utils/util_modeler.py
index 4dd5e22..4b2bdd8 100644
--- a/utils/util_modeler.py
+++ b/utils/util_modeler.py
@@ -15,19 +15,21 @@
 
 def get_f1_score(
     y_true: list[int],
-    y_pred: list[int]
+    y_pred: list[int],
+    average: str = 'weighted'
     ):
     """Returns the F1 score.
 
     Args:
         y_true (list[int]): The true labels.
         y_pred (list[int]): The predicted labels.
+        average (str, optional): The averaging method. Defaults to 'weighted'.
 
     Returns:
         float: The F1 score.
     """
 
-    return f1_score(y_true, y_pred)
+    return f1_score(y_true, y_pred, average='weighted')
 
 def get_classification_report_confusion_matrix(
     y_true: list[int],
@@ -50,7 +52,8 @@ def evaluate_and_log(
     y_true: list[int], 
     y_pred: list[int], 
     filename: str,
-    experiment: wandb = None
+    experiment: wandb = None,
+    id: list[str] = None
     ):
     """Evaluates the model's performance and logs the results.
 
@@ -61,6 +64,9 @@ def evaluate_and_log(
         filename (str): The name of the log file.
     """
     
+    if id is None:
+        id = [str(i) for i in range(len(x))]
+    
     if len(x) != len(y_true) or len(x) != len(y_pred):
         raise ValueError("Input lists (x, y_true, and y_pred) must have the same length.")
 
@@ -76,7 +82,7 @@ def evaluate_and_log(
 
     for i in mismatched_indices:
         # Format the mismatched example in a code block
-        mismatched_example = f"\nActual: {y_true[i]}\nPredicted: {y_pred[i]}\n\nText: {x[i]}\n\n"
+        mismatched_example = f"\nMail ID: {id[i]}\nActual: {y_true[i]}\nPredicted: {y_pred[i]}\n\nText: {x[i]}\n\n"
         mismatched_examples.append(mismatched_example)
 
         if experiment is not None: