From ac1283458f4bea996f29d43be2fee7b34e39b7f7 Mon Sep 17 00:00:00 2001 From: Advaith S Rao Date: Thu, 23 Nov 2023 18:28:49 -0500 Subject: [PATCH] Perform Heuristics based relabeling for our fraud set (#29) * Perform Heuristics based relabeling for our fraud set - marketing to non-fraud, signatures, metadata dropped from fraud * Test fix for mismatch labeler function * Final dataframe updated after mismatch labeling class * Last fix for mislabeled data labeler --- README.md | 8 +- config.ini | 38 +++++++ detector/data_loader.py | 4 +- detector/labeler.py | 178 +++++++++++++++++++++++++++++--- detector/preprocessor.py | 58 ++++++++--- pipelines/distilbert_trainer.py | 80 +++++++------- pipelines/roberta_trainer.py | 75 ++++++++------ pipelines/svm_trainer.py | 74 +++++++------ tests/test_labeler.py | 48 ++++++++- tests/test_modeler.py | 16 ++- utils/util_modeler.py | 14 ++- 11 files changed, 448 insertions(+), 145 deletions(-) diff --git a/README.md b/README.md index 7adf4ec..248f600 100644 --- a/README.md +++ b/README.md @@ -60,16 +60,16 @@ In the early 2000s, Leslie Kaelbling at MIT purchased the dataset and noted that | Set | Emails | | --- | --- | -| Train | 304235 | -| Sanity | 200000 | +| Train | 224543 | +| Sanity | 250000 | | Gold Fraud | 1000 | **Training Label Split:** | Label | Emails | | --- | --- | -| 0 | 288428 | -| 1 | 15807 | +| 0 | 214080 | +| 1 | 10463 | *** diff --git a/config.ini b/config.ini index 08d4e59..d8152b8 100644 --- a/config.ini +++ b/config.ini @@ -26,3 +26,41 @@ names = Lay, Kenneth & Skilling, Jeffrey & Howard, Kevin & Krautz, Michael & Yea [folders.possible_fraud] folders = junk & junk_e_mail & junk_mail & insurance_risk & risk & deleted_items + +[preprocessor.patterns] +;unicode patterns +unicode = [^\x00-\x7F]+ +;specific header and message patterns +message = -+Original Message-+ +forward = -+Forwarded by-+ +from = From:.+?(?=Sent:) +sent = Sent:.+?(?=To:) +to = To:.+?(?=Cc:) +cc = Cc:.+?(?=Subject:) +subject = Subject:.+?(\n|$) + +[labeler.mismatch] +;min & max number of words in a sentence for fraud label +drop_threshold = 4 & 1500 +;patterns to drop examples from fraud label +best_regards = Best Regards +sincerely = Sincerely +regards = Regards +your_sincerely = Your Sincerely +yours_sincerely = Yours Sincerely +yours_truly = Yours Truly +yours_faithfully = Yours Faithfully +thanks = Thanks +thank_you = Thank You +message_id = Message-ID: +from = From: +sent = Sent: +to = To: +cc = Cc: +undelivery = Undelivered Mail Returned to Sender +undeliverable = Undeliverable: +missed_reply = re\s +;reply patterns +replies = re\:|Re\:|RE\:|Fw\:|FW\:|Fwd\:|FWD\:|fwd\: +;marketing patterns +marketing = unsubscribe|read our Privacy Policy|update your(?: |)(?:communication|)(?: |)preferences|future(?: |)(?:promotional|)(?: |)(?:e-mail|e-mails|emails|email)|receive(?: |)(?:these notices|)(?: |)in the future|above for more information|mailing list|please click here and you will be removed|your name removed|remove yourself from this list|your (?:email|e-mail) removed|from our (?:email|e-mail) list|To be REMOVED from (?:this|our) list|To view our privacy policy|just let us know by clicking here|All prices and product availability subject to change without notice|(?:opt-out|opt out)|(?:opt in|opt-in|opted in|opted-in) to receive|if you no longer wish to receive|thank you for shopping with us|newsletter diff --git a/detector/data_loader.py b/detector/data_loader.py index dab0cd3..15c050d 100644 --- a/detector/data_loader.py +++ b/detector/data_loader.py @@ -182,8 +182,8 @@ def process_email( email_fields = {} - folder_user = file.split(self.localpath)[1].split('/')[0] - folder_name = file.split(self.localpath)[1].split('/')[1] + folder_user = file.split(self.localpath)[1].split('/')[1] + folder_name = file.split(self.localpath)[1].split('/')[2] email_fields['Folder-User'] = folder_user email_fields['Folder-Name'] = folder_name diff --git a/detector/labeler.py b/detector/labeler.py index d4466d3..ebf814d 100644 --- a/detector/labeler.py +++ b/detector/labeler.py @@ -328,21 +328,17 @@ def contains_replies_forwards( if data is None: data = self.data + reply_patterns = self.config.get('labeler.mismatch','replies') + pattern = fr'\b(?:{reply_patterns})\b' + data['Contains-Reply-Forwards'] = data['Body'].swifter.apply( - lambda x: True \ - if \ - 'Re:' in x \ - or \ - 'RE:' in x \ - or \ - 'Fw:' in x \ - or \ - 'FW:' in x \ - or \ - 'Fwd:' in x \ - or \ - 'FWD:' in x \ - else False + lambda x: bool( + re.search( + pattern, + x, + flags=re.IGNORECASE + ) + ) ) return data @@ -511,4 +507,156 @@ def get_labels( axis = 1 ) - return data \ No newline at end of file + return data + +class MismatchLabeler: + """Class to relabel the mismatch examples from our dataset + + Args: + data (pd.DataFrame): DataFrame + cfg (configparser.ConfigParser): ConfigParser object to read config.ini file + + Returns: + data (pd.DataFrame): DataFrame containing the relabeled data with labeling updates + """ + + def __init__( + self, + data: pd.DataFrame = None, + cfg: configparser.ConfigParser = None, + ): + + self.data = data + self.config = cfg + + if self.data is None: + raise ValueError('data not provided') + + if self.config is None: + self.config = config + + def __call__( + self + ) -> pd.DataFrame: + + """Call the Pipeline to label the enron data + + Returns: + pd.DataFrame: DataFrame containing the enron data with labels + """ + + self.data = self.drop_by_length(self.data) + print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body length less than 4 words and more than 600 words') + + self.data = self.drop_by_pattern(self.data) + print(f'\x1b[4mMismatchLabeler\x1b[0m: Dropped examples with body containing the given pattern') + + self.data = self.relabel_marketing_frauds(self.data) + print(f'\x1b[4mMismatchLabeler\x1b[0m: Relabeled marketing examples with label 1 to label 0 using marketing keywords') + + return self.data + + def drop_by_length( + self, + data: pd.DataFrame = None, + ) -> pd.DataFrame: + """Drop the fraud examples with body length less than 4 words and more than 600 words + + Args: + data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None. + + Returns: + data (pd.DataFrame): DataFrame containing the enron data with examples dropped + """ + + if data is None: + data = self.data + + drop_threshold = self.config.get('labeler.mismatch','drop_threshold') + min_length, max_length = convert_string_to_list(drop_threshold, sep = '&') + min_length, max_length = int(min_length), int(max_length) + + data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))] + data = data[~((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))] + + return data + + def drop_by_pattern( + self, + data: pd.DataFrame = None, + ) -> pd.DataFrame: + """Drop the fraud examples with body containing the given pattern + + Args: + data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None. + + Returns: + data (pd.DataFrame): DataFrame containing the enron data with examples dropped + """ + + if data is None: + data = self.data + + patterns = [ + r'' + config.get('labeler.mismatch', 'best_regards'), + r'' + config.get('labeler.mismatch', 'sincerely'), + r'' + config.get('labeler.mismatch', 'regards'), + r'' + config.get('labeler.mismatch', 'your_sincerely'), + r'' + config.get('labeler.mismatch', 'yours_sincerely'), + r'' + config.get('labeler.mismatch', 'yours_truly'), + r'' + config.get('labeler.mismatch', 'yours_faithfully'), + r'' + config.get('labeler.mismatch', 'thanks'), + r'' + config.get('labeler.mismatch', 'thank_you'), + r'' + config.get('labeler.mismatch', 'message_id'), + r'' + config.get('labeler.mismatch', 'from'), + r'' + config.get('labeler.mismatch', 'sent'), + r'' + config.get('labeler.mismatch', 'to'), + r'' + config.get('labeler.mismatch', 'cc'), + r'' + config.get('labeler.mismatch', 'undelivery'), + r'' + config.get('labeler.mismatch', 'undeliverable'), + r'' + config.get('labeler.mismatch', 'missed_reply') + ] + + # Create a temporary column without Subject + data['Temp_Body'] = data.swifter.apply(lambda row: row['Body'].replace(row['Subject'], '') if pd.notna(row['Subject']) else row['Body'], axis=1) + + combined_pattern = '|'.join(f'(?:^|^\s|^>|^ >)(?: |){pattern}' for pattern in patterns) + + # Filter out rows where Label is 1 and any pattern matches + data = data[~((data['Label'] == 1) & data['Temp_Body'].str.contains(combined_pattern, case=False, regex=True))] + + # Drop the temporary column + data = data.drop(columns=['Temp_Body']) + + return data + + def relabel_marketing_frauds( + self, + data: pd.DataFrame = None, + ) -> pd.DataFrame: + """Relabel the marketing examples with label 1 to label 0 using marketing keywords + + Args: + data (pd.DataFrame, optional): DataFrame containing the enron data. Defaults to None. + + Returns: + data (pd.DataFrame): DataFrame containing the enron data with new column 'Label' + -> Label of the email + """ + + if data is None: + data = self.data + + marketing_keywords = self.config.get('labeler.mismatch','marketing') + + data.loc[ + (data['Label'] == 1) & \ + data['Body'].str.contains( + marketing_keywords, + case=False, regex=True + ), + 'Label' + ] = 0 + + return data + diff --git a/detector/preprocessor.py b/detector/preprocessor.py index 58208f6..95411bf 100644 --- a/detector/preprocessor.py +++ b/detector/preprocessor.py @@ -1,6 +1,7 @@ import sys sys.path.append("..") +import os import re import html2text from typing import Any @@ -8,7 +9,31 @@ from utils.util_preprocessor import add_subject_to_body +#read config.ini file +import configparser +config = configparser.ConfigParser() +config.read( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + '../config.ini' + ) +) class Preprocessor: + def __init__( + self, + cfg: configparser.ConfigParser = None, + ) -> None: + """Preprocessor class + + Args: + cfg (configparser.ConfigParser, optional): ConfigParser object. Defaults to None. + """ + + self.config = cfg + + if self.config is None: + self.config = config + def __call__( self, text: str, @@ -115,7 +140,9 @@ def remove_unicode_characters( text (str): text with unicode characters removed """ - return re.sub(r'[^\x00-\x7F]+', ' ', text) + unicode_pattern = r'' + config.get('preprocessor.patterns', 'unicode') + + return re.sub(unicode_pattern, ' ', text) def remove_specific_patterns( self, @@ -130,19 +157,26 @@ def remove_specific_patterns( text (str): text with patterns removed """ - message_type = [ - r'-+Original Message-+' - ] - - header_type = [ - r'From:.+?(?=Sent:)', - r'Sent:.+?(?=To:)', - r'To:.+?(?=Cc:)', - r'Cc:.+?(?=Subject:)', - r'Subject:.+?(\n|$)' + # Extract patterns from the [preprocessor.patterns] section + message_pattern = r'' + config.get('preprocessor.patterns', 'message') + forward_pattern = r'' + config.get('preprocessor.patterns', 'forward') + from_pattern = r'' + config.get('preprocessor.patterns', 'from') + sent_pattern = r'' + config.get('preprocessor.patterns', 'sent') + to_pattern = r'' + config.get('preprocessor.patterns', 'to') + cc_pattern = r'' + config.get('preprocessor.patterns', 'cc') + subject_pattern = r'' + config.get('preprocessor.patterns', 'subject') + + patterns = [ + message_pattern, + forward_pattern, + from_pattern, + sent_pattern, + to_pattern, + cc_pattern, + subject_pattern ] - for pattern in message_type + header_type: + for pattern in patterns: text = re.sub(pattern, ' ', text, flags = re.DOTALL | re.IGNORECASE) return text diff --git a/pipelines/distilbert_trainer.py b/pipelines/distilbert_trainer.py index d89a93b..899dcc8 100644 --- a/pipelines/distilbert_trainer.py +++ b/pipelines/distilbert_trainer.py @@ -11,7 +11,7 @@ import os from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData -from detector.labeler import EnronLabeler +from detector.labeler import EnronLabeler, MismatchLabeler from detector.modeler import DistilbertModel from detector.preprocessor import Preprocessor from utils.util_modeler import evaluate_and_log, get_f1_score @@ -79,6 +79,11 @@ def label_and_preprocess_data(data): ignore_index=True ) + # Run Mismatch Labeler + data = MismatchLabeler(data)() + + data.reset_index(drop=True, inplace=True) + return data def data_split(data): @@ -111,11 +116,6 @@ def data_split(data): ] train['Split'] = 'Train' - - #drop train examples with Label=1 and Body less than 4 words - train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))] - - train = train.reset_index(drop=True) else: train = data[data['Split'] == 'Train'] @@ -128,16 +128,9 @@ def train_model(train_data, hyper_params): run = wandb.init(config=hyper_params) model = DistilbertModel(**hyper_params) - # os.makedirs(f'/tmp/{date}/logs', exist_ok=True) - - # # Define a log file path - # log_filename = f"/tmp/{date}/logs/model_training.log" - - # # Create or open the log file in write mode - # log_file = open(log_filename, "w") - - # # Redirect stdout to the log file - # sys.stdout = log_file + # #drop train examples with Label=1 and Body less than 4 words + # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] + # train_data = train_data.reset_index(drop=True) # Call your code that produces output model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run) @@ -154,36 +147,45 @@ def test_model(train_data, sanity_data, gold_fraud_data, save_path): f1_scores = {} # Define a dictionary to store the predictions, true labels for each dataset - true_pred_map = { - 'train':{}, - 'sanity':{}, - 'gold_fraud':{} - } + # true_pred_map = { + # 'train':{}, + # 'sanity':{}, + # 'gold_fraud':{} + # } os.makedirs(os.path.join(save_path,'logs'), exist_ok=True) # Save the model and logs to the date folder model.save_model(os.path.join(save_path,'model')) - true_pred_map['train']['true'] = train_data['Label'].tolist() - true_pred_map['train']['pred'] = model.predict(body=train_data['Body']) - - evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run) - f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred']) - - true_pred_map['sanity']['true'] = sanity_data['Label'].tolist() - true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body']) - evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run) - f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred']) + train_data['Prediction'] = model.predict(body=train_data['Body']) + evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist()) + f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist()) + + sanity_data['Prediction'] = model.predict(body=sanity_data['Body']) + evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist()) + f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist()) + + gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body']) + evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist()) + f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist()) + + #save mismatch data into a csv file + mismatch_data = pd.concat( + [ + train_data[train_data['Prediction'] != train_data['Label']], + sanity_data[sanity_data['Prediction'] != sanity_data['Label']], + gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']] + ], + axis=0, + ignore_index=True + ) - true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist() - true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body']) - evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run) - f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred']) + mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False) - return f1_scores, true_pred_map + return f1_scores -def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): +def dump_logs_to_wandb(hyper_params, f1_scores, save_path): # Log the hyperparameters and f1 scores to Weights and Biases all_params = {**hyper_params, **f1_scores} run.config.update(all_params) @@ -254,10 +256,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): model = train_model(train_data, hyper_params) # Test the model - f1_scores, true_pred_map = test_model(train_data, sanity_data, gold_fraud_data, save_path) + f1_scores = test_model(train_data, sanity_data, gold_fraud_data, save_path) # Dump the logs to Weights and Biases - dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path) + dump_logs_to_wandb(hyper_params, f1_scores, save_path) # Close the Weights and Biases run run.finish() diff --git a/pipelines/roberta_trainer.py b/pipelines/roberta_trainer.py index 91b927f..d867dd3 100644 --- a/pipelines/roberta_trainer.py +++ b/pipelines/roberta_trainer.py @@ -11,7 +11,7 @@ import os from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData -from detector.labeler import EnronLabeler +from detector.labeler import EnronLabeler, MismatchLabeler from detector.modeler import RobertaModel from detector.preprocessor import Preprocessor from utils.util_modeler import evaluate_and_log, get_f1_score @@ -79,6 +79,11 @@ def label_and_preprocess_data(data): ignore_index=True ) + # Run Mismatch Labeler + data = MismatchLabeler(data)() + + data.reset_index(drop=True, inplace=True) + return data def data_split(data): @@ -111,11 +116,6 @@ def data_split(data): ] train['Split'] = 'Train' - - #drop train examples with Label=1 and Body less than 4 words - train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))] - - train = train.reset_index(drop=True) else: train = data[data['Split'] == 'Train'] @@ -139,6 +139,10 @@ def train_model(train_data, hyper_params): # # Redirect stdout to the log file # sys.stdout = log_file + # #drop train examples with Label=1 and Body less than 4 words + # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] + # train_data = train_data.reset_index(drop=True) + # Call your code that produces output model.train(body=train_data['Body'], label=train_data['Label'], validation_size=0.2, wandb=run) @@ -153,37 +157,46 @@ def test_model(train_data, sanity_data, gold_fraud_data, save_path): # Define a dictionary to store the f1 scores f1_scores = {} - # Define a dictionary to store the predictions, true labels for each dataset - true_pred_map = { - 'train':{}, - 'sanity':{}, - 'gold_fraud':{} - } + # # Define a dictionary to store the predictions, true labels for each dataset + # true_pred_map = { + # 'train':{}, + # 'sanity':{}, + # 'gold_fraud':{} + # } os.makedirs(os.path.join(save_path,'logs'), exist_ok=True) # Save the model and logs to the date folder model.save_model(os.path.join(save_path,'model')) - true_pred_map['train']['true'] = train_data['Label'].tolist() - true_pred_map['train']['pred'] = model.predict(body=train_data['Body']) - - evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run) - f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred']) - - true_pred_map['sanity']['true'] = sanity_data['Label'].tolist() - true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body']) - evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run) - f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred']) - - true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist() - true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body']) - evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run) - f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred']) + train_data['Prediction'] = model.predict(body=train_data['Body']) + evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist()) + f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist()) + + sanity_data['Prediction'] = model.predict(body=sanity_data['Body']) + evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist()) + f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist()) + + gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body']) + evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist()) + f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist()) + + #save mismatch data into a csv file + mismatch_data = pd.concat( + [ + train_data[train_data['Prediction'] != train_data['Label']], + sanity_data[sanity_data['Prediction'] != sanity_data['Label']], + gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']] + ], + axis=0, + ignore_index=True + ) - return f1_scores, true_pred_map + mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False) + + return f1_scores -def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): +def dump_logs_to_wandb(hyper_params, f1_scores, save_path): # Log the hyperparameters and f1 scores to Weights and Biases all_params = {**hyper_params, **f1_scores} run.config.update(all_params) @@ -254,10 +267,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): model = train_model(train_data, hyper_params) # Test the model - f1_scores, true_pred_map = test_model(train_data, sanity_data, gold_fraud_data, save_path) + f1_scores = test_model(train_data, sanity_data, gold_fraud_data, save_path) # Dump the logs to Weights and Biases - dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path) + dump_logs_to_wandb(hyper_params, f1_scores, save_path) # Close the Weights and Biases run run.finish() diff --git a/pipelines/svm_trainer.py b/pipelines/svm_trainer.py index 54e0bfd..e4be1ff 100644 --- a/pipelines/svm_trainer.py +++ b/pipelines/svm_trainer.py @@ -11,7 +11,7 @@ import os from detector.data_loader import LoadEnronData, LoadPhishingData, LoadSocEnggData -from detector.labeler import EnronLabeler +from detector.labeler import EnronLabeler, MismatchLabeler from detector.modeler import SVMModel from detector.preprocessor import Preprocessor from utils.util_modeler import evaluate_and_log, get_f1_score @@ -76,6 +76,12 @@ def label_and_preprocess_data(data): axis=0, ignore_index=True ) + + # Run Mismatch Labeler + data = MismatchLabeler(data)() + + data.reset_index(drop=True, inplace=True) + return data def data_split(data): @@ -109,11 +115,6 @@ def data_split(data): train['Split'] = 'Train' - #drop train examples with Label=1 and Body less than 4 words - train = train[~((train['Label'] == 1) & (train['Body'].str.split().str.len() < 4))] - - train = train.reset_index(drop=True) - else: train = data[data['Split'] == 'Train'] gold_fraud = data[data['Split'] == 'Gold Fraud'] @@ -125,6 +126,10 @@ def train_model(train_data, hyper_params): run = wandb.init(config=hyper_params) model = SVMModel(**hyper_params) + # #drop train examples with Label=1 and Body less than 4 words + # train_data = train_data[~((train_data['Label'] == 1) & (train_data['Body'].str.split().str.len() < 4))] + # train_data = train_data.reset_index(drop=True) + # Call your code that produces output model.train(body=train_data['Body'], label=train_data['Label']) return model @@ -133,37 +138,46 @@ def test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path): # Define a dictionary to store the f1 scores f1_scores = {} - # Define a dictionary to store the predictions, true labels for each dataset - true_pred_map = { - 'train':{}, - 'sanity':{}, - 'gold_fraud':{} - } + # # Define a dictionary to store the predictions, true labels for each dataset + # true_pred_map = { + # 'train':{}, + # 'sanity':{}, + # 'gold_fraud':{} + # } os.makedirs(os.path.join(save_path,'logs'), exist_ok=True) # Save the model and logs to the date folder model.save_model(os.path.join(save_path,'model')) - true_pred_map['train']['true'] = train_data['Label'].tolist() - true_pred_map['train']['pred'] = model.predict(body=train_data['Body']) - - evaluate_and_log(x=train_data['Body'].tolist(), y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred'], filename=os.path.join(save_path,'logs/train.log'), experiment=run) - f1_scores['train'] = get_f1_score(y_true=true_pred_map['train']['true'], y_pred=true_pred_map['train']['pred']) - - true_pred_map['sanity']['true'] = sanity_data['Label'].tolist() - true_pred_map['sanity']['pred'] = model.predict(body=sanity_data['Body']) - evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred'], filename=os.path.join(save_path,'logs/sanity.log'), experiment=run) - f1_scores['sanity'] = get_f1_score(y_true=true_pred_map['sanity']['true'], y_pred=true_pred_map['sanity']['pred']) + train_data['Prediction'] = model.predict(body=train_data['Body']) + evaluate_and_log(x=train_data['Body'].tolist(), y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/train.log'), experiment=run, id = train_data['Mail-ID'].tolist()) + f1_scores['train'] = get_f1_score(y_true=train_data['Label'].tolist(), y_pred=train_data['Prediction'].tolist()) + + sanity_data['Prediction'] = model.predict(body=sanity_data['Body']) + evaluate_and_log(x=sanity_data['Body'].tolist(), y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/sanity.log'), experiment=run, id = sanity_data['Mail-ID'].tolist()) + f1_scores['sanity'] = get_f1_score(y_true=sanity_data['Label'].tolist(), y_pred=sanity_data['Prediction'].tolist()) + + gold_fraud_data['Prediction'] = model.predict(body=gold_fraud_data['Body']) + evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist(), filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run, id = gold_fraud_data['Mail-ID'].tolist()) + f1_scores['gold_fraud'] = get_f1_score(y_true=gold_fraud_data['Label'].tolist(), y_pred=gold_fraud_data['Prediction'].tolist()) + + #save mismatch data into a csv file + mismatch_data = pd.concat( + [ + train_data[train_data['Prediction'] != train_data['Label']], + sanity_data[sanity_data['Prediction'] != sanity_data['Label']], + gold_fraud_data[gold_fraud_data['Prediction'] != gold_fraud_data['Label']] + ], + axis=0, + ignore_index=True + ) - true_pred_map['gold_fraud']['true'] = gold_fraud_data['Label'].tolist() - true_pred_map['gold_fraud']['pred'] = model.predict(body=gold_fraud_data['Body']) - evaluate_and_log(x=gold_fraud_data['Body'].tolist(), y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred'], filename=os.path.join(save_path,'logs/gold_fraud.log'), experiment=run) - f1_scores['gold_fraud'] = get_f1_score(y_true=true_pred_map['gold_fraud']['true'], y_pred=true_pred_map['gold_fraud']['pred']) + mismatch_data.to_csv(os.path.join(save_path,'logs/mismatch_data.csv'), index=False) - return f1_scores, true_pred_map + return f1_scores -def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): +def dump_logs_to_wandb(hyper_params, f1_scores, save_path): # Log the hyperparameters and f1 scores to Weights and Biases all_params = {**hyper_params, **f1_scores} run.config.update(all_params) @@ -231,10 +245,10 @@ def dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path): model = train_model(train_data, hyper_params) # Test the model - f1_scores, true_pred_map = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path) + f1_scores = test_and_save_model(train_data, sanity_data, gold_fraud_data, save_path) # Dump the logs to Weights and Biases - dump_logs_to_wandb(hyper_params, f1_scores, true_pred_map, save_path) + dump_logs_to_wandb(hyper_params, f1_scores, save_path) # Close the Weights and Biases run run.finish() diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 7d60826..6524a80 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -1,13 +1,25 @@ -import os import sys sys.path.append("..") +import os +# import random import pandas as pd import pytest from detector.data_loader import LoadEnronData -from detector.labeler import EnronLabeler +from detector.labeler import EnronLabeler, MismatchLabeler from utils.util_data_loader import sha256_hash +from utils.util_preprocessor import convert_string_to_list + +#read config.ini file +import configparser +config = configparser.ConfigParser() +config.read( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + '../config.ini' + ) +) @pytest.fixture def dataframe(): @@ -23,7 +35,6 @@ def dataframe(): return data - def test_enron_labeler(dataframe): pipeline = EnronLabeler(dataframe) # assert type(pipeline()) == pd.DataFrame @@ -42,5 +53,36 @@ def test_enron_labeler(dataframe): # assert type(pipeline.get_social_engineering_annotation()) == pd.DataFrame # assert type(pipeline.get_labels()) == pd.DataFrame +def test_mismatch_labeler(dataframe): + idx_of_body = dataframe.columns.tolist().index('Body') + dataframe.iloc[0, idx_of_body] = 'Best Regards' + dataframe.iloc[0, idx_of_body] + dataframe.iloc[1, idx_of_body] = '>From:' + dataframe.iloc[1, idx_of_body] + dataframe.iloc[2, idx_of_body] = dataframe.iloc[2, idx_of_body] + 'Unsubscribe' + dataframe.iloc[3, idx_of_body] = dataframe.iloc[3, idx_of_body] + 'update your preferences' + dataframe['Label'] = [1] * len(dataframe) + + + drop_threshold = config.get('labeler.mismatch','drop_threshold') + min_length, max_length = convert_string_to_list(drop_threshold, sep = '&') + min_length, max_length = int(min_length), int(max_length) + + pipeline = MismatchLabeler(dataframe) + # assert type(pipeline()) == pd.DataFrame + + data = pipeline() + + assert type(data) == pd.DataFrame + + #drop by length function + assert len(data[((data['Label'] == 1) & (data['Body'].str.split().str.len() < min_length))]) == 0 + assert len(data[((data['Label'] == 1) & (data['Body'].str.split().str.len() > max_length))]) == 0 + + #drop by pattern function + assert len(data[((data['Label'] == 1) & data['Body'].str.contains('(?:^|^\s|^>|^ >)(?: |)best regards', case=False, regex=True))]) == 0 + assert len(data[((data['Label'] == 1) & data['Body'].str.contains('(?:^|^\s|^>|^ >)(?: |)from:', case=False, regex=True))]) == 0 + + #relabel marketing function + assert len(data[((data['Label'] == 1) & data['Body'].str.contains('unsubscribe', case=False, regex=True))]) == 0 + if __name__ == "__main__": pytest.main() diff --git a/tests/test_modeler.py b/tests/test_modeler.py index dabe15f..38033b4 100644 --- a/tests/test_modeler.py +++ b/tests/test_modeler.py @@ -11,7 +11,7 @@ import pytest from utils.util_modeler import evaluate_and_log, get_f1_score, get_classification_report_confusion_matrix, Word2VecEmbedder, TPSampler - +from utils.util_data_loader import sha256_hash @pytest.fixture def x(): @@ -25,6 +25,10 @@ def y_true(): def y_pred(): return [0, 1] +@pytest.fixture +def id(): + return [sha256_hash('Give me your account number quick'), sha256_hash('Give me your account number quick')] + @pytest.fixture def mail(): return """ @@ -38,8 +42,10 @@ def mail(): """ def test_get_f1_score(y_true, y_pred): - f1_score = get_f1_score(y_true, y_pred) - assert round(f1_score,3) == 0.667 + macro_f1_score = get_f1_score(y_true, y_pred, average='macro') + weighted_f1_score = get_f1_score(y_true, y_pred, average='weighted') + assert round(macro_f1_score,3) == 0.667 + assert round(weighted_f1_score,3) == 0.667 def test_get_classification_report_confusion_matrix(y_true, y_pred): class_report, conf_matrix = get_classification_report_confusion_matrix(y_true, y_pred) @@ -74,8 +80,8 @@ def test_get_classification_report_confusion_matrix(y_true, y_pred): assert (conf_matrix == np.array([[0, 0], [1, 1]])).all() -def test_evaluate_and_log(x, y_true, y_pred): - evaluate_and_log(x, y_true, y_pred, '/tmp/test.log') +def test_evaluate_and_log(x, y_true, y_pred, id): + evaluate_and_log(x=x, y_true=y_true, y_pred=y_pred, filename='/tmp/test.log', id=id) assert os.path.exists('/tmp/test.log') def test_word2vec_embedding(mail): diff --git a/utils/util_modeler.py b/utils/util_modeler.py index 4dd5e22..4b2bdd8 100644 --- a/utils/util_modeler.py +++ b/utils/util_modeler.py @@ -15,19 +15,21 @@ def get_f1_score( y_true: list[int], - y_pred: list[int] + y_pred: list[int], + average: str = 'weighted' ): """Returns the F1 score. Args: y_true (list[int]): The true labels. y_pred (list[int]): The predicted labels. + average (str, optional): The averaging method. Defaults to 'weighted'. Returns: float: The F1 score. """ - return f1_score(y_true, y_pred) + return f1_score(y_true, y_pred, average='weighted') def get_classification_report_confusion_matrix( y_true: list[int], @@ -50,7 +52,8 @@ def evaluate_and_log( y_true: list[int], y_pred: list[int], filename: str, - experiment: wandb = None + experiment: wandb = None, + id: list[str] = None ): """Evaluates the model's performance and logs the results. @@ -61,6 +64,9 @@ def evaluate_and_log( filename (str): The name of the log file. """ + if id is None: + id = [str(i) for i in range(len(x))] + if len(x) != len(y_true) or len(x) != len(y_pred): raise ValueError("Input lists (x, y_true, and y_pred) must have the same length.") @@ -76,7 +82,7 @@ def evaluate_and_log( for i in mismatched_indices: # Format the mismatched example in a code block - mismatched_example = f"\nActual: {y_true[i]}\nPredicted: {y_pred[i]}\n\nText: {x[i]}\n\n" + mismatched_example = f"\nMail ID: {id[i]}\nActual: {y_true[i]}\nPredicted: {y_pred[i]}\n\nText: {x[i]}\n\n" mismatched_examples.append(mismatched_example) if experiment is not None: