From 100b49ad17335b7c114b47f0c50c28dba02748ed Mon Sep 17 00:00:00 2001 From: Aiko Date: Thu, 11 Jan 2024 17:57:36 -0700 Subject: [PATCH] fix: make prediction on every user in DB instead of filtering labelled_by_curator=None - using get_all_users_df() instead of get_unlabelled_by_curator_df() to obtain dataframe, because a user previously labelled as ham may turn into spam. - improved the management command messages. - added exception handling for file operations. - replaced MultinomialNB with XGboost. --- .../commands/curator_spam_detection.py | 26 +- django/curator/models.py | 2 +- django/curator/spam.py | 240 +++++++++--------- django/curator/spam_classifiers.py | 74 ++++-- django/curator/spam_processor.py | 115 ++++++--- django/curator/tests/test_spam.py | 221 +++++++++++----- 6 files changed, 419 insertions(+), 259 deletions(-) diff --git a/django/curator/management/commands/curator_spam_detection.py b/django/curator/management/commands/curator_spam_detection.py index e3c90903d..6dd70f8cf 100644 --- a/django/curator/management/commands/curator_spam_detection.py +++ b/django/curator/management/commands/curator_spam_detection.py @@ -17,12 +17,19 @@ def __init__(self): def add_arguments(self, parser): parser.add_argument( - "--exe", - "-e", + "--predict", + "-p", action="store_true", default=False, help="Print user_ids of spam users and the metrics of the models used to obtain the predictions.", ) + parser.add_argument( + "--fit", + "-f", + action="store_true", + default=False, + help="Fit all models based on user data labelled by curator.", + ) parser.add_argument( "--get_model_metrics", "-m", @@ -35,7 +42,7 @@ def add_arguments(self, parser): "-l", action="store_true", default=False, - help="Store manually annotated spam labels to the DB.", + help="Store bootstrap spam labels to the DB.", ) parser.add_argument( "--fit_usermeta_model", "-fu", action="store_true", default=False @@ -58,8 +65,11 @@ def add_arguments(self, parser): help="Print user_ids of all the evaluated users and spam users using the Text model", ) - def handle_exe(self): - self.detection.execute() + def handle_predict(self): + self.detection.predict() + + def handle_fit(self): + self.detection.fit_classifiers() def handle_get_model_metrics(self): self.detection.get_model_metrics() @@ -80,8 +90,10 @@ def handle_predict_text_model(self): self.detection.predict_text_spam_classifier() def handle(self, *args, **options): - if options["exe"]: - action = "exe" + if options["predict"]: + action = "predict" + elif options["fit"]: + action = "fit" elif options["get_model_metrics"]: action = "get_model_metrics" elif options["load_labels"]: diff --git a/django/curator/models.py b/django/curator/models.py index 09c430676..b7bc301ca 100644 --- a/django/curator/models.py +++ b/django/curator/models.py @@ -375,7 +375,7 @@ class UserSpamStatus(models.Model): member_profile = models.OneToOneField( MemberProfile, on_delete=models.CASCADE, primary_key=True ) - # FIXME: add help_text + # TODO: add help_text # None = not processed yet # True = bio_classifier considered this user to be spam # False = bio_classifier did not consider this user to be spam diff --git a/django/curator/spam.py b/django/curator/spam.py index 42e9714d8..c17796e0f 100644 --- a/django/curator/spam.py +++ b/django/curator/spam.py @@ -1,5 +1,6 @@ import json import os +import sys from django.conf import settings @@ -25,9 +26,6 @@ def __init__(self): SpamDetection Initialization Steps: 1. Initializes UserSpamStatusProcessor and the classifier classes 2. If no data has been labelled by a curator, load datase.csv - 3. If no model pickle file is found, call fit() of Classifier classes - - if all users have None in labelled_by_curator, load to DB by calling Pipeline.load_labels_from_csv() - - additionally, if no labels file, throw exception """ self.processor = UserSpamStatusProcessor() self.usermeta_classifier = UserMetadataSpamClassifier() @@ -35,41 +33,45 @@ def __init__(self): if not self.processor.labelled_by_curator_exist(): self.processor.load_labels_from_csv() + + def _check_model_instance_files(self): # Check whether UserMetadataSpamClassifier model file exists - if os.path.exists(self.usermeta_classifier.MODEL_METRICS_FILE_PATH): - with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file: - self.usermeta_classifier_metrics = json.load(json_file) - else: - # If model metrics and instance file don't exist, call fit() - self.usermeta_classifier_metrics = self.usermeta_classifier.fit() + try: + json_file = open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) + except OSError: + print("Could not open/read file:", self.usermeta_classifier.MODEL_METRICS_FILE_PATH) + print("Please run fit_classifiers() to create model instance and metrics files.") + sys.exit() + with json_file: + self.usermeta_classifier_metrics = json.load(json_file) + + try: + json_file = open(self.text_classifier.MODEL_METRICS_FILE_PATH) + except OSError: + print("Could not open/read file:", self.text_classifier.MODEL_METRICS_FILE_PATH) + print("Please run fit_classifiers() to create model instance and metrics files.") + sys.exit() + with json_file: + self.text_classifier_metrics = json.load(json_file) - # Check whether TextSpamClassifier model file exists - if os.path.exists(self.text_classifier.MODEL_METRICS_FILE_PATH): - with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file: - self.text_classifier_metrics = json.load(json_file) - else: - # If model metrics and instance file don't exist, call fit() - self.text_classifier_metrics = self.text_classifier.fit() - def execute(self): + def predict(self): """ A default function to obtain the list of spam users and the metrics of the models used to obtain the predictions. Execution Steps: - 1. Check if there exists user data that should be labelled by the classifier models - 2. If there exists, class predict() of the classifier classes. This function will store the result in DB at the end. - 3. Print resluts + 1. Call predict() of the classifier classes. This function will store the result in DB at the end. + 2. Print resluts 4. Return the detection results stored in DB. """ print("Executing spam dectection...") - # 1. Check DB for unlabelled users (None in all labelled_by_curator, labelled_by_user_classifier, and labelled_by_text_classifier) - if len(self.processor.get_unlabelled_users()) != 0: - # 2. if there are some unlabelled users, predict - print("Models are making predictions...") - self.usermeta_classifier.predict() - self.text_classifier.predict() - print("Successfully made predictions!") + self._check_model_instance_files() + + print("Models are making predictions...") + self.usermeta_classifier.predict() + self.text_classifier.predict() + print("Successfully made predictions!") result = { "spam_users": self.processor.get_spam_users(), @@ -78,33 +80,9 @@ def execute(self): } # 3. Print resluts - metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"] - result["usermeta_spam_classifier"].pop("test_user_ids") - metadata_model_metrics = result["usermeta_spam_classifier"] + self._print_model_metrics_usermeta(self.usermeta_classifier_metrics) + self._print_model_metrics_text(self.text_classifier_metrics) - text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"] - result["text_spam_classifier"].pop("test_user_ids") - text_model_metrics = result["text_spam_classifier"] - - print("IDs of Detected Spam User :\n", result["spam_users"]) - print("\n------------------------------------\n") - print( - "UserMetadataSpamClassifier Metrics :\n", - metadata_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - metadata_model_tested_ids, - ) - print("\n------------------------------------\n") - print( - "TextSpamClassifier Metrics :\n", - text_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - text_model_tested_ids, - ) # 4. Return spam user_ids and metrics of the model return result @@ -115,9 +93,9 @@ def get_model_metrics(self): load the JSON metrics files as dictionary and return it. Execution Steps: - 1. load the model metrics files from the saving directory - 2. print the metrics - 3. return a dictionary of the scores of existing machine learning model instances. + 1. Load the model metrics files from the saving directory + 2. Print the metrics + 3. Return a dictionary of the scores of existing machine learning model instances. return: { "usermeta_spam_classifier": {"Accuracy": float, "Precision": float, @@ -128,13 +106,9 @@ def get_model_metrics(self): "text_spam_classifier": { same as above } } """ + # 1. Load the model metrics files from the saving directory print("Loading model metric files...") - # We can assume that model and model metrics files exist after __init__ - with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file: - self.usermeta_classifier_metrics = json.load(json_file) - - with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file: - self.text_classifier_metrics = json.load(json_file) + self._check_model_instance_files() print("Successfully loaded model metrics!") result = { @@ -142,35 +116,19 @@ def get_model_metrics(self): "text_spam_classifier": self.text_classifier_metrics, } - # Print the model metrics - metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"] - result["usermeta_spam_classifier"].pop("test_user_ids") - metadata_model_metrics = result["usermeta_spam_classifier"] - - text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"] - result["text_spam_classifier"].pop("test_user_ids") - text_model_metrics = result["text_spam_classifier"] + # 2. Print the model metrics + self._print_model_metrics_usermeta(self.usermeta_classifier_metrics) + self._print_model_metrics_text(self.text_classifier_metrics) - print("\n------------------------------------\n") - print( - "UserMetadataSpamClassifier Metrics :\n", - metadata_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - metadata_model_tested_ids, - ) - print("\n------------------------------------\n") - print( - "TextSpamClassifier Metrics :\n", - text_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - text_model_tested_ids, - ) + # 3. Return a dictionary of the scores of existing machine learning model instances. return result + + def fit_classifiers(self): + self.fit_usermeta_spam_classifier() + self.fit_text_spam_classifier() + + def fit_usermeta_spam_classifier(self): """ This function is a wrapper of the fit() function in UserMetadataSpamClassifier. @@ -181,18 +139,7 @@ def fit_usermeta_spam_classifier(self): 2. print the model metrics and the user_ids of the users used to calculate the scores. """ model_metrics = self.usermeta_classifier.fit() - metadata_model_tested_ids = model_metrics["test_user_ids"] - model_metrics.pop("test_user_ids") - metadata_model_metrics = model_metrics - print("\n------------------------------------\n") - print( - "UserMetadataSpamClassifier Metrics :\n", - metadata_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - metadata_model_tested_ids, - ) + self._print_model_metrics_usermeta(model_metrics) def fit_text_spam_classifier(self): """ @@ -204,18 +151,7 @@ def fit_text_spam_classifier(self): 2. print the model metrics and the user_ids of the users used to calculate the scores. """ model_metrics = self.text_classifier.fit() - text_model_tested_ids = model_metrics["test_user_ids"] - model_metrics.pop("test_user_ids") - text_model_metrics = model_metrics - print("\n------------------------------------\n") - print( - "TextSpamClassifier Metrics :\n", - text_model_metrics, - ) - print( - "Metrics was calculated based on users with following IDs ....\n", - text_model_tested_ids, - ) + self._print_model_metrics_text(model_metrics) def predict_usermeta_spam_classifier(self): """ @@ -232,8 +168,22 @@ def predict_usermeta_spam_classifier(self): "Since all users were labelled by curator, classifier prediction was not executed.\n" ) else: - print("UserMetadataSpamClassifier evaluated users:\n", evaluated_user_ids) - print("Spam Users :\n", spam_user_ids) + result_filepath = ( + "/shared/curator/spam/user_meta_classifier_prediction.json" + ) + result = { + "spam_user_ids": spam_user_ids, + "evaluated_user_ids": evaluated_user_ids, + } + self._save_prediction_result(result_filepath, result) + print("\n------------------------------------") + print( + "Number of spam users detected by UserMetadataSpamClassifier: %d / %d" + % (len(spam_user_ids), len(evaluated_user_ids)) + ) + print( + "You can find the list of ids of the detected spams and evaluated users" + ) def predict_text_spam_classifier(self): """ @@ -251,5 +201,63 @@ def predict_text_spam_classifier(self): "Since all users were labelled by curator, classifier prediction was not executed.\n" ) else: - print("TextSpamClassifier evaluated users:\n", evaluated_user_ids) - print("Spam Users :\n", spam_user_ids) + result_filepath = "/shared/curator/spam/text_classifier_prediction.json" + result = { + "spam_user_ids": spam_user_ids, + "evaluated_user_ids": evaluated_user_ids, + } + self._save_prediction_result(result_filepath, result) + print("\n------------------------------------") + print( + "Number of spam users detected by TextSpamClassifier: %d / %d" + % (len(spam_user_ids), len(evaluated_user_ids)) + ) + print( + "You can find the list of ids of the detected spams and evaluated users" + ) + + def _save_prediction_result(self, filepath, data: dict): + with open(filepath, "w") as f: + json.dump(data, f, indent=4) + + def _print_model_metrics_usermeta(self, result: dict): + metadata_model_tested_ids = result["test_user_ids"] + result.pop("test_user_ids") + metadata_model_metrics = result + print("\n------------------------------------") + print( + "UserMetadataSpamClassifier Metrics :\n", + metadata_model_metrics, + ) + print( + "Metrics was calculated based on a test dataset of size ", + len(metadata_model_tested_ids), + ) + print( + "Here's some of the user IDs in the test dataset.\n", + metadata_model_tested_ids[:10], + ) + print( + "You can find the rest in /shared/curator/spam/user_meta_classifier_metrics.json" + ) + + def _print_model_metrics_text(self, result: dict): + text_model_tested_ids = result["test_user_ids"] + result.pop("test_user_ids") + text_model_metrics = result + print("\n------------------------------------") + print( + "TextSpamClassifier Metrics :\n", + text_model_metrics, + ) + print( + "Metrics was calculated based on a test dataset of size ", + len(text_model_tested_ids), + ) + print( + "Here's some of the user IDs in the test dataset.\n", + text_model_tested_ids[:10], + ) + print( + "You can find the rest in /shared/curator/spam/text_classifier_metrics.json" + ) diff --git a/django/curator/spam_classifiers.py b/django/curator/spam_classifiers.py index f88c36f0f..21c4b6bb8 100644 --- a/django/curator/spam_classifiers.py +++ b/django/curator/spam_classifiers.py @@ -11,7 +11,7 @@ from typing import List from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import FunctionTransformer -from sklearn.naive_bayes import MultinomialNB +# from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score @@ -108,9 +108,11 @@ def load_model(self, file_path: str): Params : file_path ... Path to the saved model instance Returns : Loaded model instance """ - if not os.path.isfile(file_path): - self.fit() - with open(file_path, "rb") as file: + try: + file = open(file_path, "rb") + except OSError: + print("Could not open/read file:", file_path) + with file: return pickle.load(file) def save_model(self, model, file_path: str): @@ -175,28 +177,39 @@ def __init__(self): self.MODEL_METRICS_FILE_PATH = SPAM_DIR_PATH / "text_classifier_metrics.json" def fit(self): + """ + return: + - model_metrics ... Dictionary of the following format containing model scores and user_ids + that were used for the validation. + { "Accuracy": accuracy, + "Precision": precision, + "Recall": recall, + "F1": f1, + "test_user_ids": test_user_ids,} + If training connot be conducted, it returns empty dictionary. (model_metrics = {}) + """ print("Training TextSpamClassifier...") - model_metrics = None + model_metrics = {} model = Pipeline( [ ("cleaner", FunctionTransformer(self.preprocess)), ("countvectorizer", CountVectorizer(lowercase=True)), - ("classifier", MultinomialNB()), + ("classifier", xgb.XGBClassifier()), ] ) - all_df = self.processor.get_all_users_df() - - if all_df.empty: - return model_metrics # = None + # Use labelled samples as training data + df = self.processor.get_labelled_by_curator_df() + if df.empty: + return model_metrics - data_x, data_y = self.concat_pd(all_df) + data_x, data_y = self.concat_pd(df) if data_x.empty: - return model_metrics # = None + return model_metrics if len(data_y.value_counts()) != 2: print("Cannot create a binary classifier!!") - return model_metrics # = None + return model_metrics ( train_x, @@ -218,10 +231,15 @@ def fit(self): return model_metrics def predict(self): + """ + return: + - evaluated_user_ids ... + - spam_user_ids ... + """ print("TextSpamClassifier is making predictions...") evaluated_user_ids = [] spam_user_ids = [] - df = self.processor.get_unlabelled_by_curator_df() + df = self.processor.get_all_users_df() if df.empty: # no-op if no data found return evaluated_user_ids, spam_user_ids @@ -241,10 +259,12 @@ def predict(self): df = pd.DataFrame(result).replace(np.nan, None) self.processor.update_predictions(df, isTextClassifier=True) - evaluated_user_ids = df["user_id"].values.flatten() - spam_user_ids = df["user_id"][ - df["labelled_by_text_classifier"] == 1 - ].values.flatten() + evaluated_user_ids = df["user_id"].values.flatten().tolist() + spam_user_ids = ( + df["user_id"][df["labelled_by_text_classifier"] == 1] + .values.flatten() + .tolist() + ) print("Successfully made predictions!") return evaluated_user_ids, spam_user_ids @@ -293,8 +313,8 @@ def fit(self): ("classifier", xgb.XGBClassifier()), ] ) - # obtain df from pipleline - df = self.processor.get_all_users_df() + # Use labelled samples as training data + df = self.processor.get_labelled_by_curator_df() if df.empty: return model_metrics # None if no untrained data found @@ -327,7 +347,7 @@ def predict(self): print("UserMetadataSpamClassifier is making predictions...") evaluated_user_ids = [] spam_user_ids = [] - df = self.processor.get_unlabelled_by_curator_df() + df = self.processor.get_all_users_df() if df.empty: # no-op if no data found return evaluated_user_ids, spam_user_ids @@ -346,10 +366,12 @@ def predict(self): df = pd.DataFrame(result).replace(np.nan, None) self.processor.update_predictions(df, isTextClassifier=False) - evaluated_user_ids = df["user_id"].values.flatten() - spam_user_ids = df["user_id"][ - df["labelled_by_user_classifier"] == 1 - ].values.flatten() + evaluated_user_ids = df["user_id"].values.flatten().tolist() + spam_user_ids = ( + df["user_id"][df["labelled_by_user_classifier"] == 1] + .values.flatten() + .tolist() + ) print("Successfully made predictions!") return evaluated_user_ids, spam_user_ids @@ -402,7 +424,7 @@ def __input_df_transformation(self, df: pd.DataFrame): ].fillna( "" ) - + df.loc[:, ["user_id", "labelled_by_curator"]] = df[ ["user_id", "labelled_by_curator"] ].fillna(0) diff --git a/django/curator/spam_processor.py b/django/curator/spam_processor.py index b5089a9ca..edf7aaf64 100644 --- a/django/curator/spam_processor.py +++ b/django/curator/spam_processor.py @@ -1,5 +1,6 @@ import pandas as pd import re +import sys from django.db.models import Q from curator.models import UserSpamStatus from django.conf import settings @@ -84,7 +85,7 @@ def __convert_df_markup_to_string(self, df): for col in df.columns: if col in self.type_int_bool_column_names: df[col] = df[col].fillna(0).astype(int) - # It is safe to set Nan as 0 because: + # It is safe to set None as 0 because: # for training, all values with labelled_by_curator=None are exclueded before passed to this function. # for prediction, the labelled_by_curator column is not used during prediction process. else: @@ -98,60 +99,76 @@ def get_all_users_df(self): self.__convert_df_markup_to_string( pd.DataFrame( list( - UserSpamStatus.objects.all() - .exclude(member_profile__user_id=None, labelled_by_curator=None) - .values(*self.db_column_names) + UserSpamStatus.objects.exclude( + member_profile__user_id=None + ).values(*self.db_column_names) ) ) ) ) - def get_unlabelled_by_curator_df(self): - # return : DataFrame of user data that haven't been labeled by curator + def get_labelled_by_curator_df(self): return self.__rename_columns( self.__convert_df_markup_to_string( pd.DataFrame( list( - UserSpamStatus.objects.all() - .exclude(member_profile__user_id=None) - .filter(labelled_by_curator=None) - .values(*self.db_column_names) + UserSpamStatus.objects.exclude( + Q(member_profile__user_id=None) + | Q(labelled_by_curator=None) + ).values(*self.db_column_names) ) ) ) ) - def get_untrained_df(self): - # return : DataFrame of user data that haven't been used for train previously - return self.__rename_columns( - self.__convert_df_markup_to_string( - pd.DataFrame( - list( - UserSpamStatus.objects.all() - .exclude(member_profile__user_id=None, labelled_by_curator=None) - .filter(is_training_data=False) - .values(*self.db_column_names) - ) - ) - ) - ) + # Currently not using + # def get_unlabelled_by_curator_df(self): + # # return : DataFrame of user data that haven't been labeled by curator + # return self.__rename_columns( + # self.__convert_df_markup_to_string( + # pd.DataFrame( + # list( + # UserSpamStatus.objects + # .exclude(member_profile__user_id=None) + # .filter(labelled_by_curator=None) + # .values(*self.db_column_names) + # ) + # ) + # ) + # ) - def get_unlabelled_users(self): - unlabelled_users = list( - UserSpamStatus.objects.filter( - Q(labelled_by_curator=None) - & Q(labelled_by_text_classifier=None) - & Q(labelled_by_user_classifier=None) - ) - ) - return unlabelled_users + # Currently not using + # def get_untrained_df(self): + # # return : DataFrame of user data that haven't been used for train previously + # return self.__rename_columns( + # self.__convert_df_markup_to_string( + # pd.DataFrame( + # list( + # UserSpamStatus.objects + # .exclude(Q(member_profile__user_id=None) | Q(labelled_by_curator=None)) + # .filter(is_training_data=False) + # .values(*self.db_column_names) + # ) + # ) + # ) + # ) - # FIXME: tune confidence threshold later + # def get_unlabelled_users(self): + # unlabelled_users = list( + # UserSpamStatus.objects.filter( + # Q(labelled_by_curator=None) + # & Q(labelled_by_text_classifier=None) + # & Q(labelled_by_user_classifier=None) + # ) + # ) + # return unlabelled_users + + # TODO: tune confidence threshold later def get_spam_users(self, confidence_threshold=0.5): """ This functions will first filter out the users with labelled_by_curator==True, - but the ones with None, only get users with labelled_by_user_classifier == True - or labelled_by_text_classifier == True with a specific confidence level. + but the ones with None, only get users with labelled_by_user_classifier==True + or labelled_by_text_classifier==True with a specific confidence level. """ spam_users = list( UserSpamStatus.objects.filter( @@ -191,17 +208,37 @@ def load_labels_from_csv(self, filepath=DATASET_FILE_PATH): return : list of user_ids which labelled_by_curator was updated """ print("Loading labels CSV...") - label_df = pd.read_csv(filepath) # TODO add exception + try: + label_df = pd.read_csv(filepath) + except Exception: + print("Could not open/read file:", filepath) + print("Please locate a dataset with labels at the path of ./curator/spam_dataset.csv") + sys.exit() + + # Use when batch updating of labelled_by_curator is ready + # spam_user_ids = label_df[label_df['is_spam']==1]['user_id'].values + # ham_user_ids = label_df[label_df['is_spam']==0]['user_id'].values + + # is_spam = True + # flag = self.update_labelled_by_curator(spam_user_ids, is_spam) + # if flag == 1: + # user_id_list.append(spam_user_ids) + + # is_spam = False + # flag = self.update_labelled_by_curator(ham_user_ids, is_spam) + # if flag == 1: + # user_id_list.append(ham_user_ids) + user_id_list = [] for idx, row in label_df.iterrows(): flag = self.update_labelled_by_curator(row["user_id"], bool(row["is_spam"])) if flag == 1: user_id_list.append(row["user_id"]) print("Successfully loaded labels from CSV!") - print("List of user ids of which label was loaded :\n", user_id_list) + print("Number of user ids whose label was loaded: ", len(user_id_list)) return user_id_list - def update_labelled_by_curator(self, user_id, label): + def update_labelled_by_curator(self, user_id, label): #TODO update with batch return UserSpamStatus.objects.filter(member_profile__user_id=user_id).update( labelled_by_curator=label ) # return 0(fail) or 1(success) diff --git a/django/curator/tests/test_spam.py b/django/curator/tests/test_spam.py index d722aefde..de791053d 100644 --- a/django/curator/tests/test_spam.py +++ b/django/curator/tests/test_spam.py @@ -43,6 +43,16 @@ def delete_new_users(self, user_ids): user = User.objects.filter(id=user_id) user.delete() + # def update_labels(self, user_ids): + # middle_idx = len(user_ids)/2 - 1 + # label = True + # self.processor.update_labelled_by_curator(user_ids[0:middle_idx], label) + # label = False + # self.processor.update_labelled_by_curator(user_ids[middle_idx:-1], label) + + # def delete_labels(self, user_ids): + # self.processor.update_labelled_by_curator(user_ids, None) + def update_labels(self, user_ids): for user_id in user_ids: label = random.randint(0, 1) @@ -52,7 +62,7 @@ def delete_labels(self, user_ids): for user_id in user_ids: self.processor.update_labelled_by_curator(user_id, None) - def get_existing_users(self): + def get_existing_user_ids(self): user_ids = list( UserSpamStatus.objects.all() .exclude(member_profile__user_id=None) @@ -79,93 +89,75 @@ def test_load_labels_from_csv(self): self.assertTrue(self.processor.labelled_by_curator_exist()) # ============== Tests for UserSpamStatusProcessor.get_unlabelled_by_curator_df() ============== - def test_get_unlabelled_by_curator_df__new_users_added(self): + def test_get_labelled_by_curator_df__new_users_added(self): """ case1 : new user data added stub data/requirements ... new user data with labelled_by_curator==None - assertion ... df with the specific columns with the correct user_ids + assertion ... no change in df """ - existing_users = self.user_ids - user_ids = self.create_new_users() # default labelled_by_curator==None - user_size = len(user_ids) + len(existing_users) + existing_user_ids = self.user_ids + self.update_labels( + existing_user_ids + ) # make sure all the existing users have labels + existing_df = self.processor.get_labelled_by_curator_df() - df = self.processor.get_unlabelled_by_curator_df() - - self.assertIsInstance(df, pd.DataFrame) - self.assertTrue( - set(self.processor.column_names).issubset(set(df.columns.unique())) - ) - self.assertEqual(user_size, len(df["user_id"].values)) - self.assertTrue(set(user_ids).issubset(set(df["user_id"].values))) - self.delete_new_users(user_ids) - - def test_get_unlabelled_by_curator_df__no_users_added(self): - """ - case2 : no new user data added - stub data/requirements ... all data in DB with correct user_id has values - with labelled_by_curator!=None - assertion ... empty df - """ - existing_users = self.user_ids - self.update_labels(existing_users) # simulate a curator labelling the users + new_user_ids = self.create_new_users() # default labelled_by_curator==None - df = self.processor.get_unlabelled_by_curator_df() - self.assertEqual(len(df), 0) + user_size = len(new_user_ids) + len(existing_user_ids) + new_df = self.processor.get_labelled_by_curator_df() - self.delete_labels(existing_users) + self.assertEqual( + len(existing_df["user_id"].values), len(new_df["user_id"].values) + ) + self.assertGreater(user_size, len(existing_df["user_id"].values)) + self.assertFalse(set(new_user_ids).issubset(set(new_df["user_id"].values))) + self.delete_new_users(new_user_ids) - # ======================== Tests for UserSpamStatusProcessor.get_untrained_df() ======================== - def test_get_untrained_df__dataset_loaded(self): + def test_get_labelled_by_curator_df__label_added(self): """ - case1 : Just uploaded spam_dataset.csv - stub data/requirements ... just called load_labels_from_csv(). - assertion ... df with the specific columns with the correct user_ids + case2 : new user data added + stub data/requirements ... new user data with labelled_by_curator==True/False + assertion ... df with the specific columns with the addtional user data that were labelled. """ - user_ids = self.processor.load_labels_from_csv() + existing_user_ids = self.user_ids + self.update_labels( + existing_user_ids + ) # make sure all the existing users have labels - df = self.processor.get_untrained_df() - self.assertIsInstance(df, pd.DataFrame) - self.assertTrue( - set(self.processor.column_names).issubset(set(df.columns.unique())) - ) - # self.assertListEqual(user_ids, list(df["user_id"].values)) - self.assertTrue(set(user_ids) == set(list(df["user_id"].values))) - self.delete_labels(user_ids) + new_user_ids = self.create_new_users() # default labelled_by_curator==None + user_size = len(new_user_ids) + len(existing_user_ids) - def test_get_untrained_df__labels_updated(self): - """ - case2 : labelled_by_curator updated - stub data/requirements ... new labells by curator (users with abelled_by_curator!=None and is_training_data=False) - assertion ... df with the specific columns with the correct user_ids - """ - existing_users = self.user_ids - self.update_labels(existing_users) # update labels of exisiting users + self.update_labels(new_user_ids) + df = self.processor.get_labelled_by_curator_df() - df = self.processor.get_untrained_df() self.assertIsInstance(df, pd.DataFrame) self.assertTrue( set(self.processor.column_names).issubset(set(df.columns.unique())) ) - # self.assertListEqual(existing_users, list(df["user_id"].values)) - self.assertTrue(set(existing_users) == set(list(df["user_id"].values))) - self.delete_labels(existing_users) + self.assertEqual(user_size, len(df["user_id"].values)) + self.assertTrue(set(new_user_ids).issubset(set(df["user_id"].values))) + self.delete_new_users(new_user_ids) - def test_get_untrained_df__no_labels_updated(self): + def test_get_labelled_by_curator_df__no_users_added(self): """ - case3 : no label updates - stub data/requirements ... all data in DB with labelled_by_curator!=None has is_training_data=True - assertion ... empty df + case3 : no new user data added + stub data/requirements ... all data in DB with correct user_id has values + with labelled_by_curator==True/False + assertion ... no change in df """ - existing_users = self.user_ids - self.update_labels(existing_users) # update labels of exisiting users - self.mark_as_training_data(existing_users) # mark the user as training data + existing_user_ids = self.user_ids + self.update_labels( + existing_user_ids + ) # make sure all the existing users have labels + existing_df = self.processor.get_labelled_by_curator_df() - df = self.processor.get_untrained_df() - self.assertEqual(len(df), 0) + self.update_labels(existing_user_ids) # making sure all users are labelled - self.delete_labels(existing_users) - self.unmark_as_training_data(existing_users) + new_df = self.processor.get_labelled_by_curator_df() + self.assertEqual( + len(existing_df["user_id"].values), len(new_df["user_id"].values) + ) # ================================ Tests for UserMetadataSpamClassifier ================================ def test_user_meta_classifier_fit(self): @@ -205,8 +197,8 @@ def test_user_meta_classifier_prediction(self): self.processor.load_labels_from_csv() self.user_meta_classifier.fit() - existing_users = self.user_ids - self.update_labels(existing_users) + existing_user_ids = self.user_ids + self.update_labels(existing_user_ids) new_user_ids = self.create_new_users() # default labelled_by_curator==None labelled_user_ids, _ = self.user_meta_classifier.predict() @@ -238,12 +230,101 @@ def test_text_classifier_prediction(self): # self.add_texts_to_users(user_ids) self.text_classifier.fit() - existing_users = self.user_ids - self.update_labels(existing_users) + existing_user_ids = self.user_ids + self.update_labels(existing_user_ids) new_user_ids = self.create_new_users() # default labelled_by_curator==None - self.add_texts_to_users(existing_users) + self.add_texts_to_users(existing_user_ids) self.add_texts_to_users(new_user_ids) labelled_user_ids, _ = self.text_classifier.predict() self.assertTrue(self.processor.all_have_labels()) self.assertTrue(bool(set(new_user_ids) & set(labelled_user_ids))) + + # # ============== Tests for UserSpamStatusProcessor.get_unlabelled_by_curator_df() ============== + # def test_get_unlabelled_by_curator_df__new_users_added(self): + # """ + # case1 : new user data added + # stub data/requirements ... new user data with labelled_by_curator==None + # assertion ... df with the specific columns with the correct user_ids + # """ + + # existing_user_ids = self.user_ids + # user_ids = self.create_new_users() # default labelled_by_curator==None + # user_size = len(user_ids) + len(existing_user_ids) + + # df = self.processor.get_unlabelled_by_curator_df() + + # self.assertIsInstance(df, pd.DataFrame) + # self.assertTrue( + # set(self.processor.column_names).issubset(set(df.columns.unique())) + # ) + # self.assertEqual(user_size, len(df["user_id"].values)) + # self.assertTrue(set(user_ids).issubset(set(df["user_id"].values))) + # self.delete_new_users(user_ids) + + # def test_get_unlabelled_by_curator_df__no_users_added(self): + # """ + # case2 : no new user data added + # stub data/requirements ... all data in DB with correct user_id has values + # with labelled_by_curator!=None + # assertion ... empty df + # """ + # existing_user_ids = self.user_ids + # self.update_labels(existing_user_ids) # simulate a curator labelling the users + + # df = self.processor.get_unlabelled_by_curator_df() + # self.assertEqual(len(df), 0) + + # self.delete_labels(existing_user_ids) + + # ======================== Tests for UserSpamStatusProcessor.get_untrained_df() ======================== + # def test_get_untrained_df__dataset_loaded(self): + # """ + # case1 : Just uploaded spam_dataset.csv + # stub data/requirements ... just called load_labels_from_csv(). + # assertion ... df with the specific columns with the correct user_ids + # """ + # user_ids = self.processor.load_labels_from_csv() + + # df = self.processor.get_untrained_df() + # self.assertIsInstance(df, pd.DataFrame) + # self.assertTrue( + # set(self.processor.column_names).issubset(set(df.columns.unique())) + # ) + # # self.assertListEqual(user_ids, list(df["user_id"].values)) + # self.assertTrue(set(user_ids) == set(list(df["user_id"].values))) + # self.delete_labels(user_ids) + + # def test_get_untrained_df__labels_updated(self): + # """ + # case2 : labelled_by_curator updated + # stub data/requirements ... new labells by curator (users with abelled_by_curator!=None and is_training_data=False) + # assertion ... df with the specific columns with the correct user_ids + # """ + # existing_user_ids = self.user_ids + # self.update_labels(existing_user_ids) # update labels of exisiting users + + # df = self.processor.get_untrained_df() + # self.assertIsInstance(df, pd.DataFrame) + # self.assertTrue( + # set(self.processor.column_names).issubset(set(df.columns.unique())) + # ) + # # self.assertListEqual(existing_user_ids, list(df["user_id"].values)) + # self.assertTrue(set(existing_user_ids) == set(list(df["user_id"].values))) + # self.delete_labels(existing_user_ids) + + # def test_get_untrained_df__no_labels_updated(self): + # """ + # case3 : no label updates + # stub data/requirements ... all data in DB with labelled_by_curator!=None has is_training_data=True + # assertion ... empty df + # """ + # existing_user_ids = self.user_ids + # self.update_labels(existing_user_ids) # update labels of exisiting users + # self.mark_as_training_data(existing_user_ids) # mark the user as training data + + # df = self.processor.get_untrained_df() + # self.assertEqual(len(df), 0) + + # self.delete_labels(existing_user_ids) + # self.unmark_as_training_data(existing_user_ids)