From 100b49ad17335b7c114b47f0c50c28dba02748ed Mon Sep 17 00:00:00 2001
From: Aiko <blllue.sea.blllue@gmail.com>
Date: Thu, 11 Jan 2024 17:57:36 -0700
Subject: [PATCH] fix: make prediction on every user in DB instead of filtering
 labelled_by_curator=None

 - using get_all_users_df() instead of get_unlabelled_by_curator_df() to obtain dataframe, because a user previously labelled as ham may turn into spam.

 - improved the management command messages.

 - added exception handling for file operations.

 - replaced MultinomialNB with XGboost.
---
 .../commands/curator_spam_detection.py        |  26 +-
 django/curator/models.py                      |   2 +-
 django/curator/spam.py                        | 240 +++++++++---------
 django/curator/spam_classifiers.py            |  74 ++++--
 django/curator/spam_processor.py              | 115 ++++++---
 django/curator/tests/test_spam.py             | 221 +++++++++++-----
 6 files changed, 419 insertions(+), 259 deletions(-)

diff --git a/django/curator/management/commands/curator_spam_detection.py b/django/curator/management/commands/curator_spam_detection.py
index e3c90903d..6dd70f8cf 100644
--- a/django/curator/management/commands/curator_spam_detection.py
+++ b/django/curator/management/commands/curator_spam_detection.py
@@ -17,12 +17,19 @@ def __init__(self):
 
     def add_arguments(self, parser):
         parser.add_argument(
-            "--exe",
-            "-e",
+            "--predict",
+            "-p",
             action="store_true",
             default=False,
             help="Print user_ids of spam users and the metrics of the models used to obtain the predictions.",
         )
+        parser.add_argument(
+            "--fit",
+            "-f",
+            action="store_true",
+            default=False,
+            help="Fit all models based on user data labelled by curator.",
+        )
         parser.add_argument(
             "--get_model_metrics",
             "-m",
@@ -35,7 +42,7 @@ def add_arguments(self, parser):
             "-l",
             action="store_true",
             default=False,
-            help="Store manually annotated spam labels to the DB.",
+            help="Store bootstrap spam labels to the DB.",
         )
         parser.add_argument(
             "--fit_usermeta_model", "-fu", action="store_true", default=False
@@ -58,8 +65,11 @@ def add_arguments(self, parser):
             help="Print user_ids of all the evaluated users and spam users using the Text model",
         )
 
-    def handle_exe(self):
-        self.detection.execute()
+    def handle_predict(self):
+        self.detection.predict()
+    
+    def handle_fit(self):
+        self.detection.fit_classifiers()
 
     def handle_get_model_metrics(self):
         self.detection.get_model_metrics()
@@ -80,8 +90,10 @@ def handle_predict_text_model(self):
         self.detection.predict_text_spam_classifier()
 
     def handle(self, *args, **options):
-        if options["exe"]:
-            action = "exe"
+        if options["predict"]:
+            action = "predict"
+        elif options["fit"]:
+            action = "fit"
         elif options["get_model_metrics"]:
             action = "get_model_metrics"
         elif options["load_labels"]:
diff --git a/django/curator/models.py b/django/curator/models.py
index 09c430676..b7bc301ca 100644
--- a/django/curator/models.py
+++ b/django/curator/models.py
@@ -375,7 +375,7 @@ class UserSpamStatus(models.Model):
     member_profile = models.OneToOneField(
         MemberProfile, on_delete=models.CASCADE, primary_key=True
     )
-    # FIXME: add help_text
+    # TODO: add help_text
     # None = not processed yet
     # True = bio_classifier considered this user to be spam
     # False = bio_classifier did not consider this user to be spam
diff --git a/django/curator/spam.py b/django/curator/spam.py
index 42e9714d8..c17796e0f 100644
--- a/django/curator/spam.py
+++ b/django/curator/spam.py
@@ -1,5 +1,6 @@
 import json
 import os
+import sys
 
 from django.conf import settings
 
@@ -25,9 +26,6 @@ def __init__(self):
         SpamDetection Initialization Steps:
         1. Initializes UserSpamStatusProcessor and the classifier classes
         2. If no data has been labelled by a curator, load datase.csv
-        3. If no model pickle file is found, call fit() of Classifier classes
-            - if all users have None in labelled_by_curator, load to DB by calling Pipeline.load_labels_from_csv()
-            - additionally, if no labels file, throw exception
         """
         self.processor = UserSpamStatusProcessor()
         self.usermeta_classifier = UserMetadataSpamClassifier()
@@ -35,41 +33,45 @@ def __init__(self):
         if not self.processor.labelled_by_curator_exist():
             self.processor.load_labels_from_csv()
 
+
+    def _check_model_instance_files(self):
         # Check whether UserMetadataSpamClassifier model file exists
-        if os.path.exists(self.usermeta_classifier.MODEL_METRICS_FILE_PATH):
-            with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-                self.usermeta_classifier_metrics = json.load(json_file)
-        else:
-            # If model metrics and instance file don't exist, call fit()
-            self.usermeta_classifier_metrics = self.usermeta_classifier.fit()
+        try:
+            json_file = open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH)
+        except OSError:
+            print("Could not open/read file:", self.usermeta_classifier.MODEL_METRICS_FILE_PATH)
+            print("Please run fit_classifiers() to create model instance and metrics files.")
+            sys.exit()
+        with json_file:
+            self.usermeta_classifier_metrics = json.load(json_file)
+        
+        try:
+            json_file = open(self.text_classifier.MODEL_METRICS_FILE_PATH)
+        except OSError:
+            print("Could not open/read file:", self.text_classifier.MODEL_METRICS_FILE_PATH)
+            print("Please run fit_classifiers() to create model instance and metrics files.")
+            sys.exit()
+        with json_file:
+            self.text_classifier_metrics = json.load(json_file)
 
-        # Check whether TextSpamClassifier model file exists
-        if os.path.exists(self.text_classifier.MODEL_METRICS_FILE_PATH):
-            with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-                self.text_classifier_metrics = json.load(json_file)
-        else:
-            # If model metrics and instance file don't exist, call fit()
-            self.text_classifier_metrics = self.text_classifier.fit()
 
-    def execute(self):
+    def predict(self):
         """
         A default function to obtain the list of spam users and the metrics of the models used to
         obtain the predictions.
 
         Execution Steps:
-            1. Check if there exists user data that should be labelled by the classifier models
-            2. If there exists, class predict() of the classifier classes. This function will store the result in DB at the end.
-            3. Print resluts
+            1. Call predict() of the classifier classes. This function will store the result in DB at the end.
+            2. Print resluts
             4. Return the detection results stored in DB.
         """
         print("Executing spam dectection...")
-        # 1. Check DB for unlabelled users (None in all labelled_by_curator, labelled_by_user_classifier, and labelled_by_text_classifier)
-        if len(self.processor.get_unlabelled_users()) != 0:
-            # 2. if there are some unlabelled users, predict
-            print("Models are making predictions...")
-            self.usermeta_classifier.predict()
-            self.text_classifier.predict()
-            print("Successfully made predictions!")
+        self._check_model_instance_files()
+
+        print("Models are making predictions...")
+        self.usermeta_classifier.predict()
+        self.text_classifier.predict()
+        print("Successfully made predictions!")
 
         result = {
             "spam_users": self.processor.get_spam_users(),
@@ -78,33 +80,9 @@ def execute(self):
         }
 
         # 3. Print resluts
-        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
-        result["usermeta_spam_classifier"].pop("test_user_ids")
-        metadata_model_metrics = result["usermeta_spam_classifier"]
+        self._print_model_metrics_usermeta(self.usermeta_classifier_metrics)
+        self._print_model_metrics_text(self.text_classifier_metrics)
 
-        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
-        result["text_spam_classifier"].pop("test_user_ids")
-        text_model_metrics = result["text_spam_classifier"]
-
-        print("IDs of Detected Spam User :\n", result["spam_users"])
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
         # 4. Return spam user_ids and metrics of the model
         return result
 
@@ -115,9 +93,9 @@ def get_model_metrics(self):
         load the JSON metrics files as dictionary and return it.
 
         Execution Steps:
-            1. load the model metrics files from the saving directory
-            2. print the metrics
-            3. return a dictionary of the scores of existing machine learning model instances.
+            1. Load the model metrics files from the saving directory
+            2. Print the metrics
+            3. Return a dictionary of the scores of existing machine learning model instances.
         return:
             {   "usermeta_spam_classifier": {"Accuracy": float,
                                                   "Precision": float,
@@ -128,13 +106,9 @@ def get_model_metrics(self):
                 "text_spam_classifier": { same as above }
             }
         """
+        # 1. Load the model metrics files from the saving directory
         print("Loading model metric files...")
-        # We can assume that model and model metrics files exist after __init__
-        with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-            self.usermeta_classifier_metrics = json.load(json_file)
-
-        with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-            self.text_classifier_metrics = json.load(json_file)
+        self._check_model_instance_files()
 
         print("Successfully loaded model metrics!")
         result = {
@@ -142,35 +116,19 @@ def get_model_metrics(self):
             "text_spam_classifier": self.text_classifier_metrics,
         }
 
-        # Print the model metrics
-        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
-        result["usermeta_spam_classifier"].pop("test_user_ids")
-        metadata_model_metrics = result["usermeta_spam_classifier"]
-
-        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
-        result["text_spam_classifier"].pop("test_user_ids")
-        text_model_metrics = result["text_spam_classifier"]
+        # 2. Print the model metrics
+        self._print_model_metrics_usermeta(self.usermeta_classifier_metrics)
+        self._print_model_metrics_text(self.text_classifier_metrics)
 
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
+        # 3. Return a dictionary of the scores of existing machine learning model instances.
         return result
 
+
+    def fit_classifiers(self):
+        self.fit_usermeta_spam_classifier()
+        self.fit_text_spam_classifier()
+
+
     def fit_usermeta_spam_classifier(self):
         """
         This function is a wrapper of the fit() function in UserMetadataSpamClassifier.
@@ -181,18 +139,7 @@ def fit_usermeta_spam_classifier(self):
             2. print the model metrics and the user_ids of the users used to calculate the scores.
         """
         model_metrics = self.usermeta_classifier.fit()
-        metadata_model_tested_ids = model_metrics["test_user_ids"]
-        model_metrics.pop("test_user_ids")
-        metadata_model_metrics = model_metrics
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
+        self._print_model_metrics_usermeta(model_metrics)
 
     def fit_text_spam_classifier(self):
         """
@@ -204,18 +151,7 @@ def fit_text_spam_classifier(self):
             2. print the model metrics and the user_ids of the users used to calculate the scores.
         """
         model_metrics = self.text_classifier.fit()
-        text_model_tested_ids = model_metrics["test_user_ids"]
-        model_metrics.pop("test_user_ids")
-        text_model_metrics = model_metrics
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
+        self._print_model_metrics_text(model_metrics)
 
     def predict_usermeta_spam_classifier(self):
         """
@@ -232,8 +168,22 @@ def predict_usermeta_spam_classifier(self):
                 "Since all users were labelled by curator, classifier prediction was not executed.\n"
             )
         else:
-            print("UserMetadataSpamClassifier evaluated users:\n", evaluated_user_ids)
-            print("Spam Users :\n", spam_user_ids)
+            result_filepath = (
+                "/shared/curator/spam/user_meta_classifier_prediction.json"
+            )
+            result = {
+                "spam_user_ids": spam_user_ids,
+                "evaluated_user_ids": evaluated_user_ids,
+            }
+            self._save_prediction_result(result_filepath, result)
+            print("\n------------------------------------")
+            print(
+                "Number of spam users detected by UserMetadataSpamClassifier: %d / %d"
+                % (len(spam_user_ids), len(evaluated_user_ids))
+            )
+            print(
+                "You can find the list of ids of the detected spams and evaluated users"
+            )
 
     def predict_text_spam_classifier(self):
         """
@@ -251,5 +201,63 @@ def predict_text_spam_classifier(self):
                 "Since all users were labelled by curator, classifier prediction was not executed.\n"
             )
         else:
-            print("TextSpamClassifier evaluated users:\n", evaluated_user_ids)
-            print("Spam Users :\n", spam_user_ids)
+            result_filepath = "/shared/curator/spam/text_classifier_prediction.json"
+            result = {
+                "spam_user_ids": spam_user_ids,
+                "evaluated_user_ids": evaluated_user_ids,
+            }
+            self._save_prediction_result(result_filepath, result)
+            print("\n------------------------------------")
+            print(
+                "Number of spam users detected by TextSpamClassifier: %d / %d"
+                % (len(spam_user_ids), len(evaluated_user_ids))
+            )
+            print(
+                "You can find the list of ids of the detected spams and evaluated users"
+            )
+
+    def _save_prediction_result(self, filepath, data: dict):
+        with open(filepath, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def _print_model_metrics_usermeta(self, result: dict):
+        metadata_model_tested_ids = result["test_user_ids"]
+        result.pop("test_user_ids")
+        metadata_model_metrics = result
+        print("\n------------------------------------")
+        print(
+            "UserMetadataSpamClassifier Metrics :\n",
+            metadata_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on a test dataset of size ",
+            len(metadata_model_tested_ids),
+        )
+        print(
+            "Here's some of the user IDs in the test dataset.\n",
+            metadata_model_tested_ids[:10],
+        )
+        print(
+            "You can find the rest in /shared/curator/spam/user_meta_classifier_metrics.json"
+        )
+
+    def _print_model_metrics_text(self, result: dict):
+        text_model_tested_ids = result["test_user_ids"]
+        result.pop("test_user_ids")
+        text_model_metrics = result
+        print("\n------------------------------------")
+        print(
+            "TextSpamClassifier Metrics :\n",
+            text_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on a test dataset of size ",
+            len(text_model_tested_ids),
+        )
+        print(
+            "Here's some of the user IDs in the test dataset.\n",
+            text_model_tested_ids[:10],
+        )
+        print(
+            "You can find the rest in /shared/curator/spam/text_classifier_metrics.json"
+        )
diff --git a/django/curator/spam_classifiers.py b/django/curator/spam_classifiers.py
index f88c36f0f..21c4b6bb8 100644
--- a/django/curator/spam_classifiers.py
+++ b/django/curator/spam_classifiers.py
@@ -11,7 +11,7 @@
 from typing import List
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import FunctionTransformer
-from sklearn.naive_bayes import MultinomialNB
+# from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
@@ -108,9 +108,11 @@ def load_model(self, file_path: str):
         Params : file_path ... Path to the saved model instance
         Returns : Loaded model instance
         """
-        if not os.path.isfile(file_path):
-            self.fit()
-        with open(file_path, "rb") as file:
+        try:
+            file = open(file_path, "rb")
+        except OSError:
+            print("Could not open/read file:", file_path)
+        with file:
             return pickle.load(file)
 
     def save_model(self, model, file_path: str):
@@ -175,28 +177,39 @@ def __init__(self):
         self.MODEL_METRICS_FILE_PATH = SPAM_DIR_PATH / "text_classifier_metrics.json"
 
     def fit(self):
+        """
+        return:
+            - model_metrics ... Dictionary of the following format containing model scores and user_ids
+                                that were used for the validation.
+                                    { "Accuracy": accuracy,
+                                      "Precision": precision,
+                                      "Recall": recall,
+                                      "F1": f1,
+                                      "test_user_ids": test_user_ids,}
+                                If training connot be conducted, it returns empty dictionary. (model_metrics = {})
+        """
         print("Training TextSpamClassifier...")
-        model_metrics = None
+        model_metrics = {}
         model = Pipeline(
             [
                 ("cleaner", FunctionTransformer(self.preprocess)),
                 ("countvectorizer", CountVectorizer(lowercase=True)),
-                ("classifier", MultinomialNB()),
+                ("classifier", xgb.XGBClassifier()),
             ]
         )
 
-        all_df = self.processor.get_all_users_df()
-
-        if all_df.empty:
-            return model_metrics # = None
+        # Use labelled samples as training data
+        df = self.processor.get_labelled_by_curator_df()
+        if df.empty:
+            return model_metrics
 
-        data_x, data_y = self.concat_pd(all_df)
+        data_x, data_y = self.concat_pd(df)
         if data_x.empty:
-            return model_metrics # = None
+            return model_metrics
 
         if len(data_y.value_counts()) != 2:
             print("Cannot create a binary classifier!!")
-            return model_metrics # = None
+            return model_metrics
 
         (
             train_x,
@@ -218,10 +231,15 @@ def fit(self):
         return model_metrics
 
     def predict(self):
+        """
+        return:
+            - evaluated_user_ids ...
+            - spam_user_ids ...
+        """
         print("TextSpamClassifier is making predictions...")
         evaluated_user_ids = []
         spam_user_ids = []
-        df = self.processor.get_unlabelled_by_curator_df()
+        df = self.processor.get_all_users_df()
         if df.empty:  # no-op if no data found
             return evaluated_user_ids, spam_user_ids
 
@@ -241,10 +259,12 @@ def predict(self):
         df = pd.DataFrame(result).replace(np.nan, None)
 
         self.processor.update_predictions(df, isTextClassifier=True)
-        evaluated_user_ids = df["user_id"].values.flatten()
-        spam_user_ids = df["user_id"][
-            df["labelled_by_text_classifier"] == 1
-        ].values.flatten()
+        evaluated_user_ids = df["user_id"].values.flatten().tolist()
+        spam_user_ids = (
+            df["user_id"][df["labelled_by_text_classifier"] == 1]
+            .values.flatten()
+            .tolist()
+        )
 
         print("Successfully made predictions!")
         return evaluated_user_ids, spam_user_ids
@@ -293,8 +313,8 @@ def fit(self):
                 ("classifier", xgb.XGBClassifier()),
             ]
         )
-        # obtain df from pipleline
-        df = self.processor.get_all_users_df()
+        # Use labelled samples as training data
+        df = self.processor.get_labelled_by_curator_df()
         if df.empty:
             return model_metrics  # None if no untrained data found
 
@@ -327,7 +347,7 @@ def predict(self):
         print("UserMetadataSpamClassifier is making predictions...")
         evaluated_user_ids = []
         spam_user_ids = []
-        df = self.processor.get_unlabelled_by_curator_df()
+        df = self.processor.get_all_users_df()
         if df.empty:  # no-op if no data found
             return evaluated_user_ids, spam_user_ids
 
@@ -346,10 +366,12 @@ def predict(self):
         df = pd.DataFrame(result).replace(np.nan, None)
 
         self.processor.update_predictions(df, isTextClassifier=False)
-        evaluated_user_ids = df["user_id"].values.flatten()
-        spam_user_ids = df["user_id"][
-            df["labelled_by_user_classifier"] == 1
-        ].values.flatten()
+        evaluated_user_ids = df["user_id"].values.flatten().tolist()
+        spam_user_ids = (
+            df["user_id"][df["labelled_by_user_classifier"] == 1]
+            .values.flatten()
+            .tolist()
+        )
 
         print("Successfully made predictions!")
         return evaluated_user_ids, spam_user_ids
@@ -402,7 +424,7 @@ def __input_df_transformation(self, df: pd.DataFrame):
         ].fillna(
             ""
         )
-        
+
         df.loc[:, ["user_id", "labelled_by_curator"]] = df[
             ["user_id", "labelled_by_curator"]
         ].fillna(0)
diff --git a/django/curator/spam_processor.py b/django/curator/spam_processor.py
index b5089a9ca..edf7aaf64 100644
--- a/django/curator/spam_processor.py
+++ b/django/curator/spam_processor.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import re
+import sys
 from django.db.models import Q
 from curator.models import UserSpamStatus
 from django.conf import settings
@@ -84,7 +85,7 @@ def __convert_df_markup_to_string(self, df):
         for col in df.columns:
             if col in self.type_int_bool_column_names:
                 df[col] = df[col].fillna(0).astype(int)
-                # It is safe to set Nan as 0 because:
+                # It is safe to set None as 0 because:
                 # for training, all values with labelled_by_curator=None are exclueded before passed to this function.
                 # for prediction, the labelled_by_curator column is not used during prediction process.
             else:
@@ -98,60 +99,76 @@ def get_all_users_df(self):
             self.__convert_df_markup_to_string(
                 pd.DataFrame(
                     list(
-                        UserSpamStatus.objects.all()
-                        .exclude(member_profile__user_id=None, labelled_by_curator=None)
-                        .values(*self.db_column_names)
+                        UserSpamStatus.objects.exclude(
+                            member_profile__user_id=None
+                        ).values(*self.db_column_names)
                     )
                 )
             )
         )
 
-    def get_unlabelled_by_curator_df(self):
-        # return : DataFrame of user data that haven't been labeled by curator
+    def get_labelled_by_curator_df(self):
         return self.__rename_columns(
             self.__convert_df_markup_to_string(
                 pd.DataFrame(
                     list(
-                        UserSpamStatus.objects.all()
-                        .exclude(member_profile__user_id=None)
-                        .filter(labelled_by_curator=None)
-                        .values(*self.db_column_names)
+                        UserSpamStatus.objects.exclude(
+                            Q(member_profile__user_id=None)
+                            | Q(labelled_by_curator=None)
+                        ).values(*self.db_column_names)
                     )
                 )
             )
         )
 
-    def get_untrained_df(self):
-        # return : DataFrame of user data that haven't been used for train previously
-        return self.__rename_columns(
-            self.__convert_df_markup_to_string(
-                pd.DataFrame(
-                    list(
-                        UserSpamStatus.objects.all()
-                        .exclude(member_profile__user_id=None, labelled_by_curator=None)
-                        .filter(is_training_data=False)
-                        .values(*self.db_column_names)
-                    )
-                )
-            )
-        )
+    # Currently not using
+    # def get_unlabelled_by_curator_df(self):
+    #     # return : DataFrame of user data that haven't been labeled by curator
+    #     return self.__rename_columns(
+    #         self.__convert_df_markup_to_string(
+    #             pd.DataFrame(
+    #                 list(
+    #                     UserSpamStatus.objects
+    #                     .exclude(member_profile__user_id=None)
+    #                     .filter(labelled_by_curator=None)
+    #                     .values(*self.db_column_names)
+    #                 )
+    #             )
+    #         )
+    #     )
 
-    def get_unlabelled_users(self):
-        unlabelled_users = list(
-            UserSpamStatus.objects.filter(
-                Q(labelled_by_curator=None)
-                & Q(labelled_by_text_classifier=None)
-                & Q(labelled_by_user_classifier=None)
-            )
-        )
-        return unlabelled_users
+    # Currently not using
+    # def get_untrained_df(self):
+    #     # return : DataFrame of user data that haven't been used for train previously
+    #     return self.__rename_columns(
+    #         self.__convert_df_markup_to_string(
+    #             pd.DataFrame(
+    #                 list(
+    #                     UserSpamStatus.objects
+    #                     .exclude(Q(member_profile__user_id=None) | Q(labelled_by_curator=None))
+    #                     .filter(is_training_data=False)
+    #                     .values(*self.db_column_names)
+    #                 )
+    #             )
+    #         )
+    #     )
 
-    # FIXME: tune confidence threshold later
+    # def get_unlabelled_users(self):
+    #     unlabelled_users = list(
+    #         UserSpamStatus.objects.filter(
+    #             Q(labelled_by_curator=None)
+    #             & Q(labelled_by_text_classifier=None)
+    #             & Q(labelled_by_user_classifier=None)
+    #         )
+    #     )
+    #     return unlabelled_users
+
+    # TODO: tune confidence threshold later
     def get_spam_users(self, confidence_threshold=0.5):
         """
         This functions will first filter out the users with labelled_by_curator==True,
-        but the ones with None, only get users with labelled_by_user_classifier == True
-        or labelled_by_text_classifier == True with a specific confidence level.
+        but the ones with None, only get users with labelled_by_user_classifier==True
+        or labelled_by_text_classifier==True with a specific confidence level.
         """
         spam_users = list(
             UserSpamStatus.objects.filter(
@@ -191,17 +208,37 @@ def load_labels_from_csv(self, filepath=DATASET_FILE_PATH):
         return : list of user_ids which labelled_by_curator was updated
         """
         print("Loading labels CSV...")
-        label_df = pd.read_csv(filepath)  # TODO add exception
+        try:
+            label_df = pd.read_csv(filepath)
+        except Exception:
+            print("Could not open/read file:", filepath)
+            print("Please locate a dataset with labels at the path of ./curator/spam_dataset.csv")
+            sys.exit()
+        
+        # Use when batch updating of labelled_by_curator is ready
+        # spam_user_ids = label_df[label_df['is_spam']==1]['user_id'].values
+        # ham_user_ids = label_df[label_df['is_spam']==0]['user_id'].values
+
+        # is_spam = True
+        # flag = self.update_labelled_by_curator(spam_user_ids, is_spam)
+        # if flag == 1:
+        #     user_id_list.append(spam_user_ids)
+
+        # is_spam = False
+        # flag = self.update_labelled_by_curator(ham_user_ids, is_spam)
+        # if flag == 1:
+        #     user_id_list.append(ham_user_ids)
+            
         user_id_list = []
         for idx, row in label_df.iterrows():
             flag = self.update_labelled_by_curator(row["user_id"], bool(row["is_spam"]))
             if flag == 1:
                 user_id_list.append(row["user_id"])
         print("Successfully loaded labels from CSV!")
-        print("List of user ids of which label was loaded :\n", user_id_list)
+        print("Number of user ids whose label was loaded: ", len(user_id_list))
         return user_id_list
 
-    def update_labelled_by_curator(self, user_id, label):
+    def update_labelled_by_curator(self, user_id, label): #TODO update with batch
         return UserSpamStatus.objects.filter(member_profile__user_id=user_id).update(
             labelled_by_curator=label
         )  # return 0(fail) or 1(success)
diff --git a/django/curator/tests/test_spam.py b/django/curator/tests/test_spam.py
index d722aefde..de791053d 100644
--- a/django/curator/tests/test_spam.py
+++ b/django/curator/tests/test_spam.py
@@ -43,6 +43,16 @@ def delete_new_users(self, user_ids):
             user = User.objects.filter(id=user_id)
             user.delete()
 
+    # def update_labels(self, user_ids):
+    #     middle_idx = len(user_ids)/2 - 1
+    #     label = True
+    #     self.processor.update_labelled_by_curator(user_ids[0:middle_idx], label)
+    #     label = False
+    #     self.processor.update_labelled_by_curator(user_ids[middle_idx:-1], label)
+
+    # def delete_labels(self, user_ids):
+    #     self.processor.update_labelled_by_curator(user_ids, None)
+
     def update_labels(self, user_ids):
         for user_id in user_ids:
             label = random.randint(0, 1)
@@ -52,7 +62,7 @@ def delete_labels(self, user_ids):
         for user_id in user_ids:
             self.processor.update_labelled_by_curator(user_id, None)
 
-    def get_existing_users(self):
+    def get_existing_user_ids(self):
         user_ids = list(
             UserSpamStatus.objects.all()
             .exclude(member_profile__user_id=None)
@@ -79,93 +89,75 @@ def test_load_labels_from_csv(self):
         self.assertTrue(self.processor.labelled_by_curator_exist())
 
     # ============== Tests for UserSpamStatusProcessor.get_unlabelled_by_curator_df() ==============
-    def test_get_unlabelled_by_curator_df__new_users_added(self):
+    def test_get_labelled_by_curator_df__new_users_added(self):
         """
         case1 : new user data added
             stub data/requirements ... new user data with labelled_by_curator==None
-            assertion ... df with the specific columns with the correct user_ids
+            assertion ... no change in df
         """
 
-        existing_users = self.user_ids
-        user_ids = self.create_new_users()  # default labelled_by_curator==None
-        user_size = len(user_ids) + len(existing_users)
+        existing_user_ids = self.user_ids
+        self.update_labels(
+            existing_user_ids
+        )  # make sure all the existing users have labels
+        existing_df = self.processor.get_labelled_by_curator_df()
 
-        df = self.processor.get_unlabelled_by_curator_df()
-
-        self.assertIsInstance(df, pd.DataFrame)
-        self.assertTrue(
-            set(self.processor.column_names).issubset(set(df.columns.unique()))
-        )
-        self.assertEqual(user_size, len(df["user_id"].values))
-        self.assertTrue(set(user_ids).issubset(set(df["user_id"].values)))
-        self.delete_new_users(user_ids)
-
-    def test_get_unlabelled_by_curator_df__no_users_added(self):
-        """
-        case2 : no new user data added
-            stub data/requirements ... all data in DB with correct user_id has values
-                                       with labelled_by_curator!=None
-            assertion ... empty df
-        """
-        existing_users = self.user_ids
-        self.update_labels(existing_users)  # simulate a curator labelling the users
+        new_user_ids = self.create_new_users()  # default labelled_by_curator==None
 
-        df = self.processor.get_unlabelled_by_curator_df()
-        self.assertEqual(len(df), 0)
+        user_size = len(new_user_ids) + len(existing_user_ids)
+        new_df = self.processor.get_labelled_by_curator_df()
 
-        self.delete_labels(existing_users)
+        self.assertEqual(
+            len(existing_df["user_id"].values), len(new_df["user_id"].values)
+        )
+        self.assertGreater(user_size, len(existing_df["user_id"].values))
+        self.assertFalse(set(new_user_ids).issubset(set(new_df["user_id"].values)))
+        self.delete_new_users(new_user_ids)
 
-    # ======================== Tests for UserSpamStatusProcessor.get_untrained_df()  ========================
-    def test_get_untrained_df__dataset_loaded(self):
+    def test_get_labelled_by_curator_df__label_added(self):
         """
-        case1 : Just uploaded spam_dataset.csv
-            stub data/requirements ... just called load_labels_from_csv().
-            assertion ... df with the specific columns with the correct user_ids
+        case2 : new user data added
+            stub data/requirements ... new user data with labelled_by_curator==True/False
+            assertion ... df with the specific columns with the addtional user data that were labelled.
         """
-        user_ids = self.processor.load_labels_from_csv()
+        existing_user_ids = self.user_ids
+        self.update_labels(
+            existing_user_ids
+        )  # make sure all the existing users have labels
 
-        df = self.processor.get_untrained_df()
-        self.assertIsInstance(df, pd.DataFrame)
-        self.assertTrue(
-            set(self.processor.column_names).issubset(set(df.columns.unique()))
-        )
-        # self.assertListEqual(user_ids, list(df["user_id"].values))
-        self.assertTrue(set(user_ids) == set(list(df["user_id"].values)))
-        self.delete_labels(user_ids)
+        new_user_ids = self.create_new_users()  # default labelled_by_curator==None
+        user_size = len(new_user_ids) + len(existing_user_ids)
 
-    def test_get_untrained_df__labels_updated(self):
-        """
-        case2 : labelled_by_curator updated
-            stub data/requirements ...  new labells by curator (users with abelled_by_curator!=None and is_training_data=False)
-            assertion ... df with the specific columns with the correct user_ids
-        """
-        existing_users = self.user_ids
-        self.update_labels(existing_users)  # update labels of exisiting users
+        self.update_labels(new_user_ids)
+        df = self.processor.get_labelled_by_curator_df()
 
-        df = self.processor.get_untrained_df()
         self.assertIsInstance(df, pd.DataFrame)
         self.assertTrue(
             set(self.processor.column_names).issubset(set(df.columns.unique()))
         )
-        # self.assertListEqual(existing_users, list(df["user_id"].values))
-        self.assertTrue(set(existing_users) == set(list(df["user_id"].values)))
-        self.delete_labels(existing_users)
+        self.assertEqual(user_size, len(df["user_id"].values))
+        self.assertTrue(set(new_user_ids).issubset(set(df["user_id"].values)))
+        self.delete_new_users(new_user_ids)
 
-    def test_get_untrained_df__no_labels_updated(self):
+    def test_get_labelled_by_curator_df__no_users_added(self):
         """
-        case3 : no label updates
-            stub data/requirements ... all data in DB with labelled_by_curator!=None has is_training_data=True
-            assertion ... empty df
+        case3 : no new user data added
+            stub data/requirements ... all data in DB with correct user_id has values
+                                        with labelled_by_curator==True/False
+            assertion ... no change in df
         """
-        existing_users = self.user_ids
-        self.update_labels(existing_users)  # update labels of exisiting users
-        self.mark_as_training_data(existing_users)  # mark the user as training data
+        existing_user_ids = self.user_ids
+        self.update_labels(
+            existing_user_ids
+        )  # make sure all the existing users have labels
+        existing_df = self.processor.get_labelled_by_curator_df()
 
-        df = self.processor.get_untrained_df()
-        self.assertEqual(len(df), 0)
+        self.update_labels(existing_user_ids)  # making sure all users are labelled
 
-        self.delete_labels(existing_users)
-        self.unmark_as_training_data(existing_users)
+        new_df = self.processor.get_labelled_by_curator_df()
+        self.assertEqual(
+            len(existing_df["user_id"].values), len(new_df["user_id"].values)
+        )
 
     # ================================ Tests for UserMetadataSpamClassifier ================================
     def test_user_meta_classifier_fit(self):
@@ -205,8 +197,8 @@ def test_user_meta_classifier_prediction(self):
             self.processor.load_labels_from_csv()
             self.user_meta_classifier.fit()
 
-        existing_users = self.user_ids
-        self.update_labels(existing_users)
+        existing_user_ids = self.user_ids
+        self.update_labels(existing_user_ids)
         new_user_ids = self.create_new_users()  # default labelled_by_curator==None
         labelled_user_ids, _ = self.user_meta_classifier.predict()
 
@@ -238,12 +230,101 @@ def test_text_classifier_prediction(self):
             # self.add_texts_to_users(user_ids)
             self.text_classifier.fit()
 
-        existing_users = self.user_ids
-        self.update_labels(existing_users)
+        existing_user_ids = self.user_ids
+        self.update_labels(existing_user_ids)
         new_user_ids = self.create_new_users()  # default labelled_by_curator==None
-        self.add_texts_to_users(existing_users)
+        self.add_texts_to_users(existing_user_ids)
         self.add_texts_to_users(new_user_ids)
         labelled_user_ids, _ = self.text_classifier.predict()
 
         self.assertTrue(self.processor.all_have_labels())
         self.assertTrue(bool(set(new_user_ids) & set(labelled_user_ids)))
+
+    # # ============== Tests for UserSpamStatusProcessor.get_unlabelled_by_curator_df() ==============
+    # def test_get_unlabelled_by_curator_df__new_users_added(self):
+    #     """
+    #     case1 : new user data added
+    #         stub data/requirements ... new user data with labelled_by_curator==None
+    #         assertion ... df with the specific columns with the correct user_ids
+    #     """
+
+    #     existing_user_ids = self.user_ids
+    #     user_ids = self.create_new_users()  # default labelled_by_curator==None
+    #     user_size = len(user_ids) + len(existing_user_ids)
+
+    #     df = self.processor.get_unlabelled_by_curator_df()
+
+    #     self.assertIsInstance(df, pd.DataFrame)
+    #     self.assertTrue(
+    #         set(self.processor.column_names).issubset(set(df.columns.unique()))
+    #     )
+    #     self.assertEqual(user_size, len(df["user_id"].values))
+    #     self.assertTrue(set(user_ids).issubset(set(df["user_id"].values)))
+    #     self.delete_new_users(user_ids)
+
+    # def test_get_unlabelled_by_curator_df__no_users_added(self):
+    #     """
+    #     case2 : no new user data added
+    #         stub data/requirements ... all data in DB with correct user_id has values
+    #                                    with labelled_by_curator!=None
+    #         assertion ... empty df
+    #     """
+    #     existing_user_ids = self.user_ids
+    #     self.update_labels(existing_user_ids)  # simulate a curator labelling the users
+
+    #     df = self.processor.get_unlabelled_by_curator_df()
+    #     self.assertEqual(len(df), 0)
+
+    #     self.delete_labels(existing_user_ids)
+
+    # ======================== Tests for UserSpamStatusProcessor.get_untrained_df()  ========================
+    # def test_get_untrained_df__dataset_loaded(self):
+    #     """
+    #     case1 : Just uploaded spam_dataset.csv
+    #         stub data/requirements ... just called load_labels_from_csv().
+    #         assertion ... df with the specific columns with the correct user_ids
+    #     """
+    #     user_ids = self.processor.load_labels_from_csv()
+
+    #     df = self.processor.get_untrained_df()
+    #     self.assertIsInstance(df, pd.DataFrame)
+    #     self.assertTrue(
+    #         set(self.processor.column_names).issubset(set(df.columns.unique()))
+    #     )
+    #     # self.assertListEqual(user_ids, list(df["user_id"].values))
+    #     self.assertTrue(set(user_ids) == set(list(df["user_id"].values)))
+    #     self.delete_labels(user_ids)
+
+    # def test_get_untrained_df__labels_updated(self):
+    #     """
+    #     case2 : labelled_by_curator updated
+    #         stub data/requirements ...  new labells by curator (users with abelled_by_curator!=None and is_training_data=False)
+    #         assertion ... df with the specific columns with the correct user_ids
+    #     """
+    #     existing_user_ids = self.user_ids
+    #     self.update_labels(existing_user_ids)  # update labels of exisiting users
+
+    #     df = self.processor.get_untrained_df()
+    #     self.assertIsInstance(df, pd.DataFrame)
+    #     self.assertTrue(
+    #         set(self.processor.column_names).issubset(set(df.columns.unique()))
+    #     )
+    #     # self.assertListEqual(existing_user_ids, list(df["user_id"].values))
+    #     self.assertTrue(set(existing_user_ids) == set(list(df["user_id"].values)))
+    #     self.delete_labels(existing_user_ids)
+
+    # def test_get_untrained_df__no_labels_updated(self):
+    #     """
+    #     case3 : no label updates
+    #         stub data/requirements ... all data in DB with labelled_by_curator!=None has is_training_data=True
+    #         assertion ... empty df
+    #     """
+    #     existing_user_ids = self.user_ids
+    #     self.update_labels(existing_user_ids)  # update labels of exisiting users
+    #     self.mark_as_training_data(existing_user_ids)  # mark the user as training data
+
+    #     df = self.processor.get_untrained_df()
+    #     self.assertEqual(len(df), 0)
+
+    #     self.delete_labels(existing_user_ids)
+    #     self.unmark_as_training_data(existing_user_ids)