fix: make prediction on every user in DB instead of filtering labelle…

…d_by_curator=None - using get_all_users_df() instead of get_unlabelled_by_curator_df() to obtain dataframe, because a user previously labelled as ham may turn into spam. - improved the management command messages. - added exception handling for file operations. - replaced MultinomialNB with XGboost.
comses · Jan 12, 2024 · 100b49a · 100b49a
1 parent 46605dc
commit 100b49a
Show file tree

Hide file tree

Showing 6 changed files with 419 additions and 259 deletions.
diff --git a/django/curator/management/commands/curator_spam_detection.py b/django/curator/management/commands/curator_spam_detection.py
@@ -17,12 +17,19 @@ def __init__(self):
 
     def add_arguments(self, parser):
         parser.add_argument(
-            "--exe",
-            "-e",
+            "--predict",
+            "-p",
             action="store_true",
             default=False,
             help="Print user_ids of spam users and the metrics of the models used to obtain the predictions.",
         )
+        parser.add_argument(
+            "--fit",
+            "-f",
+            action="store_true",
+            default=False,
+            help="Fit all models based on user data labelled by curator.",
+        )
         parser.add_argument(
             "--get_model_metrics",
             "-m",
@@ -35,7 +42,7 @@ def add_arguments(self, parser):
             "-l",
             action="store_true",
             default=False,
-            help="Store manually annotated spam labels to the DB.",
+            help="Store bootstrap spam labels to the DB.",
         )
         parser.add_argument(
             "--fit_usermeta_model", "-fu", action="store_true", default=False
@@ -58,8 +65,11 @@ def add_arguments(self, parser):
             help="Print user_ids of all the evaluated users and spam users using the Text model",
         )
 
-    def handle_exe(self):
-        self.detection.execute()
+    def handle_predict(self):
+        self.detection.predict()
+
+    def handle_fit(self):
+        self.detection.fit_classifiers()
 
     def handle_get_model_metrics(self):
         self.detection.get_model_metrics()
@@ -80,8 +90,10 @@ def handle_predict_text_model(self):
         self.detection.predict_text_spam_classifier()
 
     def handle(self, *args, **options):
-        if options["exe"]:
-            action = "exe"
+        if options["predict"]:
+            action = "predict"
+        elif options["fit"]:
+            action = "fit"
         elif options["get_model_metrics"]:
             action = "get_model_metrics"
         elif options["load_labels"]:

diff --git a/django/curator/models.py b/django/curator/models.py
@@ -375,7 +375,7 @@ class UserSpamStatus(models.Model):
     member_profile = models.OneToOneField(
         MemberProfile, on_delete=models.CASCADE, primary_key=True
     )
-    # FIXME: add help_text
+    # TODO: add help_text
     # None = not processed yet
     # True = bio_classifier considered this user to be spam
     # False = bio_classifier did not consider this user to be spam

diff --git a/django/curator/spam.py b/django/curator/spam.py
@@ -1,5 +1,6 @@
 import json
 import os
+import sys
 
 from django.conf import settings
 
@@ -25,51 +26,52 @@ def __init__(self):
         SpamDetection Initialization Steps:
         1. Initializes UserSpamStatusProcessor and the classifier classes
         2. If no data has been labelled by a curator, load datase.csv
-        3. If no model pickle file is found, call fit() of Classifier classes
-            - if all users have None in labelled_by_curator, load to DB by calling Pipeline.load_labels_from_csv()
-            - additionally, if no labels file, throw exception
         """
         self.processor = UserSpamStatusProcessor()
         self.usermeta_classifier = UserMetadataSpamClassifier()
         self.text_classifier = TextSpamClassifier()
         if not self.processor.labelled_by_curator_exist():
             self.processor.load_labels_from_csv()
 
+
+    def _check_model_instance_files(self):
         # Check whether UserMetadataSpamClassifier model file exists
-        if os.path.exists(self.usermeta_classifier.MODEL_METRICS_FILE_PATH):
-            with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-                self.usermeta_classifier_metrics = json.load(json_file)
-        else:
-            # If model metrics and instance file don't exist, call fit()
-            self.usermeta_classifier_metrics = self.usermeta_classifier.fit()
+        try:
+            json_file = open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH)
+        except OSError:
+            print("Could not open/read file:", self.usermeta_classifier.MODEL_METRICS_FILE_PATH)
+            print("Please run fit_classifiers() to create model instance and metrics files.")
+            sys.exit()
+        with json_file:
+            self.usermeta_classifier_metrics = json.load(json_file)
+
+        try:
+            json_file = open(self.text_classifier.MODEL_METRICS_FILE_PATH)
+        except OSError:
+            print("Could not open/read file:", self.text_classifier.MODEL_METRICS_FILE_PATH)
+            print("Please run fit_classifiers() to create model instance and metrics files.")
+            sys.exit()
+        with json_file:
+            self.text_classifier_metrics = json.load(json_file)
 
-        # Check whether TextSpamClassifier model file exists
-        if os.path.exists(self.text_classifier.MODEL_METRICS_FILE_PATH):
-            with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-                self.text_classifier_metrics = json.load(json_file)
-        else:
-            # If model metrics and instance file don't exist, call fit()
-            self.text_classifier_metrics = self.text_classifier.fit()
 
-    def execute(self):
+    def predict(self):
         """
         A default function to obtain the list of spam users and the metrics of the models used to
         obtain the predictions.
 
         Execution Steps:
-            1. Check if there exists user data that should be labelled by the classifier models
-            2. If there exists, class predict() of the classifier classes. This function will store the result in DB at the end.
-            3. Print resluts
+            1. Call predict() of the classifier classes. This function will store the result in DB at the end.
+            2. Print resluts
             4. Return the detection results stored in DB.
         """
         print("Executing spam dectection...")
-        # 1. Check DB for unlabelled users (None in all labelled_by_curator, labelled_by_user_classifier, and labelled_by_text_classifier)
-        if len(self.processor.get_unlabelled_users()) != 0:
-            # 2. if there are some unlabelled users, predict
-            print("Models are making predictions...")
-            self.usermeta_classifier.predict()
-            self.text_classifier.predict()
-            print("Successfully made predictions!")
+        self._check_model_instance_files()
+
+        print("Models are making predictions...")
+        self.usermeta_classifier.predict()
+        self.text_classifier.predict()
+        print("Successfully made predictions!")
 
         result = {
             "spam_users": self.processor.get_spam_users(),
@@ -78,33 +80,9 @@ def execute(self):
         }
 
         # 3. Print resluts
-        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
-        result["usermeta_spam_classifier"].pop("test_user_ids")
-        metadata_model_metrics = result["usermeta_spam_classifier"]
+        self._print_model_metrics_usermeta(self.usermeta_classifier_metrics)
+        self._print_model_metrics_text(self.text_classifier_metrics)
 
-        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
-        result["text_spam_classifier"].pop("test_user_ids")
-        text_model_metrics = result["text_spam_classifier"]
-
-        print("IDs of Detected Spam User :\n", result["spam_users"])
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
         # 4. Return spam user_ids and metrics of the model
         return result
 
@@ -115,9 +93,9 @@ def get_model_metrics(self):
         load the JSON metrics files as dictionary and return it.
 
         Execution Steps:
-            1. load the model metrics files from the saving directory
-            2. print the metrics
-            3. return a dictionary of the scores of existing machine learning model instances.
+            1. Load the model metrics files from the saving directory
+            2. Print the metrics
+            3. Return a dictionary of the scores of existing machine learning model instances.
         return:
             {   "usermeta_spam_classifier": {"Accuracy": float,
                                                   "Precision": float,
@@ -128,49 +106,29 @@ def get_model_metrics(self):
                 "text_spam_classifier": { same as above }
             }
         """
+        # 1. Load the model metrics files from the saving directory
         print("Loading model metric files...")
-        # We can assume that model and model metrics files exist after __init__
-        with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-            self.usermeta_classifier_metrics = json.load(json_file)
-
-        with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-            self.text_classifier_metrics = json.load(json_file)
+        self._check_model_instance_files()
 
         print("Successfully loaded model metrics!")
         result = {
             "usermeta_spam_classifier": self.usermeta_classifier_metrics,
             "text_spam_classifier": self.text_classifier_metrics,
         }
 
-        # Print the model metrics
-        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
-        result["usermeta_spam_classifier"].pop("test_user_ids")
-        metadata_model_metrics = result["usermeta_spam_classifier"]
-
-        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
-        result["text_spam_classifier"].pop("test_user_ids")
-        text_model_metrics = result["text_spam_classifier"]
+        # 2. Print the model metrics
+        self._print_model_metrics_usermeta(self.usermeta_classifier_metrics)
+        self._print_model_metrics_text(self.text_classifier_metrics)
 
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
+        # 3. Return a dictionary of the scores of existing machine learning model instances.
         return result
 
+
+    def fit_classifiers(self):
+        self.fit_usermeta_spam_classifier()
+        self.fit_text_spam_classifier()
+
+
     def fit_usermeta_spam_classifier(self):
         """
         This function is a wrapper of the fit() function in UserMetadataSpamClassifier.
@@ -181,18 +139,7 @@ def fit_usermeta_spam_classifier(self):
             2. print the model metrics and the user_ids of the users used to calculate the scores.
         """
         model_metrics = self.usermeta_classifier.fit()
-        metadata_model_tested_ids = model_metrics["test_user_ids"]
-        model_metrics.pop("test_user_ids")
-        metadata_model_metrics = model_metrics
-        print("\n------------------------------------\n")
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            metadata_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            metadata_model_tested_ids,
-        )
+        self._print_model_metrics_usermeta(model_metrics)
 
     def fit_text_spam_classifier(self):
         """
@@ -204,18 +151,7 @@ def fit_text_spam_classifier(self):
             2. print the model metrics and the user_ids of the users used to calculate the scores.
         """
         model_metrics = self.text_classifier.fit()
-        text_model_tested_ids = model_metrics["test_user_ids"]
-        model_metrics.pop("test_user_ids")
-        text_model_metrics = model_metrics
-        print("\n------------------------------------\n")
-        print(
-            "TextSpamClassifier Metrics :\n",
-            text_model_metrics,
-        )
-        print(
-            "Metrics was calculated based on users with following IDs ....\n",
-            text_model_tested_ids,
-        )
+        self._print_model_metrics_text(model_metrics)
 
     def predict_usermeta_spam_classifier(self):
         """
@@ -232,8 +168,22 @@ def predict_usermeta_spam_classifier(self):
                 "Since all users were labelled by curator, classifier prediction was not executed.\n"
             )
         else:
-            print("UserMetadataSpamClassifier evaluated users:\n", evaluated_user_ids)
-            print("Spam Users :\n", spam_user_ids)
+            result_filepath = (
+                "/shared/curator/spam/user_meta_classifier_prediction.json"
+            )
+            result = {
+                "spam_user_ids": spam_user_ids,
+                "evaluated_user_ids": evaluated_user_ids,
+            }
+            self._save_prediction_result(result_filepath, result)
+            print("\n------------------------------------")
+            print(
+                "Number of spam users detected by UserMetadataSpamClassifier: %d / %d"
+                % (len(spam_user_ids), len(evaluated_user_ids))
+            )
+            print(
+                "You can find the list of ids of the detected spams and evaluated users"
+            )
 
     def predict_text_spam_classifier(self):
         """
@@ -251,5 +201,63 @@ def predict_text_spam_classifier(self):
                 "Since all users were labelled by curator, classifier prediction was not executed.\n"
             )
         else:
-            print("TextSpamClassifier evaluated users:\n", evaluated_user_ids)
-            print("Spam Users :\n", spam_user_ids)
+            result_filepath = "/shared/curator/spam/text_classifier_prediction.json"
+            result = {
+                "spam_user_ids": spam_user_ids,
+                "evaluated_user_ids": evaluated_user_ids,
+            }
+            self._save_prediction_result(result_filepath, result)
+            print("\n------------------------------------")
+            print(
+                "Number of spam users detected by TextSpamClassifier: %d / %d"
+                % (len(spam_user_ids), len(evaluated_user_ids))
+            )
+            print(
+                "You can find the list of ids of the detected spams and evaluated users"
+            )
+
+    def _save_prediction_result(self, filepath, data: dict):
+        with open(filepath, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def _print_model_metrics_usermeta(self, result: dict):
+        metadata_model_tested_ids = result["test_user_ids"]
+        result.pop("test_user_ids")
+        metadata_model_metrics = result
+        print("\n------------------------------------")
+        print(
+            "UserMetadataSpamClassifier Metrics :\n",
+            metadata_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on a test dataset of size ",
+            len(metadata_model_tested_ids),
+        )
+        print(
+            "Here's some of the user IDs in the test dataset.\n",
+            metadata_model_tested_ids[:10],
+        )
+        print(
+            "You can find the rest in /shared/curator/spam/user_meta_classifier_metrics.json"
+        )
+
+    def _print_model_metrics_text(self, result: dict):
+        text_model_tested_ids = result["test_user_ids"]
+        result.pop("test_user_ids")
+        text_model_metrics = result
+        print("\n------------------------------------")
+        print(
+            "TextSpamClassifier Metrics :\n",
+            text_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on a test dataset of size ",
+            len(text_model_tested_ids),
+        )
+        print(
+            "Here's some of the user IDs in the test dataset.\n",
+            text_model_tested_ids[:10],
+        )
+        print(
+            "You can find the rest in /shared/curator/spam/text_classifier_metrics.json"
+        )