refactor: Cleaning and adding more comments for the functions related…

… to the spam feature. - adding headline comments for the functions. - Cleaning up the management command code and clarifying code responsibilities. - Bettering execution messages.
comses · Nov 21, 2023 · b99c8b0 · b99c8b0
1 parent 25f548e
commit b99c8b0
Show file tree

Hide file tree

Showing 5 changed files with 322 additions and 78 deletions.
diff --git a/django/curator/management/commands/curator_spam_detection.py b/django/curator/management/commands/curator_spam_detection.py
@@ -21,62 +21,63 @@ def add_arguments(self, parser):
             "-e",
             action="store_true",
             default=False,
-            help="returns spam user_ids and model metrics",
+            help="Print user_ids of spam users and the metrics of the models used to obtain the predictions.",
         )
         parser.add_argument(
             "--get_model_metrics",
-            "-g",
+            "-m",
             action="store_true",
             default=False,
-            help="gets model accuracy, precision, recall and f1 scores",
+            help="Print the accuracy, precision, recall and f1 scores of the models used to obtain the predictions.",
         )
         parser.add_argument(
             "--load_labels",
             "-l",
             action="store_true",
             default=False,
-            help="save initial dataset to the DB. ",
+            help="Store manually annotated spam labels to the DB.",
+        )
+        parser.add_argument(
+            "--fit_usermeta_model", "-fu", action="store_true", default=False
+        )
+        parser.add_argument(
+            "--fit_text_model", "-ft", action="store_true", default=False
+        )
+        parser.add_argument(
+            "--predict_usermeta_model",
+            "-pu",
+            action="store_true",
+            default=False,
+            help="Print user_ids of all the evaluated users and spam users using the UserMetadata model",
+        )
+        parser.add_argument(
+            "--predict_text_model",
+            "-pt",
+            action="store_true",
+            default=False,
+            help="Print user_ids of all the evaluated users and spam users using the Text model",
         )
-        parser.add_argument("--train_user", "-tu", action="store_true", default=False)
-        parser.add_argument("--predict_user", "-pu", action="store_true", default=False)
-        parser.add_argument("--train_text", "-tt", action="store_true", default=False)
-        parser.add_argument("--predict_text", "-pt", action="store_true", default=False)
 
     def handle_exe(self):
-        result = self.detection.execute()
-        print("Spam Users :\n", result["spam_users"])
-        print(
-            "UserMetadataSpamClassifier Metrics :\n",
-            result["user_metadata_spam_classifier"],
-        )
-        print("TextSpamClassifier Metrics:\n", result["text_spam_classifier"])
+        self.detection.execute()
 
     def handle_get_model_metrics(self):
-        metrics = self.detection.get_model_metrics()
-        print(
-            "UserMetadataSpamClassifier Metrics:\n",
-            metrics["user_metadata_spam_classifier"],
-        )
-        print("TextSpamClassifier Metrics:\n", metrics["text_spam_classifier"])
+        self.detection.get_model_metrics()
 
     def handle_load_labels(self):
         self.processor.load_labels_from_csv()
 
-    def handle_train_user(self):
-        model_metrics = self.user_meta_classifier.fit()
-        print("UserMetadataSpamClassifier Metrics:\n", model_metrics)
+    def handle_fit_usermeta_model(self):
+        self.detection.fit_usermeta_spam_classifier()
 
-    def handle_predict_user(self):
-        user_ids = self.user_meta_classifier.predict()
-        print("UserMetadataSpamClassifier predicted users:\n", user_ids)
+    def handle_fit_text_model(self):
+        self.detection.fit_text_spam_classifier()
 
-    def handle_train_text(self):
-        model_metrics = self.text_classifier.fit()
-        print("TextSpamClassifier Metrics:\n", model_metrics)
+    def handle_predict_usermeta_model(self):
+        self.detection.predict_usermeta_spam_classifier()
 
-    def handle_predict_text(self):
-        user_ids = self.text_classifier.predict()
-        print("TextSpamClassifier predicted users:\n", user_ids)
+    def handle_predict_text_model(self):
+        self.detection.predict_text_spam_classifier()
 
     def handle(self, *args, **options):
         if options["exe"]:
@@ -85,13 +86,13 @@ def handle(self, *args, **options):
             action = "get_model_metrics"
         elif options["load_labels"]:
             action = "load_labels"
-        elif options["train_user"]:
-            action = "train_user"
-        elif options["predict_user"]:
-            action = "predict_user"
-        elif options["train_text"]:
-            action = "train_text"
-        elif options["predict_text"]:
-            action = "predict_text"
+        elif options["fit_usermeta_model"]:
+            action = "fit_usermeta_model"
+        elif options["fit_text_model"]:
+            action = "fit_text_model"
+        elif options["predict_usermeta_model"]:
+            action = "predict_usermeta_model"
+        elif options["predict_text_model"]:
+            action = "predict_text_model"
 
         getattr(self, f"handle_{action}")()
diff --git a/django/curator/spam.py b/django/curator/spam.py
@@ -18,28 +18,30 @@ class SpamDetector:
 
     def __init__(self):
         """
-        SpamDetection Initialization Steps
+        This function makes sure that both models, UserMetadataSpamClassifier and TextSpamClassifier,
+        exist and saved in a saving directory for the later use. If the model instance files do not exist in
+        the directory, this function calls fit() to train the models and create the files.
+
+        SpamDetection Initialization Steps:
         1. Initializes UserSpamStatusProcessor and the classifier classes
         2. If no data has been labelled by a curator, load datase.csv
         3. If no model pickle file is found, call fit() of Classifier classes
             - if all users have None in labelled_by_curator, load to DB by calling Pipeline.load_labels_from_csv()
             - additionally, if no labels file, throw exception
         """
         self.processor = UserSpamStatusProcessor()
-        self.user_metadata_classifier = UserMetadataSpamClassifier()
+        self.usermeta_classifier = UserMetadataSpamClassifier()
         self.text_classifier = TextSpamClassifier()
         if not self.processor.labelled_by_curator_exist():
             self.processor.load_labels_from_csv()
 
         # Check whether UserMetadataSpamClassifier model file exists
-        if os.path.exists(self.user_metadata_classifier.MODEL_METRICS_FILE_PATH):
-            with open(
-                self.user_metadata_classifier.MODEL_METRICS_FILE_PATH
-            ) as json_file:
-                self.user_meta_classifier_metrics = json.load(json_file)
+        if os.path.exists(self.usermeta_classifier.MODEL_METRICS_FILE_PATH):
+            with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
+                self.usermeta_classifier_metrics = json.load(json_file)
         else:
             # If model metrics and instance file don't exist, call fit()
-            self.user_meta_classifier_metrics = self.user_meta_classifier.fit()
+            self.usermeta_classifier_metrics = self.usermeta_classifier.fit()
 
         # Check whether TextSpamClassifier model file exists
         if os.path.exists(self.text_classifier.MODEL_METRICS_FILE_PATH):
@@ -51,38 +53,203 @@ def __init__(self):
 
     def execute(self):
         """
-        Execution Steps
-        1. Check if there exists user data that should be labelled by the classifier models
-        2. If there exists, class predict() of the classifier classes. This function will store the result in DB at the end.
-        3. Return the detection results stored in DB.
-        """
+        A default function to obtain the list of spam users and the metrics of the models used to
+        obtain the predictions.
 
+        Execution Steps:
+            1. Check if there exists user data that should be labelled by the classifier models
+            2. If there exists, class predict() of the classifier classes. This function will store the result in DB at the end.
+            3. Print resluts
+            4. Return the detection results stored in DB.
+        """
+        print("Executing spam dectection...")
         # 1. Check DB for unlabelled users (None in all labelled_by_curator, labelled_by_user_classifier, and labelled_by_text_classifier)
         if len(self.processor.get_unlabelled_users()) != 0:
             # 2. if there are some unlabelled users, predict
-            self.user_metadata_classifier.predict()
+            print("Models are making predictions...")
+            self.usermeta_classifier.predict()
             self.text_classifier.predict()
+            print("Successfully made predictions!")
 
-        # 3. Return spam user_ids and metrics of the model
-        return {
+        result = {
             "spam_users": self.processor.get_spam_users(),
-            "user_metadata_spam_classifier": self.user_meta_classifier_metrics,
+            "usermeta_spam_classifier": self.usermeta_classifier_metrics,
             "text_spam_classifier": self.text_classifier_metrics,
         }
 
+        # 3. Print resluts
+        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
+        result["usermeta_spam_classifier"].pop("test_user_ids")
+        metadata_model_metrics = result["usermeta_spam_classifier"]
+
+        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
+        result["text_spam_classifier"].pop("test_user_ids")
+        text_model_metrics = result["text_spam_classifier"]
+
+        print("IDs of Detected Spam User :\n", result["spam_users"])
+        print("\n------------------------------------\n")
+        print(
+            "UserMetadataSpamClassifier Metrics :\n",
+            metadata_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            metadata_model_tested_ids,
+        )
+        print("\n------------------------------------\n")
+        print(
+            "TextSpamClassifier Metrics :\n",
+            text_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            text_model_tested_ids,
+        )
+        # 4. Return spam user_ids and metrics of the model
+        return result
+
     def get_model_metrics(self):
         """
-        return: a dictionary of the scores of existing machine learning model instances.
-        """
+        A function retrieves the model metrics used for spam detection. __init__() makes sure that
+        the models and the model metrics files exist; therefore, the role of this function is to
+        load the JSON metrics files as dictionary and return it.
 
+        Execution Steps:
+            1. load the model metrics files from the saving directory
+            2. print the metrics
+            3. return a dictionary of the scores of existing machine learning model instances.
+        return:
+            {   "usermeta_spam_classifier": {"Accuracy": float,
+                                                  "Precision": float,
+                                                  "Recall": float,
+                                                  "F1",float,
+                                                  "test_user_ids": list of user_id of
+                                                  which users was used to calculate metrics}
+                "text_spam_classifier": { same as above }
+            }
+        """
+        print("Loading model metric files...")
         # We can assume that model and model metrics files exist after __init__
-        with open(self.user_metadata_classifier.MODEL_METRICS_FILE_PATH) as json_file:
-            self.user_meta_classifier_metrics = json.load(json_file)
+        with open(self.usermeta_classifier.MODEL_METRICS_FILE_PATH) as json_file:
+            self.usermeta_classifier_metrics = json.load(json_file)
 
         with open(self.text_classifier.MODEL_METRICS_FILE_PATH) as json_file:
             self.text_classifier_metrics = json.load(json_file)
 
-        return {
-            "user_metadata_spam_classifier": self.user_meta_classifier_metrics,
+        print("Successfully loaded model metrics!")
+        result = {
+            "usermeta_spam_classifier": self.usermeta_classifier_metrics,
             "text_spam_classifier": self.text_classifier_metrics,
         }
+
+        # Print the model metrics
+        metadata_model_tested_ids = result["usermeta_spam_classifier"]["test_user_ids"]
+        result["usermeta_spam_classifier"].pop("test_user_ids")
+        metadata_model_metrics = result["usermeta_spam_classifier"]
+
+        text_model_tested_ids = result["text_spam_classifier"]["test_user_ids"]
+        result["text_spam_classifier"].pop("test_user_ids")
+        text_model_metrics = result["text_spam_classifier"]
+
+        print("\n------------------------------------\n")
+        print(
+            "UserMetadataSpamClassifier Metrics :\n",
+            metadata_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            metadata_model_tested_ids,
+        )
+        print("\n------------------------------------\n")
+        print(
+            "TextSpamClassifier Metrics :\n",
+            text_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            text_model_tested_ids,
+        )
+        return result
+
+    def fit_usermeta_spam_classifier(self):
+        """
+        This function is a wrapper of the fit() function in UserMetadataSpamClassifier.
+        It prints the model details returned by fit().
+
+        Execution Steps:
+            1. call fit() in UserMetadataSpamClassifier
+            2. print the model metrics and the user_ids of the users used to calculate the scores.
+        """
+        model_metrics = self.usermeta_classifier.fit()
+        metadata_model_tested_ids = model_metrics["test_user_ids"]
+        model_metrics.pop("test_user_ids")
+        metadata_model_metrics = model_metrics
+        print("\n------------------------------------\n")
+        print(
+            "UserMetadataSpamClassifier Metrics :\n",
+            metadata_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            metadata_model_tested_ids,
+        )
+
+    def fit_text_spam_classifier(self):
+        """
+        This function is a wrapper of the fit() function in TextSpamClassifier.
+        It prints the model details returned by fit().
+
+        Execution Steps:
+            1. call fit() in TextSpamClassifier
+            2. print the model metrics and the user_ids of the users used to calculate the scores.
+        """
+        model_metrics = self.text_classifier.fit()
+        text_model_tested_ids = model_metrics["test_user_ids"]
+        model_metrics.pop("test_user_ids")
+        text_model_metrics = model_metrics
+        print("\n------------------------------------\n")
+        print(
+            "TextSpamClassifier Metrics :\n",
+            text_model_metrics,
+        )
+        print(
+            "Metrics was calculated based on users with following IDs ....\n",
+            text_model_tested_ids,
+        )
+
+    def predict_usermeta_spam_classifier(self):
+        """
+        This function is a wrapper of the predict() function in UserMetadataSpamClassifier.
+        It prints the model details returned by predict().
+
+        Execution Steps:
+            1. call predict() in UserMetadataSpamClassifier
+            2. Print the evaluated users and users that were detected as a spam.
+        """
+        evaluated_user_ids, spam_user_ids = self.usermeta_classifier.predict()
+        if len(evaluated_user_ids) == 0:
+            print(
+                "Since all users were labelled by curator, classifier prediction was not executed.\n"
+            )
+        else:
+            print("UserMetadataSpamClassifier evaluated users:\n", evaluated_user_ids)
+            print("Spam Users :\n", spam_user_ids)
+
+    def predict_text_spam_classifier(self):
+        """
+        This function is a wrapper of the predict() function in TextSpamClassifier.
+        It prints the model details returned by predict().
+
+        Execution Steps:
+            1. call predict() in TextSpamClassifier
+            2. Print the evaluated users and users that were detected as a spam.
+        """
+        evaluated_user_ids, spam_user_ids = self.text_classifier.predict()
+
+        if len(evaluated_user_ids) == 0:
+            print(
+                "Since all users were labelled by curator, classifier prediction was not executed.\n"
+            )
+        else:
+            print("TextSpamClassifier evaluated users:\n", evaluated_user_ids)
+            print("Spam Users :\n", spam_user_ids)