BehroozMansouri · w32zhong · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021
diff --git a/DataReader/wiki_data_reader.py b/DataReader/wiki_data_reader.py
@@ -23,7 +23,8 @@ def get_collection(self, ):
             temp_address = root+"/"+directory+"/"
             if not os.path.isdir(temp_address):
                 continue
-            temp_address = temp_address +"/Articles"
+            if os.path.exists(temp_address + "/Articles"):
+                temp_address = temp_address + "/Articles"
             for filename in os.listdir(temp_address):
                 file_path = temp_address + '/' + filename
                 parts = filename.split('/')
@@ -38,9 +39,10 @@ def get_collection(self, ):
                     for key in formulas:
                         tuples = formulas[key].get_pairs(window=2, eob=True)
                         dictionary_formula_tuples[file_name + ":" + str(key)] = tuples
-                except:
+                except Exception as e:
                     except_count += 1
-                    print(file_name)
+                    print('Reader Exception:', e)
+                    print(file_path)
         return dictionary_formula_tuples
 
     def get_query(self,):

diff --git a/README.md b/README.md
@@ -6,7 +6,13 @@ We introduce a new formula embedding model that we use with two hierarchical rep
 The codebase is implemented in Python 3.6. Package versions used for development are in [requirement.txt](https://github.com/BehroozMansouri/TangentCFT/blob/master/requirements.txt) file.
 
 # Dataset
-To evaluate our embedding model we used [NTCIR-12 dataset](https://www.cs.rit.edu/~rlaz/NTCIR-12_MathIR_Wikipedia_Corpus.zip), focusing on formula retrieval task. The collection contains over 590000 mathematical formulas from Wikipedia with 20 formula queries with their relevant formulas. For comparison with previous approaches we used bpref score to evaluate the top-1000 relevant formulas. 
+To evaluate our embedding model we used [NTCIR-12 dataset](https://www.cs.rit.edu/~rlaz/NTCIR-12_MathIR_Wikipedia_Corpus.zip) (See TestQueries directory for example topics), focusing on formula retrieval task. The collection contains over 590000 mathematical formulas from Wikipedia with 20 formula queries with their relevant formulas. For comparison with previous approaches we used bpref score to evaluate the top-1000 relevant formulas.
+
+After downloading NTCIR-12 dataset, extract all tarballs under the sub-directories. Example shell command:
+```sh
+for x in *.tar.bz2; do echo $x; tar xjf $x; done
+```
+
 Also one can easily use anydataset, such as [Math Stach Exchange] (https://math.stackexchange.com/), in form of csv file of latex formula and formula ids (separated by $$ sign) to train a new model. 
 
 # Running TangentCFT
@@ -28,15 +34,15 @@ vector_size,300
 ```
 The next step is to decide to train a new model or load a previously trained model that is saved in Saved_model directory. To train a new model, one can simply set directory of NTCIR-12 (or other dataset) and configuration file id. Here is an example of running the model that runs the model with configurations 100 and 101 and saves the vector representations in the direcotry specified in the configuration file:
 ```
-python3 tangent_cft_front_end.py -cid 1 -ds '/NTCIR-12/MathTagArticles' --slt True -em 'encoder.csv'
+python3 tangent_cft_front_end.py -cid 1 -ds '/NTCIR-12/MathTagArticles' --slt True -em 'encoder.csv' --mp 'slt_model'
 ```
 The command above, use the configuration file, with id 1, use the NTCIR 12 dataset to train the model based on slt representation and saves the encoding map in encoder.csv file. To save the model one can use the command:
 ```
 python3 tangent_cft_front_end.py -cid 2 -ds '/NTCIR-12/MathTagArticles' --slt False -em 'encoder.csv' --mp 'opt_model' 
 ```
 With this command, a model is trained based on OPT representation of NTCIR-12 dataset and result is saved in opt_model. Finally, to load a model, one can use the following command:
 ```
-python3 tangent_cft_front_end.py -cid 2 -ds '/NTCIR-12/MathTagArticles' --slt False -em 'encoder.csv' --mp 'opt_model' --t False --rf res_1
+python3 tangent_cft_front_end.py -cid 2 -ds '/NTCIR-12/MathTagArticles' --slt False -em 'encoder.csv' --mp 'opt_model' --t False --qd TestQueries/ --rf res_1
 ```
 With this command, train model is set to false and model is loaded and retrieval result is saved in res_1 file in Retrieval_Results directory.
 

diff --git a/Retrieval_Results/judge.dat b/Retrieval_Results/judge.dat
diff --git a/Saved_model/Embedding_Preprocessing/.keep b/Saved_model/Embedding_Preprocessing/.keep
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,5 @@ gensim==3.4.0
 matplotlib==3.1.0
 numpy==1.17.2
 torch==1.3.0
+beautifulsoup4==4.9.3
+lxml==4.6.3
diff --git a/tangent_cft_back_end.py b/tangent_cft_back_end.py
@@ -60,7 +60,7 @@ def train_model(self, map_file_path, model_file_path=None,
                     embedding_type=TupleTokenizationMode.Both_Separated, ignore_full_relative_path=True,
                     tokenize_all=False,
                     tokenize_number=True):
-        self.module = TangentCFTModule()
+        self.module = TangentCFTModule(self.config)
         dictionary_formula_tuples_collection = self.__encode_train_tuples(embedding_type, ignore_full_relative_path,
                                                                           tokenize_all, tokenize_number)
         print("training the fast text model...")
@@ -76,8 +76,8 @@ def load_model(self, map_file_path, model_file_path,
                    tokenize_all=False,
                    tokenize_number=True
                    ):
-        self.module = TangentCFTModule(model_file_path)
-        self.__load_encoder_map(map_file_path)
+        self.module = TangentCFTModule(self.config, model_file_path)
+        self.__load_encoder_map("Saved_model/" + map_file_path)
         dictionary_formula_tuples_collection = self.__encode_train_tuples(embedding_type, ignore_full_relative_path,
                                                                           tokenize_all, tokenize_number)
         self.__save_encoder_map(map_file_path)

diff --git a/tangent_cft_front_end.py b/tangent_cft_front_end.py
@@ -2,31 +2,36 @@
 
 from Embedding_Preprocessing.encoder_tuple_level import TupleTokenizationMode
 from tangent_cft_back_end import TangentCFTBackEnd
+from distutils import util
+
+
+def strtobool(v):
+    return bool(util.strtobool(v))
 
 
 def main():
     parser = argparse.ArgumentParser(description='Given the configuration file for training Tangent_CFT model.'
                                                  'This function train the model and then does the retrieval task on'
                                                  'NTCIR-12 formula retrieval task.')
 
-    parser.add_argument('--t', type=bool, help="Value True for training a new model and False for loading a model",
+    parser.add_argument('--t', type=strtobool, help="Value True for training a new model and False for loading a model",
                         default=True)
-    parser.add_argument('--r', type=bool, help="Value True to do the retrieval on NTCIR12 dataset",
+    parser.add_argument('--r', type=strtobool, help="Value True to do the retrieval on NTCIR12 dataset",
                         default=True)
     parser.add_argument('-ds', type=str, help="File path of training data. If using NTCIR12 dataset, "
                                               "it should be MathTagArticles directory. If using the MSE dataset, it"
                                               "should be csv file of formula", required=True)
     parser.add_argument('-cid', metavar='cid', type=int, help='Configuration file.', required=True)
-    parser.add_argument('--wiki', type=bool, help="Determines if the dataset is wiki or not.", default=True)
-    parser.add_argument('--slt', type=bool, help="Determines to use slt (True) or opt(False)", default=True)
+    parser.add_argument('--wiki', type=strtobool, help="Determines if the dataset is wiki or not.", default=True)
+    parser.add_argument('--slt', type=strtobool, help="Determines to use slt (True) or opt(False)", default=True)
     parser.add_argument('-em', type=str, help="File path for encoder map.", required=True)
     parser.add_argument('--mp', type=str, help="Model file path.", default=None)
     parser.add_argument('--qd', type=str, help="NTCIR12 query directory.", default=None)
     parser.add_argument('--rf', type=str, help="Retrieval result file path.", default="ret_res")
     parser.add_argument('--ri', type=int, help="Run Id for Retrieval.", default=1)
-    parser.add_argument('--frp', type=bool, help="Determines to ignore full relative path", default=True)
-    parser.add_argument('--ta', type=bool, help="Determines to tokenize all", default=False)
-    parser.add_argument('--tn', type=bool, help="Determines to tokenize numbers", default=True)
+    parser.add_argument('--frp', type=strtobool, help="Determines to ignore full relative path", default=True)
+    parser.add_argument('--ta', type=strtobool, help="Determines to tokenize all", default=False)
+    parser.add_argument('--tn', type=strtobool, help="Determines to tokenize numbers", default=True)
     parser.add_argument('--et', help='Embedding type; 1:Value, 2:Type, 3:Type and Value separated and'
                                      ' 4: Type and Value Not Separated, 2 for formula level', choices=range(1, 5),
                         default=3, type=int)
@@ -62,10 +67,14 @@ def main():
             tokenize_number=tokenize_number
         )
         if do_retrieval:
-            retrieval_result = system.retrieval(dictionary_formula_tuples_collection)
+            retrieval_result = system.retrieval(dictionary_formula_tuples_collection,
+                embedding_type=embedding_type,
+                ignore_full_relative_path=ignore_full_relative_path,
+                tokenize_all=tokenize_all,
+                tokenize_number=tokenize_number
+            )
             system.create_result_file(retrieval_result, "Retrieval_Results/" + res_file, run_id)
     else:
-
         dictionary_formula_tuples_collection = system.load_model(
             map_file_path=map_file_path,
             model_file_path=model_file_path,
@@ -74,7 +83,12 @@ def main():
             tokenize_number=tokenize_number
         )
         if do_retrieval:
-            retrieval_result = system.retrieval(dictionary_formula_tuples_collection)
+            retrieval_result = system.retrieval(dictionary_formula_tuples_collection,
+                embedding_type=embedding_type,
+                ignore_full_relative_path=ignore_full_relative_path,
+                tokenize_all=tokenize_all,
+                tokenize_number=tokenize_number
+            )
             system.create_result_file(retrieval_result, "Retrieval_Results/" + res_file, run_id)
 
 

diff --git a/tangent_cft_module.py b/tangent_cft_module.py
@@ -12,14 +12,15 @@
 
 
 class TangentCFTModule:
-    def __init__(self, model_file_path=None):
+    def __init__(self, config, model_file_path=None):
         """
             Take the configuration file path, this file define where the tangent_fasttext formulas are (those
             tangent-tuple encoded as char to be fed to fasttext). Both queries and collection dataset are in the same
             location. Also the destination where the queries vectors and all the other wikipedia formula vectors should
             be saved is defined in this file.
             Finally this file has the hyper_parameter setting for fasttext.
         """
+        self.config = config
         self.model = TangentCftModel()
         if model_file_path is not None:
             print("Loading the model")
@@ -38,8 +39,12 @@ def index_collection(self, dictionary_formula_lst_encoded_tuples):
         index_formula_id = {}
         idx = 0
         for formula in dictionary_formula_lst_encoded_tuples:
-            numpy_lst.append(self.__get_vector_representation(dictionary_formula_lst_encoded_tuples[formula]))
-            index_formula_id[idx] = formula
+            encoded_tuples = dictionary_formula_lst_encoded_tuples[formula]
+            if len(encoded_tuples) > 0:
+                vector = self.__get_vector_representation(encoded_tuples)
+                numpy_lst.append(vector)
+                index_formula_id[idx] = formula
+                idx += 1
         temp = numpy.concatenate(numpy_lst, axis=0)
         tensor_values = Variable(torch.tensor(temp).double()).cuda()
         return tensor_values, index_formula_id
@@ -49,17 +54,17 @@ def get_query_vector(self, lst_encoded_tuples):
 
     @staticmethod
     def formula_retrieval(collection_tensor, formula_index, query_vector):
-        query_vec = torch.from_numpy(query_vector)
+        query_vec = torch.from_numpy(query_vector).cuda()
         dist = F.cosine_similarity(collection_tensor, query_vec)
         index_sorted = torch.sort(dist, descending=True)[1]
         top_1000 = index_sorted[:1000]
         top_1000 = top_1000.data.cpu().numpy()
         cos_values = torch.sort(dist, descending=True)[0][:1000].data.cpu().numpy()
         result = {}
-        count = 1
+        count = 0
         for x in top_1000:
             doc_id = formula_index[x]
-            score = cos_values[count - 1]
+            score = cos_values[count]
             result[doc_id] = score
             count += 1
         return result
@@ -85,5 +90,6 @@ def __get_vector_representation(self, lst_encoded_tuples):
                     temp_vector = temp_vector + self.model.get_vector_representation(encoded_tuple)
                 counter = counter + 1
             except Exception as e:
-                logging.exception(e)
-        return (temp_vector / counter).reshape(1, self.vector_size)
+                pass
+
+        return (temp_vector / counter).reshape(1, self.config.vector_size)