dhlab-epfl · ivyleavedtoadflax · Aug 8, 2019 · Aug 8, 2019 · Aug 10, 2019 · Aug 13, 2019
diff --git a/crf_baseline/README.md b/crf_baseline/README.md
@@ -22,8 +22,13 @@ The data is expected to be in a *dataset* folder, in the main repository directo
 * [main_threeTasks](main_threeTasks.py) python script to train one CRF model for each task.
 * [validation](validation.py) python script to compute classification score on validation dataset for the three tasks.
 
-## Dependencies 
-* Numpy: 1.13.3
-* Sklearn : 0.19.1
-* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6
-* Python 3.5
+## Dependencies
+
+* Python 3.7
+
+See [requirements.txt](./requirements.txt) for a complete list of depdendencies.
+
+For setup, first create a new virtual environment using your favoured method, then install dependencies with:
+
+```
+pip3 install -r requirements.txt
diff --git a/crf_baseline/code/utils.py b/crf_baseline/code/utils.py
@@ -12,16 +12,42 @@ def closePrintToFile(f, stdout_original):
     sys.stdout = stdout_original
     f.close()
 
+def load_data(filepath):
+    """
+        Load and return the data stored in the given path.
+        The data is structured as follows: 
+            Each line contains four columns separated by a single space. 
+            Each word has been put on a separate line and there is an empty line after each sentence. 
+            The first item on each line is a word, the second, third and fourth are tags related to the word.
+        Example:
+            The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna 1983." is represented in the dataset as:
+                L author b-secondary b-r
+                . author i-secondary i-r
+                Antonielli author i-secondary i-r
+                , author i-secondary i-r
+                Iprefetti title i-secondary i-r
+                dell title i-secondary i-r
+                ’ title i-secondary i-r
+                Italia title i-secondary i-r
+                napoleonica title i-secondary i-r
+                , title i-secondary i-r
+                Bologna publicationplace i-secondary i-r
+                1983 year e-secondary i-r
+                . year e-secondary e-r
 
-def load_data(file):
+        :param filepath: Path to the data
+        :return: Four arrays: The first one contains sentences (one array of words per sentence) and the other threes are arrays of tags.
+
+    """
+
+    # Arrays to return
     words = []
     tags_1 = []
     tags_2 = []
     tags_3 = []
-    tags_4 = []
 
-    word = tags1 = tags2 = tags3 = tags4 = []
-    with open (file, "r") as file:
+    word = tags1 = tags2 = tags3 = []
+    with open (filepath, "r") as file:
         for line in file:
             if 'DOCSTART' not in line: #Do not take the first line into consideration
                 # Check if empty line
@@ -31,14 +57,12 @@ def load_data(file):
                     tags_1.append(tags1)
                     tags_2.append(tags2)
                     tags_3.append(tags3)
-                    tags_4.append(tags4)
 
                     # Reset
                     word = []
                     tags1 = []
                     tags2 = []
                     tags3 = []
-                    tags4 = []
 
                 else:
                     # Split the line into words, tag #1, tag #2, tag #3
@@ -47,6 +71,43 @@ def load_data(file):
                     tags1.append(w[1])
                     tags2.append(w[2])
                     tags3.append(w[3])
-                    tags4.append(w[4])
 
-    return words,tags_1,tags_2,tags_3,tags_4
+    return words,tags_1,tags_2,tags_3
+
+#def load_data(file):
+#    words = []
+#    tags_1 = []
+#    tags_2 = []
+#    tags_3 = []
+#    tags_4 = []
+#
+#    word = tags1 = tags2 = tags3 = tags4 = []
+#    with open (file, "r") as file:
+#        for line in file:
+#            if 'DOCSTART' not in line: #Do not take the first line into consideration
+#                # Check if empty line
+#                if line in ['\n', '\r\n']:
+#                    # Append line
+#                    words.append(word)
+#                    tags_1.append(tags1)
+#                    tags_2.append(tags2)
+#                    tags_3.append(tags3)
+#                    tags_4.append(tags4)
+#
+#                    # Reset
+#                    word = []
+#                    tags1 = []
+#                    tags2 = []
+#                    tags3 = []
+#                    tags4 = []
+#
+#                else:
+#                    # Split the line into words, tag #1, tag #2, tag #3
+#                    w = line[:-1].split(" ")
+#                    word.append(w[0])
+#                    tags1.append(w[1])
+#                    tags2.append(w[2])
+#                    tags3.append(w[3])
+#                    tags4.append(w[4])
+#
+#    return words,tags_1,tags_2,tags_3,tags_4
diff --git a/crf_baseline/main_finetune.py b/crf_baseline/main_finetune.py
@@ -12,8 +12,8 @@
 import sklearn_crfsuite
 from sklearn_crfsuite 	import scorers, metrics
 from sklearn.metrics 	import make_scorer, confusion_matrix
-from sklearn.externals 	import joblib
 from sklearn.model_selection import RandomizedSearchCV
+import joblib
 
 # For model validation
 import scipy

diff --git a/crf_baseline/main_threeTasks.py b/crf_baseline/main_threeTasks.py
@@ -9,7 +9,7 @@
 import sklearn_crfsuite
 from sklearn_crfsuite 	import scorers, metrics
 from sklearn.metrics 	import make_scorer, confusion_matrix
-from sklearn.externals 	import joblib
+import joblib
 
 
 # Utils functions

diff --git a/crf_baseline/models/.gitkeep b/crf_baseline/models/.gitkeep
diff --git a/crf_baseline/requirements.txt b/crf_baseline/requirements.txt
@@ -0,0 +1,15 @@
+cycler==0.10.0
+joblib==0.13.2
+kiwisolver==1.1.0
+matplotlib==3.1.1
+numpy==1.17.0
+pyparsing==2.4.2
+python-crfsuite==0.9.6
+python-dateutil==2.8.0
+scikit-learn==0.21.3
+scipy==1.3.1
+six==1.12.0
+sklearn==0.0
+sklearn-crfsuite==0.3.6
+tabulate==0.8.3
+tqdm==4.33.0
diff --git a/crf_baseline/results/.gitkeep b/crf_baseline/results/.gitkeep
diff --git a/keras/README.md b/keras/README.md
@@ -18,10 +18,14 @@ The results will be stored into the *model_results* folder, with one directory c
 * [main_threeTasks](main_threeTasks.py) python script to train one NN model for each task.
 
 ## Dependencies
-* Keras : version 2.1.1
-* TensorFlow: 1.4.0
-* Numpy: 1.13.3
-* [Keras contrib](https://github.com/keras-team/keras-contrib) Keras contrib : 0.0.2
-* Sklearn : 0.19.1
-* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6
-* Python 3.5	
+
+* Python 3.7
+
+See [requirements.txt](./requirements.txt) for a complete list of depdendencies.
+
+For setup, first create a new virtual environment using your favoured method, then install dependencies with:
+
+```
+pip3 install -r requirements.txt
+
+```
diff --git a/keras/code/utils.py b/keras/code/utils.py
@@ -378,8 +378,8 @@ def on_train_begin(self, logs={}):
         self.params['metrics'].append("val_f1")
 
         # In case of multiple outputs
-        if len(self.model.output_layers) > 1:
-            for output_layer in self.model.output_layers:
+        if len(self.model.layers) > 1:
+            for output_layer in self.model.layers:
                 self.params['metrics'].append("val_"+output_layer.name+"_f1")
 
 
@@ -403,8 +403,8 @@ def compute_epoch_training_F1(self):
         """
             Compute and save the F1 score for the training data
         """
-        in_length  = len(self.model.input_layers)
-        out_length = len(self.model.output_layers)
+        in_length  = len(self.model._input_layers)
+        out_length = len(self.model.layers)
         predictions = self.model.predict(self.train_data[0])
         if len(predictions) != out_length:
             predictions = [predictions]
@@ -464,8 +464,8 @@ def on_epoch_end(self, epoch, logs={}):
             Same model's weights for the best epoch.
         """
         self.compute_epoch_training_F1()
-        in_length  = len(self.model.input_layers)  # X data - to predict from
-        out_length = len(self.model.output_layers) # Number of tasks
+        in_length  = len(self.model._input_layers)  # X data - to predict from
+        out_length = len(self.model.layers) # Number of tasks
 
         # Compute the model predictions
         predictions = self.model.predict(self.validation_data[:in_length])
@@ -493,7 +493,7 @@ def on_epoch_end(self, epoch, logs={}):
             vals_f1.append(_val_f1)
 
             # Add F1 score to be log
-            f1_name = "val_"+self.model.output_layers[i].name+"_f1"
+            f1_name = "val_"+self.model.layers[i].name+"_f1"
             logs[f1_name] = _val_f1
 
 

diff --git a/keras/requirements.txt b/keras/requirements.txt
@@ -0,0 +1,36 @@
+absl-py==0.7.1
+astor==0.8.0
+bleach==1.5.0
+cycler==0.10.0
+gast==0.2.2
+google-pasta==0.1.7
+grpcio==1.22.0
+h5py==2.9.0
+html5lib==0.9999999
+joblib==0.13.2
+Keras==2.2.4
+Keras-Applications==1.0.8
+keras-contrib==2.0.8
+Keras-Preprocessing==1.1.0
+kiwisolver==1.1.0
+Markdown==3.1.1
+matplotlib==3.1.1
+numpy==1.17.0
+protobuf==3.9.1
+pyparsing==2.4.2
+python-crfsuite==0.9.6
+python-dateutil==2.8.0
+PyYAML==5.1.2
+scikit-learn==0.21.3
+scipy==1.3.0
+six==1.12.0
+sklearn-crfsuite==0.3.6
+tabulate==0.8.3
+tensorboard==1.14.0
+tensorflow==1.14.0
+tensorflow-estimator==1.14.0
+tensorflow-tensorboard==1.5.1
+termcolor==1.1.0
+tqdm==4.32.2
+Werkzeug==0.15.5
+wrapt==1.11.2