Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to allow running keras and crf_baseline models on python 3.7 #7

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
15 changes: 10 additions & 5 deletions crf_baseline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,13 @@ The data is expected to be in a *dataset* folder, in the main repository directo
* [main_threeTasks](main_threeTasks.py) python script to train one CRF model for each task.
* [validation](validation.py) python script to compute classification score on validation dataset for the three tasks.

## Dependencies
* Numpy: 1.13.3
* Sklearn : 0.19.1
* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6
* Python 3.5
## Dependencies

* Python 3.7

See [requirements.txt](./requirements.txt) for a complete list of depdendencies.

For setup, first create a new virtual environment using your favoured method, then install dependencies with:

```
pip3 install -r requirements.txt
77 changes: 69 additions & 8 deletions crf_baseline/code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,42 @@ def closePrintToFile(f, stdout_original):
sys.stdout = stdout_original
f.close()

def load_data(filepath):
"""
Load and return the data stored in the given path.
The data is structured as follows:
Each line contains four columns separated by a single space.
Each word has been put on a separate line and there is an empty line after each sentence.
The first item on each line is a word, the second, third and fourth are tags related to the word.
Example:
The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna 1983." is represented in the dataset as:
L author b-secondary b-r
. author i-secondary i-r
Antonielli author i-secondary i-r
, author i-secondary i-r
Iprefetti title i-secondary i-r
dell title i-secondary i-r
’ title i-secondary i-r
Italia title i-secondary i-r
napoleonica title i-secondary i-r
, title i-secondary i-r
Bologna publicationplace i-secondary i-r
1983 year e-secondary i-r
. year e-secondary e-r

def load_data(file):
:param filepath: Path to the data
:return: Four arrays: The first one contains sentences (one array of words per sentence) and the other threes are arrays of tags.

"""

# Arrays to return
words = []
tags_1 = []
tags_2 = []
tags_3 = []
tags_4 = []

word = tags1 = tags2 = tags3 = tags4 = []
with open (file, "r") as file:
word = tags1 = tags2 = tags3 = []
with open (filepath, "r") as file:
for line in file:
if 'DOCSTART' not in line: #Do not take the first line into consideration
# Check if empty line
Expand All @@ -31,14 +57,12 @@ def load_data(file):
tags_1.append(tags1)
tags_2.append(tags2)
tags_3.append(tags3)
tags_4.append(tags4)

# Reset
word = []
tags1 = []
tags2 = []
tags3 = []
tags4 = []

else:
# Split the line into words, tag #1, tag #2, tag #3
Expand All @@ -47,6 +71,43 @@ def load_data(file):
tags1.append(w[1])
tags2.append(w[2])
tags3.append(w[3])
tags4.append(w[4])

return words,tags_1,tags_2,tags_3,tags_4
return words,tags_1,tags_2,tags_3

#def load_data(file):
# words = []
# tags_1 = []
# tags_2 = []
# tags_3 = []
# tags_4 = []
#
# word = tags1 = tags2 = tags3 = tags4 = []
# with open (file, "r") as file:
# for line in file:
# if 'DOCSTART' not in line: #Do not take the first line into consideration
# # Check if empty line
# if line in ['\n', '\r\n']:
# # Append line
# words.append(word)
# tags_1.append(tags1)
# tags_2.append(tags2)
# tags_3.append(tags3)
# tags_4.append(tags4)
#
# # Reset
# word = []
# tags1 = []
# tags2 = []
# tags3 = []
# tags4 = []
#
# else:
# # Split the line into words, tag #1, tag #2, tag #3
# w = line[:-1].split(" ")
# word.append(w[0])
# tags1.append(w[1])
# tags2.append(w[2])
# tags3.append(w[3])
# tags4.append(w[4])
#
# return words,tags_1,tags_2,tags_3,tags_4
2 changes: 1 addition & 1 deletion crf_baseline/main_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import RandomizedSearchCV
import joblib

# For model validation
import scipy
Expand Down
2 changes: 1 addition & 1 deletion crf_baseline/main_threeTasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.externals import joblib
import joblib


# Utils functions
Expand Down
Empty file added crf_baseline/models/.gitkeep
Empty file.
15 changes: 15 additions & 0 deletions crf_baseline/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
cycler==0.10.0
joblib==0.13.2
kiwisolver==1.1.0
matplotlib==3.1.1
numpy==1.17.0
pyparsing==2.4.2
python-crfsuite==0.9.6
python-dateutil==2.8.0
scikit-learn==0.21.3
scipy==1.3.1
six==1.12.0
sklearn==0.0
sklearn-crfsuite==0.3.6
tabulate==0.8.3
tqdm==4.33.0
Empty file added crf_baseline/results/.gitkeep
Empty file.
18 changes: 11 additions & 7 deletions keras/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@ The results will be stored into the *model_results* folder, with one directory c
* [main_threeTasks](main_threeTasks.py) python script to train one NN model for each task.

## Dependencies
* Keras : version 2.1.1
* TensorFlow: 1.4.0
* Numpy: 1.13.3
* [Keras contrib](https://github.com/keras-team/keras-contrib) Keras contrib : 0.0.2
* Sklearn : 0.19.1
* [Sklearn crfsuite](https://sklearn-crfsuite.readthedocs.io/en/latest/index.html) Sklearn crfsuite : 0.3.6
* Python 3.5

* Python 3.7

See [requirements.txt](./requirements.txt) for a complete list of depdendencies.

For setup, first create a new virtual environment using your favoured method, then install dependencies with:

```
pip3 install -r requirements.txt

```
14 changes: 7 additions & 7 deletions keras/code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ def on_train_begin(self, logs={}):
self.params['metrics'].append("val_f1")

# In case of multiple outputs
if len(self.model.output_layers) > 1:
for output_layer in self.model.output_layers:
if len(self.model.layers) > 1:
for output_layer in self.model.layers:
self.params['metrics'].append("val_"+output_layer.name+"_f1")


Expand All @@ -403,8 +403,8 @@ def compute_epoch_training_F1(self):
"""
Compute and save the F1 score for the training data
"""
in_length = len(self.model.input_layers)
out_length = len(self.model.output_layers)
in_length = len(self.model._input_layers)
out_length = len(self.model.layers)
predictions = self.model.predict(self.train_data[0])
if len(predictions) != out_length:
predictions = [predictions]
Expand Down Expand Up @@ -464,8 +464,8 @@ def on_epoch_end(self, epoch, logs={}):
Same model's weights for the best epoch.
"""
self.compute_epoch_training_F1()
in_length = len(self.model.input_layers) # X data - to predict from
out_length = len(self.model.output_layers) # Number of tasks
in_length = len(self.model._input_layers) # X data - to predict from
out_length = len(self.model.layers) # Number of tasks

# Compute the model predictions
predictions = self.model.predict(self.validation_data[:in_length])
Expand Down Expand Up @@ -493,7 +493,7 @@ def on_epoch_end(self, epoch, logs={}):
vals_f1.append(_val_f1)

# Add F1 score to be log
f1_name = "val_"+self.model.output_layers[i].name+"_f1"
f1_name = "val_"+self.model.layers[i].name+"_f1"
logs[f1_name] = _val_f1


Expand Down
36 changes: 36 additions & 0 deletions keras/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
absl-py==0.7.1
astor==0.8.0
bleach==1.5.0
cycler==0.10.0
gast==0.2.2
google-pasta==0.1.7
grpcio==1.22.0
h5py==2.9.0
html5lib==0.9999999
joblib==0.13.2
Keras==2.2.4
Keras-Applications==1.0.8
keras-contrib==2.0.8
Keras-Preprocessing==1.1.0
kiwisolver==1.1.0
Markdown==3.1.1
matplotlib==3.1.1
numpy==1.17.0
protobuf==3.9.1
pyparsing==2.4.2
python-crfsuite==0.9.6
python-dateutil==2.8.0
PyYAML==5.1.2
scikit-learn==0.21.3
scipy==1.3.0
six==1.12.0
sklearn-crfsuite==0.3.6
tabulate==0.8.3
tensorboard==1.14.0
tensorflow==1.14.0
tensorflow-estimator==1.14.0
tensorflow-tensorboard==1.5.1
termcolor==1.1.0
tqdm==4.32.2
Werkzeug==0.15.5
wrapt==1.11.2