-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner_parse.py
207 lines (182 loc) · 7.33 KB
/
ner_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Tony DiPadova and Steven Jiang, May 2018
# Written in Python 3
try:
print("Checking dependencies...")
from gensim.models import KeyedVectors
from nltk.corpus.reader.conll import ConllCorpusReader
import nltk
from nltk import conlltags2tree
from langdetect import detect
import re
from math import floor
import numpy as np
import sys
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Bidirectional, Flatten, Dropout, TimeDistributed
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical
import keras.backend as K
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
except Exception as e:
raise Exception("You do not have the necessary dependencies install. Try 'pip install -r requirements.txt' first.\n", e)
# global variables
max_length = 70
num_features = 300
classes = ['B-MISC', 'I-MISC', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'O']
num_classes = len(classes)
encoded_classes = range(num_classes)
class2idx = {classes[enc]: enc for enc in encoded_classes}
dropout = 0.1
recurrent_dropout = 0.3
hidden_nodes = 100
window_size = 70
nums_regex = re.compile(r'0+')
def clean_sents(sents, max_length):
cleaned = []
# remove sentences shorter than 5 words
for sent in sents:
if len(sent) > 4 and len(sent) <= max_length:
new_sent = []
# clean the words
for word in sent:
this_word = word.lower()
new_word = ''
# replace numbers with 0
for char in this_word:
if char.isalpha():
new_word = new_word + char
elif char.isdigit():
new_word = new_word + '0'
new_word = nums_regex.sub('0', new_word)
new_sent.append(this_word)
cleaned.append(new_sent)
return cleaned
def arr2label(cats, labels):
new_labels = []
for i in range(len(cats)):
sent_labels = []
for j in range(len(cats[i])):
label = np.argmax(cats[i][j])
label = labels[label]
new_labels.append(label)
return new_labels
def tag2tree(tags):
string = '(S'
for i in range(len(tags)):
# beginning case
if i == 0 and tags[i][1] != 'O':
string += ' (' + tags[i][1][2:] + ' ' + tags[i][0] # beginning is B-
if i + 1 < len(tags) and tags[i+1][1] == 'O':
string += ')'
elif tags[i][1] == 'O':
string += ' ' + tags[i][0]
else:
# middle cases
if i < len(tags) - 1:
if tags[i+1][1] == 'O' and tags[i][1][0] == 'I':
string += ' ' + tags[i][0] + ')'
elif tags[i+1][1][0] == 'I' and tags[i][1][0] == 'I':
string += ' ' + tags[i][0]
elif tags[i][1][0] == 'B' and tags[i+1][1] == 'O':
string += ' (' + tags[i][1][2:] + ' ' + tags[i][0] + ')'
elif tags[i][1][0] == 'B' and tags[i+1][1][0] == 'I':
string += ' (' + tags[i][1][2:] + ' ' + tags[i][0]
else:
# end cases
if tags[i][1][0] == 'B':
string += ' (' + tags[i][1][2:] + ' ' + tags[i][0] + ')'
elif tags[i][1][0] == 'I':
string += ' ' + tags[i][0] + ')'
string += ')'
tree = nltk.Tree.fromstring(string)
return tree
def get_padded_sentence_features(sentences, num_features, max_length, wv):
features = np.empty((0, max_length, num_features))
for i in range(len(sentences)):
sent = sentences[i]
new_sent = []
for j in range(max_length):
if 0 <= j < len(sent):
this_word = sent[j]
if this_word in wv.vocab:
new_sent.append(wv.get_vector(this_word))
elif this_word == '':
new_sent.append(np.zeros(num_features))
else:
new_sent.append(np.random.uniform(-0.25,0.25, num_features)) # random vector for unknown
else:
new_sent.append(np.zeros(num_features))
feature_stack = np.dstack([[new_sent]])
features = np.vstack([features, feature_stack])
return features
def create_model(num_classes, num_features, hidden_nodes=100):
model = Sequential()
model.add(Bidirectional(
LSTM(units=num_features, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout),
input_shape=(window_size, num_features,),
merge_mode='concat'))
model.add(TimeDistributed(Dense(hidden_nodes, activation='relu')))
# add a CRF layer to enforce NER IOB rules
crf = CRF(num_classes, sparse_target=False)
model.add(crf)
model.compile(optimizer='rmsprop', loss=crf.loss_function, metrics=[crf.accuracy])
return model
def load_resources():
print("Loading embeddings...")
embeddings = {
'en': KeyedVectors.load_word2vec_format('data/wiki.multi.en.vec.txt', binary=False),
'es': KeyedVectors.load_word2vec_format('data/wiki.multi.es.vec.txt', binary=False),
'de': KeyedVectors.load_word2vec_format('data/wiki.multi.de.vec.txt', binary=False),
'it': KeyedVectors.load_word2vec_format('data/wiki.multi.it.vec.txt', binary=False)
}
print("Loading model...")
model = create_model(num_classes, num_features)
save_load_utils.load_all_weights(model,'models/full_train.h5', include_optimizer=False)
return embeddings, model
def parse(sentence, model, wordvecs):
sent = nltk.word_tokenize(sentence) # tokenize the sentence
if len(sent) > max_length:
print("Sentence must be less than", max_length, "words long.")
return tag2tree([('', 'O')])
elif len(sent) <= 4:
print("Sentence must be at least 5 words long.")
return tag2tree([('', 'O')])
cleaned = clean_sents([sent], max_length) # normalize the sentence
X = get_padded_sentence_features(cleaned, num_features, max_length, wordvecs) # process the sentence
pred = model.predict(X) # predict
labels = arr2label(pred, classes) # get labels
labels = labels[:len(sent)] # remove padding
tags = [(sent[i], labels[i]) for i in range(len(sent))] # get the tags
tree = tag2tree(tags) # convert to tree
return tree
if __name__ == '__main__':
print("Loading resources, this may take several minutes...")
embeddings, model = load_resources()
print("Type any sentence in English, German, Spanish, or Italian to parse it into a Named Entity Tree.")
print("Type 'q' to quit.")
ipt = input('Sentence: ')
while ipt != 'q':
try:
language = detect(ipt)
except Exception as e:
print("Unable to detect language, defaulting to English")
language = 'en'
if language in embeddings:
tree = parse(ipt, model, embeddings[language])
tree.pprint()
print()
else:
print("Detected language:", language)
print("Defaulting to English")
tree = parse(ipt, model, embeddings['en'])
tree.pprint()
print()
ipt = input('Sentence: ')