Fix Sequence Labelling Models, fixes #178 (#180)

* Don't export Flux and Tracker * Fix NER with new weights * Update Checksum for NER Weights * Minor fix in datadeps * Fix POS DataDeps * Fix Shasum for POS * Update weights and checksum for POS * Bugs and datadeps checksum fix * Fixing 32-bit tests for NER * Restrict Flux to 0.8.3 and NNlib to 0.6.0 * Switch to JSON for Sequence Model Dicts, addressing 32-bit test fails. * Switch NER Dicts to JSON to address 32-bit fails * Switch to JSON for POS model dicts and update checksum * Set lower bound for BSON.jl version in pProject.toml * Remove Duplicates in Project.toml * Versions for BSON.jl * Fix version clashes in Manifest.toml * Update CRF tests * update Manifest.toml * Remove Manifest * update manifest * Delete Manifest.toml
JuliaText · Sep 4, 2020 · 3ff0cb2 · 3ff0cb2
1 parent ed8c771
commit 3ff0cb2
Show file tree

Hide file tree

Showing 8 changed files with 82 additions and 29 deletions.
diff --git a/Project.toml b/Project.toml
@@ -16,6 +16,7 @@ JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -25,6 +26,7 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
+BSON = ">= 0.2.5"
 Flux = "< 0.10"
 julia = "1"
 

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -66,7 +66,7 @@ module TextAnalysis
 
     export CRF, viterbi_decode, crf_loss
 
-    export NERTagger, PoSTagger, Tracker, Flux
+    export NERTagger, PoSTagger
 
     export Vocabulary, lookup, update
     export everygram, padding_ngram

diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl
@@ -1,4 +1,4 @@
-using BSON, Tracker
+using BSON, JSON
 
 const NER_Char_UNK = '¿'
 const NER_Word_UNK = "<UNK>"
@@ -11,8 +11,18 @@ load_model_dicts(filepath) = load_model_dicts(filepath, true)
 
 function load_model_dicts(filepath, remove_tag_prefix)
     labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
-    chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
-    words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]
+
+    chars_idx_json = JSON.parsefile(joinpath(filepath, "char_to_embed_idx.json"),
+                                    dicttype = Dict{String, Int32},
+                                    inttype = Int32
+                                   )
+    # Since String can't be directly converted into Char. But these Strings only of length 1.
+    chars_idx = Dict(key[1] => chars_idx_json[key] for key in keys(chars_idx_json)) # This is Dict{Char,Int32}
+
+    words_idx = JSON.parsefile(joinpath(filepath, "word_to_embed_idx.json"),
+                               dicttype = Dict{String, Int32},
+                               inttype = Int32
+                              )
 
     remove_tag_prefix || return [labels...], chars_idx, words_idx
 

diff --git a/src/sequence/ner_datadeps.jl b/src/sequence/ner_datadeps.jl
@@ -3,11 +3,11 @@ function ner_datadep_register()
         """
         The weights for NER Sequence Labelling Model.
         """,
-        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz",
-        "6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6",
+        "https://github.com/Ayushk4/NER.jl/releases/download/0.0.0.1/ner_weights.tar.xz",
+        "6eda5cd778af99f57a0a0b7eb4d5bc46a5a61c214e3e515e620b7db6b76ce3aa",
         post_fetch_method = function(fn)
             unpack(fn)
-            dir = "weights"
+            dir = "ner_weights"
             innerfiles = readdir(dir)
             mv.(joinpath.(dir, innerfiles), innerfiles)
             rm(dir)
@@ -18,8 +18,8 @@ function ner_datadep_register()
         """
         The character and words dict for NER Sequence Labelling Model.
         """,
-        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz",
-        "40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11",
+        "https://github.com/Ayushk4/NER.jl/releases/download/0.0.0.1/ner_dicts.tar.xz",
+        "49619d793a5974dd41859e68d73eae68e58f8b264d49ba98489ab6ed74bf5f86",
         post_fetch_method = function(fn)
             unpack(fn)
             dir = "model_dicts"

diff --git a/src/sequence/pos_datadeps.jl b/src/sequence/pos_datadeps.jl
@@ -3,11 +3,11 @@ function pos_datadep_register()
         """
         The weights for POS Sequence Labelling Model.
         """,
-        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_weights.tar.xz",
-        "b02e891ea913be6834ff67d6ecf2ddae6754d55509bb3d9c078dbfc7eed27988";
+        "https://github.com/Ayushk4/POS.jl/releases/download/v0.0.1/pos_weights.tar.xz",
+        "74759f446aeaec3f46ba44de1d82c2324f26c8f1f65790187067973d3aefc054";
         post_fetch_method = function(fn)
             unpack(fn)
-            dir = "weights"
+            dir = "pos_weights"
             innerfiles = readdir(dir)
             mv.(joinpath.(dir, innerfiles), innerfiles)
             rm(dir)
@@ -18,8 +18,8 @@ function pos_datadep_register()
         """
         The character and words dict for POS Sequence Labelling Model.
         """,
-        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_model_dicts.tar.xz",
-        "4d7fe8238ff0cfb92d195dfa745b4ed08f916d4707e3dbe27a1b3144c9282f41";
+        "https://github.com/Ayushk4/POS.jl/releases/download/v0.0.1/pos_model_dicts.tar.xz",
+        "8c79089a4aecd09444143b833da49e7a4529612f5447e607dc77aa45968b3858";
         post_fetch_method = function(fn)
             unpack(fn)
             dir = "model_dicts"

diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
@@ -1,8 +1,8 @@
 using BSON, Tracker
 mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
     labels::Array{String, 1} # List of Labels
-    chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed
-    words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed
+    chars_idx#::Dict{Char, Integer} # Dict that maps chars to indices in W_Char_Embed
+    words_idx#::Dict{String, Integer} # Dict that maps words to indices in W_word_Embed
     conv1::C # Convolution Layer over W_Char_Embed to give character representation
     W_Char_Embed::W # Weights for character embeddings
     W_word_Embed::W # Further trained GloVe Embeddings
@@ -32,13 +32,54 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
     init_α = fill(-10000, (n + 2, 1))
     init_α[n + 1] = 0
 
-    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
-    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
-    forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
-    backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
-    d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
-    c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
-    conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
+    # Word and Character Embeddings.
+    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu]
+    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu]
+
+    # Forward_LSTM
+    forward_wts = BSON.load(joinpath(weights_path, "forward_lstm.bson"))
+    forward_lstm = Flux.Recur(Flux.LSTMCell(forward_wts[:lstm_2], # Wi
+                                            forward_wts[:lstm_1], # Wh
+                                            forward_wts[:lstm_3], # b
+                                            forward_wts[:lstm_4], # h
+                                            forward_wts[:lstm_5]  # c
+                                           ),
+                              forward_wts[:lstm_init],
+                              forward_wts[:lstm_state]
+                             )
+
+    # Backward_LSTM
+    backward_wts = BSON.load(joinpath(weights_path, "backward_lstm.bson"))
+    backward = Flux.Recur(Flux.LSTMCell(backward_wts[:lstm_2], # Wi
+                                             backward_wts[:lstm_1], # Wh
+                                             backward_wts[:lstm_3], # b
+                                             backward_wts[:lstm_4], # h
+                                             backward_wts[:lstm_5]  # c
+                                            ),
+                               backward_wts[:lstm_init],
+                               backward_wts[:lstm_state]
+                              )
+
+    # Dense
+    d_weights_bias = BSON.load(joinpath(weights_path, "d_cpu.bson"))
+    d_out = Flux.Dense(d_weights_bias[:d_weight],
+                       d_weights_bias[:d_bias],
+                       Flux.identity
+                      )
+
+    # Load CRF.
+    crf_wt = BSON.load(joinpath(weights_path, "crf_cpu.bson"))[:crf_Weights]
+    c = TextAnalysis.CRF(crf_wt, size(crf_wt)[1] - 2)
+
+    # Load Conv
+    conv_wt_bias = BSON.load(joinpath(weights_path, "conv_cpu.bson"))
+    conv1 = Flux.Conv(Flux.identity, # activation
+                      conv_wt_bias[:conv_weight], # weights
+                      conv_wt_bias[:conv_bias], # bias
+                      (1, 1), # stride
+                      (0, 2), # pad
+                      (1, 1), # dilation
+            )
 
     BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,
                 forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx)

diff --git a/test/crf.jl b/test/crf.jl
@@ -1,5 +1,5 @@
 using Flux
-using Flux: gradient, LSTM, Dense, reset!, onehot
+using Flux: gradient, LSTM, Dense, reset!, onehot, RNN
 using TextAnalysis: score_sequence, forward_score
 
 @testset "crf" begin
@@ -101,7 +101,7 @@ using TextAnalysis: score_sequence, forward_score
 
         LSTM_STATE_SIZE = 5
         d_out = Dense(LSTM_STATE_SIZE, num_labels + 2)
-        lstm = LSTM(num_features, LSTM_STATE_SIZE)
+        lstm = RNN(num_features, LSTM_STATE_SIZE)
         m(x) = d_out.(lstm.(x))
 
         c = CRF(num_labels)
@@ -127,8 +127,8 @@ using TextAnalysis: score_sequence, forward_score
             reset!(lstm)
             loss(d[1], d[2])
         end
-
-        l1 = sum([find_loss(d) for d in data])
+        to_sum = [find_loss(d) for d in data]
+        l1 = sum(to_sum)
         dense_param_1 = deepcopy(Tracker.data(d_out.W))
         lstm_param_1 = deepcopy(Tracker.data(lstm.cell.Wh))
         crf_param_1 = deepcopy(Tracker.data(c.W))

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,10 +6,10 @@ using WordTokenizers
 
 println("Running tests:")
 
-include("pos.jl")
+include("crf.jl")
 include("ner.jl")
+include("pos.jl")
 include("coom.jl")
-include("crf.jl")
 include("tokenizer.jl")
 include("ngramizer.jl")
 include("document.jl")