Skip to content

Commit

Permalink
Fix Sequence Labelling Models, fixes #178 (#180)
Browse files Browse the repository at this point in the history
* Don't export Flux and Tracker

* Fix NER with new weights

* Update Checksum for NER Weights

* Minor fix in datadeps

* Fix POS DataDeps

* Fix Shasum for POS

* Update  weights and checksum for POS

* Bugs and datadeps checksum fix

* Fixing 32-bit tests for NER

* Restrict Flux to 0.8.3 and NNlib to 0.6.0

* Switch to JSON for Sequence Model Dicts, addressing 32-bit test fails.

* Switch NER Dicts to JSON to address 32-bit fails

* Switch to JSON for POS model dicts and update checksum

* Set lower bound for BSON.jl version in pProject.toml

* Remove Duplicates in Project.toml

* Versions for BSON.jl

* Fix version clashes in Manifest.toml

* Update CRF tests

* update Manifest.toml

* Remove Manifest

* update manifest

* Delete Manifest.toml
  • Loading branch information
Ayushk4 authored Sep 4, 2020
1 parent ed8c771 commit 3ff0cb2
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 29 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -25,6 +26,7 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"

[compat]
BSON = ">= 0.2.5"
Flux = "< 0.10"
julia = "1"

Expand Down
2 changes: 1 addition & 1 deletion src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ module TextAnalysis

export CRF, viterbi_decode, crf_loss

export NERTagger, PoSTagger, Tracker, Flux
export NERTagger, PoSTagger

export Vocabulary, lookup, update
export everygram, padding_ngram
Expand Down
16 changes: 13 additions & 3 deletions src/sequence/ner.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using BSON, Tracker
using BSON, JSON

const NER_Char_UNK = '¿'
const NER_Word_UNK = "<UNK>"
Expand All @@ -11,8 +11,18 @@ load_model_dicts(filepath) = load_model_dicts(filepath, true)

function load_model_dicts(filepath, remove_tag_prefix)
labels = BSON.load(joinpath(filepath, "labels.bson"))[:labels]
chars_idx = BSON.load(joinpath(filepath, "char_to_embed_idx.bson"))[:get_char_index]
words_idx = BSON.load(joinpath(filepath, "word_to_embed_idx.bson"))[:get_word_index]

chars_idx_json = JSON.parsefile(joinpath(filepath, "char_to_embed_idx.json"),
dicttype = Dict{String, Int32},
inttype = Int32
)
# Since String can't be directly converted into Char. But these Strings only of length 1.
chars_idx = Dict(key[1] => chars_idx_json[key] for key in keys(chars_idx_json)) # This is Dict{Char,Int32}

words_idx = JSON.parsefile(joinpath(filepath, "word_to_embed_idx.json"),
dicttype = Dict{String, Int32},
inttype = Int32
)

remove_tag_prefix || return [labels...], chars_idx, words_idx

Expand Down
10 changes: 5 additions & 5 deletions src/sequence/ner_datadeps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ function ner_datadep_register()
"""
The weights for NER Sequence Labelling Model.
""",
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_weights.tar.xz",
"6290353b66c9bdbb794ddcb6063ab52c30145d3918f2f115f19e21fa994282e6",
"https://github.com/Ayushk4/NER.jl/releases/download/0.0.0.1/ner_weights.tar.xz",
"6eda5cd778af99f57a0a0b7eb4d5bc46a5a61c214e3e515e620b7db6b76ce3aa",
post_fetch_method = function(fn)
unpack(fn)
dir = "weights"
dir = "ner_weights"
innerfiles = readdir(dir)
mv.(joinpath.(dir, innerfiles), innerfiles)
rm(dir)
Expand All @@ -18,8 +18,8 @@ function ner_datadep_register()
"""
The character and words dict for NER Sequence Labelling Model.
""",
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/ner_dicts.tar.xz",
"40cfa37da216b990eb9c257aa7994e34d7a7a59d69b2506c6f39120f2688dc11",
"https://github.com/Ayushk4/NER.jl/releases/download/0.0.0.1/ner_dicts.tar.xz",
"49619d793a5974dd41859e68d73eae68e58f8b264d49ba98489ab6ed74bf5f86",
post_fetch_method = function(fn)
unpack(fn)
dir = "model_dicts"
Expand Down
10 changes: 5 additions & 5 deletions src/sequence/pos_datadeps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ function pos_datadep_register()
"""
The weights for POS Sequence Labelling Model.
""",
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_weights.tar.xz",
"b02e891ea913be6834ff67d6ecf2ddae6754d55509bb3d9c078dbfc7eed27988";
"https://github.com/Ayushk4/POS.jl/releases/download/v0.0.1/pos_weights.tar.xz",
"74759f446aeaec3f46ba44de1d82c2324f26c8f1f65790187067973d3aefc054";
post_fetch_method = function(fn)
unpack(fn)
dir = "weights"
dir = "pos_weights"
innerfiles = readdir(dir)
mv.(joinpath.(dir, innerfiles), innerfiles)
rm(dir)
Expand All @@ -18,8 +18,8 @@ function pos_datadep_register()
"""
The character and words dict for POS Sequence Labelling Model.
""",
"https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pos_model_dicts.tar.xz",
"4d7fe8238ff0cfb92d195dfa745b4ed08f916d4707e3dbe27a1b3144c9282f41";
"https://github.com/Ayushk4/POS.jl/releases/download/v0.0.1/pos_model_dicts.tar.xz",
"8c79089a4aecd09444143b833da49e7a4529612f5447e607dc77aa45968b3858";
post_fetch_method = function(fn)
unpack(fn)
dir = "model_dicts"
Expand Down
59 changes: 50 additions & 9 deletions src/sequence/sequence_models.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using BSON, Tracker
mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
labels::Array{String, 1} # List of Labels
chars_idx::Dict{Char, Int64} # Dict that maps chars to indices in W_Char_Embed
words_idx::Dict{String, Int64} # Dict that maps words to indices in W_word_Embed
chars_idx#::Dict{Char, Integer} # Dict that maps chars to indices in W_Char_Embed
words_idx#::Dict{String, Integer} # Dict that maps words to indices in W_word_Embed
conv1::C # Convolution Layer over W_Char_Embed to give character representation
W_Char_Embed::W # Weights for character embeddings
W_word_Embed::W # Further trained GloVe Embeddings
Expand Down Expand Up @@ -32,13 +32,54 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
init_α = fill(-10000, (n + 2, 1))
init_α[n + 1] = 0

W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu].data
W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu].data
forward_lstm = BSON.load(joinpath(weights_path, "forward_lstm.bson"))[:forward_lstm_cpu]
backward = BSON.load(joinpath(weights_path, "backward_lstm.bson"))[:backward_lstm_cpu]
d_out = BSON.load(joinpath(weights_path, "d_cpu.bson"))[:d_cpu]
c = BSON.load(joinpath(weights_path, "crf.bson"))[:crf_cpu]
conv1 = BSON.load(joinpath(weights_path, "conv_cpu.bson"))[:conv_cpu]
# Word and Character Embeddings.
W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu]
W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu]

# Forward_LSTM
forward_wts = BSON.load(joinpath(weights_path, "forward_lstm.bson"))
forward_lstm = Flux.Recur(Flux.LSTMCell(forward_wts[:lstm_2], # Wi
forward_wts[:lstm_1], # Wh
forward_wts[:lstm_3], # b
forward_wts[:lstm_4], # h
forward_wts[:lstm_5] # c
),
forward_wts[:lstm_init],
forward_wts[:lstm_state]
)

# Backward_LSTM
backward_wts = BSON.load(joinpath(weights_path, "backward_lstm.bson"))
backward = Flux.Recur(Flux.LSTMCell(backward_wts[:lstm_2], # Wi
backward_wts[:lstm_1], # Wh
backward_wts[:lstm_3], # b
backward_wts[:lstm_4], # h
backward_wts[:lstm_5] # c
),
backward_wts[:lstm_init],
backward_wts[:lstm_state]
)

# Dense
d_weights_bias = BSON.load(joinpath(weights_path, "d_cpu.bson"))
d_out = Flux.Dense(d_weights_bias[:d_weight],
d_weights_bias[:d_bias],
Flux.identity
)

# Load CRF.
crf_wt = BSON.load(joinpath(weights_path, "crf_cpu.bson"))[:crf_Weights]
c = TextAnalysis.CRF(crf_wt, size(crf_wt)[1] - 2)

# Load Conv
conv_wt_bias = BSON.load(joinpath(weights_path, "conv_cpu.bson"))
conv1 = Flux.Conv(Flux.identity, # activation
conv_wt_bias[:conv_weight], # weights
conv_wt_bias[:conv_bias], # bias
(1, 1), # stride
(0, 2), # pad
(1, 1), # dilation
)

BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,
forward_lstm, backward, d_out, c, init_α, UNK_Word_idx, UNK_char_idx)
Expand Down
8 changes: 4 additions & 4 deletions test/crf.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
using Flux
using Flux: gradient, LSTM, Dense, reset!, onehot
using Flux: gradient, LSTM, Dense, reset!, onehot, RNN
using TextAnalysis: score_sequence, forward_score

@testset "crf" begin
Expand Down Expand Up @@ -101,7 +101,7 @@ using TextAnalysis: score_sequence, forward_score

LSTM_STATE_SIZE = 5
d_out = Dense(LSTM_STATE_SIZE, num_labels + 2)
lstm = LSTM(num_features, LSTM_STATE_SIZE)
lstm = RNN(num_features, LSTM_STATE_SIZE)
m(x) = d_out.(lstm.(x))

c = CRF(num_labels)
Expand All @@ -127,8 +127,8 @@ using TextAnalysis: score_sequence, forward_score
reset!(lstm)
loss(d[1], d[2])
end

l1 = sum([find_loss(d) for d in data])
to_sum = [find_loss(d) for d in data]
l1 = sum(to_sum)
dense_param_1 = deepcopy(Tracker.data(d_out.W))
lstm_param_1 = deepcopy(Tracker.data(lstm.cell.Wh))
crf_param_1 = deepcopy(Tracker.data(c.W))
Expand Down
4 changes: 2 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ using WordTokenizers

println("Running tests:")

include("pos.jl")
include("crf.jl")
include("ner.jl")
include("pos.jl")
include("coom.jl")
include("crf.jl")
include("tokenizer.jl")
include("ngramizer.jl")
include("document.jl")
Expand Down

0 comments on commit 3ff0cb2

Please sign in to comment.