JuliaText · tejasvaidhyadev · Mar 8, 2020 · Mar 8, 2020 · Mar 8, 2020 · Mar 8, 2020
diff --git a/README.md b/README.md
@@ -40,3 +40,4 @@ Follow the links below for full docs on the usage of the corpora.
  - [IMDB movie reviews](docs/src/IMDB.md)
  - [Twitter sentiment dataset](docs/src/Twitter.md)
  - [Stanford Sentiment Treebank](docs/src/SST.md)
+ - [GMB](docs/src/GMB.md)
diff --git a/docs/make.jl b/docs/make.jl
@@ -17,7 +17,8 @@ makedocs(modules = [CorpusLoaders],
              "Twitter" => "Twitter.md",
              "WikiCorpus" => "WikiCorpus.md",
              "WikiGold" => "WikiGold.md",
-             "API References" => "APIReference.md"
+             "API References" => "APIReference.md",
+             "GMB" => "GMB.md"
         ])
 
 

diff --git a/docs/src/GMB.md b/docs/src/GMB.md
@@ -0,0 +1,58 @@
+# GMB
+The dataset an extract from GMB corpus which is tagged, annotated, 
+and built specifically to train the classifier to predict named entities such as name, location, etc. 
+
+GMB is a fairly large corpus with a lot of annotations.
+Unfortunately, GMB is not perfect. It is not a gold standard corpus, meaning that it’s not completely human annotated and it’s not considered 100% correct. 
+The corpus is created by using already existed annotators and then corrected by humans where needed.
+
+
+```julia
+
+julia> corp = load(GMB())
+37789-element Array{Any,1}:
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Families"), NerOnlyTaggedWord("IN", "of"), 
+NerOnlyTaggedWord("NNS", "soldiers"), NerOnlyTaggedWord("VBN", "killed"), NerOnlyTaggedWord("IN", "in"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "conflict"), NerOnlyTaggedWord("VBD", "joined"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "protesters")  …  NerOnlyTaggedWord("CD", "One"), 
+NerOnlyTaggedWord("NN", "Terrorist"), NerOnlyTaggedWord("RQU", "\""), NerOnlyTaggedWord("CC", "and"), 
+NerOnlyTaggedWord("LQU", "\""), NerOnlyTaggedWord("VB", "Stop"), NerOnlyTaggedWord("DT", "the"), 
+NerOnlyTaggedWord("NNS", "Bombings"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("PRP", "They"), NerOnlyTaggedWord("VBD", "marched"), 
+NerOnlyTaggedWord("IN", "from"), NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNS", "Houses"), 
+NerOnlyTaggedWord("IN", "of"), NerOnlyTaggedWord("NN", "Parliament"), NerOnlyTaggedWord("TO", "to"), 
+NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "rally"), NerOnlyTaggedWord("IN", "in"), 
+NerOnlyTaggedWord("NNP", "Hyde"), NerOnlyTaggedWord("NNP", "Park"), NerOnlyTaggedWord(".", ".")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("NNS", "Police"), NerOnlyTaggedWord("VBD", "put"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NN", "number"), NerOnlyTaggedWord("IN", "of"), 
+NerOnlyTaggedWord("NNS", "marchers"), NerOnlyTaggedWord("IN", "at"), NerOnlyTaggedWord("CD", "10,000"), 
+NerOnlyTaggedWord("IN", "while"), NerOnlyTaggedWord("NNS", "organizers"), NerOnlyTaggedWord("VBD", "claimed"), 
+NerOnlyTaggedWord("PRP", "it"), NerOnlyTaggedWord("VBD", "was"), NerOnlyTaggedWord("CD", "100,000"), 
+NerOnlyTaggedWord(".", ".")]
+
+  ⋮
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("IN", "At"), NerOnlyTaggedWord("JJ", "last"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goatherd"), NerOnlyTaggedWord("VBD", "threw"), 
+NerOnlyTaggedWord("DT", "a"), NerOnlyTaggedWord("NN", "stone"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("CC", 
+"and"), NerOnlyTaggedWord("VBG", "breaking")  …  NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("VBD", "begged"), 
+NerOnlyTaggedWord("DT", "the"), NerOnlyTaggedWord("NNP", "Goat"), NerOnlyTaggedWord("RB", "not"), 
+NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "tell"), NerOnlyTaggedWord("PRP\$", "his"), 
+NerOnlyTaggedWord("NN", "master"), NerOnlyTaggedWord(".", ".")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("DT", "The"), NerOnlyTaggedWord("NNP", "Goat"),
+ NerOnlyTaggedWord("VBD", "replied"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("LQU", "\""),
+ NerOnlyTaggedWord("WRB", "Why"), NerOnlyTaggedWord(",", ","), NerOnlyTaggedWord("PRP", "you"), 
+NerOnlyTaggedWord("JJ", "silly"), NerOnlyTaggedWord("NN", "fellow")  …  NerOnlyTaggedWord("DT", "the"), 
+NerOnlyTaggedWord("NN", "horn"), NerOnlyTaggedWord("MD", "will"), NerOnlyTaggedWord("VB", "speak"), 
+NerOnlyTaggedWord("IN", "though"), NerOnlyTaggedWord("PRP", "I"), NerOnlyTaggedWord("VB", "be"), 
+NerOnlyTaggedWord("JJ", "silent"), NerOnlyTaggedWord(".", "."), NerOnlyTaggedWord("LQU", "\"")]
+
+ CorpusLoaders.NerOnlyTaggedWord[NerOnlyTaggedWord("VBP", "Do"), NerOnlyTaggedWord("RB", "not"), 
+NerOnlyTaggedWord("VB", "attempt"), NerOnlyTaggedWord("TO", "to"), NerOnlyTaggedWord("VB", "hide"), 
+NerOnlyTaggedWord("NNS", "things"), NerOnlyTaggedWord("WDT", "which"), NerOnlyTaggedWord("MD", "can"), 
+NerOnlyTaggedWord("RB", "not"), NerOnlyTaggedWord("VB", "be"), NerOnlyTaggedWord("JJ", "hid"), NerOnlyTaggedWord(".", ".")]
+
+```
diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl
@@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord
 export title, sensekey, word, named_entity, part_of_speech
 export load
 
-export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000
+export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold, CoNLL2000, GMB
 
 function __init__()
     include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl"))
@@ -24,6 +24,7 @@ function __init__()
     include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl"))
     include(joinpath(@__DIR__, "WikiGold_DataDeps.jl"))
     include(joinpath(@__DIR__, "CoNLL2000_DataDeps.jl"))
+    include(joinpath(@__DIR__, "GMB_DataDeps.jl"))
 end
 
 include("types.jl")
@@ -38,5 +39,5 @@ include("Twitter.jl")
 include("StanfordSentimentTreebank.jl")
 include("WikiGold.jl")
 include("CoNLL2000.jl")
-
+include("GMB.jl")
 end
diff --git a/src/GMB.jl b/src/GMB.jl
@@ -0,0 +1,57 @@
+struct GMB{S}
+    filepath :: Vector{S}
+end
+
+function GMB(dirpath)
+    @assert(isdir(dirpath), dirpath)
+    paths=glob("data/*/*/en.tags",dirpath)
+    GMB(paths)
+end
+
+GMB() = GMB(datadep"GMB 2.2.0") 
+
+MultiResolutionIterators.levelname_map(::Type{GMB}) = [
+    :doc=>1, :contextfile=>1, :context=>1, :document=>1,
+    :para=>2, :paragraph=>2,
+    :sent=>3, :sentence=>3,
+    :word=>4, :token=>4,
+    :char=>5, :character=>5
+    ]
+
+function parse_gmb_tagged_word(line::AbstractString)
+    tokens_tags = split(line, '\t')
+    return NerOnlyTaggedWord(tokens_tags[2], tokens_tags[1])
+end
+
+function parse_gmb(filename)
+    local sent=[]
+	sents = @NestedVector(NerOnlyTaggedWord, 2)()
+
+    function new_sentence()
+        sent = @NestedVector(NerOnlyTaggedWord, 1)()
+        push!(sents, sent)
+    end
+
+
+    # words
+    get_tagged(line) = push!(sent, parse_gmb_tagged_word(line))
+
+    # parse
+	for line in eachline(filename)
+        if length(line) == 0
+            new_sentence()
+        else
+            get_tagged(line)
+        end
+	end
+    return sents
+end
+
+function load(corpus::GMB)
+	ch=[]
+	for fn in corpus.filepath
+    	document = parse_gmb(fn)
+        append!(ch, document)
+	end
+    return(ch)
+end
diff --git a/src/GMB_DataDeps.jl b/src/GMB_DataDeps.jl
@@ -0,0 +1,33 @@
+using DataDeps
+
+
+for (ver, checksum) in [("1.0.0", "e151d953a0316c5712a52d56a5702f24cc1dc8f22425955821113437ec43a3b8"),
+            ("1.1.0", "3830e7071e43ca9e659d51f2f7c5e5afea9e233993251e9f45d628caa6a372c6"),
+            ("2.0.0", "30a700e2509eb1a484357a1f1e5f7f06ef8e9516267413061b7dfccdf8ba4215"),
+            ("2.1.0", "e4bd7d43f7b2c1618f896784c2b7df3acde3bfe93ef4fd6e5a7a196f54b6a4f9"),
+            ("2.2.0", "dd12f2617f745ea3cafa348c60ee374c804be238d184bcf91db7bd9f90261625")]
+
+    register(DataDep("GMB $ver",
+        """
+        Website: https://gmb.let.rug.nl/data.php
+        Orignal Author: Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes
+
+        The Groningen Meaning Bank (GMB) consists of public domain English texts with corresponding syntactic and semantic representations.
+        The GMB is developed at the University of Groningen. A multi-lingual version of the GMB is the Parallel Meaning Bank. 
+        A thorough description of the GMB can be found in the Handbook of Linguistic Annotation.
+
+        Please cite the following publication if you use the corpora:
+        Bos, Johan and Basile, Valerio and Evang, Kilian and Venhuizen, Noortje and Bjerva, Johannes. " Handbook of Linguistic Annotation, Publisher: Springer Netherlands, Editors: Nancy Ide, James Pustejovsky, pp.463-496."
+        """,
+        "https://gmb.let.rug.nl/releases/gmb-$(ver).zip",
+        checksum;
+        post_fetch_method = function (fn)
+            unpack(fn)
+            innerdir = "gbm-$(ver)"
+            innerfiles = readdir(innerdir)
+            # Move everything to current directory, under same name
+            mv.(joinpath.(innerdir, innerfiles), innerfiles)
+            rm(innerdir)
+        end
+    ))
+end
diff --git a/test/test_GMB.jl b/test/test_GMB.jl
@@ -0,0 +1,21 @@
+using CorpusLoaders
+using Test
+using Base.Iterators
+using MultiResolutionIterators
+using DataDeps
+
+@testset "Using flatten_levels" for path in [datadep"GMB 1.0.0", datadep"GMB 1.1.0", datadep"GMB 2.0.0", datadep"GMB 2.1.0", datadep"GMB 2.2.0"]
+    train = load(GMB())
+    docs = train[1:5]
+
+    words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word)))
+    @test length(words) > length(docs)
+    @test typeof(words) == Vector{CorpusLoaders.NerOnlyTaggedWord}
+
+    plain_words = word.(words)
+    @test typeof(plain_words) <: Vector{String}
+
+    ner_tags = named_entity.(words)
+    @test typeof(ner_tags) <: Vector{String}
+
+end