Skip to content

Commit

Permalink
Merge pull request #128 from amir-zeldes/dev
Browse files Browse the repository at this point in the history
V8.1.0
  • Loading branch information
amir-zeldes authored Jan 6, 2023
2 parents fd91f06 + 66e0dc7 commit aa6621a
Show file tree
Hide file tree
Showing 2,077 changed files with 5,435,195 additions and 5,284,632 deletions.
48 changes: 26 additions & 22 deletions _build/build_gum.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def setup_directories(gum_source, gum_target):
parser.add_argument("-i",dest="increment_version",action="store",help="A new version number to assign",default="DEVELOP")
parser.add_argument("--pepper_only",action="store_true", help="Just rerun pepper on generated targets")
parser.add_argument("--skip_ptb_labels",action="store_true", help="Skip projecting function labels to PTB trees")
parser.add_argument("--skip_ontogum",action="store_true", help="Skip building OntoGUM version of coref data")

options = parser.parse_args()

Expand Down Expand Up @@ -194,20 +195,6 @@ def check_diff(xml, ptb, docname):
#proof(gum_source)

if not options.pepper_only:
# Add annotations to dep/:
# * fresh token strings, POS tags and lemmas from xml/
# * generates vanilla tags in CPOS column from POS
# * creates speaker and s_type comments from xml/
# Returns pre_annotated, a dictionary giving pre-annotated fields in src/dep/ which overwrite annotation values
print("\nEnriching Dependencies:\n" + "="*23)
pre_annotated = enrich_dep(gum_source, pepper_tmp, reddit)

# Add annotations to xml/:
# * add CLAWS tags in fourth column
# * add fifth column after lemma containing tok_func from dep/
print("\n\nEnriching XML files:\n" + "="*23)
enrich_xml(gum_source, gum_target, add_claws=options.claws, reddit=reddit)

# Token and sentence border adjustments
print("\nAdjusting token and sentence borders:\n" + "="*37)
# Adjust tsv/ files:
Expand All @@ -216,13 +203,28 @@ def check_diff(xml, ptb, docname):
# * find instances of "'s" that are not included in any immediately preceding
# markables and merge them into those markables if genitive_s is True
# * return conllu-a style bracket informatio to add entity data to conllu files later
conllua_data = fix_tsv(gum_source, gum_target, reddit=reddit)
conllua_data, centering_data = fix_tsv(gum_source, gum_target, reddit=reddit)

# Adjust rst/ files:
# * refresh token strings in case of inconsistency
# * note that segment borders are not automatically adjusted around xml/ <s> elements
fix_rst(gum_source, gum_target, reddit=reddit)

# Add annotations to xml/:
# * add CLAWS tags in fourth column
# * add fifth column after lemma containing tok_func from dep/
# * add Centering Theory transition types to sentences
print("\n\nEnriching XML files:\n" + "="*23)
enrich_xml(gum_source, gum_target, centering_data, add_claws=options.claws, reddit=reddit)

# Add annotations to dep/:
# * fresh token strings, POS tags and lemmas from xml/
# * generates vanilla tags in CPOS column from POS
# * creates speaker, s_type and centering transition comments from xml/
# Returns pre_annotated, a dictionary giving pre-annotated fields in src/dep/ which overwrite annotation values
print("\nEnriching Dependencies:\n" + "="*23)
pre_annotated = enrich_dep(gum_source, gum_target, pepper_tmp, reddit)

# Create fresh constituent parses in const/ if desired
# (either reparse or use dep2const conversion, e.g. https://github.com/ikekonglp/PAD)
if options.parse:
Expand All @@ -245,9 +247,10 @@ def check_diff(xml, ptb, docname):
print("\nCompiling Universal Dependencies version:\n" + "=" * 40)
compile_ud(pepper_tmp, gum_target, pre_annotated, reddit=reddit)

# Create OntoGUM data (OntoNotes schema version of coref annotations)
print("\n\nCreating alternate OntoGUM version of coref annotations:\n" + "="*37)
make_ontogum(gum_target, reddit=reddit)
if not options.skip_ontogum:
# Create OntoGUM data (OntoNotes schema version of coref annotations)
print("\n\nCreating alternate OntoGUM version of coref annotations:\n" + "="*37)
make_ontogum(gum_target, reddit=reddit)

# Add labels to PTB trees
if not options.skip_ptb_labels:
Expand Down Expand Up @@ -329,10 +332,11 @@ def check_diff(xml, ptb, docname):
from utils.propagate import add_entities_to_conllu, add_rsd_to_conllu, add_bridging_to_conllu, add_xml_to_conllu

add_entities_to_conllu(gum_target, reddit=reddit, ontogum=False, conllua_data=conllua_data)
if options.no_pepper:
sys.__stdout__.write("\ni Not adding entity information to UD parses in OntoGUM version since Pepper conversion was skipped\n")
else:
add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
if not options.skip_ontogum:
if options.no_pepper:
sys.__stdout__.write("\ni Not adding entity information to UD parses in OntoGUM version since Pepper conversion was skipped\n")
else:
add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
add_bridging_to_conllu(gum_target,reddit=reddit)

sys.__stdout__.write("\no Added entities, coreference and bridging to UD parses\n")
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_academic_theropod.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@
(PP
(IN than)
(NP
(NP (CD one))
(NP (NN one))
(ADJP
(JJ parallel)
(PP (IN to) (NP (DT the) (NN GRF))))))))))))
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_bio_bernoulli.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@
(VP
(VBG sticking)
(NP
(ADJP (JJ point) (HYPH -) (JJ ended))
(ADJP (NN point) (HYPH -) (JJ ended))
(NN glass)
(NNS tubes))
(PP
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_bio_marbles.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@
(NP (PRP She))
(VP
(VBD paid)
(NP (ADJP (SYM $) (CD 800)) (NN rent))
(NP (ADJP ($ $) (CD 800)) (NN rent))
(PP
(IN by)
(S
Expand Down
13 changes: 7 additions & 6 deletions _build/src/const/GUM_conversation_atoms.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@
(NP (DT The) (NN temperature))
(PP
(IN of)
(NP (NP (DT this) (NN w-) (NN water)) (ADVP (RB here)))))
(NP (NP (DT this) (UH w-) (NN water)) (ADVP (RB here)))))
(, ,)
(VP
(VBZ is)
Expand Down Expand Up @@ -353,7 +353,8 @@

(ROOT
(S
(S (NP (PRP You)) (VP (VP (MD should)) (VP (VB g-))))
(NP (PRP You))
(VP (VP (MD should)) (UH g-))
(: —)
(S
(NP (PRP you))
Expand Down Expand Up @@ -498,7 +499,7 @@
(VP
(TO to)
(VP
(VB f-)
(UH f-)
(ADVP (RB completely))
(VB fill)
(NP
Expand Down Expand Up @@ -702,7 +703,7 @@
(CC but)
(, ,)
(S
(INTJ (VB l-))
(INTJ (UH l-))
(VP
(VB let)
(S
Expand Down Expand Up @@ -745,7 +746,7 @@
(ROOT
(S
(NP (EX There))
(VP (VBZ 's) (NP (DT a-)) (PP (IN in) (NP (RB there))))
(VP (VBZ 's) (NP (UH a-)) (PP (IN in) (NP (RB there))))
(. .)))

(ROOT
Expand Down Expand Up @@ -1145,7 +1146,7 @@
(FRAG
(SBAR
(WHNP (WP What))
(S (NP (PRP you)) (VP (VBD had) (PP (IN for) (NP (NN b-))))))
(S (NP (PRP you)) (VP (VBD had) (PP (IN for) (NP (UH b-))))))
(: —)
(CC But)
(VP (ADVP (INTJ (UH no)) (, ,) (NP (FW huit))))
Expand Down
6 changes: 3 additions & 3 deletions _build/src/const/GUM_conversation_blacksmithing.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
(SBAR
(IN cause)
(S
(PRP y-)
(UH y-)
(NP (PRP I))
(VP
(VBP mean)
Expand Down Expand Up @@ -329,7 +329,7 @@
(ADVP (RB kinda))
(VBD had)
(NP
(NP (DT a) (JJ b-) (JJ general) (NN idea))
(NP (DT a) (UH b-) (JJ general) (NN idea))
(, ,)
(PP
(IN of)
Expand Down Expand Up @@ -925,7 +925,7 @@
(S (NP (DT a) (NN horse)) (ADJP (RB really) (JJ bad)))))
(. .)))

(ROOT (S (CC And) (NP (PRP they)) (VP (VBP m-))))
(ROOT (S (CC And) (NP (PRP they)) (VP (UH m-))))

(ROOT
(S
Expand Down
6 changes: 3 additions & 3 deletions _build/src/const/GUM_conversation_christmas.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
(VBP 'm)
(ADVP (RB just))
(ADJP
(VBN interested)
(JJ interested)
(PP
(IN in)
(SBAR
Expand Down Expand Up @@ -269,7 +269,7 @@
(S (NP (DT this) (NN one)) (VP (VBZ is))))))))))
(. .)))

(ROOT (S (ADVP (RB There)) (NP (PRP we)) (VP (VBP go)) (. .)))
(ROOT (S (ADVP (EX There)) (NP (PRP we)) (VP (VBP go)) (. .)))

(ROOT (INTJ (UH Okay) (. .)))

Expand Down Expand Up @@ -1111,7 +1111,7 @@
(VP
(MD 'll)
(ADVP (RB just))
(VP (VB wait) (, ,) (PP (IN on) (NP (PRP$ ours)))))
(VP (VB wait) (, ,) (PP (IN on) (NP (PRP ours)))))
(. .)))

(ROOT
Expand Down
4 changes: 2 additions & 2 deletions _build/src/const/GUM_conversation_grounded.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@
(ROOT
(S (NP (PRP$ Her) (NN mom)) (VP (VB call) (NP (PRP you))) (, ,)))

(ROOT (FRAG (UH Right) (, ,) (UH right) (. .)))
(ROOT (FRAG (JJ Right) (, ,) (JJ right) (. .)))

(ROOT
(S
Expand All @@ -374,7 +374,7 @@

(ROOT
(S
(INTJ (UH Right))
(INTJ (JJ Right))
(, ,)
(NP (NNP Melanie))
(VP
Expand Down
6 changes: 3 additions & 3 deletions _build/src/const/GUM_conversation_retirement.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@
(VP (MD will) (RB not) (VP (VB take) (NP (DT any) (NN frost))))
(. .)))

(ROOT (ADVP (RB Soon) (PP (IN as) (NP (DT the) (NN b-)))))
(ROOT (ADVP (RB Soon) (PP (IN as) (NP (DT the) (UH b-)))))

(ROOT (FRAG (INTJ (UH Yeah)) (NP (PRP I)) (: –)))

Expand All @@ -693,14 +693,14 @@
(PP (IN of) (NP (NN frost))))
(, ,)
(NP (PRP it))
(VP (VBZ 's) (ADJP (VBN gone)))
(VP (VBZ 's) (ADJP (JJ gone)))
(. .)))

(ROOT
(S
(NP (PRP I))
(VP
(VBD l-)
(UH l-)
(NP (PRP I))
(VP
(VBD learned)
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_frankenstein.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,6 @@
(VBP ’m)
(ADJP
(JJ glad)
(SBAR (S (NP (PRP they)) (VP (VBP ’re) (ADJP (VBN gone)))))))
(SBAR (S (NP (PRP they)) (VP (VBP ’re) (ADJP (JJ gone)))))))
(. .)))

2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_garden.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@
(NP (DT a) (JJ dead) (NN man) (POS ’s))
(NNS fingers))))
(, ,)
(NP (NP (VBD tarred) (NNS timbers)) (VP (VBG groaning)))
(NP (NP (VBN tarred) (NNS timbers)) (VP (VBG groaning)))
(CC and)
(S
(NP (DT a) (NN mermaid))
Expand Down
4 changes: 2 additions & 2 deletions _build/src/const/GUM_fiction_giants.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@
(VBG laughing)
(PP
(PP (IN at) (NP (DT the) (NN chemist)))
(CONJP (IN as) (RB well) (IN as))
(CONJP (RB as) (RB well) (IN as))
(PP (IN at) (NP (DT the) (NN traveler))))))
(. .)))

Expand Down Expand Up @@ -572,7 +572,7 @@
(NP (DT the) (NN traveler))
(CC and)
(NP (PRP$ her) (NN pack)))
(VP (VBD were) (ADJP (VBN gone))))
(VP (VBD were) (ADJP (JJ gone))))
(. .)))

(ROOT (S (NP (PRP I)) (VP (VBD hurried) (ADVP (RB home))) (. .)))
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_pag.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@
(CC and)
(SBAR
(IN with)
(S (NP (PDT half) (PRP$ my) (NN mind)) (VP (VBN gone)))))
(S (NP (PDT half) (PRP$ my) (NN mind)) (VP (JJ gone)))))
(, ,)
(NP (PRP I))
(VP
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_rose.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@
(VP
(VBD turned)
(NP (PRP$ her) (NN hair))
(PP (TO to) (NP (VB fire)))))
(PP (IN to) (NP (NN fire)))))
(CC and)
(S
(NP (NNP Carroll))
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_veronique.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@
(IN that)
(S
(NP (DT the) (NNP Scathalos) (NNS outrunners))
(VP (VBD were) (ADJP (VBN gone))))))))))))
(VP (VBD were) (ADJP (JJ gone))))))))))))
(. .)))

(ROOT
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_fiction_wedding.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@
(VP
(VP (VBD took) (NP (NN heart)))
(CC and)
(VP (VBD went) (PRT (RB on)) (S (VP (VBG meditating))))))
(VP (VBD went) (PRT (RP on)) (S (VP (VBG meditating))))))
(. .)))

(ROOT
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_interview_brotherhood.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
(VP
(VBZ interviews)
(NP
(NP (NN President))
(NP (NNP President))
(PP
(IN of)
(NP
Expand Down
2 changes: 1 addition & 1 deletion _build/src/const/GUM_interview_chomsky.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@
(CC or)
(RB not))
(NP (DT that))
(VP (VBZ ’s) (ADJP (VBN gone))))
(VP (VBZ ’s) (ADJP (JJ gone))))
(. .)))

(ROOT
Expand Down
6 changes: 2 additions & 4 deletions _build/src/const/GUM_interview_cocktail.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,7 @@
(NP (DT The) (NN beverage))
(VP
(VBZ is)
(VP
(VBN priced)
(PP (IN at) (NP (NNP US) (SYM $) (CD 8.00))))))
(VP (VBN priced) (PP (IN at) (NP (NNP US) ($ $) (CD 8.00))))))
(, ,)
(CC and)
(S
Expand Down Expand Up @@ -1067,7 +1065,7 @@

(ROOT
(SBARQ
(WHNP (WP What) (JJ culinary) (NNS dishes))
(WHNP (WDT What) (JJ culinary) (NNS dishes))
(SQ
(MD would)
(NP (PRP you))
Expand Down
6 changes: 3 additions & 3 deletions _build/src/const/GUM_interview_cyclone.ptb
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@
(, ,)
(S
(VP
(VBN headed)
(PP (IN on) (NP (DT a) (JJ northwest) (NN track))))))))
(JJ headed)
(PP (IN on) (NP (DT a) (NN northwest) (NN track))))))))
(. .)))

(ROOT
Expand Down Expand Up @@ -589,7 +589,7 @@
(PP
(IN as)
(NP
(QP (NNP US) (QP (SYM $) (CD 10) (CD billion))))))))))))
(QP (NNP US) (QP ($ $) (CD 10) (CD billion))))))))))))
(. .)))

(ROOT
Expand Down
Loading

0 comments on commit aa6621a

Please sign in to comment.