Skip to content

Commit

Permalink
Merge pull request #28 from amir-zeldes/develop
Browse files Browse the repository at this point in the history
V2.2.0
  • Loading branch information
amir-zeldes authored Feb 25, 2020
2 parents 3174ca6 + bff40ca commit 85b682b
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ Batch mode options:
.depedit)
```

For more information see https://corpling.uis.georgetown.edu/depedit/ and read the included User Guide PDF.
For more information see https://corpling.uis.georgetown.edu/depedit/ and read the included User Guide PDF in doc/.
64 changes: 50 additions & 14 deletions depedit/depedit.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import io
from six import iteritems

__version__ = "2.1.7"
__version__ = "2.2.0"

ALIASES = {"form":"text","upostag":"pos","xpostag":"cpos","feats":"morph","deprel":"func","deps":"head2","misc":"func2",
"xpos": "cpos","upos":"pos"}
Expand Down Expand Up @@ -55,6 +55,16 @@ def __init__(self, tok_id, text, lemma, pos, cpos, morph, head, func, head2, fun
self.position = position
self.is_super_tok = is_super_tok

def __getattr__(self, item):
if item.startswith("#S:"):
key = item.split(":",1)[1]
if key in self.sentence.annotations:
return self.sentence.annotations[key]
elif key in self.sentence.input_annotations:
return self.sentence.input_annotations[key]
else:
return ""

def __repr__(self):
return str(self.text) + " (" + str(self.pos) + "/" + str(self.lemma) + ") " + "<-" + str(self.func)

Expand All @@ -64,7 +74,8 @@ class Sentence:
def __init__(self, sentence_string="", sent_num=0,tokoffset=0):
self.sentence_string = sentence_string
self.length = 0
self.annotations = {}
self.annotations = {} # Dictionary to keep sentence annotations added by DepEdit rules
self.input_annotations = {} # Dictionary with original sentence annotations (i.e. comment lines) in input conll
self.sent_num = sent_num
self.offset = tokoffset

Expand Down Expand Up @@ -113,6 +124,7 @@ def normalize_shorthand(criterion_string):
return criterion_string

def __init__(self, transformation_text, line):
self.transformation_text = transformation_text
instructions = self.parse_transformation(transformation_text)
if instructions is None:
sys.stderr.write("Depedit says: error in configuration file\n"
Expand All @@ -132,7 +144,8 @@ def validate(self):
for criterion in criteria:
if re.match(r"(text|pos|cpos|lemma|morph|func|head|func2|head2|num|form|upos|upostag|xpos|xpostag|feats|deprel|deps|misc)!?=/[^/=]*/", criterion) is None:
if re.match(r"position!?=/(first|last|mid)/", criterion) is None:
report += "Invalid node definition in column 1: " + criterion
if re.match(r"#S:[A-Za-z_]+!?=/[^/\t]+/",criterion) is None:
report += "Invalid node definition in column 1: " + criterion
for relation in self.relations:
if relation == "none" and len(self.relations) == 1:
if len(self.definitions) > 1:
Expand Down Expand Up @@ -164,6 +177,9 @@ def __init__(self, def_text, def_index):
self.def_index = def_index
self.groups = []
self.defs = []
self.sent_def = False
if def_text.startswith("#S:"):
self.sent_def = True

def_items = self.def_text.split("&")
for def_item in def_items:
Expand Down Expand Up @@ -213,7 +229,10 @@ class Definition:

def __init__(self, criterion, value, negative=False):
# Handle conllu criterion aliases:
self.criterion = ALIASES.get(criterion, criterion)
if criterion.startswith("#S:"): # Sentence annotation
self.criterion = criterion
else:
self.criterion = ALIASES.get(criterion, criterion)
self.value = value
self.match_type = ""
self.compiled_re = None
Expand Down Expand Up @@ -259,6 +278,7 @@ def __init__(self, def_index, token, groups):
self.def_index = def_index
self.token = token
self.groups = groups
self.sent_def = False # Whether this is a sentence annotation match

def __repr__(self):
return "#" + str(self.def_index) + ": " + self.token.__repr__
Expand Down Expand Up @@ -302,8 +322,9 @@ def read_config_file(self, config_file, clear_transformations=False):
line_num = 0
for instruction in config_file:
line_num += 1
if len(instruction)>0 and not instruction.startswith(";") and not instruction.startswith("#") and not instruction.strip() =="":
self.transformations.append(Transformation(instruction, line_num))
if len(instruction)>0 and not instruction.startswith(";") and not instruction.startswith("#") \
or instruction.startswith("#S:"):
self.transformations.append(Transformation(instruction, line_num))

trans_report = ""
for transformation in self.transformations:
Expand All @@ -327,7 +348,12 @@ def process_sentence(self, conll_tokens, stepwise=False):
for def_matcher in transformation.definitions:
for token in conll_tokens:
if not token.is_super_tok and def_matcher.match(token):
node_matches[def_matcher.def_index].append(Match(def_matcher.def_index, token, def_matcher.groups))
if def_matcher.sent_def:
if len(node_matches[def_matcher.def_index])==0: # Only add a sentence anno definition once
node_matches[def_matcher.def_index] = [Match(def_matcher.def_index, token, def_matcher.groups)]
node_matches[def_matcher.def_index][0].sent_def = True
else:
node_matches[def_matcher.def_index].append(Match(def_matcher.def_index, token, def_matcher.groups))
result_sets = []
for relation in transformation.relations:
if not self.matches_relation(node_matches, relation, result_sets):
Expand All @@ -336,7 +362,7 @@ def process_sentence(self, conll_tokens, stepwise=False):
self.add_groups(result_sets)
if len(result_sets) > 0:
for action in transformation.actions:
retval = self.execute_action(result_sets, action)
retval = self.execute_action(result_sets, action, transformation)
if retval == "last": # Explicit instruction to cease processing
return
if stepwise:
Expand Down Expand Up @@ -373,6 +399,7 @@ def matches_relation(self, node_matches, relation, result_sets):
result[node1] = tok1
result["rel"] = relation
result["matchers"] = [matcher1]
result["ID2matcher"] = {node1:matcher1}
result_sets.append(result)
elif "==" in relation:
node1 = relation.split(operator)[0]
Expand All @@ -385,7 +412,8 @@ def matches_relation(self, node_matches, relation, result_sets):
for matcher2 in node_matches[node2]:
tok2 = matcher2.token
if self.test_relation(tok1, tok2, field):
result_sets.append({node1: tok1, node2: tok2, "rel": relation, "matchers": [matcher1, matcher2]})
result_sets.append({node1: tok1, node2: tok2, "rel": relation, "matchers": [matcher1, matcher2],
"ID2matcher":{node1:matcher1, node2:matcher2}})
matches[node1].append(tok1)
matches[node2].append(tok2)
hits += 1
Expand All @@ -407,8 +435,9 @@ def matches_relation(self, node_matches, relation, result_sets):
tok1 = matcher1.token
for matcher2 in node_matches[node2]:
tok2 = matcher2.token
if self.test_relation(tok1, tok2, operator):
result_sets.append({node1: tok1, node2: tok2, "rel": relation, "matchers": [matcher1, matcher2]})
if self.test_relation(tok1, tok2, operator) or matcher1.sent_def: # Sentence dominance always True
result_sets.append({node1: tok1, node2: tok2, "rel": relation, "matchers": [matcher1, matcher2],
"ID2matcher":{node1:matcher1, node2:matcher2}})
matches[node1].append(tok1)
matches[node2].append(tok2)
hits += 1
Expand Down Expand Up @@ -482,7 +511,7 @@ def merge_sets(self, sets, node_count, rel_count):
bins.append(copy(new_set))

for my_bin in bins:
if len(my_bin) == node_count + 2:
if len(my_bin) == node_count + 3:
if len(my_bin["rels"]) == rel_count: # All required relations have been fulfilled
solutions.append(my_bin)
else: # Some node pair has multiple relations, check that all are fulfilled
Expand Down Expand Up @@ -603,8 +632,7 @@ def add_groups(result_sets):
groups.append(g)
result["groups"] = groups[:]

@staticmethod
def execute_action(result_sets, action_list):
def execute_action(self, result_sets, action_list, transformation):
actions = action_list.split(";")
for result in result_sets:
if len(result) > 0:
Expand All @@ -618,6 +646,11 @@ def execute_action(result_sets, action_list):
result[1].sentence.annotations[key] = val
else: # node instruction
node_position = int(action[1:action.find(":")])
if not self.quiet:
if result["ID2matcher"][node_position].sent_def:
sys.stdout.write("! Warning: Rule is applying a *token* transformation to a *sentence* annotation node:\n")
sys.stdout.write(" " + transformation.transformation_text + "\n")
sys.stdout.write(" Applying the transformation to first token in sentence.\n")
prop = action[action.find(":") + 1:action.find("=")]
value = action[action.find("=") + 1:].strip()
group_num_matches = re.findall(r"(\$[0-9]+[LU]?)", value)
Expand Down Expand Up @@ -762,6 +795,9 @@ def _process_sentence(stepwise=False):
if myline.startswith("#"): # Preserve comment lines unless kill requested
if self.kill not in ["comments", "both"]:
output_lines.append(myline.strip())
if "=" in myline:
key, val = myline[1:].split("=",1)
current_sentence.input_annotations[key.strip()] = val.strip()
elif not myline:
output_lines.append("")
elif myline.find("\t") > 0: # Only process lines that contain tabs (i.e. conll tokens)
Expand Down
Binary file modified docs/DepEdit_user_guide.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
setup(
name = 'depedit',
packages = ['depedit'],
version = '2.1.7',
version = '2.2.0',
description = 'A simple configurable tool for manipulating dependency trees',
author = 'Amir Zeldes',
author_email = '[email protected]',
url = 'https://github.com/amir-zeldes/depedit',
install_requires=["six"],
license='Apache License, Version 2.0',
download_url = 'https://github.com/amir-zeldes/depedit/releases/tag/2.1.7',
download_url = 'https://github.com/amir-zeldes/depedit/releases/tag/2.2.0',
keywords = ['NLP', 'parsing', 'syntax', 'dependencies', 'dependency', 'tree', 'treebank', 'conll', 'conllu', 'ud'],
classifiers = ['Programming Language :: Python',
'Programming Language :: Python :: 2',
Expand Down

0 comments on commit 85b682b

Please sign in to comment.