Skip to content

Commit

Permalink
If a word's text adds up to the text of the whole token, we can mark …
Browse files Browse the repository at this point in the history
…start_char and end_char on it.

Note that there will still be no start_char and end_char annotations
on words if the words don't add up to the token's text, so even in a
language like English where the standard is to annotate the datasets
so that they correspond to the pieces of the real text instead of the
word being represented, there may be unusual separations in the MWT
processor that result in no start/end char

Fix a unit test error

#1361
  • Loading branch information
AngledLuffa authored and Jemoka committed Jul 16, 2024
1 parent 12b08ae commit 84b4b10
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 4 deletions.
15 changes: 15 additions & 0 deletions stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,13 @@ def set_mwt_expansions(self, expansions,
word.sent = sentence
word.parent = token
sentence.words.append(word)
if len(token.words) > 1 and token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
start_char = token.start_char
for word in token.words:
end_char = start_char + len(word.text)
word.start_char = start_char
word.end_char = end_char
start_char = end_char

if fake_dependencies:
sentence.build_fake_dependencies()
Expand Down Expand Up @@ -1463,11 +1470,19 @@ def start_char(self):
""" Access the start character index for this token in the raw text. """
return self._start_char

@start_char.setter
def start_char(self, value):
self._start_char = value

@property
def end_char(self):
""" Access the end character index for this token in the raw text. """
return self._end_char

@end_char.setter
def end_char(self, value):
self._end_char = value

@property
def parent(self):
""" Access the parent token of this word. In the case of a multi-word token, a token can be the parent of
Expand Down
8 changes: 4 additions & 4 deletions stanza/tests/tokenization/test_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def test_spaces_no_mwt():
# text = She's not a nice person.
# sent_id = 0
1-2 She's _ _ _ _ _ _ _ start_char=2|end_char=7|SpacesBefore=\\s\\s
1 She _ _ _ _ 0 _ _ _
2 's _ _ _ _ 1 _ _ _
1 She _ _ _ _ 0 _ _ start_char=2|end_char=5
2 's _ _ _ _ 1 _ _ start_char=5|end_char=7
3 not _ _ _ _ 2 _ _ start_char=8|end_char=11
4 a _ _ _ _ 3 _ _ start_char=12|end_char=13
5 nice _ _ _ _ 4 _ _ start_char=14|end_char=18
Expand All @@ -59,8 +59,8 @@ def test_spaces_no_mwt():
8 Cerritos _ _ _ _ 7 _ _ start_char=62|end_char=70
9 are _ _ _ _ 8 _ _ start_char=71|end_char=74
10-11 Jennifer's _ _ _ _ _ _ _ start_char=75|end_char=85|SpaceAfter=No
10 Jennifer _ _ _ _ 9 _ _ _
11 's _ _ _ _ 10 _ _ _
10 Jennifer _ _ _ _ 9 _ _ start_char=75|end_char=83
11 's _ _ _ _ 10 _ _ start_char=83|end_char=85
12 . _ _ _ _ 11 _ _ start_char=85|end_char=86|SpacesAfter=\\s\\s
""".strip()

Expand Down

0 comments on commit 84b4b10

Please sign in to comment.