If a word's text adds up to the text of the whole token, we can mark …

…start_char and end_char on it. Note that there will still be no start_char and end_char annotations on words if the words don't add up to the token's text, so even in a language like English where the standard is to annotate the datasets so that they correspond to the pieces of the real text instead of the word being represented, there may be unusual separations in the MWT processor that result in no start/end char Fix a unit test error #1361
stanfordnlp · Jul 16, 2024 · 84b4b10 · 84b4b10
1 parent 12b08ae
commit 84b4b10
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 4 deletions.
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -388,6 +388,13 @@ def set_mwt_expansions(self, expansions,
                     word.sent = sentence
                     word.parent = token
                     sentence.words.append(word)
+                if len(token.words) > 1 and token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
+                    start_char = token.start_char
+                    for word in token.words:
+                        end_char = start_char + len(word.text)
+                        word.start_char = start_char
+                        word.end_char = end_char
+                        start_char = end_char
 
             if fake_dependencies:
                 sentence.build_fake_dependencies()
@@ -1463,11 +1470,19 @@ def start_char(self):
         """ Access the start character index for this token in the raw text. """
         return self._start_char
 
+    @start_char.setter
+    def start_char(self, value):
+        self._start_char = value
+
     @property
     def end_char(self):
         """ Access the end character index for this token in the raw text. """
         return self._end_char
 
+    @end_char.setter
+    def end_char(self, value):
+        self._end_char = value
+
     @property
     def parent(self):
         """ Access the parent token of this word. In the case of a multi-word token, a token can be the parent of

diff --git a/stanza/tests/tokenization/test_spaces.py b/stanza/tests/tokenization/test_spaces.py
@@ -39,8 +39,8 @@ def test_spaces_no_mwt():
 # text = She's not a nice person.
 # sent_id = 0
 1-2	She's	_	_	_	_	_	_	_	start_char=2|end_char=7|SpacesBefore=\\s\\s
-1	She	_	_	_	_	0	_	_	_
-2	's	_	_	_	_	1	_	_	_
+1	She	_	_	_	_	0	_	_	start_char=2|end_char=5
+2	's	_	_	_	_	1	_	_	start_char=5|end_char=7
 3	not	_	_	_	_	2	_	_	start_char=8|end_char=11
 4	a	_	_	_	_	3	_	_	start_char=12|end_char=13
 5	nice	_	_	_	_	4	_	_	start_char=14|end_char=18
@@ -59,8 +59,8 @@ def test_spaces_no_mwt():
 8	Cerritos	_	_	_	_	7	_	_	start_char=62|end_char=70
 9	are	_	_	_	_	8	_	_	start_char=71|end_char=74
 10-11	Jennifer's	_	_	_	_	_	_	_	start_char=75|end_char=85|SpaceAfter=No
-10	Jennifer	_	_	_	_	9	_	_	_
-11	's	_	_	_	_	10	_	_	_
+10	Jennifer	_	_	_	_	9	_	_	start_char=75|end_char=83
+11	's	_	_	_	_	10	_	_	start_char=83|end_char=85
 12	.	_	_	_	_	11	_	_	start_char=85|end_char=86|SpacesAfter=\\s\\s
 """.strip()