Skip to content

Commit

Permalink
Fix #25 (dates at the end of sentences)
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Aug 14, 2023
1 parent af9a5c2 commit 8c08187
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,10 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
self.underline = re.compile(r"(?<!\w)(?P<left>_)(?P<middle>\w[^_]+\w)(?P<right>_)(?!\w)")

# DATE, TIME, NUMBERS
self.three_part_date_year_first = re.compile(r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?![\d.])', re.VERBOSE)
self.three_part_date_dmy = re.compile(r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE)
self.three_part_date_mdy = re.compile(r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?![\d.])', re.VERBOSE)
self.two_part_date = re.compile(r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?![\d.])', re.VERBOSE)
self.three_part_date_year_first = re.compile(r'(?<![\d.]) (?P<a_year>\d{4}) (?P<b_month_or_day>([/-])\d{1,2}) (?P<c_day_or_month>\3\d{1,2}) (?!\d)', re.VERBOSE)
self.three_part_date_dmy = re.compile(r'(?<![\d.]) (?P<a_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P<b_month>(?:0?[1-9]|1[0-2])\2) (?P<c_year>(?:\d\d){1,2}) (?!\d)', re.VERBOSE)
self.three_part_date_mdy = re.compile(r'(?<![\d.]) (?P<a_month>(?:0?[1-9]|1[0-2])([./-])) (?P<b_day>(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P<c_year>(?:\d\d){1,2}) (?!\d)', re.VERBOSE)
self.two_part_date = re.compile(r'(?<![\d.]) (?P<a_day_or_month>\d{1,2}([./-])) (?P<b_day_or_month>\d{1,2}\2) (?!\d)', re.VERBOSE)
self.time = re.compile(r'(?<!\w)\d{1,2}(?:(?::\d{2}){1,2}){1,2}(?![\d:])')
self.en_time = re.compile(r'(?<![\w])(?P<a_time>\d{1,2}(?:(?:[.:]\d{2})){0,2}) ?(?P<b_am_pm>(?:[ap]m\b|[ap]\.m\.(?!\w)))', re.IGNORECASE)
self.en_us_phone_number = re.compile(r"(?<![\d-])(?:[2-9]\d{2}[/-])?\d{3}-\d{4}(?![\d-])")
Expand Down
3 changes: 3 additions & 0 deletions tests/test_sentence_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ def test_misc_22(self):
def test_misc_23(self):
self._equal("Auf drei Schiffen des III. Geschwaders", ["Auf drei Schiffen des III. Geschwaders"])

def test_misc_24(self):
self._equal("Am Ende dieses Satzes steht 12.03.2023. Am Ende dieses Satzes steht 12.03.2023.", ["Am Ende dieses Satzes steht 12. 03. 2023 .", "Am Ende dieses Satzes steht 12. 03. 2023 ."])


class TestMiscEnglish(TestSentenceSplitterEnglish):
""""""
Expand Down
3 changes: 3 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,9 @@ def test_punctuation_71(self):
def test_punctuation_72(self):
self._equal("Punkte 2-4. Das System", "Punkte 2 - 4 . Das System")

def test_punctuation_73(self):
self._equal("Am Ende dieses Satzes steht 12.03.2023.", "Am Ende dieses Satzes steht 12. 03. 2023 .")


class TestEmailsURLs(TestTokenizer):
def test_emails_urls_01(self):
Expand Down

0 comments on commit 8c08187

Please sign in to comment.