From 8c0818707cbbb5e54b87d1d9b6e26ac84ecf7a7c Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Mon, 14 Aug 2023 20:48:05 +0200 Subject: [PATCH] Fix #25 (dates at the end of sentences) --- src/somajo/tokenizer.py | 8 ++++---- tests/test_sentence_splitter.py | 3 +++ tests/test_tokenizer.py | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py index 0c72981..ad63cad 100644 --- a/src/somajo/tokenizer.py +++ b/src/somajo/tokenizer.py @@ -277,10 +277,10 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False self.underline = re.compile(r"(?_)(?P\w[^_]+\w)(?P_)(?!\w)") # DATE, TIME, NUMBERS - self.three_part_date_year_first = re.compile(r'(?\d{4}) (?P([/-])\d{1,2}) (?P\3\d{1,2}) (?![\d.])', re.VERBOSE) - self.three_part_date_dmy = re.compile(r'(?(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P(?:0?[1-9]|1[0-2])\2) (?P(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) - self.three_part_date_mdy = re.compile(r'(?(?:0?[1-9]|1[0-2])([./-])) (?P(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P(?:\d\d){1,2}) (?![\d.])', re.VERBOSE) - self.two_part_date = re.compile(r'(?\d{1,2}([./-])) (?P\d{1,2}\2) (?![\d.])', re.VERBOSE) + self.three_part_date_year_first = re.compile(r'(?\d{4}) (?P([/-])\d{1,2}) (?P\3\d{1,2}) (?!\d)', re.VERBOSE) + self.three_part_date_dmy = re.compile(r'(?(?:0?[1-9]|1[0-9]|2[0-9]|3[01])([./-])) (?P(?:0?[1-9]|1[0-2])\2) (?P(?:\d\d){1,2}) (?!\d)', re.VERBOSE) + self.three_part_date_mdy = re.compile(r'(?(?:0?[1-9]|1[0-2])([./-])) (?P(?:0?[1-9]|1[0-9]|2[0-9]|3[01])\2) (?P(?:\d\d){1,2}) (?!\d)', re.VERBOSE) + self.two_part_date = re.compile(r'(?\d{1,2}([./-])) (?P\d{1,2}\2) (?!\d)', re.VERBOSE) self.time = re.compile(r'(?\d{1,2}(?:(?:[.:]\d{2})){0,2}) ?(?P(?:[ap]m\b|[ap]\.m\.(?!\w)))', re.IGNORECASE) self.en_us_phone_number = re.compile(r"(?