From fb788e65027e1b15cb3b5717063e2182cb3a120b Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Fri, 9 Feb 2024 09:38:33 +0100 Subject: [PATCH] Correctly tokenize URLs in angle brackets (#27) --- CHANGES.txt | 4 ++++ pyproject.toml | 2 +- src/somajo/tokenizer.py | 4 ++-- tests/test_tokenizer.py | 3 +++ 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 28f2e47..32fa07c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,9 @@ # CHANGELOG # +## Version 2.4.1, 2024-02-09 ## + +- Fix issue #27 (URLs in angle brackets). + ## Version 2.4.0, 2023-12-23 ## - New feature: SoMaJo can output character offsets for tokens, diff --git a/pyproject.toml b/pyproject.toml index 475d7b2..c8c2282 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ # a new release [project] name = "SoMaJo" -version = "2.4.0" +version = "2.4.1" description = "A tokenizer and sentence splitter for German and English web and social media texts." readme = "README.md" requires-python = ">=3.8" diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py index 4e99896..4d13eef 100644 --- a/src/somajo/tokenizer.py +++ b/src/somajo/tokenizer.py @@ -88,8 +88,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b") # simple regex for urls that start with http or www # no square brackets and spaces in URL: [^][ ] - self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+?\(\S*?\)[^][ ]*(?=$|[\'. "!?,;])', re.IGNORECASE) - self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+[^][\'. "!?,;:()]', re.IGNORECASE) + self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+?\(\S*?\)[^][<> ]*(?=$|[\'. "!?,;])', re.IGNORECASE) + self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+[^][<>\'. "!?,;:()]', re.IGNORECASE) self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE) self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE) # regex for ISBNs adapted from: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index bc69365..0218f4d 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1040,6 +1040,9 @@ def test_emails_urls_26(self): def test_emails_urls_27(self): self._equal("link: [Linktext „viel“ Text](https://other_link.com).", "link : [ Linktext „ viel “ Text ] ( https://other_link.com ) .") + def test_emails_urls_28(self): + self._equal("link: .", "link : < https://one_link.com > .") + class TestAbbreviations(TestTokenizer): def test_abbreviations_01(self):