From fb788e65027e1b15cb3b5717063e2182cb3a120b Mon Sep 17 00:00:00 2001
From: Thomas Proisl <thomas.proisl@bamf.bund.de>
Date: Fri, 9 Feb 2024 09:38:33 +0100
Subject: [PATCH] Correctly tokenize URLs in angle brackets (#27)

---
 CHANGES.txt             | 4 ++++
 pyproject.toml          | 2 +-
 src/somajo/tokenizer.py | 4 ++--
 tests/test_tokenizer.py | 3 +++
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 28f2e47..32fa07c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 # CHANGELOG #
 
+## Version 2.4.1, 2024-02-09 ##
+
+- Fix issue #27 (URLs in angle brackets).
+
 ## Version 2.4.0, 2023-12-23 ##
 
 - New feature: SoMaJo can output character offsets for tokens,
diff --git a/pyproject.toml b/pyproject.toml
index 475d7b2..c8c2282 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@
 #    a new release
 [project]
 name = "SoMaJo"
-version = "2.4.0"
+version = "2.4.1"
 description = "A tokenizer and sentence splitter for German and English web and social media texts."
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py
index 4e99896..4d13eef 100644
--- a/src/somajo/tokenizer.py
+++ b/src/somajo/tokenizer.py
@@ -88,8 +88,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
         self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b")
         # simple regex for urls that start with http or www
         # no square brackets and spaces in URL: [^][ ]
-        self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+?\(\S*?\)[^][ ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
-        self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+[^][\'. "!?,;:()]', re.IGNORECASE)
+        self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+?\(\S*?\)[^][<> ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
+        self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+[^][<>\'. "!?,;:()]', re.IGNORECASE)
         self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
         self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE)
         # regex for ISBNs adapted from:
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index bc69365..0218f4d 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1040,6 +1040,9 @@ def test_emails_urls_26(self):
     def test_emails_urls_27(self):
         self._equal("link: [Linktext „viel“ Text](https://other_link.com).", "link : [ Linktext „ viel “ Text ] ( https://other_link.com ) .")
 
+    def test_emails_urls_28(self):
+        self._equal("link: <https://one_link.com>.", "link : < https://one_link.com > .")
+
 
 class TestAbbreviations(TestTokenizer):
     def test_abbreviations_01(self):