Skip to content

Commit

Permalink
support custom abbreviation
Browse files Browse the repository at this point in the history
  • Loading branch information
Kai Kramer committed May 2, 2024
1 parent 5fc88d2 commit 91b7651
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/somajo/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,14 @@ class SoMaJo:
paragraph_separators = {"empty_lines", "single_newlines"}
_default_parsep = "empty_lines"

def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False):
def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False,custom_abbreviations=[]):
assert language in self.supported_languages
self.language = language
self.split_camel_case = split_camel_case
self.split_sentences = split_sentences
self.xml_sentences = xml_sentences
self.character_offsets = character_offsets
self._tokenizer = Tokenizer(split_camel_case=self.split_camel_case, language=self.language)
self._tokenizer = Tokenizer(split_camel_case=self.split_camel_case, language=self.language,custom_abbreviations=custom_abbreviations)
if self.split_sentences:
self._sentence_splitter = SentenceSplitter(language=self.language)

Expand Down
4 changes: 3 additions & 1 deletion src/somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Tokenizer():
_supported_languages = {"de", "de_CMC", "en", "en_PTB"}
_default_language = "de_CMC"

def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC"):
def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC",custom_abbreviations=[]):
"""Create a Tokenizer object. If split_camel_case is set to True,
tokens written in CamelCase will be split. If token_classes is
set to true, the tokenizer will output the token class for
Expand Down Expand Up @@ -287,6 +287,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
self.multipart_abbreviation = re.compile(r'(?:\p{L}+\.){2,}')
# only abbreviations that are not matched by (?:\p{L}\.)+
abbreviation_list = utils.read_abbreviation_file("abbreviations_%s.txt" % self.language[:2], to_lower=True)
if custom_abbreviations:
abbreviation_list += custom_abbreviations
# abbrev_simple = [(a, re.search(r"^\p{L}{2,}\.$", a)) for a in abbreviation_list]
# self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]])
# self.simple_abbreviation_candidates = re.compile(r"(?<![\w.])\p{L}{2,}\.(?!\p{L}{1,3}\.)")
Expand Down
8 changes: 8 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ class TestEnglishTokenizer(TestTokenizer):
def setUp(self):
"""Necessary preparations"""
self.tokenizer = Tokenizer(language="en_PTB", split_camel_case=True)

class TestCustomTokenizer(TestTokenizer):
def setUp(self):
"""Necessary preparations"""
self.tokenizer = Tokenizer(language="de_CMC", split_camel_case=True,custom_abbreviations=['Brem.','GBl.'])


class TestTokenizerDeprecated(TestTokenizer):
Expand Down Expand Up @@ -1438,6 +1443,9 @@ def test_english_30(self):
def test_english_31(self):
self._equal("I prefer La Porte de l'Enfer to L'Éternelle idole", "I prefer La Porte de l'Enfer to L'Éternelle idole")

class TesttCustomAbbreviation(TestCustomTokenizer):
def test_abbreviations_custom(self):
self._equal("Brem.GBl.", "Brem. GBl.")

class TestDeprecated(TestTokenizerDeprecated):
def test_deprecated_01(self):
Expand Down

0 comments on commit 91b7651

Please sign in to comment.