From 91b76515b800afcaec8ebbb3083520f6104b6851 Mon Sep 17 00:00:00 2001 From: Kai Kramer Date: Thu, 2 May 2024 13:05:11 +0200 Subject: [PATCH] support custom abbreviation --- src/somajo/somajo.py | 4 ++-- src/somajo/tokenizer.py | 4 +++- tests/test_tokenizer.py | 8 ++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py index b8c0cb8..59a5f32 100644 --- a/src/somajo/somajo.py +++ b/src/somajo/somajo.py @@ -43,14 +43,14 @@ class SoMaJo: paragraph_separators = {"empty_lines", "single_newlines"} _default_parsep = "empty_lines" - def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False): + def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None, character_offsets=False,custom_abbreviations=[]): assert language in self.supported_languages self.language = language self.split_camel_case = split_camel_case self.split_sentences = split_sentences self.xml_sentences = xml_sentences self.character_offsets = character_offsets - self._tokenizer = Tokenizer(split_camel_case=self.split_camel_case, language=self.language) + self._tokenizer = Tokenizer(split_camel_case=self.split_camel_case, language=self.language,custom_abbreviations=custom_abbreviations) if self.split_sentences: self._sentence_splitter = SentenceSplitter(language=self.language) diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py index d9b11c8..892581c 100644 --- a/src/somajo/tokenizer.py +++ b/src/somajo/tokenizer.py @@ -19,7 +19,7 @@ class Tokenizer(): _supported_languages = {"de", "de_CMC", "en", "en_PTB"} _default_language = "de_CMC" - def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC"): + def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC",custom_abbreviations=[]): """Create a Tokenizer object. If split_camel_case is set to True, tokens written in CamelCase will be split. If token_classes is set to true, the tokenizer will output the token class for @@ -287,6 +287,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False self.multipart_abbreviation = re.compile(r'(?:\p{L}+\.){2,}') # only abbreviations that are not matched by (?:\p{L}\.)+ abbreviation_list = utils.read_abbreviation_file("abbreviations_%s.txt" % self.language[:2], to_lower=True) + if custom_abbreviations: + abbreviation_list += custom_abbreviations # abbrev_simple = [(a, re.search(r"^\p{L}{2,}\.$", a)) for a in abbreviation_list] # self.simple_abbreviations = set([a[0].lower() for a in abbrev_simple if a[1]]) # self.simple_abbreviation_candidates = re.compile(r"(?