From 9425be0310c2ed992892bca5ae80c688b48d8b70 Mon Sep 17 00:00:00 2001 From: Mika Date: Mon, 23 Aug 2021 22:15:37 +0300 Subject: [PATCH] apertium fix --- test_uralicnlp.py | 4 ++++ uralicNLP/uralicApi.py | 34 +++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/test_uralicnlp.py b/test_uralicnlp.py index 9ee0a14..98df275 100644 --- a/test_uralicnlp.py +++ b/test_uralicnlp.py @@ -11,9 +11,13 @@ #print(uralicApi.get_all_forms("kissa", "N", "fin")) #uralicApi.get_transducer("spa", analyzer=True).lookup_optimize() +print(uralicApi.analyze("segiz", "kaa")) print(uralicApi.analyze("como", "spa")) print(uralicApi.generate("perro", "spa")) +print(uralicApi.generate("segiz+e", "kaa")) print(uralicApi.lemmatize("como", "spa")) +print(uralicApi.lemmatize("segiz", "kaa")) +print(uralicApi.lemmatize("segiz", "kaa",word_boundaries=True)) #print(type(uralicApi.get_transducer("spa", analyzer=True))) #print() #print(uralicApi.supported_languages()) diff --git a/uralicNLP/uralicApi.py b/uralicNLP/uralicApi.py index b43ef38..b698f8e 100644 --- a/uralicNLP/uralicApi.py +++ b/uralicNLP/uralicApi.py @@ -130,11 +130,14 @@ def __generator_model_name(descriptive, dictionary_forms): return "generator-norm" def __generate_locally(query, language, cache=True, descriptive=False, dictionary_forms=True,filename=None): - generator = get_transducer(language,cache=cache, analyzer=False, descriptive=descriptive, dictionary_forms=dictionary_forms,filename=filename) - r = generator.lookup(query) + generator = get_transducer(language,cache=cache, analyzer=False, descriptive=descriptive, dictionary_forms=dictionary_forms,filename=filename,force_no_list=False) + if not isinstance(generator, list): + generator = [generator] + r = [] + [r.extend(x.lookup(query)) for x in generator] return r -def get_transducer(language, cache=True, analyzer=True, descriptive=True, dictionary_forms=True, convert_to_openfst=False, filename=None): +def get_transducer(language, cache=True, analyzer=True, descriptive=True, dictionary_forms=True, convert_to_openfst=False, filename=None, force_no_list=True): conversion_type = hfst.ImplementationType.TROPICAL_OPENFST_TYPE if not analyzer: #generator @@ -145,7 +148,10 @@ def get_transducer(language, cache=True, analyzer=True, descriptive=True, dictio else: generator = _load_transducer(filename, True) if convert_to_openfst: - generator.convert(conversion_type) + if isinstance(generator, list): + [x.convert(conversion_type) for x in generator] + else: + generator.convert(conversion_type) generator_cache[filename] = generator else: if filename is None: @@ -155,8 +161,13 @@ def get_transducer(language, cache=True, analyzer=True, descriptive=True, dictio else: generator = _load_transducer(filename, False) if convert_to_openfst: - generator.convert(conversion_type) + if isinstance(generator, list): + [x.convert(conversion_type) for x in generator] + else: + generator.convert(conversion_type) analyzer_cache[filename] = generator + if force_no_list and isinstance(generator, list): + generator = generator[0] return generator def _load_transducer(filename, invert): @@ -172,7 +183,7 @@ def _load_transducer(filename, invert): return hfst.AttReader(mikatools.open_read(filename)).read() elif "apertium" in metadata and metadata["apertium"] == True: input_stream = hfst.HfstInputStream(filename) - return input_stream.read_all()[1] + return input_stream.read_all() else: input_stream = hfst.HfstInputStream(filename) return input_stream.read() @@ -188,8 +199,11 @@ def __analyzer_model_name(descriptive, dictionary): return "analyser-norm" def __analyze_locally(query, language, cache=True, descriptive=True, dictionary_forms=False, filename=None): - generator = get_transducer(language,cache=cache, analyzer=True, descriptive=descriptive, dictionary_forms=dictionary_forms,filename=filename) - r = generator.lookup(query) + generator = get_transducer(language,cache=cache, analyzer=True, descriptive=descriptive, dictionary_forms=dictionary_forms,filename=filename, force_no_list=False) + if not isinstance(generator, list): + generator = [generator] + r = [] + [r.extend(x.lookup(query)) for x in generator] return r def __encode_query(query): @@ -303,7 +317,9 @@ def lemmatize(word, language, force_local=True, descriptive=True, word_boundarie lemmas.append(lemma) elif "<" in an and ">" in an: #apertium - lemmas.append(an.split("<")[0]) + parts = an.split("+") + lemma = bound.join([x.split("<")[0] for x in parts]) + lemmas.append(lemma) else: if not "+Cmp#" in an and "#" in an: an = an.replace("#", "+Cmp#")