From 63400e5ebc9dc6562749e95f6a8445b18c12c8bb Mon Sep 17 00:00:00 2001 From: yokomotod Date: Thu, 24 Jun 2021 12:12:46 +0900 Subject: [PATCH 1/2] fix: slow ubuild with word info split --- sudachipy/dictionarylib/doublearraylexicon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sudachipy/dictionarylib/doublearraylexicon.py b/sudachipy/dictionarylib/doublearraylexicon.py index 6e1a7ce..39337f8 100644 --- a/sudachipy/dictionarylib/doublearraylexicon.py +++ b/sudachipy/dictionarylib/doublearraylexicon.py @@ -78,7 +78,7 @@ def size(self) -> int: return self.word_params.size def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: - for wid in range(self.word_infos.size()): + for wid, _ in self.lookup(headword.encode('utf-8'), 0): info = self.word_infos.get_word_info(wid) if info.surface == headword \ and info.pos_id == pos_id \ From fd4561c7dbc09ab9958942dd9db7a66291f2f075 Mon Sep 17 00:00:00 2001 From: yokomotod Date: Thu, 24 Jun 2021 14:21:01 +0900 Subject: [PATCH 2/2] fix: add fallback for not in trie words --- sudachipy/dictionarylib/doublearraylexicon.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sudachipy/dictionarylib/doublearraylexicon.py b/sudachipy/dictionarylib/doublearraylexicon.py index 39337f8..fb77ee8 100644 --- a/sudachipy/dictionarylib/doublearraylexicon.py +++ b/sudachipy/dictionarylib/doublearraylexicon.py @@ -79,13 +79,21 @@ def size(self) -> int: def get_word_id(self, headword: str, pos_id: int, reading_form: str) -> int: for wid, _ in self.lookup(headword.encode('utf-8'), 0): - info = self.word_infos.get_word_info(wid) - if info.surface == headword \ - and info.pos_id == pos_id \ - and info.reading_form == reading_form: + if self._compare_word_id(wid, headword, pos_id, reading_form): return wid + + for wid in range(self.word_infos.size()): + if self._compare_word_id(wid, headword, pos_id, reading_form): + return wid + return -1 + def _compare_word_id(self, wid: int, headword: str, pos_id: int, reading_form: str) -> bool: + info = self.word_infos.get_word_info(wid) + return info.surface == headword \ + and info.pos_id == pos_id \ + and info.reading_form == reading_form + def get_dictionary_id(self, word_id: int) -> int: return 0