From 5d57e7dd66b6c45113e6355f4a3b8d1fdfbc8bc9 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Dec 2024 09:46:42 +0900 Subject: [PATCH 1/2] mv dicId application from wordItTable to lexiconSet/WordLookup/DictPrinter --- .../sudachi/dictionary/DictionaryPrinter.java | 12 +++-- .../dictionary/DoubleArrayLexicon.java | 54 +++++++++++-------- .../nlp/sudachi/dictionary/LexiconSet.java | 28 +++++++--- .../nlp/sudachi/dictionary/WordIdTable.java | 25 +++++---- .../nlp/sudachi/dictionary/WordLookup.java | 9 +++- 5 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 56ceea92..077ec735 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,15 +78,18 @@ public enum WordRefMode { this.output = output; - if (base == null) { + int dicIdMask; + if (base == null) { // system grammar = dic.getGrammar(); lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize()); - } else { + dicIdMask = WordId.dicIdMask(0); + } else { // user grammar = base.getGrammar(); lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize()); lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize()); grammar.addPosList(dic.getGrammar()); + dicIdMask = WordId.dicIdMask(1); } // set default char category for text normalizer @@ -104,6 +107,9 @@ public enum WordRefMode { allIds.appendAll(ids.next()); } allIds.sort(); + for (int i = 0; i < allIds.length(); i++) { + allIds.set(i, WordId.applyMask(allIds.get(i), dicIdMask)); + } wordIds = allIds; } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 9f3677a8..418f6ce6 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -25,22 +25,34 @@ import com.worksap.nlp.sudachi.MorphemeList; import com.worksap.nlp.sudachi.Tokenizer; +/** + * The main lexicon implementation. + * + * In V1 format, it consists of followings. {@link DoubleArray} (TRIE): Mapping + * from index form to WordIdTable offset. {@link WordIdTable}: Table of list of + * word ids that have same index form. + * {@link WordParameters}/{@link WordInfoList}: List of word information, for + * analysis/non-analysis respectively. Word id represents offset in them. + * {@link CompactedStrings}: Storage of strings such as headword, reading form, + * etc. + */ public class DoubleArrayLexicon implements Lexicon { static final int USER_DICT_COST_PAR_MORPH = -20; - private final WordInfoList wordInfos; + + private final Description description; private final DoubleArray trie; + private final WordInfoList wordInfos; private final WordParameters parameters; - private final Description description; private final WordIdTable wordIdTable; private final CompactedStrings strings; public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams, WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) { this.description = description; + this.trie = trie; this.wordIdTable = wordIdTable; this.parameters = wordParams; this.wordInfos = wordInfos; - this.trie = trie; this.strings = strings; } @@ -86,29 +98,16 @@ public Iterator lookup(byte[] text, int offset) { if (!iterator.hasNext()) { return iterator; } - return new Itr(iterator); - } - - public IntBuffer getTrieArray() { - return trie.array(); - } - - public WordIdTable getWordIdTable() { - return wordIdTable; - } - - @Override - public long parameters(int wordId) { - return parameters.loadParams(wordId); + return new LookupItr(iterator); } - private class Itr implements Iterator { + private class LookupItr implements Iterator { private final Iterator iterator; private int[] wordIds; private int length; private int index; - Itr(Iterator iterator) { + LookupItr(Iterator iterator) { this.iterator = iterator; index = -1; } @@ -134,6 +133,19 @@ public int[] next() { } } + public IntBuffer getTrieArray() { + return trie.array(); + } + + public WordIdTable getWordIdTable() { + return wordIdTable; + } + + @Override + public long parameters(int wordId) { + return parameters.loadParams(wordId); + } + @Override public String string(int dic, int stringPtr) { return strings.string(stringPtr); @@ -220,10 +232,6 @@ public void calculateDynamicCosts(Tokenizer tokenizer) { } } - public void setDictionaryId(int id) { - wordIdTable.setDictionaryId(id); - } - @Override public WordInfoList wordInfos(int dic) { return wordInfos; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java index 033abc4d..bc1d7954 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java @@ -20,6 +20,14 @@ import java.util.*; +/** + * A lexicon that contains multiple lexicons inside. + * + * It only accepts {@link DoubleArrayLexicon} now. This lexicon cannot be + * nested. + * + * Handles dictionary part of the word id. + */ public class LexiconSet implements Lexicon { static final int MAX_DICTIONARIES = 15; @@ -33,9 +41,7 @@ public LexiconSet(Lexicon systemLexicon, short systemPartOfSpeechSize) { } public void add(Lexicon lexicon, short posOffset) { - DoubleArrayLexicon daLexicon = (DoubleArrayLexicon) lexicon; - daLexicon.setDictionaryId(lexicons.size()); - lexicons.add(daLexicon); + lexicons.add((DoubleArrayLexicon) lexicon); posOffsets.add(posOffset); } @@ -51,7 +57,7 @@ public Iterator lookup(byte[] text, int offset) { if (lexicons.size() == 1) { return lexicons.get(0).lookup(text, offset); } - return new Itr(text, offset, lexicons.size() - 1); + return new LookupItr(text, offset, lexicons.size() - 1); } /** @@ -63,16 +69,18 @@ public Iterator lookup(byte[] text, int offset) { * * Dictionaries have their word weights prioritized in the same manner */ - private class Itr implements Iterator { + private class LookupItr implements Iterator { byte[] text; int offset; int dictId; + int dictMask; Iterator iterator; - Itr(byte[] text, int offset, int start) { + LookupItr(byte[] text, int offset, int start) { this.text = text; this.offset = offset; dictId = start; + dictMask = WordId.dicIdMask(start); iterator = lexicons.get(dictId).lookup(text, offset); } @@ -85,6 +93,7 @@ public boolean hasNext() { } iterator = lexicons.get(nextId).lookup(text, offset); dictId = nextId; + dictMask = WordId.dicIdMask(nextId); } return true; } @@ -93,7 +102,7 @@ public boolean hasNext() { public int[] next() { if (hasNext()) { int[] r = iterator.next(); - r[0] = buildWordId(dictId, r[0]); + r[0] = WordId.applyMask(r[0], dictMask); return r; } throw new NoSuchElementException(); @@ -177,10 +186,12 @@ public Iterator wordIds() { private class WordIdItr implements Iterator { private int dictId; + private int dictMask; private Iterator iterator; WordIdItr() { this.dictId = 0; + this.dictMask = WordId.dicIdMask(dictId); this.iterator = lexicons.get(dictId).wordIds(); } @@ -192,6 +203,7 @@ public boolean hasNext() { return false; } dictId = nextDictId; + dictMask = WordId.dicIdMask(nextDictId); iterator = lexicons.get(nextDictId).wordIds(); } return true; @@ -202,7 +214,7 @@ public Integer next() { if (!hasNext()) { throw new NoSuchElementException(); } - return iterator.next(); + return WordId.applyMask(iterator.next(), dictMask); } } } diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java index 7c812d4e..45841cb1 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java @@ -24,14 +24,17 @@ import java.util.Iterator; /** - * Table which contains the list of (internal) word ids that has same index - * form. + * Lexicon parts that contains the list of (internal) word ids that have the + * same index form. * - * Automatically fills dict parts of word id using the dicId set. + * DoubleArray has mapping from indexForm to offset in this table, and + * {@link WordInfoList} has actual data for each words. + * + * In V1 format, each word ids in a list in this table are sorted (and + * compressed using varint-32), but they are not sorted between lists. */ public class WordIdTable { private final ByteBuffer bytes; - private int dicIdMask = 0; WordIdTable(ByteBuffer bytes) { this.bytes = bytes; @@ -43,7 +46,7 @@ int[] get(int index) { BufReader reader = new BufReader(dup); int length = reader.readVarint32(); int[] result = new int[length]; - readDeltaCompressed(result, length, this.dicIdMask, reader); + readDeltaCompressed(result, length, reader); return result; } @@ -62,23 +65,19 @@ int readWordIds(int index, WordLookup lookup) { BufReader reader = new BufReader(dup); int length = reader.readVarint32(); int[] result = lookup.outputBuffer(length); - readDeltaCompressed(result, length, this.dicIdMask, reader); + readDeltaCompressed(result, length, reader); return length; } - private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) { + private static void readDeltaCompressed(int[] result, int count, BufReader reader) { int sum = 0; for (int i = 0; i < count; ++i) { int v = reader.readVarint32(); - result[i] = WordId.applyMask(v + sum, mask); + result[i] = v + sum; sum += v; } } - void setDictionaryId(int dictId) { - dicIdMask = WordId.dicIdMask(dictId); - } - /** * Iterates over all valid word ids in the dictionary. Iteration order is not * the same as the original dictionary order, but dictionary ids, when sorted, @@ -109,7 +108,7 @@ public Ints next() { } ints.clear(); int[] data = ints.prepare(size); - readDeltaCompressed(data, size, dicIdMask, r); + readDeltaCompressed(data, size, r); return ints; } }; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java index 16dbd07f..414fc82e 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ public final class WordLookup { private int numWords; private final List lexicons; private int currentLexicon = -1; + private int dictMask; public WordLookup(List lexicons) { this.lexicons = lexicons; @@ -58,6 +59,7 @@ private void rebind(DoubleArrayLexicon lexicon) { */ public void reset(byte[] key, int offset, int limit) { currentLexicon = lexicons.size() - 1; + dictMask = WordId.dicIdMask(currentLexicon); rebind(lexicons.get(currentLexicon)); lookup.reset(key, offset, limit); } @@ -90,9 +92,14 @@ public boolean next() { } rebind(lexicons.get(nextLexicon)); currentLexicon = nextLexicon; + dictMask = WordId.dicIdMask(nextLexicon); } int wordGroupId = lookup.getValue(); numWords = words.readWordIds(wordGroupId, this); + for (int i = 0; i < numWords; ++i) { + int internalId = wordIds[i]; + wordIds[i] = WordId.applyMask(internalId, dictMask); + } return true; } From d458f88b5a8e1bf75f496274e47c260f92e36a67 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 2 Dec 2024 10:50:49 +0900 Subject: [PATCH 2/2] use public api to get word id list --- .../sudachi/dictionary/DictionaryPrinter.java | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java index 077ec735..96c7d9d2 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java @@ -42,7 +42,7 @@ public class DictionaryPrinter { private final LexiconSet lex; private final TextNormalizer textNormalizer; // sorted raw word ids taken from the target dict. - private final Ints wordIds; + private final int[] wordIds; private POSMode posMode = POSMode.DEFAULT; private WordRefMode wordRefMode = WordRefMode.DEFAULT; @@ -98,18 +98,15 @@ public enum WordRefMode { // In order to output dictionary entries in in-dictionary order we need to sort // them. Iterator over them will get them not in the sorted order, but grouped - // by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds - // for the performance. - DoubleArrayLexicon targetLex = dic.getLexicon(); - Ints allIds = new Ints(targetLex.size()); - Iterator ids = targetLex.getWordIdTable().wordIds(); + // by index-form. + Lexicon targetLexicon = dic.getLexicon(); + int[] allIds = new int[targetLexicon.size()]; + int idx = 0; + Iterator ids = targetLexicon.wordIds(); while (ids.hasNext()) { - allIds.appendAll(ids.next()); - } - allIds.sort(); - for (int i = 0; i < allIds.length(); i++) { - allIds.set(i, WordId.applyMask(allIds.get(i), dicIdMask)); + allIds[idx++] = WordId.applyMask(ids.next(), dicIdMask); } + Arrays.sort(allIds); wordIds = allIds; } @@ -176,9 +173,9 @@ void printColumnHeaders(List headers) { private void printEntries() { progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY); - long size = wordIds.length(); + long size = wordIds.length; for (int i = 0; i < size; ++i) { - printEntry(wordIds.get(i)); + printEntry(wordIds[i]); progress.progress(i, size); } progress.endBlock(size, System.nanoTime());