From 5d57e7dd66b6c45113e6355f4a3b8d1fdfbc8bc9 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 2 Dec 2024 09:46:42 +0900
Subject: [PATCH 1/2] mv dicId application from wordItTable to
 lexiconSet/WordLookup/DictPrinter

---
 .../sudachi/dictionary/DictionaryPrinter.java | 12 +++--
 .../dictionary/DoubleArrayLexicon.java        | 54 +++++++++++--------
 .../nlp/sudachi/dictionary/LexiconSet.java    | 28 +++++++---
 .../nlp/sudachi/dictionary/WordIdTable.java   | 25 +++++----
 .../nlp/sudachi/dictionary/WordLookup.java    |  9 +++-
 5 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
index 56ceea92..077ec735 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,15 +78,18 @@ public enum WordRefMode {
 
         this.output = output;
 
-        if (base == null) {
+        int dicIdMask;
+        if (base == null) { // system
             grammar = dic.getGrammar();
             lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
-        } else {
+            dicIdMask = WordId.dicIdMask(0);
+        } else { // user
             grammar = base.getGrammar();
             lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());
 
             lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
             grammar.addPosList(dic.getGrammar());
+            dicIdMask = WordId.dicIdMask(1);
         }
 
         // set default char category for text normalizer
@@ -104,6 +107,9 @@ public enum WordRefMode {
             allIds.appendAll(ids.next());
         }
         allIds.sort();
+        for (int i = 0; i < allIds.length(); i++) {
+            allIds.set(i, WordId.applyMask(allIds.get(i), dicIdMask));
+        }
         wordIds = allIds;
     }
 
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
index 9f3677a8..418f6ce6 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
@@ -25,22 +25,34 @@
 import com.worksap.nlp.sudachi.MorphemeList;
 import com.worksap.nlp.sudachi.Tokenizer;
 
+/**
+ * The main lexicon implementation.
+ * 
+ * In V1 format, it consists of followings. {@link DoubleArray} (TRIE): Mapping
+ * from index form to WordIdTable offset. {@link WordIdTable}: Table of list of
+ * word ids that have same index form.
+ * {@link WordParameters}/{@link WordInfoList}: List of word information, for
+ * analysis/non-analysis respectively. Word id represents offset in them.
+ * {@link CompactedStrings}: Storage of strings such as headword, reading form,
+ * etc.
+ */
 public class DoubleArrayLexicon implements Lexicon {
     static final int USER_DICT_COST_PAR_MORPH = -20;
-    private final WordInfoList wordInfos;
+
+    private final Description description;
     private final DoubleArray trie;
+    private final WordInfoList wordInfos;
     private final WordParameters parameters;
-    private final Description description;
     private final WordIdTable wordIdTable;
     private final CompactedStrings strings;
 
     public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams,
             WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) {
         this.description = description;
+        this.trie = trie;
         this.wordIdTable = wordIdTable;
         this.parameters = wordParams;
         this.wordInfos = wordInfos;
-        this.trie = trie;
         this.strings = strings;
     }
 
@@ -86,29 +98,16 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
         if (!iterator.hasNext()) {
             return iterator;
         }
-        return new Itr(iterator);
-    }
-
-    public IntBuffer getTrieArray() {
-        return trie.array();
-    }
-
-    public WordIdTable getWordIdTable() {
-        return wordIdTable;
-    }
-
-    @Override
-    public long parameters(int wordId) {
-        return parameters.loadParams(wordId);
+        return new LookupItr(iterator);
     }
 
-    private class Itr implements Iterator<int[]> {
+    private class LookupItr implements Iterator<int[]> {
         private final Iterator<int[]> iterator;
         private int[] wordIds;
         private int length;
         private int index;
 
-        Itr(Iterator<int[]> iterator) {
+        LookupItr(Iterator<int[]> iterator) {
             this.iterator = iterator;
             index = -1;
         }
@@ -134,6 +133,19 @@ public int[] next() {
         }
     }
 
+    public IntBuffer getTrieArray() {
+        return trie.array();
+    }
+
+    public WordIdTable getWordIdTable() {
+        return wordIdTable;
+    }
+
+    @Override
+    public long parameters(int wordId) {
+        return parameters.loadParams(wordId);
+    }
+
     @Override
     public String string(int dic, int stringPtr) {
         return strings.string(stringPtr);
@@ -220,10 +232,6 @@ public void calculateDynamicCosts(Tokenizer tokenizer) {
         }
     }
 
-    public void setDictionaryId(int id) {
-        wordIdTable.setDictionaryId(id);
-    }
-
     @Override
     public WordInfoList wordInfos(int dic) {
         return wordInfos;
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
index 033abc4d..bc1d7954 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
@@ -20,6 +20,14 @@
 
 import java.util.*;
 
+/**
+ * A lexicon that contains multiple lexicons inside.
+ * 
+ * It only accepts {@link DoubleArrayLexicon} now. This lexicon cannot be
+ * nested.
+ * 
+ * Handles dictionary part of the word id.
+ */
 public class LexiconSet implements Lexicon {
     static final int MAX_DICTIONARIES = 15;
 
@@ -33,9 +41,7 @@ public LexiconSet(Lexicon systemLexicon, short systemPartOfSpeechSize) {
     }
 
     public void add(Lexicon lexicon, short posOffset) {
-        DoubleArrayLexicon daLexicon = (DoubleArrayLexicon) lexicon;
-        daLexicon.setDictionaryId(lexicons.size());
-        lexicons.add(daLexicon);
+        lexicons.add((DoubleArrayLexicon) lexicon);
         posOffsets.add(posOffset);
     }
 
@@ -51,7 +57,7 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
         if (lexicons.size() == 1) {
             return lexicons.get(0).lookup(text, offset);
         }
-        return new Itr(text, offset, lexicons.size() - 1);
+        return new LookupItr(text, offset, lexicons.size() - 1);
     }
 
     /**
@@ -63,16 +69,18 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
      *
      * Dictionaries have their word weights prioritized in the same manner
      */
-    private class Itr implements Iterator<int[]> {
+    private class LookupItr implements Iterator<int[]> {
         byte[] text;
         int offset;
         int dictId;
+        int dictMask;
         Iterator<int[]> iterator;
 
-        Itr(byte[] text, int offset, int start) {
+        LookupItr(byte[] text, int offset, int start) {
             this.text = text;
             this.offset = offset;
             dictId = start;
+            dictMask = WordId.dicIdMask(start);
             iterator = lexicons.get(dictId).lookup(text, offset);
         }
 
@@ -85,6 +93,7 @@ public boolean hasNext() {
                 }
                 iterator = lexicons.get(nextId).lookup(text, offset);
                 dictId = nextId;
+                dictMask = WordId.dicIdMask(nextId);
             }
             return true;
         }
@@ -93,7 +102,7 @@ public boolean hasNext() {
         public int[] next() {
             if (hasNext()) {
                 int[] r = iterator.next();
-                r[0] = buildWordId(dictId, r[0]);
+                r[0] = WordId.applyMask(r[0], dictMask);
                 return r;
             }
             throw new NoSuchElementException();
@@ -177,10 +186,12 @@ public Iterator<Integer> wordIds() {
 
     private class WordIdItr implements Iterator<Integer> {
         private int dictId;
+        private int dictMask;
         private Iterator<Integer> iterator;
 
         WordIdItr() {
             this.dictId = 0;
+            this.dictMask = WordId.dicIdMask(dictId);
             this.iterator = lexicons.get(dictId).wordIds();
         }
 
@@ -192,6 +203,7 @@ public boolean hasNext() {
                     return false;
                 }
                 dictId = nextDictId;
+                dictMask = WordId.dicIdMask(nextDictId);
                 iterator = lexicons.get(nextDictId).wordIds();
             }
             return true;
@@ -202,7 +214,7 @@ public Integer next() {
             if (!hasNext()) {
                 throw new NoSuchElementException();
             }
-            return iterator.next();
+            return WordId.applyMask(iterator.next(), dictMask);
         }
     }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
index 7c812d4e..45841cb1 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
@@ -24,14 +24,17 @@
 import java.util.Iterator;
 
 /**
- * Table which contains the list of (internal) word ids that has same index
- * form.
+ * Lexicon parts that contains the list of (internal) word ids that have the
+ * same index form.
  * 
- * Automatically fills dict parts of word id using the dicId set.
+ * DoubleArray has mapping from indexForm to offset in this table, and
+ * {@link WordInfoList} has actual data for each words.
+ * 
+ * In V1 format, each word ids in a list in this table are sorted (and
+ * compressed using varint-32), but they are not sorted between lists.
  */
 public class WordIdTable {
     private final ByteBuffer bytes;
-    private int dicIdMask = 0;
 
     WordIdTable(ByteBuffer bytes) {
         this.bytes = bytes;
@@ -43,7 +46,7 @@ int[] get(int index) {
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
         int[] result = new int[length];
-        readDeltaCompressed(result, length, this.dicIdMask, reader);
+        readDeltaCompressed(result, length, reader);
         return result;
     }
 
@@ -62,23 +65,19 @@ int readWordIds(int index, WordLookup lookup) {
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
         int[] result = lookup.outputBuffer(length);
-        readDeltaCompressed(result, length, this.dicIdMask, reader);
+        readDeltaCompressed(result, length, reader);
         return length;
     }
 
-    private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) {
+    private static void readDeltaCompressed(int[] result, int count, BufReader reader) {
         int sum = 0;
         for (int i = 0; i < count; ++i) {
             int v = reader.readVarint32();
-            result[i] = WordId.applyMask(v + sum, mask);
+            result[i] = v + sum;
             sum += v;
         }
     }
 
-    void setDictionaryId(int dictId) {
-        dicIdMask = WordId.dicIdMask(dictId);
-    }
-
     /**
      * Iterates over all valid word ids in the dictionary. Iteration order is not
      * the same as the original dictionary order, but dictionary ids, when sorted,
@@ -109,7 +108,7 @@ public Ints next() {
                 }
                 ints.clear();
                 int[] data = ints.prepare(size);
-                readDeltaCompressed(data, size, dicIdMask, r);
+                readDeltaCompressed(data, size, r);
                 return ints;
             }
         };
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java
index 16dbd07f..414fc82e 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Works Applications Co., Ltd.
+ * Copyright (c) 2022-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ public final class WordLookup {
     private int numWords;
     private final List<DoubleArrayLexicon> lexicons;
     private int currentLexicon = -1;
+    private int dictMask;
 
     public WordLookup(List<DoubleArrayLexicon> lexicons) {
         this.lexicons = lexicons;
@@ -58,6 +59,7 @@ private void rebind(DoubleArrayLexicon lexicon) {
      */
     public void reset(byte[] key, int offset, int limit) {
         currentLexicon = lexicons.size() - 1;
+        dictMask = WordId.dicIdMask(currentLexicon);
         rebind(lexicons.get(currentLexicon));
         lookup.reset(key, offset, limit);
     }
@@ -90,9 +92,14 @@ public boolean next() {
             }
             rebind(lexicons.get(nextLexicon));
             currentLexicon = nextLexicon;
+            dictMask = WordId.dicIdMask(nextLexicon);
         }
         int wordGroupId = lookup.getValue();
         numWords = words.readWordIds(wordGroupId, this);
+        for (int i = 0; i < numWords; ++i) {
+            int internalId = wordIds[i];
+            wordIds[i] = WordId.applyMask(internalId, dictMask);
+        }
         return true;
     }
 

From d458f88b5a8e1bf75f496274e47c260f92e36a67 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 2 Dec 2024 10:50:49 +0900
Subject: [PATCH 2/2] use public api to get word id list

---
 .../sudachi/dictionary/DictionaryPrinter.java | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
index 077ec735..96c7d9d2 100644
--- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
+++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -42,7 +42,7 @@ public class DictionaryPrinter {
     private final LexiconSet lex;
     private final TextNormalizer textNormalizer;
     // sorted raw word ids taken from the target dict.
-    private final Ints wordIds;
+    private final int[] wordIds;
 
     private POSMode posMode = POSMode.DEFAULT;
     private WordRefMode wordRefMode = WordRefMode.DEFAULT;
@@ -98,18 +98,15 @@ public enum WordRefMode {
 
         // In order to output dictionary entries in in-dictionary order we need to sort
         // them. Iterator over them will get them not in the sorted order, but grouped
-        // by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
-        // for the performance.
-        DoubleArrayLexicon targetLex = dic.getLexicon();
-        Ints allIds = new Ints(targetLex.size());
-        Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
+        // by index-form.
+        Lexicon targetLexicon = dic.getLexicon();
+        int[] allIds = new int[targetLexicon.size()];
+        int idx = 0;
+        Iterator<Integer> ids = targetLexicon.wordIds();
         while (ids.hasNext()) {
-            allIds.appendAll(ids.next());
-        }
-        allIds.sort();
-        for (int i = 0; i < allIds.length(); i++) {
-            allIds.set(i, WordId.applyMask(allIds.get(i), dicIdMask));
+            allIds[idx++] = WordId.applyMask(ids.next(), dicIdMask);
         }
+        Arrays.sort(allIds);
         wordIds = allIds;
     }
 
@@ -176,9 +173,9 @@ void printColumnHeaders(List<Column> headers) {
 
     private void printEntries() {
         progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY);
-        long size = wordIds.length();
+        long size = wordIds.length;
         for (int i = 0; i < size; ++i) {
-            printEntry(wordIds.get(i));
+            printEntry(wordIds[i]);
             progress.progress(i, size);
         }
         progress.endBlock(size, System.nanoTime());