Merge pull request #256 from WorksApplications/feature/252-dictid-lex…

…iconset Handle dictid by lexiconset
WorksApplications · Dec 3, 2024 · adde77d · adde77d
2 parents 239d02b + d458f88
commit adde77d
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 58 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinter.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ public class DictionaryPrinter {
     private final LexiconSet lex;
     private final TextNormalizer textNormalizer;
     // sorted raw word ids taken from the target dict.
-    private final Ints wordIds;
+    private final int[] wordIds;
 
     private POSMode posMode = POSMode.DEFAULT;
     private WordRefMode wordRefMode = WordRefMode.DEFAULT;
@@ -78,15 +78,18 @@ public enum WordRefMode {
 
         this.output = output;
 
-        if (base == null) {
+        int dicIdMask;
+        if (base == null) { // system
             grammar = dic.getGrammar();
             lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
-        } else {
+            dicIdMask = WordId.dicIdMask(0);
+        } else { // user
             grammar = base.getGrammar();
             lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());
 
             lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
             grammar.addPosList(dic.getGrammar());
+            dicIdMask = WordId.dicIdMask(1);
         }
 
         // set default char category for text normalizer
@@ -95,15 +98,15 @@ public enum WordRefMode {
 
         // In order to output dictionary entries in in-dictionary order we need to sort
         // them. Iterator over them will get them not in the sorted order, but grouped
-        // by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
-        // for the performance.
-        DoubleArrayLexicon targetLex = dic.getLexicon();
-        Ints allIds = new Ints(targetLex.size());
-        Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
+        // by index-form.
+        Lexicon targetLexicon = dic.getLexicon();
+        int[] allIds = new int[targetLexicon.size()];
+        int idx = 0;
+        Iterator<Integer> ids = targetLexicon.wordIds();
         while (ids.hasNext()) {
-            allIds.appendAll(ids.next());
+            allIds[idx++] = WordId.applyMask(ids.next(), dicIdMask);
         }
-        allIds.sort();
+        Arrays.sort(allIds);
         wordIds = allIds;
     }
 
@@ -170,9 +173,9 @@ void printColumnHeaders(List<Column> headers) {
 
     private void printEntries() {
         progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY);
-        long size = wordIds.length();
+        long size = wordIds.length;
         for (int i = 0; i < size; ++i) {
-            printEntry(wordIds.get(i));
+            printEntry(wordIds[i]);
             progress.progress(i, size);
         }
         progress.endBlock(size, System.nanoTime());

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java
@@ -25,22 +25,34 @@
 import com.worksap.nlp.sudachi.MorphemeList;
 import com.worksap.nlp.sudachi.Tokenizer;
 
+/**
+ * The main lexicon implementation.
+ * 
+ * In V1 format, it consists of followings. {@link DoubleArray} (TRIE): Mapping
+ * from index form to WordIdTable offset. {@link WordIdTable}: Table of list of
+ * word ids that have same index form.
+ * {@link WordParameters}/{@link WordInfoList}: List of word information, for
+ * analysis/non-analysis respectively. Word id represents offset in them.
+ * {@link CompactedStrings}: Storage of strings such as headword, reading form,
+ * etc.
+ */
 public class DoubleArrayLexicon implements Lexicon {
     static final int USER_DICT_COST_PAR_MORPH = -20;
-    private final WordInfoList wordInfos;
+
+    private final Description description;
     private final DoubleArray trie;
+    private final WordInfoList wordInfos;
     private final WordParameters parameters;
-    private final Description description;
     private final WordIdTable wordIdTable;
     private final CompactedStrings strings;
 
     public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams,
             WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) {
         this.description = description;
+        this.trie = trie;
         this.wordIdTable = wordIdTable;
         this.parameters = wordParams;
         this.wordInfos = wordInfos;
-        this.trie = trie;
         this.strings = strings;
     }
 
@@ -86,29 +98,16 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
         if (!iterator.hasNext()) {
             return iterator;
         }
-        return new Itr(iterator);
-    }
-
-    public IntBuffer getTrieArray() {
-        return trie.array();
-    }
-
-    public WordIdTable getWordIdTable() {
-        return wordIdTable;
-    }
-
-    @Override
-    public long parameters(int wordId) {
-        return parameters.loadParams(wordId);
+        return new LookupItr(iterator);
     }
 
-    private class Itr implements Iterator<int[]> {
+    private class LookupItr implements Iterator<int[]> {
         private final Iterator<int[]> iterator;
         private int[] wordIds;
         private int length;
         private int index;
 
-        Itr(Iterator<int[]> iterator) {
+        LookupItr(Iterator<int[]> iterator) {
             this.iterator = iterator;
             index = -1;
         }
@@ -134,6 +133,19 @@ public int[] next() {
         }
     }
 
+    public IntBuffer getTrieArray() {
+        return trie.array();
+    }
+
+    public WordIdTable getWordIdTable() {
+        return wordIdTable;
+    }
+
+    @Override
+    public long parameters(int wordId) {
+        return parameters.loadParams(wordId);
+    }
+
     @Override
     public String string(int dic, int stringPtr) {
         return strings.string(stringPtr);
@@ -220,10 +232,6 @@ public void calculateDynamicCosts(Tokenizer tokenizer) {
         }
     }
 
-    public void setDictionaryId(int id) {
-        wordIdTable.setDictionaryId(id);
-    }
-
     @Override
     public WordInfoList wordInfos(int dic) {
         return wordInfos;

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
@@ -20,6 +20,14 @@
 
 import java.util.*;
 
+/**
+ * A lexicon that contains multiple lexicons inside.
+ * 
+ * It only accepts {@link DoubleArrayLexicon} now. This lexicon cannot be
+ * nested.
+ * 
+ * Handles dictionary part of the word id.
+ */
 public class LexiconSet implements Lexicon {
     static final int MAX_DICTIONARIES = 15;
 
@@ -33,9 +41,7 @@ public LexiconSet(Lexicon systemLexicon, short systemPartOfSpeechSize) {
     }
 
     public void add(Lexicon lexicon, short posOffset) {
-        DoubleArrayLexicon daLexicon = (DoubleArrayLexicon) lexicon;
-        daLexicon.setDictionaryId(lexicons.size());
-        lexicons.add(daLexicon);
+        lexicons.add((DoubleArrayLexicon) lexicon);
         posOffsets.add(posOffset);
     }
 
@@ -51,7 +57,7 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
         if (lexicons.size() == 1) {
             return lexicons.get(0).lookup(text, offset);
         }
-        return new Itr(text, offset, lexicons.size() - 1);
+        return new LookupItr(text, offset, lexicons.size() - 1);
     }
 
     /**
@@ -63,16 +69,18 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
      *
      * Dictionaries have their word weights prioritized in the same manner
      */
-    private class Itr implements Iterator<int[]> {
+    private class LookupItr implements Iterator<int[]> {
         byte[] text;
         int offset;
         int dictId;
+        int dictMask;
         Iterator<int[]> iterator;
 
-        Itr(byte[] text, int offset, int start) {
+        LookupItr(byte[] text, int offset, int start) {
             this.text = text;
             this.offset = offset;
             dictId = start;
+            dictMask = WordId.dicIdMask(start);
             iterator = lexicons.get(dictId).lookup(text, offset);
         }
 
@@ -85,6 +93,7 @@ public boolean hasNext() {
                 }
                 iterator = lexicons.get(nextId).lookup(text, offset);
                 dictId = nextId;
+                dictMask = WordId.dicIdMask(nextId);
             }
             return true;
         }
@@ -93,7 +102,7 @@ public boolean hasNext() {
         public int[] next() {
             if (hasNext()) {
                 int[] r = iterator.next();
-                r[0] = buildWordId(dictId, r[0]);
+                r[0] = WordId.applyMask(r[0], dictMask);
                 return r;
             }
             throw new NoSuchElementException();
@@ -177,10 +186,12 @@ public Iterator<Integer> wordIds() {
 
     private class WordIdItr implements Iterator<Integer> {
         private int dictId;
+        private int dictMask;
         private Iterator<Integer> iterator;
 
         WordIdItr() {
             this.dictId = 0;
+            this.dictMask = WordId.dicIdMask(dictId);
             this.iterator = lexicons.get(dictId).wordIds();
         }
 
@@ -192,6 +203,7 @@ public boolean hasNext() {
                     return false;
                 }
                 dictId = nextDictId;
+                dictMask = WordId.dicIdMask(nextDictId);
                 iterator = lexicons.get(nextDictId).wordIds();
             }
             return true;
@@ -202,7 +214,7 @@ public Integer next() {
             if (!hasNext()) {
                 throw new NoSuchElementException();
             }
-            return iterator.next();
+            return WordId.applyMask(iterator.next(), dictMask);
         }
     }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
@@ -24,14 +24,17 @@
 import java.util.Iterator;
 
 /**
- * Table which contains the list of (internal) word ids that has same index
- * form.
+ * Lexicon parts that contains the list of (internal) word ids that have the
+ * same index form.
  * 
- * Automatically fills dict parts of word id using the dicId set.
+ * DoubleArray has mapping from indexForm to offset in this table, and
+ * {@link WordInfoList} has actual data for each words.
+ * 
+ * In V1 format, each word ids in a list in this table are sorted (and
+ * compressed using varint-32), but they are not sorted between lists.
  */
 public class WordIdTable {
     private final ByteBuffer bytes;
-    private int dicIdMask = 0;
 
     WordIdTable(ByteBuffer bytes) {
         this.bytes = bytes;
@@ -43,7 +46,7 @@ int[] get(int index) {
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
         int[] result = new int[length];
-        readDeltaCompressed(result, length, this.dicIdMask, reader);
+        readDeltaCompressed(result, length, reader);
         return result;
     }
 
@@ -62,23 +65,19 @@ int readWordIds(int index, WordLookup lookup) {
         BufReader reader = new BufReader(dup);
         int length = reader.readVarint32();
         int[] result = lookup.outputBuffer(length);
-        readDeltaCompressed(result, length, this.dicIdMask, reader);
+        readDeltaCompressed(result, length, reader);
         return length;
     }
 
-    private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) {
+    private static void readDeltaCompressed(int[] result, int count, BufReader reader) {
         int sum = 0;
         for (int i = 0; i < count; ++i) {
             int v = reader.readVarint32();
-            result[i] = WordId.applyMask(v + sum, mask);
+            result[i] = v + sum;
             sum += v;
         }
     }
 
-    void setDictionaryId(int dictId) {
-        dicIdMask = WordId.dicIdMask(dictId);
-    }
-
     /**
      * Iterates over all valid word ids in the dictionary. Iteration order is not
      * the same as the original dictionary order, but dictionary ids, when sorted,
@@ -109,7 +108,7 @@ public Ints next() {
                 }
                 ints.clear();
                 int[] data = ints.prepare(size);
-                readDeltaCompressed(data, size, dicIdMask, r);
+                readDeltaCompressed(data, size, r);
                 return ints;
             }
         };

diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordLookup.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Works Applications Co., Ltd.
+ * Copyright (c) 2022-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ public final class WordLookup {
     private int numWords;
     private final List<DoubleArrayLexicon> lexicons;
     private int currentLexicon = -1;
+    private int dictMask;
 
     public WordLookup(List<DoubleArrayLexicon> lexicons) {
         this.lexicons = lexicons;
@@ -58,6 +59,7 @@ private void rebind(DoubleArrayLexicon lexicon) {
      */
     public void reset(byte[] key, int offset, int limit) {
         currentLexicon = lexicons.size() - 1;
+        dictMask = WordId.dicIdMask(currentLexicon);
         rebind(lexicons.get(currentLexicon));
         lookup.reset(key, offset, limit);
     }
@@ -90,9 +92,14 @@ public boolean next() {
             }
             rebind(lexicons.get(nextLexicon));
             currentLexicon = nextLexicon;
+            dictMask = WordId.dicIdMask(nextLexicon);
         }
         int wordGroupId = lookup.getValue();
         numWords = words.readWordIds(wordGroupId, this);
+        for (int i = 0; i < numWords; ++i) {
+            int internalId = wordIds[i];
+            wordIds[i] = WordId.applyMask(internalId, dictMask);
+        }
         return true;
     }