Skip to content

Commit

Permalink
Merge pull request #256 from WorksApplications/feature/252-dictid-lex…
Browse files Browse the repository at this point in the history
…iconset

Handle dictid by lexiconset
  • Loading branch information
mh-northlander authored Dec 3, 2024
2 parents 239d02b + d458f88 commit adde77d
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 58 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,7 +42,7 @@ public class DictionaryPrinter {
private final LexiconSet lex;
private final TextNormalizer textNormalizer;
// sorted raw word ids taken from the target dict.
private final Ints wordIds;
private final int[] wordIds;

private POSMode posMode = POSMode.DEFAULT;
private WordRefMode wordRefMode = WordRefMode.DEFAULT;
Expand Down Expand Up @@ -78,15 +78,18 @@ public enum WordRefMode {

this.output = output;

if (base == null) {
int dicIdMask;
if (base == null) { // system
grammar = dic.getGrammar();
lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
} else {
dicIdMask = WordId.dicIdMask(0);
} else { // user
grammar = base.getGrammar();
lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());

lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
grammar.addPosList(dic.getGrammar());
dicIdMask = WordId.dicIdMask(1);
}

// set default char category for text normalizer
Expand All @@ -95,15 +98,15 @@ public enum WordRefMode {

// In order to output dictionary entries in in-dictionary order we need to sort
// them. Iterator over them will get them not in the sorted order, but grouped
// by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
// for the performance.
DoubleArrayLexicon targetLex = dic.getLexicon();
Ints allIds = new Ints(targetLex.size());
Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
// by index-form.
Lexicon targetLexicon = dic.getLexicon();
int[] allIds = new int[targetLexicon.size()];
int idx = 0;
Iterator<Integer> ids = targetLexicon.wordIds();
while (ids.hasNext()) {
allIds.appendAll(ids.next());
allIds[idx++] = WordId.applyMask(ids.next(), dicIdMask);
}
allIds.sort();
Arrays.sort(allIds);
wordIds = allIds;
}

Expand Down Expand Up @@ -170,9 +173,9 @@ void printColumnHeaders(List<Column> headers) {

private void printEntries() {
progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY);
long size = wordIds.length();
long size = wordIds.length;
for (int i = 0; i < size; ++i) {
printEntry(wordIds.get(i));
printEntry(wordIds[i]);
progress.progress(i, size);
}
progress.endBlock(size, System.nanoTime());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,34 @@
import com.worksap.nlp.sudachi.MorphemeList;
import com.worksap.nlp.sudachi.Tokenizer;

/**
* The main lexicon implementation.
*
* In V1 format, it consists of followings. {@link DoubleArray} (TRIE): Mapping
* from index form to WordIdTable offset. {@link WordIdTable}: Table of list of
* word ids that have same index form.
* {@link WordParameters}/{@link WordInfoList}: List of word information, for
* analysis/non-analysis respectively. Word id represents offset in them.
* {@link CompactedStrings}: Storage of strings such as headword, reading form,
* etc.
*/
public class DoubleArrayLexicon implements Lexicon {
static final int USER_DICT_COST_PAR_MORPH = -20;
private final WordInfoList wordInfos;

private final Description description;
private final DoubleArray trie;
private final WordInfoList wordInfos;
private final WordParameters parameters;
private final Description description;
private final WordIdTable wordIdTable;
private final CompactedStrings strings;

public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams,
WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) {
this.description = description;
this.trie = trie;
this.wordIdTable = wordIdTable;
this.parameters = wordParams;
this.wordInfos = wordInfos;
this.trie = trie;
this.strings = strings;
}

Expand Down Expand Up @@ -86,29 +98,16 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
if (!iterator.hasNext()) {
return iterator;
}
return new Itr(iterator);
}

public IntBuffer getTrieArray() {
return trie.array();
}

public WordIdTable getWordIdTable() {
return wordIdTable;
}

@Override
public long parameters(int wordId) {
return parameters.loadParams(wordId);
return new LookupItr(iterator);
}

private class Itr implements Iterator<int[]> {
private class LookupItr implements Iterator<int[]> {
private final Iterator<int[]> iterator;
private int[] wordIds;
private int length;
private int index;

Itr(Iterator<int[]> iterator) {
LookupItr(Iterator<int[]> iterator) {
this.iterator = iterator;
index = -1;
}
Expand All @@ -134,6 +133,19 @@ public int[] next() {
}
}

public IntBuffer getTrieArray() {
return trie.array();
}

public WordIdTable getWordIdTable() {
return wordIdTable;
}

@Override
public long parameters(int wordId) {
return parameters.loadParams(wordId);
}

@Override
public String string(int dic, int stringPtr) {
return strings.string(stringPtr);
Expand Down Expand Up @@ -220,10 +232,6 @@ public void calculateDynamicCosts(Tokenizer tokenizer) {
}
}

public void setDictionaryId(int id) {
wordIdTable.setDictionaryId(id);
}

@Override
public WordInfoList wordInfos(int dic) {
return wordInfos;
Expand Down
28 changes: 20 additions & 8 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@

import java.util.*;

/**
* A lexicon that contains multiple lexicons inside.
*
* It only accepts {@link DoubleArrayLexicon} now. This lexicon cannot be
* nested.
*
* Handles dictionary part of the word id.
*/
public class LexiconSet implements Lexicon {
static final int MAX_DICTIONARIES = 15;

Expand All @@ -33,9 +41,7 @@ public LexiconSet(Lexicon systemLexicon, short systemPartOfSpeechSize) {
}

public void add(Lexicon lexicon, short posOffset) {
DoubleArrayLexicon daLexicon = (DoubleArrayLexicon) lexicon;
daLexicon.setDictionaryId(lexicons.size());
lexicons.add(daLexicon);
lexicons.add((DoubleArrayLexicon) lexicon);
posOffsets.add(posOffset);
}

Expand All @@ -51,7 +57,7 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
if (lexicons.size() == 1) {
return lexicons.get(0).lookup(text, offset);
}
return new Itr(text, offset, lexicons.size() - 1);
return new LookupItr(text, offset, lexicons.size() - 1);
}

/**
Expand All @@ -63,16 +69,18 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
*
* Dictionaries have their word weights prioritized in the same manner
*/
private class Itr implements Iterator<int[]> {
private class LookupItr implements Iterator<int[]> {
byte[] text;
int offset;
int dictId;
int dictMask;
Iterator<int[]> iterator;

Itr(byte[] text, int offset, int start) {
LookupItr(byte[] text, int offset, int start) {
this.text = text;
this.offset = offset;
dictId = start;
dictMask = WordId.dicIdMask(start);
iterator = lexicons.get(dictId).lookup(text, offset);
}

Expand All @@ -85,6 +93,7 @@ public boolean hasNext() {
}
iterator = lexicons.get(nextId).lookup(text, offset);
dictId = nextId;
dictMask = WordId.dicIdMask(nextId);
}
return true;
}
Expand All @@ -93,7 +102,7 @@ public boolean hasNext() {
public int[] next() {
if (hasNext()) {
int[] r = iterator.next();
r[0] = buildWordId(dictId, r[0]);
r[0] = WordId.applyMask(r[0], dictMask);
return r;
}
throw new NoSuchElementException();
Expand Down Expand Up @@ -177,10 +186,12 @@ public Iterator<Integer> wordIds() {

private class WordIdItr implements Iterator<Integer> {
private int dictId;
private int dictMask;
private Iterator<Integer> iterator;

WordIdItr() {
this.dictId = 0;
this.dictMask = WordId.dicIdMask(dictId);
this.iterator = lexicons.get(dictId).wordIds();
}

Expand All @@ -192,6 +203,7 @@ public boolean hasNext() {
return false;
}
dictId = nextDictId;
dictMask = WordId.dicIdMask(nextDictId);
iterator = lexicons.get(nextDictId).wordIds();
}
return true;
Expand All @@ -202,7 +214,7 @@ public Integer next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return iterator.next();
return WordId.applyMask(iterator.next(), dictMask);
}
}
}
25 changes: 12 additions & 13 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@
import java.util.Iterator;

/**
* Table which contains the list of (internal) word ids that has same index
* form.
* Lexicon parts that contains the list of (internal) word ids that have the
* same index form.
*
* Automatically fills dict parts of word id using the dicId set.
* DoubleArray has mapping from indexForm to offset in this table, and
* {@link WordInfoList} has actual data for each words.
*
* In V1 format, each word ids in a list in this table are sorted (and
* compressed using varint-32), but they are not sorted between lists.
*/
public class WordIdTable {
private final ByteBuffer bytes;
private int dicIdMask = 0;

WordIdTable(ByteBuffer bytes) {
this.bytes = bytes;
Expand All @@ -43,7 +46,7 @@ int[] get(int index) {
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
int[] result = new int[length];
readDeltaCompressed(result, length, this.dicIdMask, reader);
readDeltaCompressed(result, length, reader);
return result;
}

Expand All @@ -62,23 +65,19 @@ int readWordIds(int index, WordLookup lookup) {
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
int[] result = lookup.outputBuffer(length);
readDeltaCompressed(result, length, this.dicIdMask, reader);
readDeltaCompressed(result, length, reader);
return length;
}

private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) {
private static void readDeltaCompressed(int[] result, int count, BufReader reader) {
int sum = 0;
for (int i = 0; i < count; ++i) {
int v = reader.readVarint32();
result[i] = WordId.applyMask(v + sum, mask);
result[i] = v + sum;
sum += v;
}
}

void setDictionaryId(int dictId) {
dicIdMask = WordId.dicIdMask(dictId);
}

/**
* Iterates over all valid word ids in the dictionary. Iteration order is not
* the same as the original dictionary order, but dictionary ids, when sorted,
Expand Down Expand Up @@ -109,7 +108,7 @@ public Ints next() {
}
ints.clear();
int[] data = ints.prepare(size);
readDeltaCompressed(data, size, dicIdMask, r);
readDeltaCompressed(data, size, r);
return ints;
}
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,6 +36,7 @@ public final class WordLookup {
private int numWords;
private final List<DoubleArrayLexicon> lexicons;
private int currentLexicon = -1;
private int dictMask;

public WordLookup(List<DoubleArrayLexicon> lexicons) {
this.lexicons = lexicons;
Expand All @@ -58,6 +59,7 @@ private void rebind(DoubleArrayLexicon lexicon) {
*/
public void reset(byte[] key, int offset, int limit) {
currentLexicon = lexicons.size() - 1;
dictMask = WordId.dicIdMask(currentLexicon);
rebind(lexicons.get(currentLexicon));
lookup.reset(key, offset, limit);
}
Expand Down Expand Up @@ -90,9 +92,14 @@ public boolean next() {
}
rebind(lexicons.get(nextLexicon));
currentLexicon = nextLexicon;
dictMask = WordId.dicIdMask(nextLexicon);
}
int wordGroupId = lookup.getValue();
numWords = words.readWordIds(wordGroupId, this);
for (int i = 0; i < numWords; ++i) {
int internalId = wordIds[i];
wordIds[i] = WordId.applyMask(internalId, dictMask);
}
return true;
}

Expand Down

0 comments on commit adde77d

Please sign in to comment.