Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle dictid by lexiconset #256

Merged
merged 2 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,7 +42,7 @@ public class DictionaryPrinter {
private final LexiconSet lex;
private final TextNormalizer textNormalizer;
// sorted raw word ids taken from the target dict.
private final Ints wordIds;
private final int[] wordIds;

private POSMode posMode = POSMode.DEFAULT;
private WordRefMode wordRefMode = WordRefMode.DEFAULT;
Expand Down Expand Up @@ -78,15 +78,18 @@ public enum WordRefMode {

this.output = output;

if (base == null) {
int dicIdMask;
if (base == null) { // system
grammar = dic.getGrammar();
lex = new LexiconSet(dic.getLexicon(), grammar.getSystemPartOfSpeechSize());
} else {
dicIdMask = WordId.dicIdMask(0);
} else { // user
grammar = base.getGrammar();
lex = new LexiconSet(base.getLexicon(), grammar.getSystemPartOfSpeechSize());

lex.add(dic.getLexicon(), (short) grammar.getPartOfSpeechSize());
grammar.addPosList(dic.getGrammar());
dicIdMask = WordId.dicIdMask(1);
}

// set default char category for text normalizer
Expand All @@ -95,15 +98,15 @@ public enum WordRefMode {

// In order to output dictionary entries in in-dictionary order we need to sort
// them. Iterator over them will get them not in the sorted order, but grouped
// by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
// for the performance.
DoubleArrayLexicon targetLex = dic.getLexicon();
Ints allIds = new Ints(targetLex.size());
Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
// by index-form.
Lexicon targetLexicon = dic.getLexicon();
int[] allIds = new int[targetLexicon.size()];
int idx = 0;
Iterator<Integer> ids = targetLexicon.wordIds();
while (ids.hasNext()) {
allIds.appendAll(ids.next());
allIds[idx++] = WordId.applyMask(ids.next(), dicIdMask);
}
allIds.sort();
Arrays.sort(allIds);
wordIds = allIds;
}

Expand Down Expand Up @@ -170,9 +173,9 @@ void printColumnHeaders(List<Column> headers) {

private void printEntries() {
progress.startBlock("Entries", System.nanoTime(), Progress.Kind.ENTRY);
long size = wordIds.length();
long size = wordIds.length;
for (int i = 0; i < size; ++i) {
printEntry(wordIds.get(i));
printEntry(wordIds[i]);
progress.progress(i, size);
}
progress.endBlock(size, System.nanoTime());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,34 @@
import com.worksap.nlp.sudachi.MorphemeList;
import com.worksap.nlp.sudachi.Tokenizer;

/**
* The main lexicon implementation.
*
* In V1 format, it consists of followings. {@link DoubleArray} (TRIE): Mapping
* from index form to WordIdTable offset. {@link WordIdTable}: Table of list of
* word ids that have same index form.
* {@link WordParameters}/{@link WordInfoList}: List of word information, for
* analysis/non-analysis respectively. Word id represents offset in them.
* {@link CompactedStrings}: Storage of strings such as headword, reading form,
* etc.
*/
public class DoubleArrayLexicon implements Lexicon {
static final int USER_DICT_COST_PAR_MORPH = -20;
private final WordInfoList wordInfos;

private final Description description;
private final DoubleArray trie;
private final WordInfoList wordInfos;
private final WordParameters parameters;
private final Description description;
private final WordIdTable wordIdTable;
private final CompactedStrings strings;

public DoubleArrayLexicon(Description description, WordIdTable wordIdTable, WordParameters wordParams,
WordInfoList wordInfos, DoubleArray trie, CompactedStrings strings) {
this.description = description;
this.trie = trie;
this.wordIdTable = wordIdTable;
this.parameters = wordParams;
this.wordInfos = wordInfos;
this.trie = trie;
this.strings = strings;
}

Expand Down Expand Up @@ -86,29 +98,16 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
if (!iterator.hasNext()) {
return iterator;
}
return new Itr(iterator);
}

public IntBuffer getTrieArray() {
return trie.array();
}

public WordIdTable getWordIdTable() {
return wordIdTable;
}

@Override
public long parameters(int wordId) {
return parameters.loadParams(wordId);
return new LookupItr(iterator);
}

private class Itr implements Iterator<int[]> {
private class LookupItr implements Iterator<int[]> {
private final Iterator<int[]> iterator;
private int[] wordIds;
private int length;
private int index;

Itr(Iterator<int[]> iterator) {
LookupItr(Iterator<int[]> iterator) {
this.iterator = iterator;
index = -1;
}
Expand All @@ -134,6 +133,19 @@ public int[] next() {
}
}

public IntBuffer getTrieArray() {
return trie.array();
}

public WordIdTable getWordIdTable() {
return wordIdTable;
}

@Override
public long parameters(int wordId) {
return parameters.loadParams(wordId);
}

@Override
public String string(int dic, int stringPtr) {
return strings.string(stringPtr);
Expand Down Expand Up @@ -220,10 +232,6 @@ public void calculateDynamicCosts(Tokenizer tokenizer) {
}
}

public void setDictionaryId(int id) {
wordIdTable.setDictionaryId(id);
}

@Override
public WordInfoList wordInfos(int dic) {
return wordInfos;
Expand Down
28 changes: 20 additions & 8 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@

import java.util.*;

/**
* A lexicon that contains multiple lexicons inside.
*
* It only accepts {@link DoubleArrayLexicon} now. This lexicon cannot be
* nested.
*
* Handles dictionary part of the word id.
*/
public class LexiconSet implements Lexicon {
static final int MAX_DICTIONARIES = 15;

Expand All @@ -33,9 +41,7 @@ public LexiconSet(Lexicon systemLexicon, short systemPartOfSpeechSize) {
}

public void add(Lexicon lexicon, short posOffset) {
DoubleArrayLexicon daLexicon = (DoubleArrayLexicon) lexicon;
daLexicon.setDictionaryId(lexicons.size());
lexicons.add(daLexicon);
lexicons.add((DoubleArrayLexicon) lexicon);
posOffsets.add(posOffset);
}

Expand All @@ -51,7 +57,7 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
if (lexicons.size() == 1) {
return lexicons.get(0).lookup(text, offset);
}
return new Itr(text, offset, lexicons.size() - 1);
return new LookupItr(text, offset, lexicons.size() - 1);
}

/**
Expand All @@ -63,16 +69,18 @@ public Iterator<int[]> lookup(byte[] text, int offset) {
*
* Dictionaries have their word weights prioritized in the same manner
*/
private class Itr implements Iterator<int[]> {
private class LookupItr implements Iterator<int[]> {
byte[] text;
int offset;
int dictId;
int dictMask;
Iterator<int[]> iterator;

Itr(byte[] text, int offset, int start) {
LookupItr(byte[] text, int offset, int start) {
this.text = text;
this.offset = offset;
dictId = start;
dictMask = WordId.dicIdMask(start);
iterator = lexicons.get(dictId).lookup(text, offset);
}

Expand All @@ -85,6 +93,7 @@ public boolean hasNext() {
}
iterator = lexicons.get(nextId).lookup(text, offset);
dictId = nextId;
dictMask = WordId.dicIdMask(nextId);
}
return true;
}
Expand All @@ -93,7 +102,7 @@ public boolean hasNext() {
public int[] next() {
if (hasNext()) {
int[] r = iterator.next();
r[0] = buildWordId(dictId, r[0]);
r[0] = WordId.applyMask(r[0], dictMask);
return r;
}
throw new NoSuchElementException();
Expand Down Expand Up @@ -177,10 +186,12 @@ public Iterator<Integer> wordIds() {

private class WordIdItr implements Iterator<Integer> {
private int dictId;
private int dictMask;
private Iterator<Integer> iterator;

WordIdItr() {
this.dictId = 0;
this.dictMask = WordId.dicIdMask(dictId);
this.iterator = lexicons.get(dictId).wordIds();
}

Expand All @@ -192,6 +203,7 @@ public boolean hasNext() {
return false;
}
dictId = nextDictId;
dictMask = WordId.dicIdMask(nextDictId);
iterator = lexicons.get(nextDictId).wordIds();
}
return true;
Expand All @@ -202,7 +214,7 @@ public Integer next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return iterator.next();
return WordId.applyMask(iterator.next(), dictMask);
}
}
}
25 changes: 12 additions & 13 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@
import java.util.Iterator;

/**
* Table which contains the list of (internal) word ids that has same index
* form.
* Lexicon parts that contains the list of (internal) word ids that have the
* same index form.
*
* Automatically fills dict parts of word id using the dicId set.
* DoubleArray has mapping from indexForm to offset in this table, and
* {@link WordInfoList} has actual data for each words.
*
* In V1 format, each word ids in a list in this table are sorted (and
* compressed using varint-32), but they are not sorted between lists.
*/
public class WordIdTable {
private final ByteBuffer bytes;
private int dicIdMask = 0;

WordIdTable(ByteBuffer bytes) {
this.bytes = bytes;
Expand All @@ -43,7 +46,7 @@ int[] get(int index) {
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
int[] result = new int[length];
readDeltaCompressed(result, length, this.dicIdMask, reader);
readDeltaCompressed(result, length, reader);
return result;
}

Expand All @@ -62,23 +65,19 @@ int readWordIds(int index, WordLookup lookup) {
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
int[] result = lookup.outputBuffer(length);
readDeltaCompressed(result, length, this.dicIdMask, reader);
readDeltaCompressed(result, length, reader);
return length;
}

private static void readDeltaCompressed(int[] result, int count, int mask, BufReader reader) {
private static void readDeltaCompressed(int[] result, int count, BufReader reader) {
int sum = 0;
for (int i = 0; i < count; ++i) {
int v = reader.readVarint32();
result[i] = WordId.applyMask(v + sum, mask);
result[i] = v + sum;
sum += v;
}
}

void setDictionaryId(int dictId) {
dicIdMask = WordId.dicIdMask(dictId);
}

/**
* Iterates over all valid word ids in the dictionary. Iteration order is not
* the same as the original dictionary order, but dictionary ids, when sorted,
Expand Down Expand Up @@ -109,7 +108,7 @@ public Ints next() {
}
ints.clear();
int[] data = ints.prepare(size);
readDeltaCompressed(data, size, dicIdMask, r);
readDeltaCompressed(data, size, r);
return ints;
}
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,6 +36,7 @@ public final class WordLookup {
private int numWords;
private final List<DoubleArrayLexicon> lexicons;
private int currentLexicon = -1;
private int dictMask;

public WordLookup(List<DoubleArrayLexicon> lexicons) {
this.lexicons = lexicons;
Expand All @@ -58,6 +59,7 @@ private void rebind(DoubleArrayLexicon lexicon) {
*/
public void reset(byte[] key, int offset, int limit) {
currentLexicon = lexicons.size() - 1;
dictMask = WordId.dicIdMask(currentLexicon);
rebind(lexicons.get(currentLexicon));
lookup.reset(key, offset, limit);
}
Expand Down Expand Up @@ -90,9 +92,14 @@ public boolean next() {
}
rebind(lexicons.get(nextLexicon));
currentLexicon = nextLexicon;
dictMask = WordId.dicIdMask(nextLexicon);
}
int wordGroupId = lookup.getValue();
numWords = words.readWordIds(wordGroupId, this);
for (int i = 0; i < numWords; ++i) {
int internalId = wordIds[i];
wordIds[i] = WordId.applyMask(internalId, dictMask);
}
return true;
}

Expand Down