Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option for embedded config and fallback resources #262

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions sudachi/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use thiserror::Error;

const DEFAULT_RESOURCE_DIR: &str = "resources";
const DEFAULT_SETTING_FILE: &str = "sudachi.json";
const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json");
const DEFAULT_CHAR_DEF_FILE: &str = "char.def";

/// Sudachi Error
Expand Down Expand Up @@ -343,6 +344,12 @@ impl Config {
Ok(raw_config.build())
}

pub fn new_embedded() -> Result<Self, ConfigError> {
let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?;

Ok(raw_config.build())
}

/// Creates a minimal config with the provided resource directory
pub fn minimal_at(resource_dir: impl Into<PathBuf>) -> Config {
let mut cfg = Config::default();
Expand Down
5 changes: 5 additions & 0 deletions sudachi/src/dic/character_category.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ impl CharacterCategory {
Self::from_reader(reader)
}

pub fn from_bytes(bytes: &[u8]) -> SudachiResult<CharacterCategory> {
let reader = BufReader::new(bytes);
Self::from_reader(reader)
}

pub fn from_reader<T: BufRead>(data: T) -> SudachiResult<CharacterCategory> {
let ranges = Self::read_character_definition(data)?;
Ok(Self::compile(&ranges))
Expand Down
41 changes: 40 additions & 1 deletion sudachi/src/dic/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ impl JapaneseDictionary {
Self::from_cfg_storage(cfg, sb)
}

/// Creats a dictionary from the specified configuration and storage
/// Creates a dictionary from the specified configuration and storage
pub fn from_cfg_storage(
cfg: &Config,
storage: SudachiDicData,
Expand Down Expand Up @@ -115,6 +115,45 @@ impl JapaneseDictionary {
Ok(dic)
}

/// Creates a dictionary from the specified configuration and storage, with embedded character definition
pub fn from_cfg_storage_with_embedded_chardef(
cfg: &Config,
storage: SudachiDicData,
) -> SudachiResult<JapaneseDictionary> {
let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe {
storage.system_static_slice()
})?;

let plugins = {
let grammar = &mut basic_dict.grammar;
let cfg = &*cfg;
Plugins::load(cfg, grammar)?
};

if plugins.oov.is_empty() {
return Err(SudachiError::NoOOVPluginProvided);
}

for p in plugins.connect_cost.plugins() {
p.edit(&mut basic_dict.grammar);
}

let mut dic = JapaneseDictionary {
storage,
plugins,
_grammar: basic_dict.grammar,
_lexicon: basic_dict.lexicon_set,
};

// this Vec is needed to prevent double borrowing of dic
let user_dicts: Vec<_> = dic.storage.user_static_slice();
for udic in user_dicts {
dic = dic.merge_user_dictionary(udic)?;
}

Ok(dic)
}

/// Returns grammar with the correct lifetime
pub fn grammar<'a>(&'a self) -> &Grammar<'a> {
&self._grammar
Expand Down
31 changes: 27 additions & 4 deletions sudachi/src/dic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub mod storage;
pub mod subset;
pub mod word_id;

const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def");
const POS_DEPTH: usize = 6;

/// A dictionary consists of one system_dict and zero or more user_dicts
Expand All @@ -51,14 +52,13 @@ pub struct LoadedDictionary<'a> {
}

impl<'a> LoadedDictionary<'a> {
/// Creates a system dictionary from bytes, and load a character category from file
pub fn from_system_dictionary(
/// Creates a system dictionary from bytes, and preloaded character category
pub fn from_system_dictionary_and_chardef(
dictionary_bytes: &'a [u8],
character_category_file: &Path,
character_category: CharacterCategory,
) -> SudachiResult<LoadedDictionary<'a>> {
let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;

let character_category = CharacterCategory::from_file(character_category_file)?;
let mut grammar = system_dict
.grammar
.ok_or(SudachiError::InvalidDictionaryGrammar)?;
Expand All @@ -71,6 +71,29 @@ impl<'a> LoadedDictionary<'a> {
})
}

/// Creates a system dictionary from bytes, and load a character category from file
pub fn from_system_dictionary(
dictionary_bytes: &'a [u8],
character_category_file: &Path,
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_file(character_category_file)?;
Ok(Self::from_system_dictionary_and_chardef(
dictionary_bytes,
character_category,
)?)
}

/// Creates a system dictionary from bytes, and load embedded default character category
pub fn from_system_dictionary_embedded(
Kuuuube marked this conversation as resolved.
Show resolved Hide resolved
dictionary_bytes: &'a [u8],
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?;
Ok(Self::from_system_dictionary_and_chardef(
dictionary_bytes,
character_category,
)?)
}

#[cfg(test)]
pub(crate) fn merge_dictionary(
mut self,
Expand Down
12 changes: 9 additions & 3 deletions sudachi/src/plugin/input_text/default_input_text/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::prelude::*;
mod tests;

const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");

/// Provides basic normalization of the input text
#[derive(Default)]
Expand Down Expand Up @@ -262,10 +263,15 @@ impl InputTextPlugin for DefaultInputTextPlugin {
settings
.rewriteDef
.unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
)?;
);

let reader = BufReader::new(fs::File::open(&rewrite_file_path)?);
self.read_rewrite_lists(reader)?;
if rewrite_file_path.is_ok() {
let reader = BufReader::new(fs::File::open(&rewrite_file_path?)?);
self.read_rewrite_lists(reader)?;
} else {
let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
self.read_rewrite_lists(reader)?;
}

Ok(())
}
Expand Down
26 changes: 20 additions & 6 deletions sudachi/src/plugin/oov/mecab_oov/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use crate::prelude::*;
mod test;

const DEFAULT_CHAR_DEF_FILE: &str = "char.def";
const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/char.def");
const DEFAULT_UNK_DEF_FILE: &str = "unk.def";
const DEFAULT_UNK_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/unk.def");

/// provides MeCab oov nodes
#[derive(Default)]
Expand Down Expand Up @@ -257,17 +259,29 @@ impl OovProviderPlugin for MeCabOovPlugin {
settings
.charDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&char_def_path)?);
let categories = MeCabOovPlugin::read_character_property(reader)?;
);

let categories = if char_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&char_def_path?)?);
MeCabOovPlugin::read_character_property(reader)?
} else {
let reader = BufReader::new(DEFAULT_CHAR_DEF_BYTES);
MeCabOovPlugin::read_character_property(reader)?
};

let unk_def_path = config.complete_path(
settings
.unkDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&unk_def_path)?);
let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?;
);

let oov_list = if unk_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&unk_def_path?)?);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
} else {
let reader = BufReader::new(DEFAULT_UNK_DEF_BYTES);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
};

self.categories = categories;
self.oov_list = oov_list;
Expand Down