Skip to content

Commit

Permalink
Merge pull request #262 from Kuuuube/embedded-config-resources
Browse files Browse the repository at this point in the history
Add option for embedded config and fallback resources
  • Loading branch information
mh-northlander authored Jun 28, 2024
2 parents 89279f9 + ae095ab commit d78bf49
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 14 deletions.
7 changes: 7 additions & 0 deletions sudachi/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use thiserror::Error;

const DEFAULT_RESOURCE_DIR: &str = "resources";
const DEFAULT_SETTING_FILE: &str = "sudachi.json";
const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json");
const DEFAULT_CHAR_DEF_FILE: &str = "char.def";

/// Sudachi Error
Expand Down Expand Up @@ -343,6 +344,12 @@ impl Config {
Ok(raw_config.build())
}

pub fn new_embedded() -> Result<Self, ConfigError> {
let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?;

Ok(raw_config.build())
}

/// Creates a minimal config with the provided resource directory
pub fn minimal_at(resource_dir: impl Into<PathBuf>) -> Config {
let mut cfg = Config::default();
Expand Down
5 changes: 5 additions & 0 deletions sudachi/src/dic/character_category.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ impl CharacterCategory {
Self::from_reader(reader)
}

pub fn from_bytes(bytes: &[u8]) -> SudachiResult<CharacterCategory> {
let reader = BufReader::new(bytes);
Self::from_reader(reader)
}

pub fn from_reader<T: BufRead>(data: T) -> SudachiResult<CharacterCategory> {
let ranges = Self::read_character_definition(data)?;
Ok(Self::compile(&ranges))
Expand Down
41 changes: 40 additions & 1 deletion sudachi/src/dic/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ impl JapaneseDictionary {
Self::from_cfg_storage(cfg, sb)
}

/// Creats a dictionary from the specified configuration and storage
/// Creates a dictionary from the specified configuration and storage
pub fn from_cfg_storage(
cfg: &Config,
storage: SudachiDicData,
Expand Down Expand Up @@ -115,6 +115,45 @@ impl JapaneseDictionary {
Ok(dic)
}

/// Creates a dictionary from the specified configuration and storage, with embedded character definition
pub fn from_cfg_storage_with_embedded_chardef(
cfg: &Config,
storage: SudachiDicData,
) -> SudachiResult<JapaneseDictionary> {
let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe {
storage.system_static_slice()
})?;

let plugins = {
let grammar = &mut basic_dict.grammar;
let cfg = &*cfg;
Plugins::load(cfg, grammar)?
};

if plugins.oov.is_empty() {
return Err(SudachiError::NoOOVPluginProvided);
}

for p in plugins.connect_cost.plugins() {
p.edit(&mut basic_dict.grammar);
}

let mut dic = JapaneseDictionary {
storage,
plugins,
_grammar: basic_dict.grammar,
_lexicon: basic_dict.lexicon_set,
};

// this Vec is needed to prevent double borrowing of dic
let user_dicts: Vec<_> = dic.storage.user_static_slice();
for udic in user_dicts {
dic = dic.merge_user_dictionary(udic)?;
}

Ok(dic)
}

/// Returns grammar with the correct lifetime
pub fn grammar<'a>(&'a self) -> &Grammar<'a> {
&self._grammar
Expand Down
31 changes: 27 additions & 4 deletions sudachi/src/dic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub mod storage;
pub mod subset;
pub mod word_id;

const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def");
const POS_DEPTH: usize = 6;

/// A dictionary consists of one system_dict and zero or more user_dicts
Expand All @@ -51,14 +52,13 @@ pub struct LoadedDictionary<'a> {
}

impl<'a> LoadedDictionary<'a> {
/// Creates a system dictionary from bytes, and load a character category from file
pub fn from_system_dictionary(
/// Creates a system dictionary from bytes, and preloaded character category
pub fn from_system_dictionary_and_chardef(
dictionary_bytes: &'a [u8],
character_category_file: &Path,
character_category: CharacterCategory,
) -> SudachiResult<LoadedDictionary<'a>> {
let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?;

let character_category = CharacterCategory::from_file(character_category_file)?;
let mut grammar = system_dict
.grammar
.ok_or(SudachiError::InvalidDictionaryGrammar)?;
Expand All @@ -71,6 +71,29 @@ impl<'a> LoadedDictionary<'a> {
})
}

/// Creates a system dictionary from bytes, and load a character category from file
pub fn from_system_dictionary(
dictionary_bytes: &'a [u8],
character_category_file: &Path,
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_file(character_category_file)?;
Ok(Self::from_system_dictionary_and_chardef(
dictionary_bytes,
character_category,
)?)
}

/// Creates a system dictionary from bytes, and load embedded default character category
pub fn from_system_dictionary_embedded(
dictionary_bytes: &'a [u8],
) -> SudachiResult<LoadedDictionary<'a>> {
let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?;
Ok(Self::from_system_dictionary_and_chardef(
dictionary_bytes,
character_category,
)?)
}

#[cfg(test)]
pub(crate) fn merge_dictionary(
mut self,
Expand Down
12 changes: 9 additions & 3 deletions sudachi/src/plugin/input_text/default_input_text/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ use crate::prelude::*;
mod tests;

const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def";
const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def");

/// Provides basic normalization of the input text
#[derive(Default)]
Expand Down Expand Up @@ -262,10 +263,15 @@ impl InputTextPlugin for DefaultInputTextPlugin {
settings
.rewriteDef
.unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()),
)?;
);

let reader = BufReader::new(fs::File::open(&rewrite_file_path)?);
self.read_rewrite_lists(reader)?;
if rewrite_file_path.is_ok() {
let reader = BufReader::new(fs::File::open(&rewrite_file_path?)?);
self.read_rewrite_lists(reader)?;
} else {
let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES);
self.read_rewrite_lists(reader)?;
}

Ok(())
}
Expand Down
26 changes: 20 additions & 6 deletions sudachi/src/plugin/oov/mecab_oov/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use crate::prelude::*;
mod test;

const DEFAULT_CHAR_DEF_FILE: &str = "char.def";
const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/char.def");
const DEFAULT_UNK_DEF_FILE: &str = "unk.def";
const DEFAULT_UNK_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/unk.def");

/// provides MeCab oov nodes
#[derive(Default)]
Expand Down Expand Up @@ -257,17 +259,29 @@ impl OovProviderPlugin for MeCabOovPlugin {
settings
.charDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&char_def_path)?);
let categories = MeCabOovPlugin::read_character_property(reader)?;
);

let categories = if char_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&char_def_path?)?);
MeCabOovPlugin::read_character_property(reader)?
} else {
let reader = BufReader::new(DEFAULT_CHAR_DEF_BYTES);
MeCabOovPlugin::read_character_property(reader)?
};

let unk_def_path = config.complete_path(
settings
.unkDef
.unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)),
)?;
let reader = BufReader::new(fs::File::open(&unk_def_path)?);
let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?;
);

let oov_list = if unk_def_path.is_ok() {
let reader = BufReader::new(fs::File::open(&unk_def_path?)?);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
} else {
let reader = BufReader::new(DEFAULT_UNK_DEF_BYTES);
MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?
};

self.categories = categories;
self.oov_list = oov_list;
Expand Down

0 comments on commit d78bf49

Please sign in to comment.