From fc765829a82f8b4cb748b3987170465ab4fbde4a Mon Sep 17 00:00:00 2001 From: Kan-Ru Chen Date: Tue, 2 Jan 2024 22:44:25 +0900 Subject: [PATCH] feat(dict): implement CDB based user dictionary --- Cargo.lock | 21 ++ Cargo.toml | 1 + src/dictionary/cdb.rs | 357 +++++++++++++++++++++++++++++++++ src/dictionary/loader.rs | 8 +- src/dictionary/mod.rs | 33 +++ src/dictionary/sqlite.rs | 10 +- src/zhuyin/syllable.rs | 4 +- tools/src/bin/init_database.rs | 6 +- 8 files changed, 430 insertions(+), 10 deletions(-) create mode 100644 src/dictionary/cdb.rs diff --git a/Cargo.lock b/Cargo.lock index ce9485795..71a88659a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,6 +107,16 @@ dependencies = [ "libc", ] +[[package]] +name = "cdb" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40365487fcb26c3c826eb57802b33bd767e401771e71a69608edc9b95b320a5" +dependencies = [ + "filebuffer", + "libc", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -118,6 +128,7 @@ name = "chewing" version = "0.6.0-alpha.1" dependencies = [ "bytemuck", + "cdb", "directories", "indexmap", "riff", @@ -194,6 +205,16 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "filebuffer" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b41bfe1d74263ea9d084be951077614b3b98b4e59a9dafab1467645a9e52305" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "getrandom" version = "0.2.11" diff --git a/Cargo.toml b/Cargo.toml index 95dde9f8f..898ee4559 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ include = ["src/**/*.rs", "Cargo.toml", "AUTHORS", "COPYING", "NEWS"] [dependencies] bytemuck = { version = "1.14.0", features = ["derive"] } +cdb = "0.6.0" directories = "5.0.0" indexmap = "2.1.0" riff = "2.0.0" diff --git a/src/dictionary/cdb.rs b/src/dictionary/cdb.rs new file mode 100644 index 000000000..d133fe91c --- /dev/null +++ b/src/dictionary/cdb.rs @@ -0,0 +1,357 @@ +use std::{ + borrow::Cow, + collections::{HashMap, HashSet}, + fmt::Debug, + fs::File, + io::{self, Write}, + iter, mem, + path::{Path, PathBuf}, +}; + +use cdb::{CDBMake, CDBWriter, CDB}; +use thiserror::Error; + +use crate::zhuyin::{IntoSyllablesBytes, Syllable}; + +use super::{ + BuildDictionaryError, DictEntries, Dictionary, DictionaryBuilder, DictionaryInfo, + DictionaryUpdateError, Phrase, Phrases, +}; + +mod serde { + use std::str; + + use bytemuck; + + use super::Phrase; + + pub(crate) struct PhraseData(T); + + impl<'a> PhraseData<&'a [u8]> { + pub(crate) fn frequency(&self) -> u32 { + bytemuck::pod_read_unaligned(&self.0[..4]) + } + pub(crate) fn last_used(&self) -> u64 { + bytemuck::pod_read_unaligned(&self.0[4..12]) + } + pub(crate) fn phrase_str(&self) -> &'a str { + let len = self.0[12] as usize; + let data = &self.0[13..]; + str::from_utf8(&data[..len]).expect("should be utf8 encoded string") + } + pub(crate) fn len(&self) -> usize { + 13 + self.0[12] as usize + } + } + + pub(crate) struct PhrasesIter<'a> { + bytes: &'a [u8], + } + + impl<'a> PhrasesIter<'a> { + pub(crate) fn new(bytes: &'a [u8]) -> PhrasesIter<'a> { + PhrasesIter { bytes } + } + + pub(crate) fn empty() -> PhrasesIter<'static> { + PhrasesIter { bytes: &[] } + } + } + + impl Iterator for PhrasesIter<'_> { + type Item = Phrase; + + fn next(&mut self) -> Option { + if self.bytes.is_empty() { + return None; + } + let phrase_data = PhraseData(self.bytes); + self.bytes = &self.bytes[phrase_data.len()..]; + Some( + Phrase::new(phrase_data.phrase_str(), phrase_data.frequency()) + .with_time(phrase_data.last_used()), + ) + } + } +} + +use serde::PhrasesIter; + +pub struct CdbDictionary { + path: PathBuf, + base: CDB, + added: HashMap, Vec>, + updated: HashMap, + graveyard: HashSet, +} + +type PhraseKey = (Cow<'static, [u8]>, Cow<'static, str>); + +#[derive(Debug, Error)] +#[error("cdb error")] +pub struct CdbDictionaryError { + #[from] + source: io::Error, +} + +type Error = CdbDictionaryError; + +impl Debug for CdbDictionary { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CdbDictionary") + .field("base", &"CDB { /* opaque */ }") + .field("added", &self.added) + .field("updated", &self.updated) + .field("graveyard", &self.graveyard) + .finish() + } +} + +impl From for DictionaryUpdateError { + fn from(value: CdbDictionaryError) -> Self { + DictionaryUpdateError { + source: Some(value.into()), + } + } +} + +impl From for CdbDictionaryError { + fn from(value: BuildDictionaryError) -> Self { + CdbDictionaryError { + source: io::Error::other(value), + } + } +} + +impl CdbDictionary { + pub fn open>(path: P) -> Result { + match path.as_ref().try_exists() { + Ok(exists) => { + if !exists { + let mut builder = CdbDictionaryBuilder::new(); + builder + .set_info(DictionaryInfo::default()) + .map_err(Error::from)?; + builder.build(path.as_ref())?; + } + } + Err(_) => todo!(), + } + let base = CDB::open(&path)?; + let path = path.as_ref().to_path_buf(); + Ok(CdbDictionary { + path, + base, + added: Default::default(), + updated: Default::default(), + graveyard: Default::default(), + }) + } +} + +impl Dictionary for CdbDictionary { + fn lookup_phrase>(&self, syllables: &[Syl]) -> Phrases<'_> { + let syllable_bytes = syllables.into_bytes(); + let base_bytes = self.base.get(&syllable_bytes); + let base_phrases = match &base_bytes { + Some(record) => PhrasesIter::new(record.as_deref().unwrap_or(&[])), + None => PhrasesIter::empty(), + }; + let added_phrases = match self.added.get(&syllable_bytes) { + Some(phrases) => phrases.clone().into_iter(), + None => vec![].into_iter(), + }; + let phrases = base_phrases + .chain(added_phrases) + .filter(|it| { + let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); + !self.graveyard.contains(&phrase_key) + }) + .map(|it| { + let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); + match self.updated.get(&phrase_key) { + Some(value) => Phrase::new(it.as_str(), value.0).with_time(value.1), + None => it, + } + }) + .collect::>(); + Box::new(phrases.into_iter()) + } + + fn entries(&self) -> DictEntries { + Box::new(iter::empty()) + } + + fn about(&self) -> DictionaryInfo { + todo!() + } + + fn reopen(&mut self) -> Result<(), DictionaryUpdateError> { + self.base = CDB::open(&self.path).map_err(Error::from)?; + Ok(()) + } + + fn flush(&mut self) -> Result<(), DictionaryUpdateError> { + fn write_phrase(data_buf: &mut Vec, phrase: &Phrase) -> Result<(), io::Error> { + data_buf.write_all(&phrase.freq().to_le_bytes())?; + data_buf.write_all(&phrase.last_used().unwrap_or_default().to_le_bytes())?; + data_buf.write_all(&[phrase.as_str().len() as u8])?; + data_buf.write_all(phrase.as_str().as_bytes()) + } + // FIXME fix in CDB crate to use only PathBuf + let mut writer = + CDBWriter::create(dbg!(&self.path.display().to_string())).map_err(Error::from)?; + // FIXME reuse entries() + // FIXME fix CDB to provide key iter + for entry in self.base.iter() { + // FIXME skip info entry + let (key, value) = entry.map_err(Error::from)?; + let syllable_bytes = key; + let base_bytes = value; + let base_phrases = PhrasesIter::new(&base_bytes); + let added_phrases = match self.added.get(&syllable_bytes) { + Some(phrases) => phrases.clone().into_iter(), + None => vec![].into_iter(), + }; + let mut data_buf = vec![]; + for phrase in base_phrases + .chain(added_phrases) + .filter(|it| { + let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); + !self.graveyard.contains(&phrase_key) + }) + .map(|it| { + let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); + match self.updated.get(&phrase_key) { + Some(value) => Phrase::new(it.as_str(), value.0).with_time(value.1), + None => it, + } + }) + { + write_phrase(&mut data_buf, &phrase).map_err(Error::from)?; + } + self.added.remove(&syllable_bytes); + writer + .add(&syllable_bytes, &data_buf) + .map_err(Error::from)?; + } + for (syllable_bytes, phrases) in &self.added { + let mut data_buf = vec![]; + for phrase in phrases { + write_phrase(&mut data_buf, &phrase).map_err(Error::from)?; + } + writer + .add(&syllable_bytes, &data_buf) + .map_err(Error::from)?; + } + writer.finish().map_err(Error::from)?; + self.added.clear(); + self.updated.clear(); + self.graveyard.clear(); + dbg!(self.reopen()) + } + + fn insert>( + &mut self, + syllables: &[Syl], + phrase: Phrase, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.into_bytes(); + let phrase_key = (syllable_bytes.into(), phrase.to_string().into()); + if self.updated.contains_key(&phrase_key) { + return Err(DictionaryUpdateError { source: None }); + } + self.graveyard.remove(&phrase_key); + self.added + .entry(phrase_key.0.into_owned()) + .or_default() + .push(phrase); + Ok(()) + } + + fn update>( + &mut self, + syllables: &[Syl], + phrase: Phrase, + user_freq: u32, + time: u64, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.into_bytes(); + let phrase_key = (syllable_bytes.into(), String::from(phrase).into()); + self.graveyard.remove(&phrase_key); + self.updated.insert(phrase_key, (user_freq, time)); + Ok(()) + } + + fn remove>( + &mut self, + syllables: &[Syl], + phrase_str: &str, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.into_bytes(); + let phrase_key = (syllable_bytes.into(), phrase_str.to_owned().into()); + self.graveyard.insert(phrase_key); + Ok(()) + } +} + +#[derive(Debug)] +pub struct CdbDictionaryBuilder { + added: HashMap, Vec>, + info: DictionaryInfo, +} + +impl CdbDictionaryBuilder { + pub fn new() -> CdbDictionaryBuilder { + CdbDictionaryBuilder { + added: Default::default(), + info: Default::default(), + } + } +} + +impl From for BuildDictionaryError { + fn from(value: CdbDictionaryError) -> Self { + BuildDictionaryError { + source: Box::new(value), + } + } +} + +impl From for BuildDictionaryError { + fn from(value: DictionaryUpdateError) -> Self { + BuildDictionaryError { + source: Box::new(value), + } + } +} + +impl DictionaryBuilder for CdbDictionaryBuilder { + fn set_info(&mut self, info: DictionaryInfo) -> Result<(), BuildDictionaryError> { + // TODO + Ok(()) + } + + fn insert( + &mut self, + syllables: &[Syllable], + phrase: Phrase, + ) -> Result<(), BuildDictionaryError> { + self.added + .entry(syllables.into_bytes()) + .or_default() + .push(phrase); + Ok(()) + } + + fn build(&mut self, path: &Path) -> Result<(), BuildDictionaryError> { + let mut maker = CDBMake::new(File::create(path)?)?; + // FIXME cannot create empty db. Insert info? + maker.add(b"INFO", &[])?; + maker.finish()?; + let mut dict = CdbDictionary::open(path)?; + mem::swap(&mut dict.added, &mut self.added); + dict.flush()?; + Ok(()) + } +} diff --git a/src/dictionary/loader.rs b/src/dictionary/loader.rs index 5f63f1407..8a87d6dff 100644 --- a/src/dictionary/loader.rs +++ b/src/dictionary/loader.rs @@ -5,7 +5,7 @@ use crate::{ path::{find_path_by_files, sys_path_from_env_var, userphrase_path}, }; -use super::{AnyDictionary, SqliteDictionary, TrieDictionary}; +use super::{AnyDictionary, CdbDictionary, SqliteDictionary, TrieDictionary}; #[derive(Debug)] pub struct SystemDictionaryLoader { @@ -34,6 +34,8 @@ impl SystemDictionaryLoader { db.into() } else if let Ok(db) = TrieDictionary::open(&tsi_db_path) { db.into() + } else if let Ok(db) = CdbDictionary::open(&tsi_db_path) { + db.into() } else { return None; }; @@ -44,6 +46,8 @@ impl SystemDictionaryLoader { db.into() } else if let Ok(db) = TrieDictionary::open(&word_db_path) { db.into() + } else if let Ok(db) = CdbDictionary::open(&word_db_path) { + db.into() } else { return None; }; @@ -76,6 +80,8 @@ impl UserDictionaryLoader { db.into() } else if let Ok(db) = TrieDictionary::open(&data_path) { db.into() + } else if let Ok(db) = CdbDictionary::open(&data_path) { + db.into() } else { return None; }; diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index 66c9697e7..a48918578 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -12,11 +12,13 @@ use thiserror::Error; use crate::zhuyin::Syllable; +pub use cdb::{CdbDictionary, CdbDictionaryBuilder, CdbDictionaryError}; pub use layered::LayeredDictionary; pub use loader::{SystemDictionaryLoader, UserDictionaryLoader, UserFreqEstimateLoader}; pub use sqlite::{SqliteDictionary, SqliteDictionaryBuilder, SqliteDictionaryError}; pub use trie::{TrieDictionary, TrieDictionaryBuilder, TrieDictionaryStatistics}; +mod cdb; mod layered; mod loader; mod sqlite; @@ -488,6 +490,7 @@ impl BlockList for () { #[derive(Debug)] pub enum AnyDictionary { + CdbDictionary(CdbDictionary), SqliteDictionary(SqliteDictionary), TrieDictionary(TrieDictionary), HashMapDictionary(HashMap, Vec>), @@ -496,6 +499,7 @@ pub enum AnyDictionary { impl Dictionary for AnyDictionary { fn lookup_phrase>(&self, syllables: &[Syl]) -> Phrases<'_> { match self { + AnyDictionary::CdbDictionary(dict) => dict.lookup_phrase(syllables), AnyDictionary::SqliteDictionary(dict) => dict.lookup_phrase(syllables), AnyDictionary::TrieDictionary(dict) => dict.lookup_phrase(syllables), AnyDictionary::HashMapDictionary(dict) => dict.lookup_phrase(syllables), @@ -504,6 +508,7 @@ impl Dictionary for AnyDictionary { fn entries(&self) -> DictEntries { match self { + AnyDictionary::CdbDictionary(dict) => dict.entries(), AnyDictionary::SqliteDictionary(dict) => dict.entries(), AnyDictionary::TrieDictionary(dict) => dict.entries(), AnyDictionary::HashMapDictionary(dict) => dict.entries(), @@ -512,18 +517,38 @@ impl Dictionary for AnyDictionary { fn about(&self) -> DictionaryInfo { match self { + AnyDictionary::CdbDictionary(dict) => dict.about(), AnyDictionary::SqliteDictionary(dict) => dict.about(), AnyDictionary::TrieDictionary(dict) => dict.about(), AnyDictionary::HashMapDictionary(dict) => dict.about(), } } + fn reopen(&mut self) -> Result<(), DictionaryUpdateError> { + match self { + AnyDictionary::CdbDictionary(dict) => dict.reopen(), + AnyDictionary::SqliteDictionary(dict) => dict.reopen(), + AnyDictionary::TrieDictionary(dict) => dict.reopen(), + AnyDictionary::HashMapDictionary(dict) => dict.reopen(), + } + } + + fn flush(&mut self) -> Result<(), DictionaryUpdateError> { + match self { + AnyDictionary::CdbDictionary(dict) => dict.flush(), + AnyDictionary::SqliteDictionary(dict) => dict.flush(), + AnyDictionary::TrieDictionary(dict) => dict.flush(), + AnyDictionary::HashMapDictionary(dict) => dict.flush(), + } + } + fn insert>( &mut self, syllables: &[Syl], phrase: Phrase, ) -> Result<(), DictionaryUpdateError> { match self { + AnyDictionary::CdbDictionary(dict) => dict.insert(syllables, phrase), AnyDictionary::SqliteDictionary(dict) => dict.insert(syllables, phrase), AnyDictionary::TrieDictionary(dict) => dict.insert(syllables, phrase), AnyDictionary::HashMapDictionary(dict) => { @@ -540,6 +565,7 @@ impl Dictionary for AnyDictionary { time: u64, ) -> Result<(), DictionaryUpdateError> { match self { + AnyDictionary::CdbDictionary(dict) => dict.update(syllables, phrase, user_freq, time), AnyDictionary::SqliteDictionary(dict) => { dict.update(syllables, phrase, user_freq, time) } @@ -556,6 +582,7 @@ impl Dictionary for AnyDictionary { phrase_str: &str, ) -> Result<(), DictionaryUpdateError> { match self { + AnyDictionary::CdbDictionary(dict) => dict.remove(syllables, phrase_str), AnyDictionary::SqliteDictionary(dict) => dict.remove(syllables, phrase_str), AnyDictionary::TrieDictionary(dict) => dict.remove(syllables, phrase_str), AnyDictionary::HashMapDictionary(dict) => { @@ -565,6 +592,12 @@ impl Dictionary for AnyDictionary { } } +impl From for AnyDictionary { + fn from(value: CdbDictionary) -> Self { + Self::CdbDictionary(value) + } +} + impl From for AnyDictionary { fn from(value: SqliteDictionary) -> Self { Self::SqliteDictionary(value) diff --git a/src/dictionary/sqlite.rs b/src/dictionary/sqlite.rs index a660286de..5465d1bae 100644 --- a/src/dictionary/sqlite.rs +++ b/src/dictionary/sqlite.rs @@ -270,7 +270,7 @@ impl From for DictionaryUpdateError { impl Dictionary for SqliteDictionary { fn lookup_phrase>(&self, syllables: &[Syl]) -> Phrases<'static> { - let syllables_bytes = syllables.into_syllables_bytes(); + let syllables_bytes = syllables.into_bytes(); let mut stmt = self .conn .prepare_cached( @@ -353,7 +353,7 @@ impl Dictionary for SqliteDictionary { source: Some(Box::new(SqliteDictionaryError::ReadOnly)), }); } - let syllables_bytes = syllables.into_syllables_bytes(); + let syllables_bytes = syllables.into_bytes(); let mut stmt = self.conn.prepare_cached( "INSERT OR REPLACE INTO dictionary_v1 ( syllables, @@ -377,7 +377,7 @@ impl Dictionary for SqliteDictionary { source: Some(Box::new(SqliteDictionaryError::ReadOnly)), }); } - let syllables_bytes = syllables.into_syllables_bytes(); + let syllables_bytes = syllables.into_bytes(); let tx = self.conn.transaction()?; { let mut stmt = tx.prepare_cached( @@ -424,7 +424,7 @@ impl Dictionary for SqliteDictionary { syllables: &[Syl], phrase_str: &str, ) -> Result<(), DictionaryUpdateError> { - let syllables_bytes = syllables.into_syllables_bytes(); + let syllables_bytes = syllables.into_bytes(); let mut stmt = self .conn .prepare_cached("DELETE FROM dictionary_v1 WHERE syllables = ? AND phrase = ?")?; @@ -507,7 +507,7 @@ impl DictionaryBuilder for SqliteDictionaryBuilder { } else { 0 }; - let syllables_bytes = syllables.into_syllables_bytes(); + let syllables_bytes = syllables.into_bytes(); let mut stmt = self.dict.conn.prepare_cached( "INSERT OR REPLACE INTO dictionary_v1 ( syllables, diff --git a/src/zhuyin/syllable.rs b/src/zhuyin/syllable.rs index 4d8c43ee2..2e1c30918 100644 --- a/src/zhuyin/syllable.rs +++ b/src/zhuyin/syllable.rs @@ -281,11 +281,11 @@ impl AsRef for Syllable { /// TODO: docs pub trait IntoSyllablesBytes { /// TODO: docs - fn into_syllables_bytes(&self) -> Vec; + fn into_bytes(&self) -> Vec; } impl> IntoSyllablesBytes for &[Syl] { - fn into_syllables_bytes(&self) -> Vec { + fn into_bytes(&self) -> Vec { let mut syllables_bytes = vec![]; self.iter() .for_each(|syl| syllables_bytes.extend_from_slice(&syl.as_ref().to_le_bytes())); diff --git a/tools/src/bin/init_database.rs b/tools/src/bin/init_database.rs index 0e3707cd5..740386fd8 100644 --- a/tools/src/bin/init_database.rs +++ b/tools/src/bin/init_database.rs @@ -2,7 +2,8 @@ use anyhow::{bail, Context, Result}; use argh::FromArgs; use chewing::{ dictionary::{ - DictionaryBuilder, DictionaryInfo, SqliteDictionaryBuilder, TrieDictionaryBuilder, + CdbDictionaryBuilder, DictionaryBuilder, DictionaryInfo, SqliteDictionaryBuilder, + TrieDictionaryBuilder, }, zhuyin::{Bopomofo, Syllable}, }; @@ -40,7 +41,7 @@ impl IntoParseError for Result { #[derive(FromArgs)] /// This program creates a new chewing phrase dictionary file. pub struct Args { - /// choose the underlying database implementation, must be either "trie" or "sqlite" + /// choose the underlying database implementation, must be either "trie", "cdb", or "sqlite" #[argh(option, short = 't', default = "String::from(\"trie\")")] pub db_type: String, @@ -79,6 +80,7 @@ fn main() -> Result<()> { let mut builder: Box = match args.db_type.as_str() { "sqlite" => Box::new(SqliteDictionaryBuilder::new()), "trie" => Box::new(TrieDictionaryBuilder::new()), + "cdb" => Box::new(CdbDictionaryBuilder::new()), _ => bail!("Unknown database type {}", args.db_type), };