diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4b55af78..7f6469fa4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,16 +20,22 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macos-latest] with_rust: ['true', 'false'] + with_hash: ['true', 'false'] rust_version: ['1.70'] include: - os: ubuntu-latest with_rust: true rust_version: stable + with_hash: true + - os: ubuntu-latest + with_rust: true + rust_version: stable + with_hash: false exclude: - with_rust: 'false' rust_version: 'stable' runs-on: ${{ matrix.os }} - name: ${{ matrix.os }}, with_rust=${{ matrix.with_rust }}, rust_version=${{ matrix.rust_version }} + name: ${{ matrix.os }}, with_rust=${{ matrix.with_rust }}, with_hash=${{ matrix.with_hash }} rust_version=${{ matrix.rust_version }} steps: - uses: actions/checkout@v3 @@ -51,23 +57,7 @@ jobs: rustup update - name: Build - run: cargo xtask build --build-type ${{env.BUILD_TYPE}} --with-rust ${{matrix.with_rust}} --verbose true - - - name: Test - run: cargo xtask test --build-type ${{env.BUILD_TYPE}} - - build_with_hash: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - with: - submodules: 'true' - - - run: sudo apt-get install ninja-build - - - name: Build - run: cargo xtask build --build-type ${{env.BUILD_TYPE}} --with-rust false --with-hash true --verbose true + run: cargo xtask build --build-type ${{env.BUILD_TYPE}} --with-rust ${{matrix.with_rust}} --with-hash ${{matrix.with_hash}} --verbose true - name: Test run: cargo xtask test --build-type ${{env.BUILD_TYPE}} @@ -76,8 +66,9 @@ jobs: strategy: matrix: with_rust: ['true', 'false'] + with_hash: ['true', 'false'] runs-on: ubuntu-latest - name: Coverage with_rust=${{ matrix.with_rust }} + name: Coverage with_rust=${{ matrix.with_rust }} with_hash=${{ matrix.with_hash }} steps: - uses: actions/checkout@v3 @@ -101,7 +92,7 @@ jobs: - name: Build env: CC: clang - run: cargo xtask build --build-type Debug --with-rust ${{matrix.with_rust}} --with-coverage true --verbose true + run: cargo xtask build --build-type Debug --with-rust ${{matrix.with_rust}} --with-hash ${{matrix.with_hash}} --with-coverage true --verbose true - name: Test run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e28b38c2..6fb45c6d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,13 +82,16 @@ option(USE_VALGRIND "Use valgrind when testing" true) option(WITH_RUST "Use rust implemented internals (experimental)" false) if(WITH_RUST) add_subdirectory(cmake/corrosion) - if(CMAKE_BUILD_TYPE MATCHES DEBUG) - corrosion_import_crate(MANIFEST_PATH Cargo.toml CRATES chewing FEATURES capi test-tracing) - else() - corrosion_import_crate(MANIFEST_PATH Cargo.toml CRATES chewing FEATURES capi) - endif() + corrosion_import_crate(MANIFEST_PATH Cargo.toml CRATES chewing CRATE_TYPES staticlib FEATURES capi) corrosion_import_crate(MANIFEST_PATH Cargo.toml CRATES chewing-tools) add_compile_definitions(WITH_RUST) + if(CMAKE_BUILD_TYPE MATCHES Debug) + corrosion_set_features(chewing FEATURES test-tracing) + endif() + if(WITH_SQLITE3) + corrosion_set_features(chewing FEATURES sqlite) + endif() + corrosion_add_target_local_rustflags(chewing -Ccodegen-units=1) if(ENABLE_GCOV) corrosion_set_env_vars(chewing CARGO_INCREMENTAL=0) corrosion_add_target_local_rustflags(chewing -Cinstrument-coverage -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort) diff --git a/Cargo.toml b/Cargo.toml index 2f74e96f9..c12c1387c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ bytemuck = { version = "1.14.0", features = ["derive"] } cdb = "0.6.0" directories = "5.0.0" riff = "2.0.0" -rusqlite = "0.30.0" +rusqlite = { version = "0.30.0", optional = true } thiserror = "1.0.0" tracing = { version = "0.1.40", features = [ "max_level_trace", @@ -25,7 +25,7 @@ tracing-subscriber = { version = "0.3.18", features = [ ], optional = true } [target.'cfg(windows)'.dependencies] -rusqlite = { version = "0.30.0", features = ["bundled"] } +rusqlite = { version = "0.30.0", features = ["bundled"], optional = true } [lib] crate-type = ["rlib", "staticlib"] @@ -33,6 +33,7 @@ crate-type = ["rlib", "staticlib"] [features] default = [] capi = [] +sqlite = ["rusqlite"] test-tracing = ["tracing-subscriber"] [dev-dependencies] @@ -40,6 +41,7 @@ tempfile = "3" [workspace] members = ["tools", "xtask"] +resolver = "2" [profile.release] lto = true diff --git a/src/dictionary/cdb.rs b/src/dictionary/cdb.rs index b26fa8d95..ff52964c8 100644 --- a/src/dictionary/cdb.rs +++ b/src/dictionary/cdb.rs @@ -1,6 +1,4 @@ use std::{ - borrow::Cow, - collections::{HashMap, HashSet}, fmt::Debug, fs::File, io::{self, Write}, @@ -8,86 +6,23 @@ use std::{ path::{Path, PathBuf}, }; -use cdb::{CDBMake, CDBWriter, CDB}; +use cdb::{CDBKeyValueIter, CDBMake, CDBValueIter, CDBWriter, CDB}; use thiserror::Error; use crate::zhuyin::{Syllable, SyllableSlice}; use super::{ + kv::{KVDictionary, KVStore}, BuildDictionaryError, DictEntries, Dictionary, DictionaryBuilder, DictionaryInfo, DictionaryUpdateError, Phrase, }; -mod serde { - use std::str; - - use bytemuck; - - use super::Phrase; - - pub(crate) struct PhraseData(T); - - impl<'a> PhraseData<&'a [u8]> { - pub(crate) fn frequency(&self) -> u32 { - bytemuck::pod_read_unaligned(&self.0[..4]) - } - pub(crate) fn last_used(&self) -> u64 { - bytemuck::pod_read_unaligned(&self.0[4..12]) - } - pub(crate) fn phrase_str(&self) -> &'a str { - let len = self.0[12] as usize; - let data = &self.0[13..]; - str::from_utf8(&data[..len]).expect("should be utf8 encoded string") - } - pub(crate) fn len(&self) -> usize { - 13 + self.0[12] as usize - } - } - - pub(crate) struct PhrasesIter<'a> { - bytes: &'a [u8], - } - - impl<'a> PhrasesIter<'a> { - pub(crate) fn new(bytes: &'a [u8]) -> PhrasesIter<'a> { - PhrasesIter { bytes } - } - - pub(crate) fn empty() -> PhrasesIter<'static> { - PhrasesIter { bytes: &[] } - } - } - - impl Iterator for PhrasesIter<'_> { - type Item = Phrase; - - #[inline(always)] - fn next(&mut self) -> Option { - if self.bytes.is_empty() { - return None; - } - let phrase_data = PhraseData(self.bytes); - self.bytes = &self.bytes[phrase_data.len()..]; - Some( - Phrase::new(phrase_data.phrase_str(), phrase_data.frequency()) - .with_time(phrase_data.last_used()), - ) - } - } -} - -use serde::PhrasesIter; - pub struct CdbDictionary { path: PathBuf, - base: CDB, - added: HashMap, Vec>, - updated: HashMap, - graveyard: HashSet, + inner: KVDictionary, + info: DictionaryInfo, } -type PhraseKey = (Cow<'static, [u8]>, Cow<'static, str>); - #[derive(Debug, Error)] #[error("cdb error")] pub struct CdbDictionaryError { @@ -100,10 +35,7 @@ type Error = CdbDictionaryError; impl Debug for CdbDictionary { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("CdbDictionary") - .field("base", &"CDB { /* opaque */ }") - .field("added", &self.added) - .field("updated", &self.updated) - .field("graveyard", &self.graveyard) + .field("inner", &"CDB { /* opaque */ }") .finish() } } @@ -124,6 +56,46 @@ impl From for CdbDictionaryError { } } +pub(crate) struct OkCDBValueIter<'a>(CDBValueIter<'a>); +pub(crate) struct OkCDBKeyValueIter<'a>(CDBKeyValueIter<'a>); + +impl Iterator for OkCDBValueIter<'_> { + type Item = Vec; + + fn next(&mut self) -> Option { + if let Some(it) = self.0.next() { + it.ok() + } else { + None + } + } +} + +impl Iterator for OkCDBKeyValueIter<'_> { + type Item = (Vec, Vec); + + fn next(&mut self) -> Option { + if let Some(it) = self.0.next() { + it.ok() + } else { + None + } + } +} + +impl<'a> KVStore<'a> for CDB { + type ValueIter = OkCDBValueIter<'a>; + type KeyValueIter = OkCDBKeyValueIter<'a>; + + fn find(&'a self, key: &[u8]) -> Self::ValueIter { + OkCDBValueIter(self.find(key)) + } + + fn iter(&'a self) -> Self::KeyValueIter { + OkCDBKeyValueIter(self.iter()) + } +} + impl CdbDictionary { pub fn open>(path: P) -> Result { match path.as_ref().try_exists() { @@ -142,53 +114,28 @@ impl CdbDictionary { let path = path.as_ref().to_path_buf(); Ok(CdbDictionary { path, - base, - added: Default::default(), - updated: Default::default(), - graveyard: Default::default(), + inner: KVDictionary::new(base), + info: Default::default(), }) } } impl Dictionary for CdbDictionary { fn lookup_first_n_phrases(&self, syllables: &dyn SyllableSlice, first: usize) -> Vec { - let syllable_bytes = syllables.get_bytes(); - let base_bytes = self.base.get(&syllable_bytes); - let base_phrases = match &base_bytes { - Some(record) => PhrasesIter::new(record.as_deref().unwrap_or(&[])), - None => PhrasesIter::empty(), - }; - let added_phrases = match self.added.get(&syllable_bytes) { - Some(phrases) => phrases.clone().into_iter(), - None => vec![].into_iter(), - }; - base_phrases - .chain(added_phrases) - .filter(|it| { - let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); - !self.graveyard.contains(&phrase_key) - }) - .map(|it| { - let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); - match self.updated.get(&phrase_key) { - Some(value) => Phrase::new(it.as_str(), value.0).with_time(value.1), - None => it, - } - }) - .take(first) - .collect() + self.inner.lookup_first_n_phrases(syllables, first) } fn entries(&self) -> Option { - None + self.inner.entries() } fn about(&self) -> DictionaryInfo { - todo!() + self.info.clone() } fn reopen(&mut self) -> Result<(), DictionaryUpdateError> { - self.base = CDB::open(&self.path).map_err(Error::from)?; + self.inner + .reopen(CDB::open(&self.path).map_err(Error::from)?); Ok(()) } @@ -200,57 +147,18 @@ impl Dictionary for CdbDictionary { data_buf.write_all(&[phrase.as_str().len() as u8])?; data_buf.write_all(phrase.as_str().as_bytes()) } - // FIXME fix in CDB crate to use only PathBuf let mut writer = - CDBWriter::create(dbg!(&self.path.display().to_string())).map_err(Error::from)?; - // FIXME reuse entries() - // FIXME fix CDB to provide key iter - for entry in self.base.iter() { - // FIXME skip info entry - let (key, value) = entry.map_err(Error::from)?; - let syllable_bytes = key; - let base_bytes = value; - let base_phrases = PhrasesIter::new(&base_bytes); - let added_phrases = match self.added.get(&syllable_bytes) { - Some(phrases) => phrases.clone().into_iter(), - None => vec![].into_iter(), - }; + CDBWriter::create(&self.path.display().to_string()).map_err(Error::from)?; + writer.add(b"INFO", &[]).map_err(Error::from)?; + for entry in self.entries().unwrap() { let mut data_buf = vec![]; - for phrase in base_phrases - .chain(added_phrases) - .filter(|it| { - let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); - !self.graveyard.contains(&phrase_key) - }) - .map(|it| { - let phrase_key = (syllable_bytes.as_slice().into(), it.as_str().into()); - match self.updated.get(&phrase_key) { - Some(value) => Phrase::new(it.as_str(), value.0).with_time(value.1), - None => it, - } - }) - { - write_phrase(&mut data_buf, &phrase).map_err(Error::from)?; - } - self.added.remove(&syllable_bytes); + write_phrase(&mut data_buf, &entry.1).map_err(Error::from)?; writer - .add(&syllable_bytes, &data_buf) - .map_err(Error::from)?; - } - for (syllable_bytes, phrases) in &self.added { - let mut data_buf = vec![]; - for phrase in phrases { - write_phrase(&mut data_buf, &phrase).map_err(Error::from)?; - } - writer - .add(&syllable_bytes, &data_buf) + .add(&entry.0.get_bytes(), &data_buf) .map_err(Error::from)?; } writer.finish().map_err(Error::from)?; - self.added.clear(); - self.updated.clear(); - self.graveyard.clear(); - dbg!(self.reopen()) + self.reopen() } fn add_phrase( @@ -258,17 +166,7 @@ impl Dictionary for CdbDictionary { syllables: &dyn SyllableSlice, phrase: Phrase, ) -> Result<(), DictionaryUpdateError> { - let syllable_bytes = syllables.get_bytes(); - let phrase_key = (syllable_bytes.into(), phrase.to_string().into()); - if self.updated.contains_key(&phrase_key) { - return Err(DictionaryUpdateError { source: None }); - } - self.graveyard.remove(&phrase_key); - self.added - .entry(phrase_key.0.into_owned()) - .or_default() - .push(phrase); - Ok(()) + self.inner.add_phrase(syllables, phrase) } fn update_phrase( @@ -278,11 +176,7 @@ impl Dictionary for CdbDictionary { user_freq: u32, time: u64, ) -> Result<(), DictionaryUpdateError> { - let syllable_bytes = syllables.get_bytes(); - let phrase_key = (syllable_bytes.into(), String::from(phrase).into()); - self.graveyard.remove(&phrase_key); - self.updated.insert(phrase_key, (user_freq, time)); - Ok(()) + self.inner.update_phrase(syllables, phrase, user_freq, time) } fn remove_phrase( @@ -290,23 +184,26 @@ impl Dictionary for CdbDictionary { syllables: &dyn SyllableSlice, phrase_str: &str, ) -> Result<(), DictionaryUpdateError> { - let syllable_bytes = syllables.get_bytes(); - let phrase_key = (syllable_bytes.into(), phrase_str.to_owned().into()); - self.graveyard.insert(phrase_key); - Ok(()) + self.inner.remove_phrase(syllables, phrase_str) + } +} + +impl Drop for CdbDictionary { + fn drop(&mut self) { + let _ = self.flush(); } } #[derive(Debug)] pub struct CdbDictionaryBuilder { - added: HashMap, Vec>, + inner: KVDictionary<()>, info: DictionaryInfo, } impl CdbDictionaryBuilder { pub fn new() -> CdbDictionaryBuilder { CdbDictionaryBuilder { - added: Default::default(), + inner: KVDictionary::<()>::new_in_memory(), info: Default::default(), } } @@ -330,7 +227,7 @@ impl From for BuildDictionaryError { impl DictionaryBuilder for CdbDictionaryBuilder { fn set_info(&mut self, info: DictionaryInfo) -> Result<(), BuildDictionaryError> { - // TODO + self.info = info; Ok(()) } @@ -339,21 +236,85 @@ impl DictionaryBuilder for CdbDictionaryBuilder { syllables: &[Syllable], phrase: Phrase, ) -> Result<(), BuildDictionaryError> { - self.added - .entry(syllables.get_bytes()) - .or_default() - .push(phrase); + self.inner.add_phrase(&syllables, phrase)?; Ok(()) } fn build(&mut self, path: &Path) -> Result<(), BuildDictionaryError> { let mut maker = CDBMake::new(File::create(path)?)?; - // FIXME cannot create empty db. Insert info? maker.add(b"INFO", &[])?; maker.finish()?; - let mut dict = CdbDictionary::open(path)?; - mem::swap(&mut dict.added, &mut self.added); + let cdb = CDB::open(path)?; + let inner = mem::replace(&mut self.inner, KVDictionary::<()>::new_in_memory()); + let inner = inner.take(cdb); + let mut dict = CdbDictionary { + path: path.to_path_buf(), + inner, + info: self.info.clone(), + }; + dict.flush()?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::error::Error; + + use crate::{dictionary::Phrase, syl, zhuyin::Bopomofo::*}; + + use super::{CdbDictionary, Dictionary}; + + #[test] + fn create_new_dictionary_in_memory_and_query() -> Result<(), Box> { + let tmp_dir = tempfile::tempdir()?; + let file_path = tmp_dir.path().join("chewing.cdb"); + let mut dict = CdbDictionary::open(file_path)?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + assert_eq!( + Some(("dict", 1, 2).into()), + dict.lookup_first_phrase(&[syl![Z, TONE4], syl![D, I, AN, TONE3]]) + ); + Ok(()) + } + + #[test] + fn create_new_dictionary_and_query() -> Result<(), Box> { + let tmp_dir = tempfile::tempdir()?; + let file_path = tmp_dir.path().join("chewing.cdb"); + let mut dict = CdbDictionary::open(file_path)?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + dict.flush()?; + assert_eq!( + Some(("dict", 1, 2).into()), + dict.lookup_first_phrase(&[syl![Z, TONE4], syl![D, I, AN, TONE3]]) + ); + Ok(()) + } + + #[test] + fn create_new_dictionary_and_enumerate() -> Result<(), Box> { + let tmp_dir = tempfile::tempdir()?; + let file_path = tmp_dir.path().join("chewing.cdb"); + let mut dict = CdbDictionary::open(file_path)?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; dict.flush()?; + assert_eq!( + vec![( + vec![syl![Z, TONE4], syl![D, I, AN, TONE3]], + Phrase::from(("dict", 1, 2)) + )], + dict.entries().unwrap().collect::>() + ); Ok(()) } } diff --git a/src/dictionary/kv.rs b/src/dictionary/kv.rs new file mode 100644 index 000000000..3032abbd0 --- /dev/null +++ b/src/dictionary/kv.rs @@ -0,0 +1,393 @@ +use std::{ + borrow::Cow, + cmp, + collections::{btree_map::Entry, BTreeMap, BTreeSet}, + fmt::Debug, + io, + iter::{self, Empty}, + path::Path, + str, +}; + +use crate::zhuyin::{Syllable, SyllableSlice}; + +use super::{DictEntries, DictionaryUpdateError, Phrase}; + +pub(crate) trait KVStore<'a> { + type ValueIter: Iterator>; + type KeyValueIter: Iterator, Vec)>; + + fn find(&'a self, key: &[u8]) -> Self::ValueIter; + fn iter(&'a self) -> Self::KeyValueIter; +} + +pub(crate) trait KVStoreBuilder { + fn create>(filename: P) -> io::Result + where + Self: Sized; + fn add(&mut self, key: &[u8], data: &[u8]) -> io::Result<()>; + fn finish(self) -> io::Result<()>; +} + +type PhraseKey = (Cow<'static, [u8]>, Cow<'static, str>); + +pub(crate) struct KVDictionary { + store: T, + btree: BTreeMap, + graveyard: BTreeSet, +} + +impl Debug for KVDictionary { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KVDictionary") + .field("store", &"/* private fields */") + .field("btree", &self.btree) + .finish() + } +} + +const MIN_PHRASE: &str = ""; +const MAX_PHRASE: &str = "\u{10FFFF}"; + +fn phrase_from_bytes(bytes: &[u8]) -> Vec { + bytes + .chunks_exact(2) + .map(|bytes| { + let mut u16_bytes = [0; 2]; + u16_bytes.copy_from_slice(bytes); + let syl_u16 = u16::from_le_bytes(u16_bytes); + Syllable::try_from(syl_u16).unwrap() + }) + .collect::>() +} + +impl KVDictionary +where + T: for<'a> KVStore<'a>, +{ + pub(crate) fn new(store: T) -> KVDictionary { + KVDictionary { + store, + btree: BTreeMap::new(), + graveyard: BTreeSet::new(), + } + } + + pub(crate) fn new_in_memory() -> KVDictionary<()> { + KVDictionary { + store: (), + btree: BTreeMap::new(), + graveyard: BTreeSet::new(), + } + } + + pub(crate) fn take(self, store: S) -> KVDictionary + where + S: for<'a> KVStore<'a>, + { + KVDictionary { + store, + btree: self.btree, + graveyard: self.graveyard, + } + } + + pub(crate) fn reopen(&mut self, store: T) { + self.store = store; + } + + pub(crate) fn entries_iter_for<'a>( + &'a self, + syllable_bytes: &'a [u8], + ) -> impl Iterator + 'a { + let syllable_key = Cow::from(syllable_bytes); + let min_key = (syllable_key.clone(), Cow::from(MIN_PHRASE)); + let max_key = (syllable_key.clone(), Cow::from(MAX_PHRASE)); + + self.store + .find(&syllable_bytes) + .map(|bytes| Phrase::from(PhraseData(&bytes))) + .chain( + self.btree + .range(min_key..max_key) + .map(|(key, value)| Phrase { + phrase: key.1.as_ref().to_owned(), + freq: value.0, + last_used: Some(value.1), + }), + ) + .filter(move |it| { + !self + .graveyard + .contains(&(syllable_key.clone(), Cow::from(it.as_str()))) + }) + } + + pub(crate) fn entries_iter(&self) -> impl Iterator, Phrase)> + '_ { + let mut store_iter = self + .store + .iter() + .filter(|it| it.0 != b"INFO") + .map(|(syllable_bytes, phrase_bytes)| { + (syllable_bytes, Phrase::from(PhraseData(&phrase_bytes))) + }) + .peekable(); + let mut btree_iter = self + .btree + .iter() + .map(|(key, value)| { + ( + key.0.clone().into_owned(), + Phrase { + phrase: key.1.as_ref().to_owned(), + freq: value.0, + last_used: Some(value.1), + }, + ) + }) + .peekable(); + iter::from_fn(move || { + let a = store_iter.peek(); + let b = btree_iter.peek(); + match (a, b) { + (None, Some(_)) => btree_iter.next(), + (Some(_), None) => store_iter.next(), + (Some(x), Some(y)) => match (&x.0, x.1.as_str()).cmp(&(&y.0, y.1.as_str())) { + cmp::Ordering::Less => store_iter.next(), + cmp::Ordering::Equal => match x.1.freq.cmp(&y.1.freq) { + cmp::Ordering::Less | cmp::Ordering::Equal => { + let _ = store_iter.next(); + btree_iter.next() + } + cmp::Ordering::Greater => { + let _ = btree_iter.next(); + store_iter.next() + } + }, + cmp::Ordering::Greater => btree_iter.next(), + }, + (None, None) => None, + } + }) + .filter(|it| { + !self + .graveyard + .contains(&(Cow::from(it.0.as_slice()), Cow::from(it.1.as_str()))) + }) + } + + pub(crate) fn lookup_first_n_phrases( + &self, + syllables: &dyn SyllableSlice, + first: usize, + ) -> Vec { + let syllable_bytes = syllables.get_bytes(); + let mut sort_map = BTreeMap::new(); + let mut phrases: Vec = Vec::new(); + + for phrase in self.entries_iter_for(&syllable_bytes) { + match sort_map.entry(phrase.to_string()) { + Entry::Occupied(entry) => { + let index = *entry.get(); + phrases[index] = cmp::max(&phrase, &phrases[index]).clone(); + } + Entry::Vacant(entry) => { + entry.insert(phrases.len()); + phrases.push(phrase); + } + } + } + phrases.truncate(first); + phrases + } + + pub(crate) fn entries(&self) -> Option { + Some(Box::new( + self.entries_iter() + .map(|it| (phrase_from_bytes(&it.0), it.1)) + .collect::>() + .into_iter(), + )) + } + + pub(crate) fn add_phrase( + &mut self, + syllables: &dyn SyllableSlice, + phrase: Phrase, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.get_bytes(); + if self + .entries_iter_for(&syllable_bytes) + .any(|ph| ph.as_str() == phrase.as_str()) + { + return Err(DictionaryUpdateError { source: None }); + } + + self.btree.insert( + (Cow::from(syllable_bytes), Cow::from(phrase.phrase)), + (phrase.freq, phrase.last_used.unwrap_or_default()), + ); + + Ok(()) + } + + pub(crate) fn update_phrase( + &mut self, + syllables: &dyn SyllableSlice, + phrase: Phrase, + user_freq: u32, + time: u64, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.get_bytes(); + self.btree.insert( + (Cow::from(syllable_bytes), Cow::from(phrase.phrase)), + (user_freq, time), + ); + + Ok(()) + } + + pub(crate) fn remove_phrase( + &mut self, + syllables: &dyn SyllableSlice, + phrase_str: &str, + ) -> Result<(), DictionaryUpdateError> { + let syllable_bytes = syllables.get_bytes(); + self.btree.remove(&( + Cow::from(syllable_bytes.clone()), + Cow::from(phrase_str.to_owned()), + )); + self.graveyard + .insert((syllable_bytes.into(), phrase_str.to_owned().into())); + Ok(()) + } +} + +impl KVStore<'_> for () { + type ValueIter = Empty>; + type KeyValueIter = Empty<(Vec, Vec)>; + + fn find(&self, _key: &[u8]) -> Self::ValueIter { + iter::empty() + } + + fn iter(&self) -> Self::KeyValueIter { + iter::empty() + } +} + +#[derive(Debug, Clone, Copy)] +struct PhraseData<'a>(&'a [u8]); + +impl<'a> PhraseData<'a> { + fn frequency(&self) -> u32 { + bytemuck::pod_read_unaligned(&self.0[..4]) + } + fn last_used(&self) -> u64 { + bytemuck::pod_read_unaligned(&self.0[4..12]) + } + fn phrase_str(&self) -> &'a str { + let len = self.0[12] as usize; + let data = &self.0[13..]; + str::from_utf8(&data[..len]).expect("should be utf8 encoded string") + } + fn _len(&self) -> usize { + 13 + self.0[12] as usize + } +} + +impl From> for Phrase { + fn from(value: PhraseData<'_>) -> Self { + Phrase { + phrase: value.phrase_str().to_owned(), + freq: value.frequency(), + last_used: Some(value.last_used()), + } + } +} + +#[cfg(test)] +mod tests { + use std::error::Error; + + use crate::{dictionary::Phrase, syl, zhuyin::Bopomofo::*}; + + use super::KVDictionary; + + #[test] + fn create_new_dictionary_in_memory_and_query() -> Result<(), Box> { + let mut dict = KVDictionary::<()>::new_in_memory(); + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + assert_eq!( + vec![Phrase::from(("dict", 1, 2))], + dict.lookup_first_n_phrases(&[syl![Z, TONE4], syl![D, I, AN, TONE3]], 1) + ); + Ok(()) + } + + #[test] + fn create_new_dictionary_in_memory_all_entries() -> Result<(), Box> { + let mut dict = KVDictionary::<()>::new_in_memory(); + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict2", 1, 2).into(), + )?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict3", 1, 2).into(), + )?; + assert_eq!( + vec![ + Phrase::from(("dict", 1, 2)), + Phrase::from(("dict2", 1, 2)), + Phrase::from(("dict3", 1, 2)) + ], + dict.entries_iter().map(|it| it.1).collect::>() + ); + Ok(()) + } + + #[test] + fn create_new_dictionary_in_memory_add_remove_entries() -> Result<(), Box> { + let mut dict = KVDictionary::<()>::new_in_memory(); + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict2", 1, 2).into(), + )?; + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict3", 1, 2).into(), + )?; + dict.remove_phrase(&[syl![Z, TONE4], syl![D, I, AN, TONE3]], "dict3")?; + assert_eq!( + vec![Phrase::from(("dict", 1, 2)), Phrase::from(("dict2", 1, 2)),], + dict.entries_iter().map(|it| it.1).collect::>() + ); + Ok(()) + } + + #[test] + fn create_new_dictionary_empty_and_query() -> Result<(), Box> { + let mut dict = KVDictionary::new(()); + dict.add_phrase( + &[syl![Z, TONE4], syl![D, I, AN, TONE3]], + ("dict", 1, 2).into(), + )?; + assert_eq!( + vec![Phrase::from(("dict", 1, 2))], + dict.lookup_first_n_phrases(&[syl![Z, TONE4], syl![D, I, AN, TONE3]], 1) + ); + Ok(()) + } +} diff --git a/src/dictionary/loader.rs b/src/dictionary/loader.rs index c3de3dcfa..a7ed4601b 100644 --- a/src/dictionary/loader.rs +++ b/src/dictionary/loader.rs @@ -2,7 +2,9 @@ use std::path::{Path, PathBuf}; use crate::path::{find_path_by_files, sys_path_from_env_var, userphrase_path}; -use super::{CdbDictionary, Dictionary, SqliteDictionary, TrieDictionary}; +#[cfg(feature = "sqlite")] +use super::SqliteDictionary; +use super::{CdbDictionary, Dictionary, TrieDictionary}; #[derive(Debug)] pub struct SystemDictionaryLoader { @@ -27,29 +29,35 @@ impl SystemDictionaryLoader { let mut tsi_db_path = sys_path.clone(); tsi_db_path.push("tsi.dat"); - let tsi_db = if let Ok(db) = SqliteDictionary::open_read_only(&tsi_db_path) { - Box::new(db) as Box - } else if let Ok(db) = TrieDictionary::open(&tsi_db_path) { - Box::new(db) as Box - } else if let Ok(db) = CdbDictionary::open(&tsi_db_path) { - Box::new(db) as Box - } else { - return None; - }; + let mut tsi_db = None; + #[cfg(feature = "sqlite")] + { + tsi_db = SqliteDictionary::open_read_only(&tsi_db_path) + .map(|db| Box::new(db) as Box) + .ok(); + } + if tsi_db.is_none() { + tsi_db = TrieDictionary::open(&tsi_db_path) + .map(|db| Box::new(db) as Box) + .ok(); + } let mut word_db_path = sys_path; word_db_path.push("word.dat"); - let word_db = if let Ok(db) = SqliteDictionary::open_read_only(&word_db_path) { - Box::new(db) as Box - } else if let Ok(db) = TrieDictionary::open(&word_db_path) { - Box::new(db) as Box - } else if let Ok(db) = CdbDictionary::open(&word_db_path) { - Box::new(db) as Box - } else { - return None; - }; + let mut word_db = None; + #[cfg(feature = "sqlite")] + { + word_db = SqliteDictionary::open_read_only(&word_db_path) + .map(|db| Box::new(db) as Box) + .ok(); + } + if word_db.is_none() { + word_db = TrieDictionary::open(&word_db_path) + .map(|db| Box::new(db) as Box) + .ok(); + } - Some(vec![word_db, tsi_db]) + Some(vec![word_db.unwrap(), tsi_db.unwrap()]) } } @@ -73,16 +81,19 @@ impl UserDictionaryLoader { userphrase_path()? }; - let dict = if let Ok(db) = SqliteDictionary::open(&data_path) { - Box::new(db) as Box - } else if let Ok(db) = TrieDictionary::open(&data_path) { - Box::new(db) as Box - } else if let Ok(db) = CdbDictionary::open(&data_path) { - Box::new(db) as Box - } else { - return None; - }; + let mut dict = None; + #[cfg(feature = "sqlite")] + { + dict = dbg!(SqliteDictionary::open(&data_path)) + .map(|db| Box::new(db) as Box) + .ok(); + } + if dict.is_none() { + dict = CdbDictionary::open(&data_path) + .map(|db| Box::new(db) as Box) + .ok(); + } - Some(dict) + dict } } diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs index 77ce6f81b..16cec8789 100644 --- a/src/dictionary/mod.rs +++ b/src/dictionary/mod.rs @@ -16,12 +16,15 @@ use crate::zhuyin::{Syllable, SyllableSlice}; pub use self::cdb::{CdbDictionary, CdbDictionaryBuilder, CdbDictionaryError}; pub use layered::LayeredDictionary; pub use loader::{SystemDictionaryLoader, UserDictionaryLoader}; +#[cfg(feature = "sqlite")] pub use sqlite::{SqliteDictionary, SqliteDictionaryBuilder, SqliteDictionaryError}; pub use trie::{TrieDictionary, TrieDictionaryBuilder, TrieDictionaryStatistics}; mod cdb; +mod kv; mod layered; mod loader; +#[cfg(feature = "sqlite")] mod sqlite; mod trie; diff --git a/src/editor/selection/symbol.rs b/src/editor/selection/symbol.rs index c8ab961fd..29c3e5233 100644 --- a/src/editor/selection/symbol.rs +++ b/src/editor/selection/symbol.rs @@ -14,7 +14,7 @@ pub(crate) struct SymbolSelector { } impl SymbolSelector { - pub(crate) fn load>(path: P) -> Result { + pub(crate) fn _load>(path: P) -> Result { let file = File::open(path)?; let reader = BufReader::new(file); SymbolSelector::new(reader) diff --git a/src/path.rs b/src/path.rs index 205ad561d..5eed3b61c 100644 --- a/src/path.rs +++ b/src/path.rs @@ -106,5 +106,12 @@ fn legacy_data_dir() -> Option { /// and also respects the `CHEWING_USER_PATH` environment variable. pub fn userphrase_path() -> Option { // TODO support uhash.dat - data_dir().map(|path| path.join("chewing.sqlite3")) + #[cfg(feature = "sqlite")] + { + data_dir().map(|path| path.join("chewing.sqlite3")) + } + #[cfg(not(feature = "sqlite"))] + { + data_dir().map(|path| path.join("uhash.dat")) + } } diff --git a/src/zhuyin/syllable.rs b/src/zhuyin/syllable.rs index bacba75eb..df84267f0 100644 --- a/src/zhuyin/syllable.rs +++ b/src/zhuyin/syllable.rs @@ -1,6 +1,6 @@ use std::{ borrow::Cow, - fmt::{Display, Write}, + fmt::{Debug, Display, Write}, num::NonZeroU16, str::FromStr, }; @@ -18,7 +18,7 @@ pub struct Syllable { value: NonZeroU16, } -impl core::fmt::Debug for Syllable { +impl Debug for Syllable { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Syllable") .field("value", &self.value) @@ -279,7 +279,7 @@ impl AsRef for Syllable { } } -pub trait SyllableSlice { +pub trait SyllableSlice: Debug { fn as_slice(&self) -> Cow<'_, [Syllable]>; fn get_bytes(&self) -> Vec { let mut syllables_bytes = vec![]; diff --git a/tools/Cargo.toml b/tools/Cargo.toml index 2aa125793..249f26e43 100644 --- a/tools/Cargo.toml +++ b/tools/Cargo.toml @@ -6,7 +6,7 @@ version = "0.6.0-alpha.1" edition = "2021" [dependencies] -chewing = { version = "0.6.0-alpha.1", path = ".." } +chewing = { version = "0.6.0-alpha.1", path = "..", features = ["sqlite"] } thiserror = "1.0.0" anyhow = "1.0.0" argh = "0.1.10" diff --git a/xtask/src/cli.rs b/xtask/src/cli.rs index 57837765c..d1a5c89da 100644 --- a/xtask/src/cli.rs +++ b/xtask/src/cli.rs @@ -22,7 +22,7 @@ pub struct CmdBuild { #[argh(option, default = "true")] pub with_rust: bool, - /// using hash implementation for user-dictionary, not compatible with --with-rust + /// using hash implementation for user-dictionary #[argh(option, default = "false")] pub with_hash: bool, diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 8c4e9f8f9..88ccc62d5 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -64,11 +64,6 @@ fn main() -> Result<()> { match app.cmd { Cmds::Build(ref cmd) => { - if cmd.with_hash && cmd.with_rust { - return Err(anyhow!( - "--with_hash and --with_rust should not be used together" - )); - } BuildOpts { with_rust: cmd.with_rust, with_hash: cmd.with_hash,