From aabf0b3bc21aec3a0d62223207ba6977f93d4777 Mon Sep 17 00:00:00 2001 From: Shane Celis Date: Mon, 22 Apr 2024 07:02:57 -0400 Subject: [PATCH] test: Add memsize test for first 100 words. Right now the raw words are smaller than our trie, which surprised me. Perhaps carrying around values in the label is increasing our size. Something to try and suss out. --- src/trie/trie_impl.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/trie/trie_impl.rs b/src/trie/trie_impl.rs index facb37a..42ed460 100644 --- a/src/trie/trie_impl.rs +++ b/src/trie/trie_impl.rs @@ -243,6 +243,44 @@ mod search_tests { let _ = trie.common_prefix_search::("").next(); } + + #[cfg(feature = "mem_dbg")] + #[test] + /// ```sh + /// cargo test --features mem_dbg memsize -- --nocapture + /// ``` + fn memsize() { + use std::{env, io::{BufReader, BufRead}, fs::{File}}; + use mem_dbg::*; + + const COUNT: usize = 100; + let mut builder = TrieBuilder::new(); + + let repo_root = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR environment variable must be set."); + let edict2_path = format!("{}/benches/edict.furigana", repo_root); + println!("Reading dictionary file from: {}", edict2_path); + + let mut n_words = 0; + let mut accum = 0; + for result in BufReader::new(File::open(edict2_path).unwrap()).lines().take(COUNT) { + let l = result.unwrap(); + accum += l.len(); + builder.push(l); + n_words += 1; + } + println!("Read {} words, {} bytes.", n_words, accum); + + let trie = builder.build(); + let trie_size = trie.mem_size(SizeFlags::default()); + eprintln!("Trie size {trie_size}"); + let uncompressed: Vec = trie.iter().collect(); + let uncompressed_size = uncompressed.mem_size(SizeFlags::default()); + eprintln!("Uncompressed size {}", uncompressed_size); + assert!(accum < trie_size); // This seems wrong to me. + assert!(trie_size < uncompressed_size); + + } + mod exact_match_tests { macro_rules! parameterized_tests { ($($name:ident: $value:expr,)*) => {