From 66e3ee7c460272e1d24420b99a3e28f3d80f5630 Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Thu, 7 Mar 2024 12:42:47 +0100 Subject: [PATCH] add indexer example used for benchmark --- crates/core/Cargo.toml | 3 ++ crates/core/examples/indexer.rs | 62 +++++++++++++++++++++++++++++++++ crates/core/src/models/bert.rs | 4 +-- crates/core/src/warc.rs | 3 +- 4 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 crates/core/examples/indexer.rs diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 01cd8663..8e65aaa6 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -145,3 +145,6 @@ name = "search_preindexed" [[example]] name = "bitvec_similarity" + +[[example]] +name = "indexer" diff --git a/crates/core/examples/indexer.rs b/crates/core/examples/indexer.rs new file mode 100644 index 00000000..e840e12b --- /dev/null +++ b/crates/core/examples/indexer.rs @@ -0,0 +1,62 @@ +use std::path::Path; + +use clap::Parser; +use stract::config::IndexingLocalConfig; + +#[derive(Parser)] +struct Args { + dual_encoder_path: Option, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + + let data_path = Path::new("data"); + let path = data_path.join("example_index"); + + if path.exists() { + std::fs::remove_dir_all(&path)?; + } + + println!("Indexing..."); + let start = std::time::Instant::now(); + stract::entrypoint::indexer::run(&IndexingLocalConfig { + output_path: path.to_str().unwrap().to_string(), + limit_warc_files: None, + skip_warc_files: None, + warc_source: stract::config::WarcSource::Local(stract::config::LocalConfig { + folder: ".".to_string(), + names: vec![data_path + .join("sample.warc.gz") + .to_str() + .unwrap() + .to_string()], + }), + page_webgraph_path: Some( + data_path + .join("webgraph_page") + .to_str() + .unwrap() + .to_string(), + ), + host_centrality_threshold: None, + topics_path: None, + host_centrality_store_path: data_path.join("centrality/").to_str().unwrap().to_string(), + page_centrality_store_path: Some( + data_path + .join("centrality_page") + .to_str() + .unwrap() + .to_string(), + ), + safety_classifier_path: None, + minimum_clean_words: None, + batch_size: 512, + dual_encoder_model_path: args.dual_encoder_path, + })?; + + println!("Indexing took {:?}", start.elapsed()); + + std::fs::remove_dir(path)?; + Ok(()) +} diff --git a/crates/core/src/models/bert.rs b/crates/core/src/models/bert.rs index 9d2c7096..ab952904 100644 --- a/crates/core/src/models/bert.rs +++ b/crates/core/src/models/bert.rs @@ -78,7 +78,7 @@ impl Default for Config { num_hidden_layers: 12, num_attention_heads: 12, intermediate_size: 3072, - hidden_act: HiddenAct::Gelu, + hidden_act: HiddenAct::GeluApproximate, hidden_dropout_prob: 0.1, max_position_embeddings: 512, type_vocab_size: 2, @@ -102,7 +102,7 @@ impl Config { num_hidden_layers: 6, num_attention_heads: 12, intermediate_size: 1536, - hidden_act: HiddenAct::Gelu, + hidden_act: HiddenAct::GeluApproximate, hidden_dropout_prob: 0.1, max_position_embeddings: 512, type_vocab_size: 2, diff --git a/crates/core/src/warc.rs b/crates/core/src/warc.rs index f8874bcb..7035ce0f 100644 --- a/crates/core/src/warc.rs +++ b/crates/core/src/warc.rs @@ -47,7 +47,8 @@ fn decode_string(raw: &[u8]) -> String { res } else { let mut detector = chardetng::EncodingDetector::new(); - detector.feed(raw, true); + let end = std::cmp::min(64, raw.len()); + detector.feed(&raw[..end], false); let (enc, conf) = detector.guess_assess(None, true); if conf {