Skip to content

Commit

Permalink
add indexer example used for benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Mar 7, 2024
1 parent 0ab0c63 commit 66e3ee7
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 3 deletions.
3 changes: 3 additions & 0 deletions crates/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,6 @@ name = "search_preindexed"

[[example]]
name = "bitvec_similarity"

[[example]]
name = "indexer"
62 changes: 62 additions & 0 deletions crates/core/examples/indexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
use std::path::Path;

use clap::Parser;
use stract::config::IndexingLocalConfig;

#[derive(Parser)]
struct Args {
dual_encoder_path: Option<String>,
}

fn main() -> anyhow::Result<()> {
let args = Args::parse();

let data_path = Path::new("data");
let path = data_path.join("example_index");

if path.exists() {
std::fs::remove_dir_all(&path)?;
}

println!("Indexing...");
let start = std::time::Instant::now();
stract::entrypoint::indexer::run(&IndexingLocalConfig {
output_path: path.to_str().unwrap().to_string(),
limit_warc_files: None,
skip_warc_files: None,
warc_source: stract::config::WarcSource::Local(stract::config::LocalConfig {
folder: ".".to_string(),
names: vec![data_path
.join("sample.warc.gz")
.to_str()
.unwrap()
.to_string()],
}),
page_webgraph_path: Some(
data_path
.join("webgraph_page")
.to_str()
.unwrap()
.to_string(),
),
host_centrality_threshold: None,
topics_path: None,
host_centrality_store_path: data_path.join("centrality/").to_str().unwrap().to_string(),
page_centrality_store_path: Some(
data_path
.join("centrality_page")
.to_str()
.unwrap()
.to_string(),
),
safety_classifier_path: None,
minimum_clean_words: None,
batch_size: 512,
dual_encoder_model_path: args.dual_encoder_path,
})?;

println!("Indexing took {:?}", start.elapsed());

std::fs::remove_dir(path)?;
Ok(())
}
4 changes: 2 additions & 2 deletions crates/core/src/models/bert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ impl Default for Config {
num_hidden_layers: 12,
num_attention_heads: 12,
intermediate_size: 3072,
hidden_act: HiddenAct::Gelu,
hidden_act: HiddenAct::GeluApproximate,
hidden_dropout_prob: 0.1,
max_position_embeddings: 512,
type_vocab_size: 2,
Expand All @@ -102,7 +102,7 @@ impl Config {
num_hidden_layers: 6,
num_attention_heads: 12,
intermediate_size: 1536,
hidden_act: HiddenAct::Gelu,
hidden_act: HiddenAct::GeluApproximate,
hidden_dropout_prob: 0.1,
max_position_embeddings: 512,
type_vocab_size: 2,
Expand Down
3 changes: 2 additions & 1 deletion crates/core/src/warc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ fn decode_string(raw: &[u8]) -> String {
res
} else {
let mut detector = chardetng::EncodingDetector::new();
detector.feed(raw, true);
let end = std::cmp::min(64, raw.len());
detector.feed(&raw[..end], false);
let (enc, conf) = detector.guess_assess(None, true);

if conf {
Expand Down

0 comments on commit 66e3ee7

Please sign in to comment.