Skip to content

Commit

Permalink
change indexer to prepare webpages in batches
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Mar 6, 2024
1 parent 519ecb5 commit 0f91ac8
Show file tree
Hide file tree
Showing 27 changed files with 793 additions and 612 deletions.
8 changes: 8 additions & 0 deletions crates/core/src/config/defaults.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,11 @@ impl Widgets {
true
}
}

pub struct Indexing;

impl Indexing {
pub fn batch_size() -> usize {
512
}
}
4 changes: 3 additions & 1 deletion crates/core/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ pub struct IndexingLocalConfig {
pub limit_warc_files: Option<usize>,
pub skip_warc_files: Option<usize>,
pub warc_source: WarcSource,
pub batch_size: Option<usize>,
pub page_webgraph_path: Option<String>,
pub host_centrality_threshold: Option<f64>,
pub topics_path: Option<String>,
pub host_centrality_store_path: String,
pub page_centrality_store_path: Option<String>,
pub safety_classifier_path: Option<String>,
pub minimum_clean_words: Option<usize>,

#[serde(default = "defaults::Indexing::batch_size")]
pub batch_size: usize,
}

#[derive(Debug, Deserialize, Clone)]
Expand Down
7 changes: 4 additions & 3 deletions crates/core/src/entrypoint/configure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use tokio::io;
use tokio_stream::StreamExt;
use tracing::{debug, info};

use crate::config::{LocalConfig, WebSpellConfig};
use crate::config::{defaults, LocalConfig, WebSpellConfig};
use crate::entrypoint::indexer::JobSettings;
use crate::entrypoint::{dmoz_parser, indexer};
use crate::Result;
Expand Down Expand Up @@ -169,11 +169,12 @@ fn create_inverted_index() -> Result<()> {
folder: ".".to_string(),
names: vec![warc_path.to_str().unwrap().to_string()],
}),
warc_paths: vec![warc_path.to_str().unwrap().to_string()],
warc_path: warc_path.to_str().unwrap().to_string(),
base_path: out_path_tmp.to_str().unwrap().to_string(),
settings: JobSettings {
host_centrality_threshold: None,
minimum_clean_words: None,
batch_size: defaults::Indexing::batch_size(),
},
};

Expand All @@ -195,7 +196,7 @@ fn create_inverted_index() -> Result<()> {
None,
);

let index = indexer::process_job(&job, &worker);
let index = job.process(&worker);
std::fs::rename(index.path, out_path)?;
std::fs::remove_dir_all(&out_path_tmp)?;

Expand Down
Loading

0 comments on commit 0f91ac8

Please sign in to comment.