Skip to content

Commit

Permalink
refactor crawl planner
Browse files Browse the repository at this point in the history
* make sure crawl plan has frontpage of all crawled sites
* order crawl plan to first crawl sites with high harmonic centrality
  • Loading branch information
mikkeldenker committed Mar 4, 2024
1 parent 26eb164 commit c7e596a
Show file tree
Hide file tree
Showing 11 changed files with 547 additions and 229 deletions.
2 changes: 2 additions & 0 deletions crates/core/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ pub struct CrawlPlannerConfig {
pub top_host_fraction: f64,
pub wander_fraction: f64,
pub top_n_hosts_surplus: usize,

pub num_threads: Option<usize>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand Down
2 changes: 1 addition & 1 deletion crates/core/src/crawler/coordinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub struct CrawlCoordinator {
impl CrawlCoordinator {
pub fn new<P: AsRef<Path>>(jobs_queue: P) -> Result<Self> {
Ok(Self {
jobs: Mutex::new(FileQueue::new(jobs_queue)?),
jobs: Mutex::new(FileQueue::open(jobs_queue)?),
})
}

Expand Down
8 changes: 4 additions & 4 deletions crates/core/src/crawler/file_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ where
let file = self.writer.into_inner()?;

Ok(FileQueue {
pointer: FilePointer::new(self.path)?,
pointer: FilePointer::open(self.path)?,
file: unsafe { Mmap::map(&file)? },
_marker: std::marker::PhantomData,
})
Expand All @@ -97,7 +97,7 @@ struct FilePointer {
}

impl FilePointer {
fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
if !path.as_ref().exists() {
std::fs::create_dir_all(path.as_ref())?;
}
Expand Down Expand Up @@ -142,7 +142,7 @@ impl<T> FileQueue<T>
where
T: serde::Serialize + serde::de::DeserializeOwned,
{
pub fn new<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
if !path.as_ref().exists() {
std::fs::create_dir_all(path.as_ref())?;
}
Expand All @@ -151,7 +151,7 @@ where
let file = unsafe { Mmap::map(&file)? };

Ok(Self {
pointer: FilePointer::new(path)?,
pointer: FilePointer::open(path)?,
file,
_marker: std::marker::PhantomData,
})
Expand Down
14 changes: 14 additions & 0 deletions crates/core/src/crawler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,20 @@ pub struct WeightedUrl {
pub weight: f64,
}

impl PartialEq for WeightedUrl {
fn eq(&self, other: &Self) -> bool {
self.url == other.url
}
}

impl Eq for WeightedUrl {}

impl std::hash::Hash for WeightedUrl {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.url.hash(state);
}
}

/// All urls in a job must be from the same domain and only one job per domain.
/// at a time. This ensures that we stay polite when crawling.
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
Expand Down
Loading

0 comments on commit c7e596a

Please sign in to comment.