Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
mikkeldenker committed Feb 28, 2024
2 parents db3dce4 + 6b9d514 commit 0b787e5
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 38 deletions.
2 changes: 1 addition & 1 deletion configs/api.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ correction_threshold = 3.0
lm_prob_weight = 1.0

[llm]
api_base= "http://localhost:4000/v1"
api_base = "http://localhost:4000/v1"
model = "data/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
# model = "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
# model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
27 changes: 16 additions & 11 deletions crates/core/src/query/optic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,10 @@ impl AsTantivyQuery for Matching {

#[cfg(test)]
mod tests {
use optics::{HostRankings, Optic};
use optics::{
ast::{RankingCoeff, RankingTarget},
HostRankings, Optic,
};

use crate::{
gen_temp_path,
Expand Down Expand Up @@ -726,16 +729,18 @@ mod tests {
let res = searcher
.search(&SearchQuery {
query: "website".to_string(),
optic: Some(
Optic::parse(
r#"
Like(Site("www.a.com"));
Like(Site("www.b.com"));
Dislike(Site("www.c.com"));
"#,
)
.unwrap(),
),
optic: Some(Optic {
rankings: vec![RankingCoeff {
target: RankingTarget::Signal("inbound_similarity".to_string()),
value: 100_000.0,
}],
host_rankings: HostRankings {
liked: vec!["www.a.com".to_string(), "www.b.com".to_string()],
disliked: vec!["www.c.com".to_string()],
..Default::default()
},
..Default::default()
}),
..Default::default()
})
.unwrap()
Expand Down
12 changes: 11 additions & 1 deletion crates/core/src/ranking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,10 @@ impl Ranker {
#[cfg(test)]
mod tests {

use optics::Optic;
use optics::{
ast::{RankingCoeff, RankingTarget},
Optic,
};

use crate::{
index::Index,
Expand Down Expand Up @@ -638,6 +641,13 @@ mod tests {
let result = searcher
.search(&SearchQuery {
query: "test".to_string(),
optic: Some(Optic {
rankings: vec![RankingCoeff {
target: RankingTarget::Signal("fetch_time_ms".to_string()),
value: 100_000.0,
}],
..Default::default()
}),
..Default::default()
})
.expect("Search failed");
Expand Down
18 changes: 18 additions & 0 deletions crates/core/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,18 @@ mod tests {
res
}

fn tokenize_identity(s: &str) -> Vec<String> {
let mut res = Vec::new();
let mut tokenizer = Identity {};
let mut stream = tokenizer.token_stream(s);

while let Some(token) = stream.next() {
res.push(token.text.clone());
}

res
}

#[test]
fn simple_tokenization() {
assert_eq!(
Expand Down Expand Up @@ -1057,4 +1069,10 @@ key1.key2="this\" is @ a # test""#;

assert_eq!(tokenize_url(".com"), vec![".", "com ",])
}

#[test]
fn identity() {
assert_eq!(tokenize_identity("this is a test"), vec!["this is a test"]);
assert_eq!(tokenize_identity("a-b"), vec!["a-b"]);
}
}
27 changes: 16 additions & 11 deletions crates/core/src/webgraph/centrality/approx_harmonic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use std::{path::Path, sync::Mutex};
use std::path::Path;

use dashmap::DashMap;
use indicatif::ParallelProgressIterator;
use rayon::prelude::*;

Expand Down Expand Up @@ -44,31 +45,35 @@ impl ApproxHarmonic {

tracing::info!("sampling {} nodes", num_samples);

let sampled = graph.random_nodes(num_samples);
let sampled = graph.random_nodes_with_outgoing(num_samples);

let res = Mutex::new(Self {
inner: RocksDbStore::open(output),
});
let centralities: DashMap<NodeID, f32> = DashMap::new();

let norm = num_nodes as f64 / (num_samples as f64 * (num_nodes as f64 - 1.0));
let norm = num_nodes as f32 / (num_samples as f32 * (num_nodes as f32 - 1.0));

sampled.into_par_iter().progress().for_each(|source| {
let dists = graph.raw_distances_with_max(source, 5);

let res = res.lock().unwrap();
for (target, dist) in dists {
if dist == 0 {
continue;
}

let dist = dist as f64;
let dist = dist as f32;

let old = res.inner.get(&target).unwrap_or(0.0);
res.inner.insert(target, old + ((1.0 / dist) * norm));
*centralities.entry(target).or_default() += (1.0 / dist) * norm;
}
});

res.into_inner().unwrap()
let res = Self {
inner: RocksDbStore::open(output),
};

for (node, centrality) in centralities {
res.inner.insert(node, centrality as f64);
}

res
}

pub fn get(&self, node: &NodeID) -> Option<f64> {
Expand Down
10 changes: 8 additions & 2 deletions crates/core/src/webgraph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use std::path::Path;
use std::sync::Arc;
use std::{cmp, fs};

use itertools::Itertools;
use rand::seq::SliceRandom;
use rayon::prelude::*;
use url::Url;
Expand Down Expand Up @@ -864,9 +865,14 @@ impl Webgraph {
self.id2node.keys()
}

pub fn random_nodes(&self, num: usize) -> Vec<NodeID> {
pub fn random_nodes_with_outgoing(&self, num: usize) -> Vec<NodeID> {
let mut rng = rand::thread_rng();
let mut nodes = self.nodes().take(num).collect::<Vec<_>>();
let mut nodes = self
.edges()
.map(|e| e.from)
.unique()
.take(num)
.collect::<Vec<_>>();
nodes.shuffle(&mut rng);
nodes.into_iter().take(num).collect()
}
Expand Down
34 changes: 23 additions & 11 deletions crates/core/src/webpage/html/into_tantivy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ impl Html {
}
let title = title.unwrap();

Ok(self.pretokenize_string(title))
Ok(self.pretokenize_string(title, TextField::Title))
}

fn pretokenize_all_text(&self) -> Result<PreTokenizedString> {
Expand All @@ -52,35 +52,35 @@ impl Html {
}
let all_text = all_text.unwrap();

Ok(self.pretokenize_string(all_text))
Ok(self.pretokenize_string(all_text, TextField::AllBody))
}

fn pretokenize_clean_text(&self) -> PreTokenizedString {
let clean_text = self.clean_text().cloned().unwrap_or_default();
self.pretokenize_string(clean_text)
self.pretokenize_string(clean_text, TextField::CleanBody)
}

fn pretokenize_url(&self) -> PreTokenizedString {
let url = self.url().to_string();
self.pretokenize_string(url)
self.pretokenize_string(url, TextField::Url)
}

fn pretokenize_domain(&self) -> PreTokenizedString {
let domain = self.url().root_domain().unwrap_or_default().to_string();

self.pretokenize_string(domain)
self.pretokenize_string(domain, TextField::Domain)
}

fn pretokenize_site(&self) -> PreTokenizedString {
let site = self.url().normalized_host().unwrap_or_default().to_string();

self.pretokenize_string(site)
self.pretokenize_string(site, TextField::SiteWithout)
}

fn pretokenize_description(&self) -> PreTokenizedString {
let text = self.description().unwrap_or_default();

self.pretokenize_string(text)
self.pretokenize_string(text, TextField::Description)
}

fn pretokenize_microformats(&self) -> PreTokenizedString {
Expand All @@ -91,11 +91,11 @@ impl Html {
text.push(' ');
}

self.pretokenize_string(text)
self.pretokenize_string(text, TextField::MicroformatTags)
}

fn pretokenize_string(&self, text: String) -> PreTokenizedString {
self.pretokenize_string_with(text, tokenizer::Tokenizer::default())
fn pretokenize_string(&self, text: String, field: TextField) -> PreTokenizedString {
self.pretokenize_string_with(text, field.indexing_tokenizer())
}

fn pretokenize_string_with(
Expand Down Expand Up @@ -381,7 +381,19 @@ impl Html {
}
Field::Text(TextField::DomainIfHomepageNoTokenizer) => {
if self.is_homepage() {
doc.add_pre_tokenized_text(tantivy_field, domain.clone());
doc.add_pre_tokenized_text(
tantivy_field,
PreTokenizedString {
text: domain.text.clone(),
tokens: vec![tantivy::tokenizer::Token {
offset_from: 0,
offset_to: domain.text.len(),
position: 0,
text: domain.text.clone(),
position_length: 1,
}],
},
);
} else {
doc.add_text(tantivy_field, "");
}
Expand Down
4 changes: 3 additions & 1 deletion scripts/ci/check
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ cargo install wasm-pack

cargo check
cargo check --no-default-features
cd frontend && npm run wasm && npm install && npm run check

# skip frontend check until https://github.com/sveltejs/kit/issues/11906 is fixed
# cd frontend && npm run wasm && npm install && npm run check

0 comments on commit 0b787e5

Please sign in to comment.