Skip to content

Commit

Permalink
cleanup template, replace with selection
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Sep 17, 2023
1 parent 713d7bf commit c76eec3
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 195 deletions.
4 changes: 4 additions & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ impl CollectionSet {
pub fn into_inner(self) -> Collection {
self.collection
}

pub fn selection(&self) -> Selection {
todo!("Extract selection from first sig")
}
}

impl Collection {
Expand Down
25 changes: 23 additions & 2 deletions src/core/src/ffi/index/revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::ffi::signature::SourmashSignature;
use crate::ffi::utils::{ForeignObject, SourmashStr};
use crate::index::revindex::mem_revindex::RevIndex;
use crate::index::Index;
use crate::prelude::*;
use crate::signature::{Signature, SigsTrait};
use crate::sketch::minhash::KmerMinHash;
use crate::sketch::Sketch;
Expand All @@ -18,6 +19,21 @@ impl ForeignObject for SourmashRevIndex {
type RustObject = RevIndex;
}

// TODO: remove this when it is possible to pass Selection thru the FFI
fn from_template(template: &Sketch) -> Selection {
let (num, scaled) = match template {
Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32),
Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32),
_ => unimplemented!(),
};

Selection::builder()
.ksize(template.ksize() as u32)
.num(num)
.scaled(scaled)
.build()
}

ffi_fn! {
unsafe fn revindex_new_with_paths(
search_sigs_ptr: *const *const SourmashStr,
Expand Down Expand Up @@ -58,9 +74,12 @@ unsafe fn revindex_new_with_paths(
.collect();
Some(queries_vec.as_ref())
};

let selection = from_template(&template);

let revindex = RevIndex::new(
search_sigs.as_ref(),
&template,
&selection,
threshold,
queries,
keep_sigs,
Expand Down Expand Up @@ -105,7 +124,9 @@ unsafe fn revindex_new_with_sigs(
.collect();
Some(queries_vec.as_ref())
};
let revindex = RevIndex::new_with_sigs(search_sigs, &template, threshold, queries)?;

let selection = from_template(&template);
let revindex = RevIndex::new_with_sigs(search_sigs, &selection, threshold, queries)?;
Ok(SourmashRevIndex::from_rust(revindex))
}
}
Expand Down
18 changes: 9 additions & 9 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::index::revindex::{
};
use crate::index::{GatherResult, SigCounter};
use crate::manifest::Manifest;
use crate::prelude::*;
use crate::signature::SigsTrait;
use crate::sketch::minhash::KmerMinHash;
use crate::sketch::Sketch;
Expand Down Expand Up @@ -283,11 +284,12 @@ impl RevIndexOps for RevIndex {
hash_to_color: HashToColor,
threshold: usize,
orig_query: &KmerMinHash,
template: &Sketch,
selection: Option<Selection>,
) -> Result<Vec<GatherResult>> {
let mut match_size = usize::max_value();
let mut matches = vec![];
//let mut query: KmerMinHashBTree = orig_query.clone().into();
let selection = selection.unwrap_or_else(|| self.collection.selection());

while match_size > threshold && !counter.is_empty() {
trace!("counter len: {}", counter.len());
Expand All @@ -298,22 +300,20 @@ impl RevIndexOps for RevIndex {

let match_sig = self.collection.sig_for_dataset(dataset_id)?;

let match_mh =
prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash");

// Calculate stats
let f_orig_query = match_size as f64 / orig_query.size() as f64;
let f_match = match_size as f64 / match_mh.size() as f64;
let name = match_sig.name();
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let gather_result_rank = matches.len();
let match_ = match_sig.clone();
let md5 = match_sig.md5sum();

let match_mh = prepare_query(match_sig.into(), &selection)
.expect("Couldn't find a compatible MinHash");
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let (intersect_orig, _) = match_mh.intersection_size(orig_query)?;
let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;

let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64;
let match_ = match_sig.clone();
let md5 = match_sig.md5sum();

// TODO: all of these
let filename = "".into();
Expand Down
94 changes: 30 additions & 64 deletions src/core/src/index/revindex/mem_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,33 +84,31 @@ impl LinearIndex {
impl RevIndex {
pub fn new(
search_sigs: &[PathBuf],
template: &Sketch,
selection: &Selection,
threshold: usize,
queries: Option<&[KmerMinHash]>,
_keep_sigs: bool,
) -> Result<RevIndex> {
// If threshold is zero, let's merge all queries and save time later
let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold));

let collection =
Collection::from_paths(search_sigs)?.select(&Selection::from_template(template))?;
let collection = Collection::from_paths(search_sigs)?.select(&selection)?;
let linear = LinearIndex::from_collection(collection.try_into()?);

Ok(linear.index(threshold, merged_query, queries))
}

pub fn from_zipfile<P: AsRef<Path>>(
zipfile: P,
template: &Sketch,
selection: &Selection,
threshold: usize,
queries: Option<&[KmerMinHash]>,
_keep_sigs: bool,
) -> Result<RevIndex> {
// If threshold is zero, let's merge all queries and save time later
let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold));

let collection =
Collection::from_zipfile(zipfile)?.select(&Selection::from_template(template))?;
let collection = Collection::from_zipfile(zipfile)?.select(&selection)?;
let linear = LinearIndex::from_collection(collection.try_into()?);

Ok(linear.index(threshold, merged_query, queries))
Expand All @@ -130,15 +128,14 @@ impl RevIndex {

pub fn new_with_sigs(
search_sigs: Vec<Signature>,
template: &Sketch,
selection: &Selection,
threshold: usize,
queries: Option<&[KmerMinHash]>,
) -> Result<RevIndex> {
// If threshold is zero, let's merge all queries and save time later
let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold));

let collection =
Collection::from_sigs(search_sigs)?.select(&Selection::from_template(template))?;
let collection = Collection::from_sigs(search_sigs)?.select(selection)?;
let linear = LinearIndex::from_collection(collection.try_into()?);

let idx = linear.index(threshold, merged_query, queries);
Expand Down Expand Up @@ -338,46 +335,33 @@ impl<'a> Index<'a> for RevIndex {
mod test {
use super::*;

use crate::index::revindex::prepare_query;
use crate::sketch::minhash::max_hash_for_scaled;
use crate::Result;

#[test]
fn revindex_new() -> Result<()> {
let max_hash = max_hash_for_scaled(10000);
let template = Sketch::MinHash(
KmerMinHash::builder()
.num(0u32)
.ksize(31)
.max_hash(max_hash)
.build(),
);
let selection = Selection::builder().ksize(31).scaled(10000).build();
let search_sigs = [
"../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(),
"../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(),
];
let index = RevIndex::new(&search_sigs, &template, 0, None, false)?;
let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?;
assert_eq!(index.colors.len(), 3);

Ok(())
}

#[test]
fn revindex_many() -> Result<()> {
let max_hash = max_hash_for_scaled(10000);
let template = Sketch::MinHash(
KmerMinHash::builder()
.num(0u32)
.ksize(31)
.max_hash(max_hash)
.build(),
);
let selection = Selection::builder().ksize(31).scaled(10000).build();
let search_sigs = [
"../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(),
"../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(),
"../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(),
];

let index = RevIndex::new(&search_sigs, &template, 0, None, false)?;
let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?;
//dbg!(&index.linear.collection().manifest);
/*
dbg!(&index.colors.colors);
Expand All @@ -399,14 +383,7 @@ mod test {

#[test]
fn revindex_from_sigs() -> Result<()> {
let max_hash = max_hash_for_scaled(10000);
let template = Sketch::MinHash(
KmerMinHash::builder()
.num(0u32)
.ksize(31)
.max_hash(max_hash)
.build(),
);
let selection = Selection::builder().ksize(31).scaled(10000).build();
let search_sigs: Vec<Signature> = [
"../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig",
"../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig",
Expand All @@ -416,7 +393,7 @@ mod test {
.map(|path| Signature::from_path(path).unwrap().swap_remove(0))
.collect();

let index = RevIndex::new_with_sigs(search_sigs, &template, 0, None)?;
let index = RevIndex::new_with_sigs(search_sigs, &selection, 0, None)?;
/*
dbg!(&index.colors.colors);
0: 86
Expand All @@ -436,18 +413,14 @@ mod test {

#[test]
fn revindex_from_zipstorage() -> Result<()> {
let max_hash = max_hash_for_scaled(100);
let template = Sketch::MinHash(
KmerMinHash::builder()
.num(0u32)
.ksize(19)
.hash_function(crate::encodings::HashFunctions::murmur64_protein)
.max_hash(max_hash)
.build(),
);
let selection = Selection::builder()
.ksize(19)
.scaled(100)
.moltype(crate::encodings::HashFunctions::murmur64_protein)
.build();
let index = RevIndex::from_zipfile(
"../../tests/test-data/prot/protein.zip",
&template,
&selection,
0,
None,
false,
Expand All @@ -460,34 +433,27 @@ mod test {
"../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig",
)
.expect("Error processing query")
.swap_remove(0);

let template = Sketch::MinHash(
KmerMinHash::builder()
.num(0u32)
.ksize(57)
.hash_function(crate::encodings::HashFunctions::murmur64_protein)
.max_hash(max_hash)
.build(),
);
.swap_remove(0)
.select(&selection)?;

let mut query_mh = None;
if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) {
query_mh = Some(mh);
if let Some(q) = prepare_query(query_sig, &selection) {
query_mh = Some(q);
}
let query_mh = query_mh.expect("Couldn't find a compatible MinHash");

let counter_rev = index.counter_for_query(query_mh);
let counter_lin = index.linear.counter_for_query(query_mh);
let counter_rev = index.counter_for_query(&query_mh);
let counter_lin = index.linear.counter_for_query(&query_mh);

let results_rev = index.search(counter_rev, false, 0).unwrap();
let results_linear = index.linear.search(counter_lin, false, 0).unwrap();
assert_eq!(results_rev, results_linear);

let counter_rev = index.counter_for_query(query_mh);
let counter_lin = index.linear.counter_for_query(query_mh);
let counter_rev = index.counter_for_query(&query_mh);
let counter_lin = index.linear.counter_for_query(&query_mh);

let results_rev = index.gather(counter_rev, 0, query_mh).unwrap();
let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap();
let results_rev = index.gather(counter_rev, 0, &query_mh).unwrap();
let results_linear = index.linear.gather(counter_lin, 0, &query_mh).unwrap();
assert_eq!(results_rev.len(), 1);
assert_eq!(results_rev, results_linear);

Expand Down
Loading

0 comments on commit c76eec3

Please sign in to comment.