Skip to content

Commit

Permalink
use recursion to load paths into a MultiCollection => mf support
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Sep 8, 2024
1 parent 89b1c08 commit 88d009a
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 12 deletions.
29 changes: 29 additions & 0 deletions src/python/tests/test_multisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,35 @@ def test_simple_ani_list_of_zips(runtmp):
assert max_ani == 0.9772


def test_simple_ani_list_of_csv(runtmp):
# test basic execution against a pathlist file of manifests
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.sig.zip')
sig47 = get_test_data('47.sig.zip')
sig63 = get_test_data('63.sig.zip')

runtmp.sourmash('sig', 'collect', sig2, '-o', 'sig2.mf.csv', '-F', 'csv')
runtmp.sourmash('sig', 'collect', sig47, '-o', 'sig47.mf.csv', '-F', 'csv')
runtmp.sourmash('sig', 'collect', sig63, '-o', 'sig63.mf.csv', '-F', 'csv')

make_file_list(query_list, ['sig2.mf.csv', 'sig47.mf.csv', 'sig63.mf.csv'])
make_file_list(against_list, ['sig2.mf.csv', 'sig47.mf.csv', 'sig63.mf.csv'])

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output, '--ani')
assert os.path.exists(output)

df = pandas.read_csv(output)
assert len(df) == 5

dd = df.to_dict(orient='index')
print(dd)


def test_simple_ani_standalone_manifest(runtmp):
# test basic execution of a standalone manifest
against_list = runtmp.output('against.sig.zip')
Expand Down
61 changes: 49 additions & 12 deletions src/utils/multicollection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,26 @@ impl MultiCollection {
}

// Turn a set of paths into list of Collections.
fn load_set_of_paths(paths: HashSet<String>) -> (Vec<Collection>, usize) {
fn load_set_of_paths(paths: HashSet<String>) -> (MultiCollection, usize) {
let n_failed = AtomicUsize::new(0);

// could just use a variant of load_collection here?
let colls: Vec<_> = paths
let colls: Vec<MultiCollection> = paths
.par_iter()
.filter_map(|iloc| match iloc {
// load from zipfile
x if x.ends_with(".zip") => {
debug!("loading sigs from zipfile {}", x);
Some(Collection::from_zipfile(x).unwrap())
let coll = Collection::from_zipfile(x).unwrap();
Some(MultiCollection::from(coll))
}
// load from CSV
x if x.ends_with(".csv") => {
debug!("vec from pathlist of standalone manifests!");

let x: String = x.into();
let utf_path: &Path = x.as_str().into();
MultiCollection::from_standalone_manifest(utf_path).ok()
}
// load from (by default) a sigfile
_ => {
Expand Down Expand Up @@ -77,7 +86,7 @@ impl MultiCollection {
.build(),
),
);
Some(collection)
Some(MultiCollection::from(collection))
}
None => {
eprintln!("WARNING: could not load sketches from path '{}'", iloc);
Expand All @@ -90,7 +99,7 @@ impl MultiCollection {
.collect();

let n_failed = n_failed.load(atomic::Ordering::SeqCst);
(colls, n_failed)
(MultiCollection::from(colls), n_failed)
}

/// Build from a standalone manifest. Note: the tricky bit here
Expand All @@ -112,12 +121,9 @@ impl MultiCollection {
let ilocs: HashSet<_> = manifest.internal_locations().map(String::from).collect();
let (colls, _n_failed) = MultiCollection::load_set_of_paths(ilocs);

let colls = colls
.par_iter()
.map(|c| c.clone().intersect_manifest(&manifest))
.collect();
let multi = colls.intersect_manifest(&manifest);

Ok(MultiCollection::new(colls, false))
Ok(multi)
}
}

Expand Down Expand Up @@ -174,9 +180,9 @@ impl MultiCollection {
})
.collect();

let (colls, n_failed) = MultiCollection::load_set_of_paths(lines);
let (multi, n_failed) = MultiCollection::load_set_of_paths(lines);

Ok((MultiCollection::new(colls, false), n_failed))
Ok((multi, n_failed))
}

// Load from a sig file
Expand Down Expand Up @@ -267,6 +273,15 @@ impl MultiCollection {

Ok(sketchinfo)
}

fn intersect_manifest(self, manifest: &Manifest) -> MultiCollection {
let colls = self
.collections
.par_iter()
.map(|c| c.clone().intersect_manifest(&manifest))
.collect();
MultiCollection::new(colls, self.contains_revindex)
}
}

impl Select for MultiCollection {
Expand All @@ -281,6 +296,28 @@ impl Select for MultiCollection {
}
}

// Convert a single Collection into a MultiCollection
impl From<Collection> for MultiCollection {
fn from(coll: Collection) -> Self {
// @CTB check if revindex
MultiCollection::new(vec![coll], false)
}
}

// Merge a bunch of MultiCollection structs into one
impl From<Vec<MultiCollection>> for MultiCollection {
fn from(multi: Vec<MultiCollection>) -> Self {
let mut x: Vec<Collection> = vec![];
for mc in multi.into_iter() {
for coll in mc.collections.into_iter() {
x.push(coll);
}
}
// @CTB check bool
MultiCollection::new(x, false)
}
}

/// Track a name/minhash.
pub struct SmallSignature {
pub location: String,
Expand Down

0 comments on commit 88d009a

Please sign in to comment.