From 005775be413fe045e0808a4cdd74bb54a79f430c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 17:27:17 -0700 Subject: [PATCH] MRG: clean up index to use `MultiCollection` (#451) * try making index work with standard code * kinda working * fmt * refactor * clear up the tests * refactor/clean up * cargo fmt * add tests for index warning & error --- src/index.rs | 95 ++++++++++-------------- src/python/tests/test_fastgather.py | 4 + src/python/tests/test_fastmultigather.py | 11 ++- src/python/tests/test_index.py | 59 +++++++++++++-- src/utils/multicollection.rs | 33 ++++++++ 5 files changed, 141 insertions(+), 61 deletions(-) diff --git a/src/index.rs b/src/index.rs index 00202743..c303b09b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,12 +1,9 @@ -use anyhow::Context; -use camino::Utf8PathBuf as PathBuf; use sourmash::index::revindex::RevIndex; use sourmash::index::revindex::RevIndexOps; use sourmash::prelude::*; -use std::fs::File; -use std::io::{BufRead, BufReader}; use std::path::Path; +use crate::utils::{load_collection, ReportType}; use sourmash::collection::{Collection, CollectionSet}; pub fn index>( @@ -14,68 +11,56 @@ pub fn index>( selection: &Selection, output: P, colors: bool, - _allow_failed_sigpaths: bool, + allow_failed_sigpaths: bool, use_internal_storage: bool, ) -> Result<(), Box> { - println!("Loading siglist"); + eprintln!("Loading sketches from {}", siglist); - let collection = match siglist { - x if x.ends_with(".zip") => Collection::from_zipfile(x)?, - x if x.ends_with(".sig") || x.ends_with(".sig.gz") => { - let signatures = Signature::from_path(&x) - .with_context(|| format!("Failed to load signatures from: '{}'", x))?; + let multi = match load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + ) { + Ok(multi) => multi, + Err(err) => return Err(err.into()), + }; + eprintln!("Found {} sketches total.", multi.len()); - Collection::from_sigs(signatures).with_context(|| { - format!( - "Loaded signatures but failed to load as collection: '{}'", - x - ) - })? + // Try to convert it into a Collection and then CollectionSet. + let collection = match Collection::try_from(multi.clone()) { + // conversion worked! + Ok(c) => { + let cs: CollectionSet = c.select(selection)?.try_into()?; + Ok(cs) } - _ => { - let file = File::open(siglist.clone()) - .with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?; - - let reader = BufReader::new(file); - - // load list of paths - let lines: Vec<_> = reader - .lines() - .filter_map(|line| match line { - Ok(path) => { - let mut filename = PathBuf::new(); - filename.push(path); - Some(filename) - } - Err(_err) => None, - }) - .collect(); - - if lines.is_empty() { - return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()); + // conversion failed; can we/should we load it into memory? + Err(_) => { + if use_internal_storage { + eprintln!("WARNING: loading all sketches into memory in order to index."); + eprintln!("See 'index' documentation for details."); + let c: Collection = multi.load_all_sigs(selection)?; + let cs: CollectionSet = c.try_into()?; + Ok(cs) } else { - match Collection::from_paths(&lines) { - Ok(collection) => collection, - Err(err) => { - eprintln!("Error in loading from '{}': {}", siglist, err); - return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()); - } - } + Err( + anyhow::anyhow!("cannot index this type of collection with external storage") + .into(), + ) } } }; - let collection: CollectionSet = collection.select(selection)?.try_into()?; + match collection { + Ok(collection) => { + eprintln!("Indexing {} sketches.", collection.len()); + let mut index = RevIndex::create(output.as_ref(), collection, colors)?; - if collection.is_empty() { - Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()) - } else { - eprintln!("Indexing {} sketches.", collection.len()); - let mut index = RevIndex::create(output.as_ref(), collection, colors)?; - - if use_internal_storage { - index.internalize_storage()?; + if use_internal_storage { + index.internalize_storage()?; + } + Ok(()) } - Ok(()) + Err(e) => Err(e), } } diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index f444818f..ae26254f 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -16,6 +16,8 @@ def test_installed(runtmp): def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -60,6 +62,8 @@ def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, togg def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 6a5d99e3..f06f190c 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -203,6 +203,9 @@ def test_simple_read_manifests(runtmp): def test_simple_indexed(runtmp, zip_query, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") + # test basic execution! query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -239,6 +242,8 @@ def test_simple_indexed(runtmp, zip_query, toggle_internal_storage): def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -273,6 +278,8 @@ def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage): def test_missing_querylist(runtmp, capfd, indexed, zip_query, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test missing querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -1174,7 +1181,9 @@ def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd): "47.fa.sig.gz", "63.fa.sig.gz"]) - # index! + # index! CTB, note this will fail currently. + raise pytest.xfail("not implemented") + runtmp.sourmash('scripts', 'index', against_list, '--no-internal-storage', '-o', 'subdir/against.rocksdb') diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 6f59425e..638913f4 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -16,6 +16,9 @@ def test_installed(runtmp): def test_index(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") + # test basic index! siglist = runtmp.output('db-sigs.txt') @@ -35,6 +38,49 @@ def test_index(runtmp, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err +def test_index_warning_message(runtmp, capfd): + # test basic index when it has to load things into memory - see #451. + siglist = runtmp.output('db-sigs.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + make_file_list(siglist, [sig2, sig47, sig63]) + + output = runtmp.output('db.rocksdb') + + runtmp.sourmash('scripts', 'index', siglist, '-o', output) + assert os.path.exists(output) + print(runtmp.last_result.err) + + assert 'index is done' in runtmp.last_result.err + captured = capfd.readouterr() + print(captured.err) + assert "WARNING: loading all sketches into memory in order to index." in captured.err + + +def test_index_error_message(runtmp, capfd): + # test basic index when it errors out b/c can't load + siglist = runtmp.output('db-sigs.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + make_file_list(siglist, [sig2, sig47, sig63]) + + output = runtmp.output('db.rocksdb') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output, + '--no-internal-storage') + + captured = capfd.readouterr() + print(captured.err) + assert "cannot index this type of collection with external storage" in captured.err + + def test_index_protein(runtmp, toggle_internal_storage): sigs = get_test_data('protein.zip') output = runtmp.output('db.rocksdb') @@ -82,10 +128,9 @@ def test_index_missing_siglist(runtmp, capfd, toggle_internal_storage): captured = capfd.readouterr() print(captured.err) - assert 'Failed to open pathlist file:' in captured.err + assert 'Error: No such file or directory: ' in captured.err -@pytest.mark.xfail(reason="not implemented yet") def test_index_sig(runtmp, capfd, toggle_internal_storage): # test index with a .sig.gz file instead of pathlist # (should work now) @@ -101,7 +146,6 @@ def test_index_sig(runtmp, capfd, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err -@pytest.mark.xfail(reason="not implemented yet") def test_index_manifest(runtmp, capfd, toggle_internal_storage): # test index with a manifest file sig2 = get_test_data('2.fa.sig.gz') @@ -118,7 +162,6 @@ def test_index_manifest(runtmp, capfd, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err -@pytest.mark.xfail(reason="needs more work") def test_index_bad_siglist_2(runtmp, capfd): # test with a bad siglist (containing a missing file) against_list = runtmp.output('against.txt') @@ -139,7 +182,6 @@ def test_index_bad_siglist_2(runtmp, capfd): assert "WARNING: could not load sketches from path 'no-exist'" in captured.err -@pytest.mark.xfail(reason="needs more work") def test_index_empty_siglist(runtmp, capfd): # test empty siglist file siglist = runtmp.output('db-sigs.txt') @@ -347,6 +389,8 @@ def test_index_zipfile_bad(runtmp, capfd): def test_index_check(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") # test check index siglist = runtmp.output('db-sigs.txt') @@ -367,6 +411,8 @@ def test_index_check(runtmp, toggle_internal_storage): def test_index_check_quick(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") # test check index siglist = runtmp.output('db-sigs.txt') @@ -387,6 +433,9 @@ def test_index_check_quick(runtmp, toggle_internal_storage): def test_index_subdir(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") + # test basic index & output to subdir siglist = runtmp.output('db-sigs.txt') diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 5606243c..da0fedca 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -282,6 +282,26 @@ impl MultiCollection { .collect(); MultiCollection::new(colls, self.contains_revindex) } + + pub fn load_all_sigs(self, selection: &Selection) -> Result { + let all_sigs: Vec = self + .par_iter() + .filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) { + Ok(sig) => { + let sig = sig.clone().select(selection).ok()?; + Some(Signature::from(sig)) + } + Err(_) => { + eprintln!( + "FAILED to load sketch from '{}'", + record.internal_location() + ); + None + } + }) + .collect(); + Ok(Collection::from_sigs(all_sigs)?) + } } impl Select for MultiCollection { @@ -318,6 +338,19 @@ impl From> for MultiCollection { } } +// Extract a single Collection from a MultiCollection, if possible +impl TryFrom for Collection { + type Error = &'static str; + + fn try_from(multi: MultiCollection) -> Result { + if multi.collections.len() == 1 { + Ok(multi.collections.into_iter().next().unwrap()) + } else { + Err("More than one Collection in this MultiCollection; cannot convert") + } + } +} + /// Track a name/minhash. pub struct SmallSignature { pub location: String,