From 711d3c006a9fedbbce16faed76070b8effc2fa54 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 12:37:55 -0400 Subject: [PATCH 1/8] try making index work with standard code --- src/index.rs | 69 ++++++++++-------------------------- src/utils/multicollection.rs | 33 +++++++++++++++++ 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/index.rs b/src/index.rs index 00202743..cc052b72 100644 --- a/src/index.rs +++ b/src/index.rs @@ -8,68 +8,33 @@ use std::io::{BufRead, BufReader}; use std::path::Path; use sourmash::collection::{Collection, CollectionSet}; +use crate::utils::{ load_collection, ReportType }; +use crate::utils::multicollection::MultiCollection; pub fn index>( siglist: String, selection: &Selection, output: P, colors: bool, - _allow_failed_sigpaths: bool, + allow_failed_sigpaths: bool, use_internal_storage: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let collection = match siglist { - x if x.ends_with(".zip") => Collection::from_zipfile(x)?, - x if x.ends_with(".sig") || x.ends_with(".sig.gz") => { - let signatures = Signature::from_path(&x) - .with_context(|| format!("Failed to load signatures from: '{}'", x))?; - - Collection::from_sigs(signatures).with_context(|| { - format!( - "Loaded signatures but failed to load as collection: '{}'", - x - ) - })? - } - _ => { - let file = File::open(siglist.clone()) - .with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?; - - let reader = BufReader::new(file); - - // load list of paths - let lines: Vec<_> = reader - .lines() - .filter_map(|line| match line { - Ok(path) => { - let mut filename = PathBuf::new(); - filename.push(path); - Some(filename) - } - Err(_err) => None, - }) - .collect(); - - if lines.is_empty() { - return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()); - } else { - match Collection::from_paths(&lines) { - Ok(collection) => collection, - Err(err) => { - eprintln!("Error in loading from '{}': {}", siglist, err); - return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()); - } - } - } + let multi: MultiCollection = load_collection(&siglist, selection, + ReportType::General, + allow_failed_sigpaths).unwrap(); + if multi.len() == 1 || use_internal_storage { + let mut collection: CollectionSet; + if multi.len() == 1 { + let coll: Collection = Collection::try_from(multi).unwrap(); + collection = coll.select(selection)?.try_into().unwrap(); + } else { // use_internal_storage + // @CTB warn: loading all the things + let coll = multi.load_all_sigs(selection).unwrap(); + // @CTB multiple selects... + collection = coll.select(selection)?.try_into()?; } - }; - - let collection: CollectionSet = collection.select(selection)?.try_into()?; - - if collection.is_empty() { - Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()) - } else { eprintln!("Indexing {} sketches.", collection.len()); let mut index = RevIndex::create(output.as_ref(), collection, colors)?; @@ -77,5 +42,7 @@ pub fn index>( index.internalize_storage()?; } Ok(()) + } else { + Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()) } } diff --git a/src/utils/multicollection.rs b/src/utils/multicollection.rs index 5606243c..da0fedca 100644 --- a/src/utils/multicollection.rs +++ b/src/utils/multicollection.rs @@ -282,6 +282,26 @@ impl MultiCollection { .collect(); MultiCollection::new(colls, self.contains_revindex) } + + pub fn load_all_sigs(self, selection: &Selection) -> Result { + let all_sigs: Vec = self + .par_iter() + .filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) { + Ok(sig) => { + let sig = sig.clone().select(selection).ok()?; + Some(Signature::from(sig)) + } + Err(_) => { + eprintln!( + "FAILED to load sketch from '{}'", + record.internal_location() + ); + None + } + }) + .collect(); + Ok(Collection::from_sigs(all_sigs)?) + } } impl Select for MultiCollection { @@ -318,6 +338,19 @@ impl From> for MultiCollection { } } +// Extract a single Collection from a MultiCollection, if possible +impl TryFrom for Collection { + type Error = &'static str; + + fn try_from(multi: MultiCollection) -> Result { + if multi.collections.len() == 1 { + Ok(multi.collections.into_iter().next().unwrap()) + } else { + Err("More than one Collection in this MultiCollection; cannot convert") + } + } +} + /// Track a name/minhash. pub struct SmallSignature { pub location: String, From 561ed3c9560979f25a831b93a1b7df24091be7a5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 17:42:20 -0400 Subject: [PATCH 2/8] kinda working --- src/index.rs | 54 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/src/index.rs b/src/index.rs index cc052b72..1f6754eb 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,12 +1,9 @@ -use anyhow::Context; -use camino::Utf8PathBuf as PathBuf; use sourmash::index::revindex::RevIndex; use sourmash::index::revindex::RevIndexOps; use sourmash::prelude::*; -use std::fs::File; -use std::io::{BufRead, BufReader}; use std::path::Path; +use sourmash::Error; use sourmash::collection::{Collection, CollectionSet}; use crate::utils::{ load_collection, ReportType }; use crate::utils::multicollection::MultiCollection; @@ -24,25 +21,38 @@ pub fn index>( let multi: MultiCollection = load_collection(&siglist, selection, ReportType::General, allow_failed_sigpaths).unwrap(); - if multi.len() == 1 || use_internal_storage { - let mut collection: CollectionSet; - if multi.len() == 1 { - let coll: Collection = Collection::try_from(multi).unwrap(); - collection = coll.select(selection)?.try_into().unwrap(); - } else { // use_internal_storage - // @CTB warn: loading all the things - let coll = multi.load_all_sigs(selection).unwrap(); - // @CTB multiple selects... - collection = coll.select(selection)?.try_into()?; - } - eprintln!("Indexing {} sketches.", collection.len()); - let mut index = RevIndex::create(output.as_ref(), collection, colors)?; + eprintln!("loaded - {}", multi.len()); + + let coll: Result = multi.clone().try_into(); - if use_internal_storage { - index.internalize_storage()?; + let collection = match coll { + // if we can convert it, we have a single Collection; use that! + Ok(coll) => { + Ok(CollectionSet::from(coll.select(selection).unwrap().try_into()?)) + }, + // alt, our only chance is to load everything into memory. + Err(_) => { + if use_internal_storage { + // @CTB warn: loading all the things + let coll = multi.load_all_sigs(selection).unwrap(); + // @CTB multiple selects... + Ok(CollectionSet::from(coll.select(selection).unwrap().try_into()?)) + } else { + Err(anyhow::anyhow!("failed. Exiting.").into()) + } } - Ok(()) - } else { - Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into()) + }; + + match collection { + Ok(collection) => { + eprintln!("Indexing {} sketches.", collection.len()); + let mut index = RevIndex::create(output.as_ref(), collection, colors)?; + + if use_internal_storage { + index.internalize_storage()?; + } + Ok(()) + }, + Err(e) => Err(e), } } From a4cc9fff3fe4c82f38e773158e76e97efe1db59c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 18:08:33 -0400 Subject: [PATCH 3/8] fmt --- src/index.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/index.rs b/src/index.rs index 1f6754eb..c5d9d3f8 100644 --- a/src/index.rs +++ b/src/index.rs @@ -3,10 +3,9 @@ use sourmash::index::revindex::RevIndexOps; use sourmash::prelude::*; use std::path::Path; -use sourmash::Error; -use sourmash::collection::{Collection, CollectionSet}; -use crate::utils::{ load_collection, ReportType }; use crate::utils::multicollection::MultiCollection; +use crate::utils::{load_collection, ReportType}; +use sourmash::collection::{Collection, CollectionSet}; pub fn index>( siglist: String, @@ -18,25 +17,31 @@ pub fn index>( ) -> Result<(), Box> { println!("Loading siglist"); - let multi: MultiCollection = load_collection(&siglist, selection, - ReportType::General, - allow_failed_sigpaths).unwrap(); + let multi: MultiCollection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + ) + .unwrap(); eprintln!("loaded - {}", multi.len()); let coll: Result = multi.clone().try_into(); let collection = match coll { // if we can convert it, we have a single Collection; use that! - Ok(coll) => { - Ok(CollectionSet::from(coll.select(selection).unwrap().try_into()?)) - }, + Ok(coll) => Ok(CollectionSet::from( + coll.select(selection).unwrap().try_into()?, + )), // alt, our only chance is to load everything into memory. Err(_) => { if use_internal_storage { // @CTB warn: loading all the things let coll = multi.load_all_sigs(selection).unwrap(); // @CTB multiple selects... - Ok(CollectionSet::from(coll.select(selection).unwrap().try_into()?)) + Ok(CollectionSet::from( + coll.select(selection).unwrap().try_into()?, + )) } else { Err(anyhow::anyhow!("failed. Exiting.").into()) } @@ -52,7 +57,7 @@ pub fn index>( index.internalize_storage()?; } Ok(()) - }, + } Err(e) => Err(e), } } From 9567f9b211c03f587c127c53cac8738ade0e2c0e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 18:17:34 -0400 Subject: [PATCH 4/8] refactor --- src/index.rs | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/index.rs b/src/index.rs index c5d9d3f8..ef765cd2 100644 --- a/src/index.rs +++ b/src/index.rs @@ -3,7 +3,6 @@ use sourmash::index::revindex::RevIndexOps; use sourmash::prelude::*; use std::path::Path; -use crate::utils::multicollection::MultiCollection; use crate::utils::{load_collection, ReportType}; use sourmash::collection::{Collection, CollectionSet}; @@ -15,33 +14,35 @@ pub fn index>( allow_failed_sigpaths: bool, use_internal_storage: bool, ) -> Result<(), Box> { - println!("Loading siglist"); + eprintln!("Loading sketches from {}", siglist); - let multi: MultiCollection = load_collection( + let multi = match load_collection( &siglist, selection, ReportType::General, allow_failed_sigpaths, - ) - .unwrap(); + ) { + Ok(multi) => multi, + Err(err) => return Err(err.into()), + }; eprintln!("loaded - {}", multi.len()); - let coll: Result = multi.clone().try_into(); - - let collection = match coll { - // if we can convert it, we have a single Collection; use that! - Ok(coll) => Ok(CollectionSet::from( - coll.select(selection).unwrap().try_into()?, - )), - // alt, our only chance is to load everything into memory. + // Try to convert it into a Collection and then CollectionSet. + let collection = match Collection::try_from(multi.clone()) { + // conversion worked! + Ok(c) => { + let cs: CollectionSet = c.select(selection).unwrap().try_into()?; + Ok(cs) + } + // conversion failed; can we/should we load it into memory? + // @CTB bool. Err(_) => { if use_internal_storage { - // @CTB warn: loading all the things - let coll = multi.load_all_sigs(selection).unwrap(); + let c = multi.load_all_sigs(selection).unwrap(); // @CTB multiple selects... - Ok(CollectionSet::from( - coll.select(selection).unwrap().try_into()?, - )) + let c = c.select(selection).unwrap(); + let cs: CollectionSet = c.try_into()?; + Ok(cs) } else { Err(anyhow::anyhow!("failed. Exiting.").into()) } From 39580a70c255153fe97c5ada272921792281a8fb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 18:30:24 -0500 Subject: [PATCH 5/8] clear up the tests --- src/python/tests/test_fastgather.py | 4 ++++ src/python/tests/test_fastmultigather.py | 11 ++++++++++- src/python/tests/test_index.py | 16 +++++++++++----- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index f444818f..ae26254f 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -16,6 +16,8 @@ def test_installed(runtmp): def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -60,6 +62,8 @@ def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, togg def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 6a5d99e3..f06f190c 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -203,6 +203,9 @@ def test_simple_read_manifests(runtmp): def test_simple_indexed(runtmp, zip_query, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") + # test basic execution! query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -239,6 +242,8 @@ def test_simple_indexed(runtmp, zip_query, toggle_internal_storage): def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test basic execution! query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -273,6 +278,8 @@ def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage): def test_missing_querylist(runtmp, capfd, indexed, zip_query, toggle_internal_storage): + if toggle_internal_storage == '--no-internal-storage': + raise pytest.xfail("not implemented") # test missing querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -1174,7 +1181,9 @@ def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd): "47.fa.sig.gz", "63.fa.sig.gz"]) - # index! + # index! CTB, note this will fail currently. + raise pytest.xfail("not implemented") + runtmp.sourmash('scripts', 'index', against_list, '--no-internal-storage', '-o', 'subdir/against.rocksdb') diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 6f59425e..02765f50 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -16,6 +16,9 @@ def test_installed(runtmp): def test_index(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") + # test basic index! siglist = runtmp.output('db-sigs.txt') @@ -82,10 +85,9 @@ def test_index_missing_siglist(runtmp, capfd, toggle_internal_storage): captured = capfd.readouterr() print(captured.err) - assert 'Failed to open pathlist file:' in captured.err + assert 'Error: No such file or directory: ' in captured.err -@pytest.mark.xfail(reason="not implemented yet") def test_index_sig(runtmp, capfd, toggle_internal_storage): # test index with a .sig.gz file instead of pathlist # (should work now) @@ -101,7 +103,6 @@ def test_index_sig(runtmp, capfd, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err -@pytest.mark.xfail(reason="not implemented yet") def test_index_manifest(runtmp, capfd, toggle_internal_storage): # test index with a manifest file sig2 = get_test_data('2.fa.sig.gz') @@ -118,7 +119,6 @@ def test_index_manifest(runtmp, capfd, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err -@pytest.mark.xfail(reason="needs more work") def test_index_bad_siglist_2(runtmp, capfd): # test with a bad siglist (containing a missing file) against_list = runtmp.output('against.txt') @@ -139,7 +139,6 @@ def test_index_bad_siglist_2(runtmp, capfd): assert "WARNING: could not load sketches from path 'no-exist'" in captured.err -@pytest.mark.xfail(reason="needs more work") def test_index_empty_siglist(runtmp, capfd): # test empty siglist file siglist = runtmp.output('db-sigs.txt') @@ -347,6 +346,8 @@ def test_index_zipfile_bad(runtmp, capfd): def test_index_check(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") # test check index siglist = runtmp.output('db-sigs.txt') @@ -367,6 +368,8 @@ def test_index_check(runtmp, toggle_internal_storage): def test_index_check_quick(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") # test check index siglist = runtmp.output('db-sigs.txt') @@ -387,6 +390,9 @@ def test_index_check_quick(runtmp, toggle_internal_storage): def test_index_subdir(runtmp, toggle_internal_storage): + if toggle_internal_storage == "--no-internal-storage": + raise pytest.xfail("not implemented currently") + # test basic index & output to subdir siglist = runtmp.output('db-sigs.txt') From 1a01d9325e342d382d3191244bafedfca7418b31 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 18:31:11 -0500 Subject: [PATCH 6/8] refactor/clean up --- src/index.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/index.rs b/src/index.rs index ef765cd2..fb834ddd 100644 --- a/src/index.rs +++ b/src/index.rs @@ -25,26 +25,27 @@ pub fn index>( Ok(multi) => multi, Err(err) => return Err(err.into()), }; - eprintln!("loaded - {}", multi.len()); + eprintln!("Found {} sketches total.", multi.len()); // Try to convert it into a Collection and then CollectionSet. let collection = match Collection::try_from(multi.clone()) { // conversion worked! Ok(c) => { - let cs: CollectionSet = c.select(selection).unwrap().try_into()?; + let cs: CollectionSet = c.select(selection)?.try_into()?; Ok(cs) } // conversion failed; can we/should we load it into memory? - // @CTB bool. Err(_) => { if use_internal_storage { - let c = multi.load_all_sigs(selection).unwrap(); - // @CTB multiple selects... - let c = c.select(selection).unwrap(); + // @CTB test warning + eprintln!("WARNING: loading all sketches into memory in order to index."); + eprintln!("See 'index' documentation for details."); + let c: Collection = multi.load_all_sigs(selection)?; let cs: CollectionSet = c.try_into()?; Ok(cs) } else { - Err(anyhow::anyhow!("failed. Exiting.").into()) + // @CTB test error message + Err(anyhow::anyhow!("ERROR: cannot load this type of file with 'index'. Exiting.").into()) } } }; From 0fc35fa8acca070d34381832ea776ebe41af30fa Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 18:54:09 -0500 Subject: [PATCH 7/8] cargo fmt --- src/index.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index fb834ddd..f898c97c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -45,7 +45,10 @@ pub fn index>( Ok(cs) } else { // @CTB test error message - Err(anyhow::anyhow!("ERROR: cannot load this type of file with 'index'. Exiting.").into()) + Err( + anyhow::anyhow!("ERROR: cannot load this type of file with 'index'. Exiting.") + .into(), + ) } } }; From e85ebed83beb310a41910509b7f2dad142d09853 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 8 Sep 2024 19:22:11 -0500 Subject: [PATCH 8/8] add tests for index warning & error --- src/index.rs | 4 +--- src/python/tests/test_index.py | 43 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/index.rs b/src/index.rs index f898c97c..c303b09b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -37,16 +37,14 @@ pub fn index>( // conversion failed; can we/should we load it into memory? Err(_) => { if use_internal_storage { - // @CTB test warning eprintln!("WARNING: loading all sketches into memory in order to index."); eprintln!("See 'index' documentation for details."); let c: Collection = multi.load_all_sigs(selection)?; let cs: CollectionSet = c.try_into()?; Ok(cs) } else { - // @CTB test error message Err( - anyhow::anyhow!("ERROR: cannot load this type of file with 'index'. Exiting.") + anyhow::anyhow!("cannot index this type of collection with external storage") .into(), ) } diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 02765f50..638913f4 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -38,6 +38,49 @@ def test_index(runtmp, toggle_internal_storage): assert 'index is done' in runtmp.last_result.err +def test_index_warning_message(runtmp, capfd): + # test basic index when it has to load things into memory - see #451. + siglist = runtmp.output('db-sigs.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + make_file_list(siglist, [sig2, sig47, sig63]) + + output = runtmp.output('db.rocksdb') + + runtmp.sourmash('scripts', 'index', siglist, '-o', output) + assert os.path.exists(output) + print(runtmp.last_result.err) + + assert 'index is done' in runtmp.last_result.err + captured = capfd.readouterr() + print(captured.err) + assert "WARNING: loading all sketches into memory in order to index." in captured.err + + +def test_index_error_message(runtmp, capfd): + # test basic index when it errors out b/c can't load + siglist = runtmp.output('db-sigs.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + make_file_list(siglist, [sig2, sig47, sig63]) + + output = runtmp.output('db.rocksdb') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', output, + '--no-internal-storage') + + captured = capfd.readouterr() + print(captured.err) + assert "cannot index this type of collection with external storage" in captured.err + + def test_index_protein(runtmp, toggle_internal_storage): sigs = get_test_data('protein.zip') output = runtmp.output('db.rocksdb')