From 07470c9bdafc084cfe7a9d00b9bd02dac84c0327 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Wed, 13 Sep 2023 12:56:12 -0700 Subject: [PATCH] MRG: enable zipfile loading for manysearch (#99) --- src/manysearch.rs | 39 +++++------- src/mastiff_manysearch.rs | 15 +++-- src/python/tests/test_index.py | 22 +++++++ src/python/tests/test_search.py | 103 +++++++++++++++++++++++++++----- 4 files changed, 134 insertions(+), 45 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 0cd9ae66..37e473d3 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -15,7 +15,9 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{prepare_query, - load_sketchlist_filenames, load_sketches, SearchResult, csvwriter_thread}; + load_sigpaths_from_zip_or_pathlist, SearchResult, + csvwriter_thread, load_sketches_from_zip_or_pathlist, + ReportType}; pub fn manysearch>( querylist: P, @@ -29,29 +31,12 @@ pub fn manysearch>( eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display()); // Load all queries into memory at once. - let querylist_paths = load_sketchlist_filenames(&querylist)?; - - let result = load_sketches(querylist_paths, &template)?; - let (queries, skipped_paths, failed_paths) = result; - - eprintln!("Loaded {} query signatures", queries.len()); - if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", - failed_paths); - } - if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", - skipped_paths); - } - - if queries.is_empty() { - bail!("No query signatures loaded, exiting."); - } + let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?; // Load all _paths_, not signatures, into memory. - eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display()); + let siglist_name = siglist.as_ref().to_string_lossy().to_string(); + let (search_sigs_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(siglist)?; - let search_sigs_paths = load_sketchlist_filenames(&siglist)?; if search_sigs_paths.is_empty() { bail!("No signatures to search loaded, exiting."); } @@ -114,8 +99,12 @@ pub fn manysearch>( } } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", - filename.display()); + // for reading zips, this is likely not a useful warning and + // would show up too often (every sig is stored as individual file). + if !siglist_name.ends_with(".zip") { + eprintln!("WARNING: no compatible sketches in path '{}'", + filename.display()); + } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } Some(results) @@ -150,11 +139,11 @@ pub fn manysearch>( let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", + eprintln!("WARNING: skipped {} search paths - no compatible signatures.", skipped_paths); } if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", + eprintln!("WARNING: {} search paths failed to load. See error messages above.", failed_paths); } diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 61c36901..55db1b22 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -13,7 +13,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{prepare_query, is_revindex_database, - load_sketchlist_filenames, SearchResult, csvwriter_thread}; + load_sigpaths_from_zip_or_pathlist, SearchResult, csvwriter_thread}; pub fn mastiff_manysearch>( @@ -32,7 +32,8 @@ pub fn mastiff_manysearch>( println!("Loaded DB"); // Load query paths - let query_paths = load_sketchlist_filenames(&queries_file)?; + let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); + let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?; // if query_paths is empty, exit with error if query_paths.is_empty() { @@ -92,8 +93,12 @@ pub fn mastiff_manysearch>( } } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", + // for reading zips, this is likely not a useful warning and + // would show up too often (every sig is stored as individual file). + if !queryfile_name.ends_with(".zip") { + eprintln!("WARNING: no compatible sketches in path '{}'", filename.display()); + } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { @@ -138,11 +143,11 @@ pub fn mastiff_manysearch>( let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", + eprintln!("WARNING: skipped {} query paths - no compatible signatures.", skipped_paths); } if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", + eprintln!("WARNING: {} query paths failed to load. See error messages above.", failed_paths); } diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 66e08e9f..e5869c03 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -154,6 +154,28 @@ def test_index_zipfile(runtmp, capfd): assert 'Found 3 filepaths' in captured.err +def test_index_zipfile_bad(runtmp, capfd): + # test with a bad input zipfile (a .sig.gz file renamed as zip file) + sig2 = get_test_data('2.fa.sig.gz') + + query_zip = runtmp.output('query.zip') + # cp sig2 into query_zip + with open(query_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + output = runtmp.output('out.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', query_zip, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + def test_index_check(runtmp): # test check index siglist = runtmp.output('db-sigs.txt') diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 44b22364..19cf28df 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -23,13 +23,20 @@ def test_installed(runtmp): assert 'usage: manysearch' in runtmp.last_result.err +def zip_siglist(runtmp, siglist, db): + runtmp.sourmash('sig', 'cat', siglist, + '-o', db) + return db + def index_siglist(runtmp, siglist, db): # build index runtmp.sourmash('scripts', 'index', siglist, '-o', db) return db -def test_simple(runtmp): +@pytest.mark.parametrize("zip_query", [False, True]) +@pytest.mark.parametrize("zip_against", [False, True]) +def test_simple(runtmp, zip_query, zip_against): # test basic execution! query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -43,6 +50,11 @@ def test_simple(runtmp): output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) assert os.path.exists(output) @@ -88,7 +100,8 @@ def test_simple(runtmp): assert intersect_hashes == 2529 -def test_simple_indexed(runtmp): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_indexed(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -104,6 +117,9 @@ def test_simple_indexed(runtmp): against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) assert os.path.exists(output) @@ -138,7 +154,8 @@ def test_simple_indexed(runtmp): @pytest.mark.parametrize("indexed", [False, True]) -def test_simple_with_cores(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_with_cores(runtmp, capfd, indexed, zip_query): # test basic execution with -c argument (that it runs, at least!) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -153,6 +170,9 @@ def test_simple_with_cores(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + output = runtmp.output('out.csv') runtmp.sourmash('scripts', 'manysearch', query_list, against_list, @@ -168,7 +188,8 @@ def test_simple_with_cores(runtmp, capfd, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_simple_threshold(runtmp, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_threshold(runtmp, indexed, zip_query): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -183,6 +204,9 @@ def test_simple_threshold(runtmp, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + output = runtmp.output('out.csv') runtmp.sourmash('scripts', 'manysearch', query_list, against_list, @@ -194,7 +218,8 @@ def test_simple_threshold(runtmp, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_missing_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_missing_query(runtmp, capfd, indexed, zip_query): # test with a missing query list query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -209,6 +234,9 @@ def test_missing_query(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = runtmp.output('query.zip') + output = runtmp.output('out.csv') with pytest.raises(utils.SourmashCommandFailed): @@ -270,7 +298,35 @@ def test_bad_query_2(runtmp, capfd, indexed): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + + +def test_bad_query_3(runtmp, capfd): + # test with a bad query (a .sig.gz file renamed as zip file) + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_zip = runtmp.output('query.zip') + # cp sig2 into query_zip + with open(query_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output('out.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'multisearch', query_zip, against_list, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -342,7 +398,7 @@ def test_bad_against_2(runtmp, capfd): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -371,7 +427,8 @@ def test_empty_query(runtmp, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_nomatch_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -385,6 +442,8 @@ def test_nomatch_query(runtmp, capfd, indexed): make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) @@ -396,11 +455,12 @@ def test_nomatch_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err + assert 'WARNING: skipped 1 query paths - no compatible signatures.' in captured.err +@pytest.mark.parametrize("zip_against", [False, True]) @pytest.mark.parametrize("indexed", [False, True]) -def test_load_only_one_bug(runtmp, capfd, indexed): +def test_load_only_one_bug(runtmp, capfd, indexed, zip_against): # check that we behave properly when presented with multiple against # sketches query_list = runtmp.output('query.txt') @@ -417,7 +477,9 @@ def test_load_only_one_bug(runtmp, capfd, indexed): output = runtmp.output('out.csv') - if indexed: + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + elif indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) runtmp.sourmash('scripts', 'manysearch', query_list, against_list, @@ -431,8 +493,9 @@ def test_load_only_one_bug(runtmp, capfd, indexed): assert not 'WARNING: no compatible sketches in path ' in captured.err +@pytest.mark.parametrize("zip_query", [False, True]) @pytest.mark.parametrize("indexed", [False, True]) -def test_load_only_one_bug_as_query(runtmp, capfd, indexed): +def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. query_list = runtmp.output('query.txt') @@ -447,23 +510,29 @@ def test_load_only_one_bug_as_query(runtmp, capfd, indexed): make_file_list(query_list, [sig1_all]) make_file_list(against_list, [sig1_k31]) + output = runtmp.output('out.csv') + if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) + assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) + print(runtmp.last_result.out) assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err assert not 'WARNING: no compatible sketches in path ' in captured.err +@pytest.mark.parametrize("zip_query", [False, True]) @pytest.mark.parametrize("indexed", [False, True]) -def test_md5(runtmp, indexed): +def test_md5(runtmp, indexed, zip_query): # test that md5s match what was in the original files, not downsampled etc. query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -475,9 +544,13 @@ def test_md5(runtmp, indexed): make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) + output = runtmp.output('out.csv') + if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - output = runtmp.output('out.csv') + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output)