From 30a31cae7a13065508cf81598610f63ff0eb5362 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 8 Sep 2023 14:25:28 -0700 Subject: [PATCH] add manysearch --- src/manysearch.rs | 32 ++++------- src/mastiff_manygather.rs | 4 +- src/mastiff_manysearch.rs | 8 +-- src/python/tests/test_search.py | 95 ++++++++++++++++++++++++++++----- 4 files changed, 97 insertions(+), 42 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 0cd9ae66..1f4811e7 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -15,7 +15,9 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{prepare_query, - load_sketchlist_filenames, load_sketches, SearchResult, csvwriter_thread}; + load_sigpaths_from_zip_or_pathlist, SearchResult, + csvwriter_thread, load_sketches_from_zip_or_pathlist, + ReportType, report_on_sketch_loading}; pub fn manysearch>( querylist: P, @@ -29,29 +31,13 @@ pub fn manysearch>( eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display()); // Load all queries into memory at once. - let querylist_paths = load_sketchlist_filenames(&querylist)?; - - let result = load_sketches(querylist_paths, &template)?; - let (queries, skipped_paths, failed_paths) = result; - - eprintln!("Loaded {} query signatures", queries.len()); - if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", - failed_paths); - } - if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", - skipped_paths); - } - - if queries.is_empty() { - bail!("No query signatures loaded, exiting."); - } + let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?; // Load all _paths_, not signatures, into memory. - eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display()); + let (search_sigs_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(siglist)?; + // eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display()); - let search_sigs_paths = load_sketchlist_filenames(&siglist)?; + // let search_sigs_paths = load_sketchlist_filenames(&siglist)?; if search_sigs_paths.is_empty() { bail!("No signatures to search loaded, exiting."); } @@ -150,11 +136,11 @@ pub fn manysearch>( let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", + eprintln!("WARNING: skipped {} search paths - no compatible signatures.", skipped_paths); } if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", + eprintln!("WARNING: {} search paths failed to load. See error messages above.", failed_paths); } diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index f7bde849..c6c82bc4 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -17,7 +17,7 @@ use std::fs::File; use crate::utils::{prepare_query, is_revindex_database, - load_sketchlist_filenames}; + load_sigpaths_from_zip_or_pathlist}; pub fn mastiff_manygather>( @@ -35,7 +35,7 @@ pub fn mastiff_manygather>( println!("Loaded DB"); // Load query paths - let query_paths = load_sketchlist_filenames(&queries_file)?; + let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 61c36901..39bf20f3 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -13,7 +13,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{prepare_query, is_revindex_database, - load_sketchlist_filenames, SearchResult, csvwriter_thread}; + load_sigpaths_from_zip_or_pathlist, SearchResult, csvwriter_thread}; pub fn mastiff_manysearch>( @@ -32,7 +32,7 @@ pub fn mastiff_manysearch>( println!("Loaded DB"); // Load query paths - let query_paths = load_sketchlist_filenames(&queries_file)?; + let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?; // if query_paths is empty, exit with error if query_paths.is_empty() { @@ -138,11 +138,11 @@ pub fn mastiff_manysearch>( let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); if skipped_paths > 0 { - eprintln!("WARNING: skipped {} paths - no compatible signatures.", + eprintln!("WARNING: skipped {} query paths - no compatible signatures.", skipped_paths); } if failed_paths > 0 { - eprintln!("WARNING: {} signature paths failed to load. See error messages above.", + eprintln!("WARNING: {} query paths failed to load. See error messages above.", failed_paths); } diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 44b22364..9a8bffea 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -23,13 +23,19 @@ def test_installed(runtmp): assert 'usage: manysearch' in runtmp.last_result.err +def zip_siglist(runtmp, siglist, db): + runtmp.sourmash('sig', 'cat', siglist, + '-o', db) + return db + def index_siglist(runtmp, siglist, db): # build index runtmp.sourmash('scripts', 'index', siglist, '-o', db) return db -def test_simple(runtmp): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -43,6 +49,9 @@ def test_simple(runtmp): output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) assert os.path.exists(output) @@ -88,7 +97,8 @@ def test_simple(runtmp): assert intersect_hashes == 2529 -def test_simple_indexed(runtmp): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_indexed(runtmp, zip_query): # test basic execution! query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -104,6 +114,9 @@ def test_simple_indexed(runtmp): against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) assert os.path.exists(output) @@ -138,7 +151,8 @@ def test_simple_indexed(runtmp): @pytest.mark.parametrize("indexed", [False, True]) -def test_simple_with_cores(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_with_cores(runtmp, capfd, indexed, zip_query): # test basic execution with -c argument (that it runs, at least!) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -153,6 +167,9 @@ def test_simple_with_cores(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + output = runtmp.output('out.csv') runtmp.sourmash('scripts', 'manysearch', query_list, against_list, @@ -168,7 +185,8 @@ def test_simple_with_cores(runtmp, capfd, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_simple_threshold(runtmp, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_simple_threshold(runtmp, indexed, zip_query): # test with a simple threshold => only 3 results query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -183,6 +201,9 @@ def test_simple_threshold(runtmp, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + output = runtmp.output('out.csv') runtmp.sourmash('scripts', 'manysearch', query_list, against_list, @@ -194,7 +215,8 @@ def test_simple_threshold(runtmp, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_missing_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_missing_query(runtmp, capfd, indexed, zip_query): # test with a missing query list query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -209,6 +231,9 @@ def test_missing_query(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + if zip_query: + query_list = runtmp.output('query.zip') + output = runtmp.output('out.csv') with pytest.raises(utils.SourmashCommandFailed): @@ -270,7 +295,35 @@ def test_bad_query_2(runtmp, capfd, indexed): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + + +def test_bad_query_3(runtmp, capfd): + # test with a bad query (a .sig.gz file renamed as zip file) + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_zip = runtmp.output('query.zip') + # cp sig2 into query_zip + with open(query_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output('out.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'multisearch', query_zip, against_list, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -342,7 +395,7 @@ def test_bad_against_2(runtmp, capfd): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -371,7 +424,8 @@ def test_empty_query(runtmp, indexed): @pytest.mark.parametrize("indexed", [False, True]) -def test_nomatch_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test a non-matching (diff ksize) in query; do we get warning message? query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -385,6 +439,8 @@ def test_nomatch_query(runtmp, capfd, indexed): make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) @@ -396,7 +452,7 @@ def test_nomatch_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err + assert 'WARNING: skipped 1 query paths - no compatible signatures.' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -431,8 +487,9 @@ def test_load_only_one_bug(runtmp, capfd, indexed): assert not 'WARNING: no compatible sketches in path ' in captured.err +@pytest.mark.parametrize("zip_query", [False, True]) @pytest.mark.parametrize("indexed", [False, True]) -def test_load_only_one_bug_as_query(runtmp, capfd, indexed): +def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query): # check that we behave properly when presented with multiple query # sketches in one file, with only one matching. query_list = runtmp.output('query.txt') @@ -447,23 +504,31 @@ def test_load_only_one_bug_as_query(runtmp, capfd, indexed): make_file_list(query_list, [sig1_all]) make_file_list(against_list, [sig1_k31]) + output = runtmp.output('out.csv') + if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) + assert os.path.exists(output) captured = capfd.readouterr() print(captured.err) + print(runtmp.last_result.out) assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err + + # this fails with zip input, because they become individual signature files when zipped assert not 'WARNING: no compatible sketches in path ' in captured.err +@pytest.mark.parametrize("zip_query", [False, True]) @pytest.mark.parametrize("indexed", [False, True]) -def test_md5(runtmp, indexed): +def test_md5(runtmp, indexed, zip_query): # test that md5s match what was in the original files, not downsampled etc. query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -475,9 +540,13 @@ def test_md5(runtmp, indexed): make_file_list(query_list, [sig2, sig47, sig63]) make_file_list(against_list, [sig2, sig47, sig63]) + output = runtmp.output('out.csv') + if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - output = runtmp.output('out.csv') + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output)