From 072bb343595338e04ea19a6685d09a032e32eb5d Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Wed, 13 Sep 2023 15:36:11 -0700 Subject: [PATCH] MRG: add zip reading to `fastmultigather` (#106) --- src/fastmultigather.rs | 33 +-- src/mastiff_manygather.rs | 11 +- src/python/tests/test_multigather.py | 419 ++++++++++++++++++++------- src/utils.rs | 1 - 4 files changed, 333 insertions(+), 131 deletions(-) diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 1338899d..9c262ef6 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -14,7 +14,7 @@ use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; use crate::utils::{prepare_query, write_prefetch, PrefetchResult, - load_sketchlist_filenames, load_sketches, consume_query_by_gather}; + consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, ReportType}; pub fn fastmultigather + std::fmt::Debug + Clone>( query_filenames: P, @@ -32,14 +32,10 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( let template = Sketch::MinHash(template_mh); // load the list of query paths - let querylist_paths = load_sketchlist_filenames(&query_filenames)?; + let queryfile_name = query_filenames.as_ref().to_string_lossy().to_string(); + let (querylist_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&query_filenames)?; println!("Loaded {} sig paths in querylist", querylist_paths.len()); - // build the list of paths to match against. - println!("Loading matchlist"); - let matchlist_paths = load_sketchlist_filenames(&matchlist_filename)?; - println!("Loaded {} sig paths in matchlist", matchlist_paths.len()); - let threshold_hashes : u64 = { let x = threshold_bp / scaled; if x > 0 { @@ -52,22 +48,7 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); // Load all the against sketches - let result = load_sketches(matchlist_paths, &template)?; - let (sketchlist, skipped_paths, failed_paths) = result; - - eprintln!("Loaded {} sketches to search against.", sketchlist.len()); - if failed_paths > 0 { - eprintln!("WARNING: {} search paths failed to load. See error messages above.", - failed_paths); - } - if skipped_paths > 0 { - eprintln!("WARNING: skipped {} search paths - no compatible signatures.", - skipped_paths); - } - - if sketchlist.is_empty() { - bail!("No sketches loaded to search against!?") - } + let sketchlist = load_sketches_from_zip_or_pathlist(&matchlist_filename, &template, ReportType::Against)?; // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); @@ -89,8 +70,10 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( let mm = prepare_query(&sigs, &template, &location); if mm.is_none() { - eprintln!("WARNING: no compatible sketches in path '{}'", - q.display()); + if !queryfile_name.ends_with(".zip") { + eprintln!("WARNING: no compatible sketches in path '{}'", + q.display()); + } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } mm diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index f7bde849..36d1bc11 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -17,7 +17,7 @@ use std::fs::File; use crate::utils::{prepare_query, is_revindex_database, - load_sketchlist_filenames}; + load_sigpaths_from_zip_or_pathlist}; pub fn mastiff_manygather>( @@ -35,7 +35,8 @@ pub fn mastiff_manygather>( println!("Loaded DB"); // Load query paths - let query_paths = load_sketchlist_filenames(&queries_file)?; + let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); + let (query_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); @@ -110,8 +111,10 @@ pub fn mastiff_manygather>( eprintln!("Error gathering matches: {:?}", matches.err()); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", - filename.display()); + if !queryfile_name.ends_with(".zip") { + eprintln!("WARNING: no compatible sketches in path '{}'", + filename.display()); + } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index b041b594..d3ef21f9 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -32,8 +32,13 @@ def index_siglist(runtmp, siglist, db): '-o', db) return db -@pytest.mark.parametrize('indexed', [False, True]) -def test_simple(runtmp, indexed): +def zip_siglist(runtmp, siglist, db): + runtmp.sourmash('sig', 'cat', siglist, + '-o', db) + return db + +@pytest.mark.parametrize('zip_against', [False, True]) +def test_simple(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -46,46 +51,112 @@ def test_simple(runtmp, indexed): make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - if indexed: - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + g_output = runtmp.output('SRR606249.sig.gz.gather.csv') + p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + assert os.path.exists(p_output) + + # check prefetch output (only non-indexed gather) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + + +def test_simple_zip_query(runtmp): + # test basic execution! + query = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + against_list = runtmp.output('against.txt') + + make_file_list(query_list, [query]) + make_file_list(against_list, [sig2, sig47, sig63]) + + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + # outputs are based on md5sum, e.g. "{md5}.sig.gz.gather.csv" + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + + +@pytest.mark.parametrize('zip_query', [False, True]) +def test_simple_indexed(runtmp, zip_query): + # test basic execution! + query = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + against_list = runtmp.output('against.txt') + + make_file_list(query_list, [query]) + make_file_list(against_list, [sig2, sig47, sig63]) + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + g_output = runtmp.output('out.csv') + against_db = index_siglist(runtmp, against_list, runtmp.output('db')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_db, '-s', '100000', '-t', '0', '-o', g_output) - else: - cwd = os.getcwd() - try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') - finally: - os.chdir(cwd) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - assert os.path.exists(p_output) - - # check prefetch output (only non-indexed gather) - df = pandas.read_csv(p_output) - assert len(df) == 3 - keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - # check gather output (mostly same for indexed vs non-indexed version) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - if indexed: - assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} - else: - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp' -} + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + +@pytest.mark.parametrize('zip_query', [False, True]) @pytest.mark.parametrize('indexed', [False, True]) -def test_missing_querylist(runtmp, capfd, indexed): +def test_missing_querylist(runtmp, capfd, indexed, zip_query): # test missing querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -94,6 +165,8 @@ def test_missing_querylist(runtmp, capfd, indexed): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') + if zip_query: + query_list = runtmp.output('query.zip') # do not make query_list! make_file_list(against_list, [sig2, sig47, sig63]) @@ -134,6 +207,38 @@ def test_bad_query(runtmp, capfd, indexed): assert 'Error: invalid line in fromfile ' in captured.err +@pytest.mark.parametrize('indexed', [False, True]) +def test_bad_query_2(runtmp, capfd, indexed): + # test with a bad query (a .sig.gz file renamed as zip file) + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_zip = runtmp.output('query.zip') + # cp sig2 into query_zip + with open(query_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output('out.csv') + + if indexed: + against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastmultigather', query_zip, against_list, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + @pytest.mark.parametrize('indexed', [False, True]) def test_missing_query(runtmp, capfd, indexed): # test missingfile in querylist @@ -161,7 +266,8 @@ def test_missing_query(runtmp, capfd, indexed): @pytest.mark.parametrize('indexed', [False, True]) -def test_nomatch_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test nomatch file in querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -174,6 +280,8 @@ def test_nomatch_query(runtmp, capfd, indexed): make_file_list(query_list, [sig2, badsig1]) make_file_list(against_list, [sig2, sig47, sig63]) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) @@ -183,11 +291,15 @@ def test_nomatch_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: no compatible sketches in path " in captured.err + if zip_query: + assert "WARNING: no compatible sketches in path " not in captured.err + else: + assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err -def test_missing_against(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_missing_against(runtmp, capfd, zip_against): # test missing against query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -197,6 +309,9 @@ def test_missing_against(runtmp, capfd): sig63 = get_test_data('63.fa.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) + + if zip_against: + against_list = runtmp.output('against.zip') # do not make against_list with pytest.raises(utils.SourmashCommandFailed): @@ -247,6 +362,34 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err +@pytest.mark.parametrize('zip_query', [False, True]) +def test_bad_against_3(runtmp, capfd, zip_query): + # test with a bad query (a .sig.gz file renamed as zip file) + query = get_test_data('SRR606249.sig.gz') + query_list = runtmp.output('query.txt') + make_file_list(query_list, [query]) + + sig2 = get_test_data('2.fa.sig.gz') + against_zip = runtmp.output('against.zip') + # cp sig2 into query_zip + with open(against_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + def test_empty_against(runtmp, capfd): # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') @@ -263,11 +406,12 @@ def test_empty_against(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert "Loaded 0 sketches to search against." in captured.err - assert "Error: No sketches loaded to search against!?" in captured.err + assert "Loaded 0 search signature(s)" in captured.err + assert "Error: No search signatures loaded, exiting." in captured.err -def test_nomatch_in_against(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_nomatch_in_against(runtmp, capfd, zip_against): # test an against file that has a non-matching ksize sig in it query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -279,6 +423,9 @@ def test_nomatch_in_against(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(against_list, [sig2, sig1]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, '-s', '100000') @@ -288,8 +435,8 @@ def test_nomatch_in_against(runtmp, capfd): assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err -@pytest.mark.parametrize('indexed', [False, True]) -def test_md5(runtmp, indexed): +@pytest.mark.parametrize('zip_query', [False, True]) +def test_md5(runtmp, zip_query): # test correct md5s present in output query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -302,47 +449,43 @@ def test_md5(runtmp, indexed): make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - if indexed: - g_output = runtmp.output('out.csv') - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_list, '-s', '100000', '-t', '0', - '-o', g_output) - else: - cwd = os.getcwd() - try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') - finally: - os.chdir(cwd) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - assert os.path.exists(p_output) - - # check prefetch output (only non-indexed gather) - df = pandas.read_csv(p_output) - assert len(df) == 3 - keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - - md5s = set(df['match_md5']) - for against_file in (sig2, sig47, sig63): - for ss in sourmash.load_file_as_signatures(against_file, ksize=31): - assert ss.md5sum() in md5s + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + g_output = runtmp.output('SRR606249.sig.gz.gather.csv') + p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + if zip_query: + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + md5s = set(df['match_md5']) + for against_file in (sig2, sig47, sig63): + for ss in sourmash.load_file_as_signatures(against_file, ksize=31): + assert ss.md5sum() in md5s # check gather output (mostly same for indexed vs non-indexed version) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - if indexed: - assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} - else: - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} md5s = set(df['match_md5']) for against_file in (sig2, sig47, sig63): @@ -350,8 +493,45 @@ def test_md5(runtmp, indexed): assert ss.md5sum() in md5s -@pytest.mark.parametrize('indexed', [False, True]) -def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): +@pytest.mark.parametrize('zip_query', [False, True]) +def test_md5_indexed(runtmp, zip_query): + # test correct md5s present in output + query = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + against_list = runtmp.output('against.txt') + + make_file_list(query_list, [query]) + make_file_list(against_list, [sig2, sig47, sig63]) + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + g_output = runtmp.output('out.csv') + against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, + against_list, '-s', '100000', '-t', '0', + '-o', g_output) + + # check gather output (mostly same for indexed vs non-indexed version) + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + + md5s = set(df['match_md5']) + for against_file in (sig2, sig47, sig63): + for ss in sourmash.load_file_as_signatures(against_file, ksize=31): + assert ss.md5sum() in md5s + + +@pytest.mark.parametrize('zip_query', [False, True]) +@pytest.mark.parametrize('zip_against', [False, True]) +def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): # the column names should be strict subsets of sourmash prefetch cols query = get_test_data('SRR606249.sig.gz') @@ -364,25 +544,26 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): against_list = runtmp.output('against.txt') make_file_list(against_list, [sig2, sig47, sig63]) - if indexed: - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) - else: - cwd = os.getcwd() - try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') - finally: - os.chdir(cwd) - - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - assert os.path.exists(p_output) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + g_output = runtmp.output('SRR606249.sig.gz.gather.csv') + p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + if zip_query: + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + assert os.path.exists(p_output) assert os.path.exists(g_output) # now run sourmash prefetch sp_output = runtmp.output('sourmash-prefetch.csv') @@ -391,11 +572,47 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - if indexed: - assert g_keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} - else: - assert g_keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} - g_keys.remove('rank') # 'rank' is not in sourmash prefetch! + assert g_keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + g_keys.remove('rank') # 'rank' is not in sourmash prefetch! + + sourmash_prefetch_df = pandas.read_csv(sp_output) + sp_keys = set(sourmash_prefetch_df.keys()) + print(g_keys - sp_keys) + assert not g_keys - sp_keys, g_keys - sp_keys + + +@pytest.mark.parametrize('zip_query', [False, True]) +def test_csv_columns_vs_sourmash_prefetch_indexed(runtmp, zip_query): + # the column names should be strict subsets of sourmash prefetch cols + query = get_test_data('SRR606249.sig.gz') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + make_file_list(query_list, [query]) + against_list = runtmp.output('against.txt') + make_file_list(against_list, [sig2, sig47, sig63]) + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + g_output = runtmp.output('out.csv') + against_db = index_siglist(runtmp, against_list, runtmp.output('db')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, + against_db, '-s', '100000', '-t', '0', + '-o', g_output) + + assert os.path.exists(g_output) + # now run sourmash prefetch + sp_output = runtmp.output('sourmash-prefetch.csv') + runtmp.sourmash('prefetch', query, against_list, + '-o', sp_output, '--scaled', '100000') + + gather_df = pandas.read_csv(g_output) + g_keys = set(gather_df.keys()) + assert g_keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} sourmash_prefetch_df = pandas.read_csv(sp_output) sp_keys = set(sourmash_prefetch_df.keys()) diff --git a/src/utils.rs b/src/utils.rs index f155baca..eb52073d 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -272,7 +272,6 @@ pub fn load_sigpaths_from_zip>( Ok((signature_paths, temp_dir)) } - pub fn load_fasta_fromfile>(sketchlist_filename: &P) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?;