diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 5dcb1fe1..9c262ef6 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -70,7 +70,7 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( let mm = prepare_query(&sigs, &template, &location); if mm.is_none() { - if queryfile_name.ends_with(".zip") { + if !queryfile_name.ends_with(".zip") { eprintln!("WARNING: no compatible sketches in path '{}'", q.display()); } diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 49a6ac73..36d1bc11 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -35,6 +35,7 @@ pub fn mastiff_manygather>( println!("Loaded DB"); // Load query paths + let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); let (query_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?; // set up a multi-producer, single-consumer channel. @@ -110,8 +111,10 @@ pub fn mastiff_manygather>( eprintln!("Error gathering matches: {:?}", matches.err()); } } else { - eprintln!("WARNING: no compatible sketches in path '{}'", - filename.display()); + if !queryfile_name.ends_with(".zip") { + eprintln!("WARNING: no compatible sketches in path '{}'", + filename.display()); + } let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index 26caa593..d3ef21f9 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -81,6 +81,49 @@ def test_simple(runtmp, zip_against): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} +def test_simple_zip_query(runtmp): + # test basic execution! + query = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + against_list = runtmp.output('against.txt') + + make_file_list(query_list, [query]) + make_file_list(against_list, [sig2, sig47, sig63]) + + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + # outputs are based on md5sum, e.g. "{md5}.sig.gz.gather.csv" + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + + @pytest.mark.parametrize('zip_query', [False, True]) def test_simple_indexed(runtmp, zip_query): # test basic execution! @@ -111,8 +154,9 @@ def test_simple_indexed(runtmp, zip_query): assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} +@pytest.mark.parametrize('zip_query', [False, True]) @pytest.mark.parametrize('indexed', [False, True]) -def test_missing_querylist(runtmp, capfd, indexed): +def test_missing_querylist(runtmp, capfd, indexed, zip_query): # test missing querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -121,6 +165,8 @@ def test_missing_querylist(runtmp, capfd, indexed): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') + if zip_query: + query_list = runtmp.output('query.zip') # do not make query_list! make_file_list(against_list, [sig2, sig47, sig63]) @@ -161,6 +207,38 @@ def test_bad_query(runtmp, capfd, indexed): assert 'Error: invalid line in fromfile ' in captured.err +@pytest.mark.parametrize('indexed', [False, True]) +def test_bad_query_2(runtmp, capfd, indexed): + # test with a bad query (a .sig.gz file renamed as zip file) + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_zip = runtmp.output('query.zip') + # cp sig2 into query_zip + with open(query_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output('out.csv') + + if indexed: + against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastmultigather', query_zip, against_list, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + @pytest.mark.parametrize('indexed', [False, True]) def test_missing_query(runtmp, capfd, indexed): # test missingfile in querylist @@ -188,7 +266,8 @@ def test_missing_query(runtmp, capfd, indexed): @pytest.mark.parametrize('indexed', [False, True]) -def test_nomatch_query(runtmp, capfd, indexed): +@pytest.mark.parametrize("zip_query", [False, True]) +def test_nomatch_query(runtmp, capfd, indexed, zip_query): # test nomatch file in querylist query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -201,6 +280,8 @@ def test_nomatch_query(runtmp, capfd, indexed): make_file_list(query_list, [sig2, badsig1]) make_file_list(against_list, [sig2, sig47, sig63]) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) @@ -210,11 +291,15 @@ def test_nomatch_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: no compatible sketches in path " in captured.err + if zip_query: + assert "WARNING: no compatible sketches in path " not in captured.err + else: + assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err -def test_missing_against(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_missing_against(runtmp, capfd, zip_against): # test missing against query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -224,6 +309,9 @@ def test_missing_against(runtmp, capfd): sig63 = get_test_data('63.fa.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) + + if zip_against: + against_list = runtmp.output('against.zip') # do not make against_list with pytest.raises(utils.SourmashCommandFailed): @@ -274,6 +362,34 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err +@pytest.mark.parametrize('zip_query', [False, True]) +def test_bad_against_3(runtmp, capfd, zip_query): + # test with a bad query (a .sig.gz file renamed as zip file) + query = get_test_data('SRR606249.sig.gz') + query_list = runtmp.output('query.txt') + make_file_list(query_list, [query]) + + sig2 = get_test_data('2.fa.sig.gz') + against_zip = runtmp.output('against.zip') + # cp sig2 into query_zip + with open(against_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + output = runtmp.output('out.csv') + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, + '-o', output) + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + def test_empty_against(runtmp, capfd): # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') @@ -294,7 +410,8 @@ def test_empty_against(runtmp, capfd): assert "Error: No search signatures loaded, exiting." in captured.err -def test_nomatch_in_against(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_nomatch_in_against(runtmp, capfd, zip_against): # test an against file that has a non-matching ksize sig in it query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -306,6 +423,9 @@ def test_nomatch_in_against(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(against_list, [sig2, sig1]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, '-s', '100000') @@ -315,8 +435,8 @@ def test_nomatch_in_against(runtmp, capfd): assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err -@pytest.mark.parametrize('indexed', [False, True]) -def test_md5(runtmp, indexed): +@pytest.mark.parametrize('zip_query', [False, True]) +def test_md5(runtmp, zip_query): # test correct md5s present in output query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -329,47 +449,43 @@ def test_md5(runtmp, indexed): make_file_list(query_list, [query]) make_file_list(against_list, [sig2, sig47, sig63]) - if indexed: - g_output = runtmp.output('out.csv') - against_list = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_list, '-s', '100000', '-t', '0', - '-o', g_output) - else: - cwd = os.getcwd() - try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') - finally: - os.chdir(cwd) - - print(os.listdir(runtmp.output(''))) - - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - assert os.path.exists(p_output) - - # check prefetch output (only non-indexed gather) - df = pandas.read_csv(p_output) - assert len(df) == 3 - keys = set(df.keys()) - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - - md5s = set(df['match_md5']) - for against_file in (sig2, sig47, sig63): - for ss in sourmash.load_file_as_signatures(against_file, ksize=31): - assert ss.md5sum() in md5s + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + + print(os.listdir(runtmp.output(''))) + + g_output = runtmp.output('SRR606249.sig.gz.gather.csv') + p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + if zip_query: + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + md5s = set(df['match_md5']) + for against_file in (sig2, sig47, sig63): + for ss in sourmash.load_file_as_signatures(against_file, ksize=31): + assert ss.md5sum() in md5s # check gather output (mostly same for indexed vs non-indexed version) assert os.path.exists(g_output) df = pandas.read_csv(g_output) assert len(df) == 3 keys = set(df.keys()) - if indexed: - assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} - else: - assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} md5s = set(df['match_md5']) for against_file in (sig2, sig47, sig63): @@ -377,8 +493,45 @@ def test_md5(runtmp, indexed): assert ss.md5sum() in md5s -@pytest.mark.parametrize('indexed', [False, True]) -def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): +@pytest.mark.parametrize('zip_query', [False, True]) +def test_md5_indexed(runtmp, zip_query): + # test correct md5s present in output + query = get_test_data('SRR606249.sig.gz') + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + against_list = runtmp.output('against.txt') + + make_file_list(query_list, [query]) + make_file_list(against_list, [sig2, sig47, sig63]) + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + g_output = runtmp.output('out.csv') + against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, + against_list, '-s', '100000', '-t', '0', + '-o', g_output) + + # check gather output (mostly same for indexed vs non-indexed version) + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + + md5s = set(df['match_md5']) + for against_file in (sig2, sig47, sig63): + for ss in sourmash.load_file_as_signatures(against_file, ksize=31): + assert ss.md5sum() in md5s + + +@pytest.mark.parametrize('zip_query', [False, True]) +@pytest.mark.parametrize('zip_against', [False, True]) +def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): # the column names should be strict subsets of sourmash prefetch cols query = get_test_data('SRR606249.sig.gz') @@ -391,25 +544,26 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): against_list = runtmp.output('against.txt') make_file_list(against_list, [sig2, sig47, sig63]) - if indexed: - g_output = runtmp.output('out.csv') - against_db = index_siglist(runtmp, against_list, runtmp.output('db')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, - against_db, '-s', '100000', '-t', '0', - '-o', g_output) - else: - cwd = os.getcwd() - try: - os.chdir(runtmp.output('')) - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, - '-s', '100000', '-t', '0') - finally: - os.chdir(cwd) - - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - assert os.path.exists(p_output) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + + cwd = os.getcwd() + try: + os.chdir(runtmp.output('')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + '-s', '100000', '-t', '0') + finally: + os.chdir(cwd) + g_output = runtmp.output('SRR606249.sig.gz.gather.csv') + p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + if zip_query: + g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') + p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + + assert os.path.exists(p_output) assert os.path.exists(g_output) # now run sourmash prefetch sp_output = runtmp.output('sourmash-prefetch.csv') @@ -418,11 +572,47 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, indexed): gather_df = pandas.read_csv(g_output) g_keys = set(gather_df.keys()) - if indexed: - assert g_keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} - else: - assert g_keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} - g_keys.remove('rank') # 'rank' is not in sourmash prefetch! + assert g_keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} + g_keys.remove('rank') # 'rank' is not in sourmash prefetch! + + sourmash_prefetch_df = pandas.read_csv(sp_output) + sp_keys = set(sourmash_prefetch_df.keys()) + print(g_keys - sp_keys) + assert not g_keys - sp_keys, g_keys - sp_keys + + +@pytest.mark.parametrize('zip_query', [False, True]) +def test_csv_columns_vs_sourmash_prefetch_indexed(runtmp, zip_query): + # the column names should be strict subsets of sourmash prefetch cols + query = get_test_data('SRR606249.sig.gz') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + query_list = runtmp.output('query.txt') + make_file_list(query_list, [query]) + against_list = runtmp.output('against.txt') + make_file_list(against_list, [sig2, sig47, sig63]) + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip')) + + g_output = runtmp.output('out.csv') + against_db = index_siglist(runtmp, against_list, runtmp.output('db')) + runtmp.sourmash('scripts', 'fastmultigather', query_list, + against_db, '-s', '100000', '-t', '0', + '-o', g_output) + + assert os.path.exists(g_output) + # now run sourmash prefetch + sp_output = runtmp.output('sourmash-prefetch.csv') + runtmp.sourmash('prefetch', query, against_list, + '-o', sp_output, '--scaled', '100000') + + gather_df = pandas.read_csv(g_output) + g_keys = set(gather_df.keys()) + assert g_keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} sourmash_prefetch_df = pandas.read_csv(sp_output) sp_keys = set(sourmash_prefetch_df.keys())