From b7337e242378dd180503bfe85346397f43c4aabd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 13 Sep 2023 17:04:27 -0700 Subject: [PATCH] add tests for zipped search sigs --- src/python/tests/test_gather.py | 109 ++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 14 deletions(-) diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 2b8a91ac..8ebd524f 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -17,6 +17,12 @@ def make_file_list(filename, paths): fp.write("\n") +def zip_siglist(runtmp, siglist, db): + runtmp.sourmash('sig', 'cat', siglist, + '-o', db) + return db + + def test_installed(runtmp): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'fastgather') @@ -24,7 +30,8 @@ def test_installed(runtmp): assert 'usage: fastgather' in runtmp.last_result.err -def test_simple(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_simple(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -35,6 +42,9 @@ def test_simple(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -48,7 +58,9 @@ def test_simple(runtmp): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} -def test_simple_with_prefetch(runtmp): + +@pytest.mark.parametrize('zip_against', [False, True]) +def test_simple_with_prefetch(runtmp, zip_against): # test basic execution! query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -59,6 +71,9 @@ def test_simple_with_prefetch(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -79,7 +94,8 @@ def test_simple_with_prefetch(runtmp): assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} -def test_missing_query(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_missing_query(runtmp, capfd, zip_against): # test missing query query = runtmp.output('no-such-file') against_list = runtmp.output('against.txt') @@ -90,6 +106,9 @@ def test_missing_query(runtmp, capfd): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -103,8 +122,8 @@ def test_missing_query(runtmp, capfd): assert 'Error: No such file or directory ' in captured.err - -def test_bad_query(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_bad_query(runtmp, capfd, zip_against): # test non-sig query query = runtmp.output('no-such-file') against_list = runtmp.output('against.txt') @@ -118,6 +137,9 @@ def test_bad_query(runtmp, capfd): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -132,7 +154,8 @@ def test_bad_query(runtmp, capfd): assert 'Error: expected value at line 1' in captured.err -def test_missing_against(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_missing_against(runtmp, capfd, zip_against): # test missing against query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -142,6 +165,8 @@ def test_missing_against(runtmp, capfd): sig63 = get_test_data('63.fa.sig.gz') #make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = runtmp.output('against.zip') g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -228,7 +253,33 @@ def test_bad_against_3(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_against_multisigfile(runtmp): +def test_bad_against_4(runtmp, capfd): + # test with a bad against (a .sig.gz file renamed as zip file) + query = get_test_data('SRR606249.sig.gz') + + sig2 = get_test_data('2.fa.sig.gz') + against_zip = runtmp.output('against.zip') + # cp sig2 into against_zip + with open(against_zip, 'wb') as fp: + with open(sig2, 'rb') as fp2: + fp.write(fp2.read()) + + g_output = runtmp.output('gather.csv') + p_output = runtmp.output('prefetch.csv') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastgather', query, against_zip, + '-o', g_output, '--output-prefetch', p_output, + '-s', '100000') + + captured = capfd.readouterr() + print(captured.err) + + assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + + +@pytest.mark.parametrize('zip_against', [False, True]) +def test_against_multisigfile(runtmp, zip_against): # test against a sigfile that contains multiple sketches query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -241,6 +292,9 @@ def test_against_multisigfile(runtmp): runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined) make_file_list(against_list, [combined]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -248,11 +302,15 @@ def test_against_multisigfile(runtmp): '-o', g_output, '--output-prefetch', p_output, '-s', '100000') df = pandas.read_csv(g_output) - assert len(df) == 1 + if zip_against: + assert len(df) == 3 + else: + assert len(df) == 1 # @CTB this is a bug :(. It should load multiple sketches properly! -def test_query_multisigfile(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_query_multisigfile(runtmp, zip_against): # test with a sigfile that contains multiple sketches against_list = runtmp.output('against.txt') @@ -265,6 +323,9 @@ def test_query_multisigfile(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -276,7 +337,8 @@ def test_query_multisigfile(runtmp): assert len(df) == 1 -def test_against_nomatch(runtmp, capfd): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_against_nomatch(runtmp, capfd, zip_against): # test with 'against' file containing a non-matching ksize query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -288,6 +350,9 @@ def test_against_nomatch(runtmp, capfd): make_file_list(against_list, [sig2, sig1, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -301,7 +366,8 @@ def test_against_nomatch(runtmp, capfd): assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err -def test_md5s(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_md5s(runtmp, zip_against): # check that the correct md5sums (of the original sketches) are in # the output files query = get_test_data('SRR606249.sig.gz') @@ -313,6 +379,9 @@ def test_md5s(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -351,7 +420,8 @@ def test_md5s(runtmp): assert ss.md5sum() in md5s -def test_csv_columns_vs_sourmash_prefetch(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_against): # the column names should be strict subsets of sourmash prefetch cols query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -362,6 +432,9 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -388,7 +461,8 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp): assert not g_keys - sp_keys, g_keys - sp_keys -def test_fastgather_gatherout_as_picklist(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_fastgather_gatherout_as_picklist(runtmp, zip_against): # should be able to use fastgather gather output as picklist query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -399,6 +473,9 @@ def test_fastgather_gatherout_as_picklist(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') @@ -426,7 +503,8 @@ def test_fastgather_gatherout_as_picklist(runtmp): assert picklist_df.equals(full_df) -def test_fastgather_prefetchout_as_picklist(runtmp): +@pytest.mark.parametrize('zip_against', [False, True]) +def test_fastgather_prefetchout_as_picklist(runtmp, zip_against): # should be able to use fastgather prefetch output as picklist query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -437,6 +515,9 @@ def test_fastgather_prefetchout_as_picklist(runtmp): make_file_list(against_list, [sig2, sig47, sig63]) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip')) + g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv')