Skip to content

Commit

Permalink
add tests for zipped search sigs
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Sep 14, 2023
1 parent 41ee458 commit b7337e2
Showing 1 changed file with 95 additions and 14 deletions.
109 changes: 95 additions & 14 deletions src/python/tests/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,21 @@ def make_file_list(filename, paths):
fp.write("\n")


def zip_siglist(runtmp, siglist, db):
runtmp.sourmash('sig', 'cat', siglist,
'-o', db)
return db


def test_installed(runtmp):
with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'fastgather')

assert 'usage: fastgather' in runtmp.last_result.err


def test_simple(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_simple(runtmp, zip_against):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -35,6 +42,9 @@ def test_simple(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -48,7 +58,9 @@ def test_simple(runtmp):
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'}


def test_simple_with_prefetch(runtmp):

@pytest.mark.parametrize('zip_against', [False, True])
def test_simple_with_prefetch(runtmp, zip_against):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -59,6 +71,9 @@ def test_simple_with_prefetch(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -79,7 +94,8 @@ def test_simple_with_prefetch(runtmp):
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}


def test_missing_query(runtmp, capfd):
@pytest.mark.parametrize('zip_against', [False, True])
def test_missing_query(runtmp, capfd, zip_against):
# test missing query
query = runtmp.output('no-such-file')
against_list = runtmp.output('against.txt')
Expand All @@ -90,6 +106,9 @@ def test_missing_query(runtmp, capfd):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -103,8 +122,8 @@ def test_missing_query(runtmp, capfd):

assert 'Error: No such file or directory ' in captured.err


def test_bad_query(runtmp, capfd):
@pytest.mark.parametrize('zip_against', [False, True])
def test_bad_query(runtmp, capfd, zip_against):
# test non-sig query
query = runtmp.output('no-such-file')
against_list = runtmp.output('against.txt')
Expand All @@ -118,6 +137,9 @@ def test_bad_query(runtmp, capfd):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -132,7 +154,8 @@ def test_bad_query(runtmp, capfd):
assert 'Error: expected value at line 1' in captured.err


def test_missing_against(runtmp, capfd):
@pytest.mark.parametrize('zip_against', [False, True])
def test_missing_against(runtmp, capfd, zip_against):
# test missing against
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -142,6 +165,8 @@ def test_missing_against(runtmp, capfd):
sig63 = get_test_data('63.fa.sig.gz')

#make_file_list(against_list, [sig2, sig47, sig63])
if zip_against:
against_list = runtmp.output('against.zip')

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')
Expand Down Expand Up @@ -228,7 +253,33 @@ def test_bad_against_3(runtmp, capfd):
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


def test_against_multisigfile(runtmp):
def test_bad_against_4(runtmp, capfd):
# test with a bad against (a .sig.gz file renamed as zip file)
query = get_test_data('SRR606249.sig.gz')

sig2 = get_test_data('2.fa.sig.gz')
against_zip = runtmp.output('against.zip')
# cp sig2 into against_zip
with open(against_zip, 'wb') as fp:
with open(sig2, 'rb') as fp2:
fp.write(fp2.read())

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'fastgather', query, against_zip,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')

captured = capfd.readouterr()
print(captured.err)

assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err


@pytest.mark.parametrize('zip_against', [False, True])
def test_against_multisigfile(runtmp, zip_against):
# test against a sigfile that contains multiple sketches
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -241,18 +292,25 @@ def test_against_multisigfile(runtmp):
runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined)
make_file_list(against_list, [combined])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

runtmp.sourmash('scripts', 'fastgather', query, against_list,
'-o', g_output, '--output-prefetch', p_output,
'-s', '100000')
df = pandas.read_csv(g_output)
assert len(df) == 1
if zip_against:
assert len(df) == 3
else:
assert len(df) == 1
# @CTB this is a bug :(. It should load multiple sketches properly!


def test_query_multisigfile(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_query_multisigfile(runtmp, zip_against):
# test with a sigfile that contains multiple sketches
against_list = runtmp.output('against.txt')

Expand All @@ -265,6 +323,9 @@ def test_query_multisigfile(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -276,7 +337,8 @@ def test_query_multisigfile(runtmp):
assert len(df) == 1


def test_against_nomatch(runtmp, capfd):
@pytest.mark.parametrize('zip_against', [False, True])
def test_against_nomatch(runtmp, capfd, zip_against):
# test with 'against' file containing a non-matching ksize
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -288,6 +350,9 @@ def test_against_nomatch(runtmp, capfd):

make_file_list(against_list, [sig2, sig1, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -301,7 +366,8 @@ def test_against_nomatch(runtmp, capfd):
assert 'WARNING: skipped 1 search paths - no compatible signatures.' in captured.err


def test_md5s(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_md5s(runtmp, zip_against):
# check that the correct md5sums (of the original sketches) are in
# the output files
query = get_test_data('SRR606249.sig.gz')
Expand All @@ -313,6 +379,9 @@ def test_md5s(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand Down Expand Up @@ -351,7 +420,8 @@ def test_md5s(runtmp):
assert ss.md5sum() in md5s


def test_csv_columns_vs_sourmash_prefetch(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_against):
# the column names should be strict subsets of sourmash prefetch cols
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -362,6 +432,9 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand All @@ -388,7 +461,8 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp):
assert not g_keys - sp_keys, g_keys - sp_keys


def test_fastgather_gatherout_as_picklist(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_fastgather_gatherout_as_picklist(runtmp, zip_against):
# should be able to use fastgather gather output as picklist
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -399,6 +473,9 @@ def test_fastgather_gatherout_as_picklist(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand Down Expand Up @@ -426,7 +503,8 @@ def test_fastgather_gatherout_as_picklist(runtmp):
assert picklist_df.equals(full_df)


def test_fastgather_prefetchout_as_picklist(runtmp):
@pytest.mark.parametrize('zip_against', [False, True])
def test_fastgather_prefetchout_as_picklist(runtmp, zip_against):
# should be able to use fastgather prefetch output as picklist
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand All @@ -437,6 +515,9 @@ def test_fastgather_prefetchout_as_picklist(runtmp):

make_file_list(against_list, [sig2, sig47, sig63])

if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

g_output = runtmp.output('gather.csv')
p_output = runtmp.output('prefetch.csv')

Expand Down

0 comments on commit b7337e2

Please sign in to comment.