Skip to content

Commit

Permalink
MRG: upgrade sig overlap and sig subtract to load more than JSON …
Browse files Browse the repository at this point in the history
…signatures (#3153)

Fix `sig overlap` and `sig subtract` to take more than just JSON
signatures.

Also, adds a function `sourmash_args.load_one_signature` that I think
should (eventually) replace the now-deprecated
`sourmash.signature.load_one_signature`. This will be the topic of a new
PR - for now, I think it's a nice quick fix!

Fixes #3136

Related issues:
* #1062 - will do another
PR to close this issue
* #1877
* #1312
* #1060

TODO:
- [x] test uncovered code
- [x] do a bit more of a search and digest of related issues to see if
there's other low hanging fruit

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ctb and pre-commit-ci[bot] authored Jun 4, 2024
1 parent fabe76a commit 2542c69
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 22 deletions.
16 changes: 8 additions & 8 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,10 @@ def overlap(args):

moltype = sourmash_args.calculate_moltype(args)

sig1 = sourmash.load_one_signature(
sig1 = sourmash_args.load_one_signature(
args.signature1, ksize=args.ksize, select_moltype=moltype
)
sig2 = sourmash.load_one_signature(
sig2 = sourmash_args.load_one_signature(
args.signature2, ksize=args.ksize, select_moltype=moltype
)

Expand Down Expand Up @@ -573,7 +573,7 @@ def intersect(args):
# borrow abundances from a signature?
if args.abundances_from:
notify(f"loading signature from {args.abundances_from}, keeping abundances")
abund_sig = sourmash.load_one_signature(
abund_sig = sourmash_args.load_one_signature(
args.abundances_from, ksize=args.ksize, select_moltype=moltype
)
if not abund_sig.minhash.track_abundance:
Expand Down Expand Up @@ -646,9 +646,8 @@ def subtract(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

from_sigfile = args.signature_from
from_sigobj = sourmash.load_one_signature(
from_sigfile, ksize=args.ksize, select_moltype=moltype
from_sigobj = sourmash_args.load_one_signature(
args.signature_from, ksize=args.ksize, select_moltype=moltype
)

if args.abundances_from: # it's ok to work with abund signatures if -A.
Expand All @@ -661,7 +660,7 @@ def subtract(args):

subtract_mins = set(from_mh.hashes)

notify(f"loaded signature from {from_sigfile}...", end="\r")
notify(f"loaded signature from {args.signature_from}...", end="\r")

progress = sourmash_args.SignatureLoadingProgress()

Expand Down Expand Up @@ -694,9 +693,10 @@ def subtract(args):
# borrow abundances from somewhere?
if args.abundances_from:
notify(f"loading signature from {args.abundances_from}, keeping abundances")
abund_sig = sourmash.load_one_signature(
abund_sig = sourmash_args.load_one_signature(
args.abundances_from, ksize=args.ksize, select_moltype=moltype
)

if not abund_sig.minhash.track_abundance:
error("--track-abundance not set on loaded signature?! exiting.")
sys.exit(-1)
Expand Down
37 changes: 37 additions & 0 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,3 +810,40 @@ def load_file_as_signatures(
return progress.start_file(filename, loader)
else:
return loader


def load_one_signature(
filename,
*,
select_moltype=None,
ksize=None,
picklist=None,
yield_all_files=False,
pattern=None,
):
db = _load_database(filename, yield_all_files)

db = db.select(moltype=select_moltype, ksize=ksize)

# apply pattern search & picklist
db = apply_picklist_and_pattern(db, picklist, pattern)

loader = db.signatures()

# load exactly one!
try:
ss = next(iter(loader))
except StopIteration:
raise ValueError(f"no signatures in '{filename}'? expected exactly one.")

# make sure there's not a second one...
try:
_ = next(iter(loader))

raise ValueError(
f"more than one signature in '{filename}'; expected exactly one"
)
except StopIteration:
pass

return ss
Binary file added tests/test-data/47.fa.sig.zip
Binary file not shown.
Binary file added tests/test-data/63.fa.sig.zip
Binary file not shown.
68 changes: 54 additions & 14 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,8 +769,8 @@ def test_sig_inflate_5_bad_moltype(runtmp):
assert "no signatures to inflate" in runtmp.last_result.err


@utils.in_tempdir
def test_sig_subtract_1(c):
def test_sig_subtract_1(runtmp):
c = runtmp
# subtract of 63 from 47
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -789,6 +789,28 @@ def test_sig_subtract_1(c):
assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


def test_sig_subtract_1_sigzip(runtmp):
c = runtmp
# subtract of 63 from 47
sig47 = utils.get_test_data("47.fa.sig.zip")
sig63 = utils.get_test_data("63.fa.sig.zip")
c.run_sourmash("sig", "subtract", sig47, sig63)

# stdout should be new signature
out = c.last_result.out

from sourmash import sourmash_args

test1_sig = sourmash_args.load_one_signature(sig47)
test2_sig = sourmash_args.load_one_signature(sig63)
actual_subtract_sig = sourmash.load_one_signature(out)

mins = set(test1_sig.minhash.hashes.keys())
mins -= set(test2_sig.minhash.hashes.keys())

assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


def test_sig_subtract_1_abund(runtmp):
# subtract 63 from 47, with abundances borrowed from 47

Expand Down Expand Up @@ -856,8 +878,8 @@ def test_sig_subtract_1_flatten(runtmp):
assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


@utils.in_tempdir
def test_sig_subtract_1_multisig(c):
def test_sig_subtract_1_multisig(runtmp):
c = runtmp
# subtract of everything from 47
sig47 = utils.get_test_data("47.fa.sig")
multisig = utils.get_test_data("47+63-multisig.sig")
Expand All @@ -871,8 +893,8 @@ def test_sig_subtract_1_multisig(c):
assert not set(actual_subtract_sig.minhash.hashes.keys())


@utils.in_tempdir
def test_sig_subtract_2(c):
def test_sig_subtract_2(runtmp):
c = runtmp
# subtract of 63 from 47 should fail if 47 has abund
sig47 = utils.get_test_data("track_abund/47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -881,8 +903,8 @@ def test_sig_subtract_2(c):
c.run_sourmash("sig", "subtract", sig47, sig63)


@utils.in_tempdir
def test_sig_subtract_3(c):
def test_sig_subtract_3(runtmp):
c = runtmp
# subtract of 63 from 47 should fail if 63 has abund
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("track_abund/63.fa.sig")
Expand All @@ -891,8 +913,8 @@ def test_sig_subtract_3(c):
c.run_sourmash("sig", "subtract", sig47, sig63)


@utils.in_tempdir
def test_sig_subtract_4_ksize_fail(c):
def test_sig_subtract_4_ksize_fail(runtmp):
c = runtmp
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data("47.fa.sig")
sig2 = utils.get_test_data("2.fa.sig")
Expand All @@ -901,8 +923,8 @@ def test_sig_subtract_4_ksize_fail(c):
c.run_sourmash("sig", "subtract", sig47, sig2)


@utils.in_tempdir
def test_sig_subtract_4_ksize_succeed(c):
def test_sig_subtract_4_ksize_succeed(runtmp):
c = runtmp
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data("47.fa.sig")
sig2 = utils.get_test_data("2.fa.sig")
Expand Down Expand Up @@ -3839,8 +3861,8 @@ def test_sig_describe_3_manifest_fails_when_moved(runtmp):
runtmp.sourmash("sig", "describe", "mf.csv")


@utils.in_tempdir
def test_sig_overlap(c):
def test_sig_overlap(runtmp):
c = runtmp
# get overlap details
sig47 = utils.get_test_data("47.fa.sig")
sig63 = utils.get_test_data("63.fa.sig")
Expand All @@ -3857,6 +3879,24 @@ def test_sig_overlap(c):
assert "number of hashes in common: 2529" in out


def test_sig_overlap_2(runtmp):
c = runtmp
# get overlap details
sig47 = utils.get_test_data("47.fa.sig.zip")
sig63 = utils.get_test_data("63.fa.sig.zip")
c.run_sourmash("sig", "overlap", sig47, sig63)
out = c.last_result.out

print(out)

# md5s
assert "09a08691ce52952152f0e866a59f6261" in out
assert "38729c6374925585db28916b82a6f513" in out

assert "similarity: 0.32069" in out
assert "number of hashes in common: 2529" in out


@utils.in_tempdir
def test_import_export_1(c):
# check to make sure we can import what we've exported!
Expand Down
28 changes: 28 additions & 0 deletions tests/test_sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,3 +841,31 @@ def test_bug_2370(runtmp):
# try running sourmash_args.load_file_as_index
# runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz'))
sourmash_args.load_file_as_index(runtmp.output("not_really_gzipped.gz"))


def test_load_one_signature_1(runtmp):
# test the sourmash_args.load_one_signature function
sigfile = utils.get_test_data("63.fa.sig.zip")

ss = sourmash_args.load_one_signature(sigfile, ksize=31)
assert ss.name.startswith("NC_011663.1 ")


def test_load_one_signature_2_fail(runtmp):
# test the sourmash_args.load_one_signature function on failure - no sig
sigfile = utils.get_test_data("63.fa.sig.zip")

with pytest.raises(ValueError) as exc:
sourmash_args.load_one_signature(sigfile, ksize=21)

assert "expected exactly one." in str(exc)


def test_load_one_signature_3_fail(runtmp):
# test the sourmash_args.load_one_signature function on failure - many sigs
sigfile = utils.get_test_data("prot/all.zip")

with pytest.raises(ValueError) as exc:
sourmash_args.load_one_signature(sigfile)

assert "more than one signature" in str(exc)

0 comments on commit 2542c69

Please sign in to comment.