From 8d138b33170034911c7fc174525e7565a1d54d09 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Mon, 19 Feb 2024 08:17:02 -0800 Subject: [PATCH] k/3 for manifest to reflect input ksize (#224) --- src/python/tests/test_sketch.py | 44 +++++++++++++++++++++++++++++++++ src/utils.rs | 7 +++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index d6bbc5fa..fdbad0b9 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -405,3 +405,47 @@ def test_zip_manifest(runtmp, capfd): assert sig.minhash.ksize == 31 assert sig.minhash.moltype == 'DNA' assert sig.minhash.scaled == 1 + + +def test_protein_zip_manifest(runtmp, capfd): + # test basic manifest-generating functionality. + fa_csv = runtmp.output('db-fa.csv') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short-protein.fa') + + make_file_csv(fa_csv, [fa1], [fa2]) + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "protein,k=10,scaled=1") + + loader = sourmash.load_file_as_index(output) + + rows = [] + siglist = [] + # make manifest via sourmash python code + for (sig, loc) in loader._signatures_with_internal(): + row = index.CollectionManifest.make_manifest_row(sig, loc) + rows.append(row) + siglist.append(sig) + + manifest = index.CollectionManifest(rows) + + assert len(manifest) == len(rows) + assert len(manifest) == 1 + + md5_list = [ row['md5'] for row in manifest.rows ] + assert 'eb4467d11e0ecd2dbde4193bfc255310' in md5_list + ksize_list = [ row['ksize'] for row in manifest.rows ] + assert 10 in ksize_list # manifest ksizes are human-readable (k, not k*3) + scaled_list = [ row['scaled'] for row in manifest.rows ] + assert 1 in scaled_list + moltype_list = [ row['moltype'] for row in manifest.rows ] + assert "protein" in moltype_list + + for sig in siglist: + assert sig in manifest + assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us + assert sig.minhash.moltype == 'protein' + assert sig.minhash.scaled == 1 diff --git a/src/utils.rs b/src/utils.rs index 7342af97..db2c84e4 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -787,11 +787,16 @@ pub fn make_manifest_row( "protein".to_string() }; let sketch = &sig.sketches()[0]; + let ksize: u32 = if is_dna { + sketch.ksize() as u32 + } else { + sketch.ksize() as u32 / 3 + }; ManifestRow { internal_location: internal_location.to_string(), md5: sig.md5sum(), md5short: sig.md5sum()[0..8].to_string(), - ksize: sketch.ksize() as u32, + ksize: ksize, moltype, num, scaled,