From 2bd5cfaf368aa797866bf0cc11db8cfc30ca7018 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 2 Jun 2024 11:50:57 -0700 Subject: [PATCH 1/8] Add test for hp, dayhoff --- src/python/tests/test_sketch.py | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index bade5ccc..1e4eb68a 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -171,6 +171,47 @@ def test_manysketch_mult_moltype(runtmp): assert sig.minhash.is_dna assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"] +def test_manysketch_mult_moltype_protein(runtmp): + fa_csv = runtmp.output('db-fa.csv') + + protfa1 = get_test_data('short-protein.fa') + + make_assembly_csv(fa_csv, [], [protfa1]) + + output = runtmp.output('db.zip') + + runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output, + '--param-str', "dayhoff,k=10,scaled=1", + '--param-str', "hp,k=24,scaled=1") + + assert os.path.exists(output) + assert not runtmp.last_result.out # stdout should be empty + + idx = sourmash.load_file_as_index(output) + sigs = list(idx.signatures()) + print(sigs) + + assert len(sigs) == 2 + # check moltypes, etc! + for sig in sigs: + if sig.name == 'short': + if sig.minhash.is_dna: + assert sig.minhash.ksize == 21 + assert sig.minhash.scaled == 1 + assert sig.md5sum() == "1474578c5c46dd09da4c2df29cf86621" + else: + assert sig.name == 'short' + assert sig.minhash.ksize == 10 + assert sig.minhash.scaled == 1 + assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" + else: + assert sig.name in ['short', 'short2', 'short3'] + assert sig.minhash.ksize == 21 + assert sig.minhash.scaled == 1 + assert sig.minhash.is_dna + assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"] + + def test_manysketch_only_incompatible_fastas(runtmp, capfd): # provide dna, protein fastas, but only sketch protein (skip protein fastas!) From 9fc1e7d2ddda93bec7a8f3ab929cfaa6e55f3e2f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sun, 2 Jun 2024 11:51:06 -0700 Subject: [PATCH 2/8] Add hp, dayhoff to params --- src/manysketch.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/manysketch.rs b/src/manysketch.rs index 80226035..7e6b4a95 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -24,6 +24,8 @@ fn parse_params_str(params_strs: String) -> Result, String> { let mut seed = 42; let mut is_protein = false; let mut is_dna = true; + let mut hp = false; + let mut dayhoff = false; for item in items.iter() { match *item { @@ -53,10 +55,26 @@ fn parse_params_str(params_strs: String) -> Result, String> { "protein" => { is_protein = true; is_dna = false; + hp = false; + dayhoff = false; } "dna" => { is_protein = false; is_dna = true; + hp = false; + dayhoff = false; + } + "hp" => { + is_protein = true; + is_dna = false; + hp = true; + dayhoff = false; + } + "dayhoff" => { + is_protein = true; + is_dna = false; + hp = false; + dayhoff = true; } _ => return Err(format!("unknown component '{}' in params string", item)), } @@ -71,6 +89,8 @@ fn parse_params_str(params_strs: String) -> Result, String> { seed, is_protein, is_dna, + dayhoff, + hp, }; unique_params.insert(param); } @@ -87,6 +107,8 @@ fn build_siginfo(params: &[Params], moltype: &str) -> Vec { // if dna, only build dna sigs. if protein, only build protein sigs "dna" | "DNA" if !param.is_dna => continue, "protein" if !param.is_protein => continue, + "dayhoff" if !param.dayhoff => continue, + "hp" if !param.hp => continue, _ => (), } From 224ac3e20ae3b78bfeac3cb0eea8aa02dee8779f Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Sat, 15 Jun 2024 11:41:18 -0700 Subject: [PATCH 3/8] MRG: add `dayhoff`/`hp` sketching improvements from `directsketch` (#358) * add changes from directsketch * ahh, needed dna in default param str * add err msg and tests for error i just encountered --- src/manysketch.rs | 51 +++++------ .../sourmash_plugin_branchwater/__init__.py | 2 +- src/python/tests/test_sketch.py | 91 +++++++++++++++---- src/utils.rs | 7 +- 4 files changed, 101 insertions(+), 50 deletions(-) diff --git a/src/manysketch.rs b/src/manysketch.rs index 7e6b4a95..72613791 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -22,10 +22,10 @@ fn parse_params_str(params_strs: String) -> Result, String> { let mut num = 0; let mut scaled = 1000; let mut seed = 42; + let mut is_dna = false; let mut is_protein = false; - let mut is_dna = true; - let mut hp = false; - let mut dayhoff = false; + let mut is_dayhoff = false; + let mut is_hp = false; for item in items.iter() { match *item { @@ -54,32 +54,27 @@ fn parse_params_str(params_strs: String) -> Result, String> { } "protein" => { is_protein = true; - is_dna = false; - hp = false; - dayhoff = false; } "dna" => { - is_protein = false; is_dna = true; - hp = false; - dayhoff = false; - } - "hp" => { - is_protein = true; - is_dna = false; - hp = true; - dayhoff = false; } "dayhoff" => { - is_protein = true; - is_dna = false; - hp = false; - dayhoff = true; + is_dayhoff = true; + } + "hp" => { + is_hp = true; } _ => return Err(format!("unknown component '{}' in params string", item)), } } + if !is_dna && !is_protein && !is_dayhoff && !is_hp { + return Err(format!("No moltype provided in params string {}", p_str)); + } + if ksizes.is_empty() { + return Err(format!("No ksizes provided in params string {}", p_str)); + } + for &k in &ksizes { let param = Params { ksize: k, @@ -89,8 +84,8 @@ fn parse_params_str(params_strs: String) -> Result, String> { seed, is_protein, is_dna, - dayhoff, - hp, + is_dayhoff, + is_hp, }; unique_params.insert(param); } @@ -99,21 +94,19 @@ fn parse_params_str(params_strs: String) -> Result, String> { Ok(unique_params.into_iter().collect()) } -fn build_siginfo(params: &[Params], moltype: &str) -> Vec { +fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec { let mut sigs = Vec::new(); for param in params.iter().cloned() { - match moltype { - // if dna, only build dna sigs. if protein, only build protein sigs + match input_moltype { + // if dna, only build dna sigs. if protein, only build protein sigs, etc "dna" | "DNA" if !param.is_dna => continue, - "protein" if !param.is_protein => continue, - "dayhoff" if !param.dayhoff => continue, - "hp" if !param.hp => continue, + "protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue, _ => (), } // Adjust ksize value based on the is_protein flag - let adjusted_ksize = if param.is_protein { + let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp { param.ksize * 3 } else { param.ksize @@ -124,6 +117,8 @@ fn build_siginfo(params: &[Params], moltype: &str) -> Vec { .scaled(param.scaled) .protein(param.is_protein) .dna(param.is_dna) + .dayhoff(param.is_dayhoff) + .hp(param.is_hp) .num_hashes(param.num) .track_abundance(param.track_abundance) .build(); diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 39dd6b59..6ca29cd0 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -349,7 +349,7 @@ def __init__(self, p): def main(self, args): print_version() if not args.param_string: - args.param_string = ["k=31,scaled=1000"] + args.param_string = ["dna,k=31,scaled=1000"] notify(f"params: {args.param_string}") # convert to a single string for easier rust handling diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py index 1e4eb68a..eabd8087 100644 --- a/src/python/tests/test_sketch.py +++ b/src/python/tests/test_sketch.py @@ -171,6 +171,7 @@ def test_manysketch_mult_moltype(runtmp): assert sig.minhash.is_dna assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"] + def test_manysketch_mult_moltype_protein(runtmp): fa_csv = runtmp.output('db-fa.csv') @@ -193,24 +194,17 @@ def test_manysketch_mult_moltype_protein(runtmp): assert len(sigs) == 2 # check moltypes, etc! + total_checked = 0 for sig in sigs: - if sig.name == 'short': - if sig.minhash.is_dna: - assert sig.minhash.ksize == 21 - assert sig.minhash.scaled == 1 - assert sig.md5sum() == "1474578c5c46dd09da4c2df29cf86621" - else: - assert sig.name == 'short' - assert sig.minhash.ksize == 10 - assert sig.minhash.scaled == 1 - assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310" - else: - assert sig.name in ['short', 'short2', 'short3'] - assert sig.minhash.ksize == 21 - assert sig.minhash.scaled == 1 - assert sig.minhash.is_dna - assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"] - + print(sig.name) + assert sig.name == "short-protein" + if sig.minhash.dayhoff: + assert sig.md5sum() == "320464775fe704d9f938a8c63d8dd722" + total_checked+=1 + elif sig.minhash.hp: + assert sig.md5sum() == "e8ccc6ca7ad560072f51be631d1c39c0" + total_checked+=1 + assert total_checked == 2 def test_manysketch_only_incompatible_fastas(runtmp, capfd): @@ -311,6 +305,24 @@ def test_manysketch_bad_fa_csv(runtmp, capfd): def test_manysketch_bad_fa_csv_2(runtmp, capfd): + # bad file within filelist + siglist = runtmp.output('bad.txt') + + # fa_file = runtmp.output("bad.fa") + make_assembly_csv(siglist, ["bad2.fa"]) + + output = runtmp.output('db.zip') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysketch', siglist, '-o', output) + + captured = capfd.readouterr() + print(captured.err) + assert "Could not load fasta files: no signatures created." in captured.err + assert "Error opening file bad2.fa: ParseError" in captured.err + + +def test_manysketch_bad_fa_csv_3(runtmp, capfd): # test sketch with fasta provided instead of fa_csv output = runtmp.output('out.zip') fa1 = get_test_data('short.fa') @@ -326,7 +338,7 @@ def test_manysketch_bad_fa_csv_2(runtmp, capfd): assert "Could not load fromfile csv" in captured.err -def test_manysketch_bad_fa_csv_3(runtmp, capfd): +def test_manysketch_bad_fa_csv_4(runtmp, capfd): # test sketch with improperly formatted fa_csv fa_csv = runtmp.output('db-fa.csv') @@ -359,7 +371,48 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd): print(captured.err) assert 'found record with 2 fields' in captured.err assert "Could not load fromfile csv" in captured.err - + + +def test_manysketch_bad_param_str_moltype(runtmp, capfd): + # no moltype provided in param str + fa_csv = runtmp.output('db-fa.txt') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) + output = runtmp.output('out.zip') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysketch', fa_csv, + '-o', output, '-p', 'k=31,scaled=100') + + captured = capfd.readouterr() + print(captured.err) + assert "Error parsing params string: No moltype provided in params string k=31,scaled=100" in captured.err + assert "Failed to parse params string" in captured.err + + +def test_manysketch_bad_param_str_ksize(runtmp, capfd): + # no ksize provided in param str + fa_csv = runtmp.output('db-fa.txt') + + fa1 = get_test_data('short.fa') + fa2 = get_test_data('short2.fa') + fa3 = get_test_data('short3.fa') + + make_assembly_csv(fa_csv, [fa1, fa2, fa3]) + output = runtmp.output('out.zip') + + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysketch', fa_csv, + '-o', output, '-p', 'dna,scaled=100') + + captured = capfd.readouterr() + print(captured.err) + assert "Error parsing params string: No ksizes provided in params string dna,scaled=100" in captured.err + assert "Failed to parse params string" in captured.err def test_manysketch_empty_fa_csv(runtmp, capfd): # test empty fa_csv file diff --git a/src/utils.rs b/src/utils.rs index 67eeaa76..30d8ba3e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -26,6 +26,7 @@ use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::{FSStorage, InnerStorage, SigStore}; use stats::{median, stddev}; use std::collections::{HashMap, HashSet}; +use std::hash::{Hash, Hasher}; /// Track a name/minhash. pub struct SmallSignature { @@ -1247,10 +1248,10 @@ pub struct Params { pub scaled: u64, pub seed: u32, pub is_protein: bool, + pub is_dayhoff: bool, + pub is_hp: bool, pub is_dna: bool, } -use std::hash::Hash; -use std::hash::Hasher; impl Hash for Params { fn hash(&self, state: &mut H) { @@ -1260,6 +1261,8 @@ impl Hash for Params { self.scaled.hash(state); self.seed.hash(state); self.is_protein.hash(state); + self.is_dayhoff.hash(state); + self.is_hp.hash(state); self.is_dna.hash(state); } } From 1e0fce0f42f506eb04448a8bf5ec9b1e34185d44 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 3 Jul 2024 08:27:21 -0700 Subject: [PATCH 4/8] Add documentation about protein sketching with hp, dayhoff --- doc/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/README.md b/doc/README.md index 6fdb1683..186730f8 100644 --- a/doc/README.md +++ b/doc/README.md @@ -158,6 +158,27 @@ sourmash sig summarize fa.zip The number of sketches per parameter combination should equal the total number of records in all input FASTA. The `name` column will not be used. Instead, each sketch will be named from the FASTA record name. +#### Protein sketching: hp and dayhoff moltypes + +`manysketch` supports all sourmash moltypes: `protein`, `hp`, and `dayhoff`. See also [`sourmash` protein encoding documentation](https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html#protein-encodings) and [`sourmash` parameter documentation](https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html#default-parameters) for more information about what these "moltypes" mean and their default parameters. + +`manysketch` does not translate DNA to protein, sorry. You'll need to do that ahead of time. + +You can run: + +``` +sourmash scripts manysketch proteins.csv -o proteins.zip -p dayhoff,k=16 -p protein,k=10 -p hp,k=42 +``` +The output will be written to `proteins.zip` + +You can check if all signatures were written properly with + +``` +sourmash sig summarize proteins.zip +``` + +In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made in `proteins.zip`. + ### Running `multisearch` and `pairwise` The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory. From 8186b7cf91a27de12e072c22afb10d4b9de89181 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 3 Jul 2024 08:35:19 -0700 Subject: [PATCH 5/8] Add Olga to authors on README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3939bf53..e3fd242e 100644 --- a/README.md +++ b/README.md @@ -124,3 +124,4 @@ This software is under the AGPL license. Please see [LICENSE.txt](LICENSE.txt). * C. Titus Brown * Mohamed Abuelanin * N. Tessa Pierce-Ward +* Olga Botvinnik \ No newline at end of file From bfb773c514ce12bd99c72f348323683d1c610409 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 3 Jul 2024 08:35:40 -0700 Subject: [PATCH 6/8] Add example `proteins.csv` --- doc/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/README.md b/doc/README.md index 186730f8..f85324fd 100644 --- a/doc/README.md +++ b/doc/README.md @@ -164,6 +164,14 @@ The `name` column will not be used. Instead, each sketch will be named from the `manysketch` does not translate DNA to protein, sorry. You'll need to do that ahead of time. +If you have a `proteins.csv` file which looks like: + +``` +name,genome_filename,protein_filename +protein1,,protein1.fa +protein2,,protein2.fa +``` + You can run: ``` From 90396182ed52c89964c380a2579596972b0536e2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 3 Jul 2024 08:35:49 -0700 Subject: [PATCH 7/8] Add Olga to authors on pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9598ed9f..9f1332c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ authors = [ { name="N. Tessa Pierce-Ward", orcid="0000-0002-2942-5331" }, { name="Luiz Irber", orcid="0000-0003-4371-9659" }, { name="Mohamed Abuelanin", orcid="0000-0002-3419-4785" }, + { name="Olga Botvinnik", orcid="0000-0003-4412-7970" }, { name="C. Titus Brown", orcid="0000-0001-6001-2677" }, ] From cc4abb4d9f63d68b5834abd73c63e208a34ea00f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Sat, 28 Sep 2024 12:16:37 -0700 Subject: [PATCH 8/8] Update doc/README.md Co-authored-by: C. Titus Brown --- doc/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/README.md b/doc/README.md index 6ae5b9bc..7081cf46 100644 --- a/doc/README.md +++ b/doc/README.md @@ -185,7 +185,7 @@ You can check if all signatures were written properly with sourmash sig summarize proteins.zip ``` -In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made in `proteins.zip`. +In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made for each file in `proteins.csv` and saved to `proteins.zip`. ### Running `multisearch` and `pairwise`