Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hp, dayhoff options to manysketch #345

Merged
merged 11 commits into from
Sep 29, 2024
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,4 @@ This software is under the AGPL license. Please see [LICENSE.txt](LICENSE.txt).
* C. Titus Brown
* Mohamed Abuelanin
* N. Tessa Pierce-Ward
* Olga Botvinnik
29 changes: 29 additions & 0 deletions doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,35 @@ sourmash sig summarize fa.zip
The number of sketches per parameter combination should equal the total number of records in all input FASTA.
The `name` column will not be used. Instead, each sketch will be named from the FASTA record name.

#### Protein sketching: hp and dayhoff moltypes

`manysketch` supports all sourmash moltypes: `protein`, `hp`, and `dayhoff`. See also [`sourmash` protein encoding documentation](https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html#protein-encodings) and [`sourmash` parameter documentation](https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html#default-parameters) for more information about what these "moltypes" mean and their default parameters.

`manysketch` does not translate DNA to protein, sorry. You'll need to do that ahead of time.

If you have a `proteins.csv` file which looks like:

```
name,genome_filename,protein_filename
protein1,,protein1.fa
protein2,,protein2.fa
```

You can run:

```
sourmash scripts manysketch proteins.csv -o proteins.zip -p dayhoff,k=16 -p protein,k=10 -p hp,k=42
```
The output will be written to `proteins.zip`

You can check if all signatures were written properly with

```
sourmash sig summarize proteins.zip
```

In this case, three sketches of `protein`, `dayhoff`, and `hp` moltypes were made for each file in `proteins.csv` and saved to `proteins.zip`.

### Running `multisearch` and `pairwise`

The `multisearch` command compares one or more query genomes, and one or more subject genomes. It differs from `manysearch` by loading all genomes into memory.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ authors = [
{ name="N. Tessa Pierce-Ward", orcid="0000-0002-2942-5331" },
{ name="Luiz Irber", orcid="0000-0003-4371-9659" },
{ name="Mohamed Abuelanin", orcid="0000-0002-3419-4785" },
{ name="Olga Botvinnik", orcid="0000-0003-4412-7970" },
{ name="C. Titus Brown", orcid="0000-0001-6001-2677" },
]

Expand Down
33 changes: 25 additions & 8 deletions src/manysketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
let mut num = 0;
let mut scaled = 1000;
let mut seed = 42;
let mut is_dna = false;
let mut is_protein = false;
let mut is_dna = true;
let mut is_dayhoff = false;
let mut is_hp = false;

for item in items.iter() {
match *item {
Expand Down Expand Up @@ -52,16 +54,27 @@ fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
}
"protein" => {
is_protein = true;
is_dna = false;
}
"dna" => {
is_protein = false;
is_dna = true;
}
"dayhoff" => {
is_dayhoff = true;
}
"hp" => {
is_hp = true;
}
_ => return Err(format!("unknown component '{}' in params string", item)),
}
}

if !is_dna && !is_protein && !is_dayhoff && !is_hp {
return Err(format!("No moltype provided in params string {}", p_str));
}
if ksizes.is_empty() {
return Err(format!("No ksizes provided in params string {}", p_str));
}

for &k in &ksizes {
let param = Params {
ksize: k,
Expand All @@ -71,6 +84,8 @@ fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
seed,
is_protein,
is_dna,
is_dayhoff,
is_hp,
};
unique_params.insert(param);
}
Expand All @@ -79,19 +94,19 @@ fn parse_params_str(params_strs: String) -> Result<Vec<Params>, String> {
Ok(unique_params.into_iter().collect())
}

fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
fn build_siginfo(params: &[Params], input_moltype: &str) -> Vec<Signature> {
let mut sigs = Vec::new();

for param in params.iter().cloned() {
match moltype {
// if dna, only build dna sigs. if protein, only build protein sigs
match input_moltype {
// if dna, only build dna sigs. if protein, only build protein sigs, etc
"dna" | "DNA" if !param.is_dna => continue,
"protein" if !param.is_protein => continue,
"protein" if !param.is_protein && !param.is_dayhoff && !param.is_hp => continue,
_ => (),
}

// Adjust ksize value based on the is_protein flag
let adjusted_ksize = if param.is_protein {
let adjusted_ksize = if param.is_protein || param.is_dayhoff || param.is_hp {
param.ksize * 3
} else {
param.ksize
Expand All @@ -102,6 +117,8 @@ fn build_siginfo(params: &[Params], moltype: &str) -> Vec<Signature> {
.scaled(param.scaled)
.protein(param.is_protein)
.dna(param.is_dna)
.dayhoff(param.is_dayhoff)
.hp(param.is_hp)
.num_hashes(param.num)
.track_abundance(param.track_abundance)
.build();
Expand Down
2 changes: 1 addition & 1 deletion src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def __init__(self, p):
def main(self, args):
print_version()
if not args.param_string:
args.param_string = ["k=31,scaled=1000"]
args.param_string = ["dna,k=31,scaled=1000"]
notify(f"params: {args.param_string}")

# convert to a single string for easier rust handling
Expand Down
98 changes: 96 additions & 2 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,41 @@ def test_manysketch_mult_moltype(runtmp):
assert sig.md5sum() in ["4efeebd26644278e36b9553e018a851a","f85747ac4f473c4a71c1740d009f512b"]


def test_manysketch_mult_moltype_protein(runtmp):
fa_csv = runtmp.output('db-fa.csv')

protfa1 = get_test_data('short-protein.fa')

make_assembly_csv(fa_csv, [], [protfa1])

output = runtmp.output('db.zip')

runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
'--param-str', "dayhoff,k=10,scaled=1",
'--param-str', "hp,k=24,scaled=1")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)

assert len(sigs) == 2
# check moltypes, etc!
total_checked = 0
for sig in sigs:
print(sig.name)
assert sig.name == "short-protein"
if sig.minhash.dayhoff:
assert sig.md5sum() == "320464775fe704d9f938a8c63d8dd722"
total_checked+=1
elif sig.minhash.hp:
assert sig.md5sum() == "e8ccc6ca7ad560072f51be631d1c39c0"
total_checked+=1
assert total_checked == 2


def test_manysketch_only_incompatible_fastas(runtmp, capfd):
# provide dna, protein fastas, but only sketch protein (skip protein fastas!)
fa_csv = runtmp.output('db-fa.csv')
Expand Down Expand Up @@ -270,6 +305,24 @@ def test_manysketch_bad_fa_csv(runtmp, capfd):


def test_manysketch_bad_fa_csv_2(runtmp, capfd):
# bad file within filelist
siglist = runtmp.output('bad.txt')

# fa_file = runtmp.output("bad.fa")
make_assembly_csv(siglist, ["bad2.fa"])

output = runtmp.output('db.zip')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'manysketch', siglist, '-o', output)

captured = capfd.readouterr()
print(captured.err)
assert "Could not load fasta files: no signatures created." in captured.err
assert "Error opening file bad2.fa: ParseError" in captured.err


def test_manysketch_bad_fa_csv_3(runtmp, capfd):
# test sketch with fasta provided instead of fa_csv
output = runtmp.output('out.zip')
fa1 = get_test_data('short.fa')
Expand All @@ -285,7 +338,7 @@ def test_manysketch_bad_fa_csv_2(runtmp, capfd):
assert "Could not load fromfile csv" in captured.err


def test_manysketch_bad_fa_csv_3(runtmp, capfd):
def test_manysketch_bad_fa_csv_4(runtmp, capfd):
# test sketch with improperly formatted fa_csv
fa_csv = runtmp.output('db-fa.csv')

Expand Down Expand Up @@ -318,7 +371,48 @@ def test_manysketch_bad_fa_csv_3(runtmp, capfd):
print(captured.err)
assert 'found record with 2 fields' in captured.err
assert "Could not load fromfile csv" in captured.err



def test_manysketch_bad_param_str_moltype(runtmp, capfd):
# no moltype provided in param str
fa_csv = runtmp.output('db-fa.txt')

fa1 = get_test_data('short.fa')
fa2 = get_test_data('short2.fa')
fa3 = get_test_data('short3.fa')

make_assembly_csv(fa_csv, [fa1, fa2, fa3])
output = runtmp.output('out.zip')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'manysketch', fa_csv,
'-o', output, '-p', 'k=31,scaled=100')

captured = capfd.readouterr()
print(captured.err)
assert "Error parsing params string: No moltype provided in params string k=31,scaled=100" in captured.err
assert "Failed to parse params string" in captured.err


def test_manysketch_bad_param_str_ksize(runtmp, capfd):
# no ksize provided in param str
fa_csv = runtmp.output('db-fa.txt')

fa1 = get_test_data('short.fa')
fa2 = get_test_data('short2.fa')
fa3 = get_test_data('short3.fa')

make_assembly_csv(fa_csv, [fa1, fa2, fa3])
output = runtmp.output('out.zip')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'manysketch', fa_csv,
'-o', output, '-p', 'dna,scaled=100')

captured = capfd.readouterr()
print(captured.err)
assert "Error parsing params string: No ksizes provided in params string dna,scaled=100" in captured.err
assert "Failed to parse params string" in captured.err

def test_manysketch_empty_fa_csv(runtmp, capfd):
# test empty fa_csv file
Expand Down
7 changes: 5 additions & 2 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use sourmash::sketch::minhash::KmerMinHash;
use sourmash::storage::{FSStorage, InnerStorage, SigStore};
use stats::{median, stddev};
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};
/// Track a name/minhash.

pub struct SmallSignature {
Expand Down Expand Up @@ -1269,10 +1270,10 @@ pub struct Params {
pub scaled: u64,
pub seed: u32,
pub is_protein: bool,
pub is_dayhoff: bool,
pub is_hp: bool,
pub is_dna: bool,
}
use std::hash::Hash;
use std::hash::Hasher;

impl Hash for Params {
fn hash<H: Hasher>(&self, state: &mut H) {
Expand All @@ -1282,6 +1283,8 @@ impl Hash for Params {
self.scaled.hash(state);
self.seed.hash(state);
self.is_protein.hash(state);
self.is_dayhoff.hash(state);
self.is_hp.hash(state);
self.is_dna.hash(state);
}
}
Expand Down
Loading