Skip to content

Commit

Permalink
add skipmer tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Dec 17, 2024
1 parent 6190609 commit 927916c
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 13 deletions.
15 changes: 8 additions & 7 deletions src/directsketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ pub async fn gbsketch(
}
};
// Check if we have dna signature templates and not keep_fastas
if sig_templates.dna_size()? == 0 && !keep_fastas {
if sig_templates.anydna_size()? == 0 && !keep_fastas {
eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
proteomes_only = true;
}
Expand Down Expand Up @@ -1084,6 +1084,8 @@ pub async fn urlsketch(
retry_times: u32,
fasta_location: String,
keep_fastas: bool,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
batch_size: u32,
n_permits: usize,
Expand Down Expand Up @@ -1174,10 +1176,9 @@ pub async fn urlsketch(
bail!("No accessions to download and sketch.")
}

// todo: add genomes_only / proteomes_only to the input options
let mut sig_templates = BuildCollection::new();
let mut genomes_only = false;
let mut proteomes_only = false;
let mut genomes_only = genomes_only;
let mut proteomes_only = proteomes_only;

if download_only {
if genomes_only {
Expand All @@ -1194,7 +1195,7 @@ pub async fn urlsketch(
}
};
// Check if we have dna signature templates and not keep_fastas
if sig_templates.dna_size()? == 0 && !keep_fastas {
if sig_templates.anydna_size()? == 0 && !keep_fastas {
eprintln!("No DNA signature templates provided, and --keep-fasta is not set.");
proteomes_only = true;
}
Expand All @@ -1205,12 +1206,12 @@ pub async fn urlsketch(
}
if genomes_only {
// select only DNA templates
let multiselection = MultiSelection::from_moltypes(vec!["dna"])?;
let multiselection = MultiSelection::from_input_moltype("DNA")?;
sig_templates.select(&multiselection)?;
eprintln!("Downloading and sketching genomes only.");
} else if proteomes_only {
// select only protein templates
let multiselection = MultiSelection::from_moltypes(vec!["protein", "dayhoff", "hp"])?;
let multiselection = MultiSelection::from_input_moltype("protein")?;
sig_templates.select(&multiselection)?;
eprintln!("Downloading and sketching proteomes only.");
}
Expand Down
6 changes: 5 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ fn do_gbsketch(

#[pyfunction]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (input_csv, param_str, failed_csv, retry_times, fasta_location, keep_fastas, download_only, batch_size, n_permits, output_sigs=None, failed_checksums=None))]
#[pyo3(signature = (input_csv, param_str, failed_csv, retry_times, fasta_location, keep_fastas, genomes_only, proteomes_only, download_only, batch_size, n_permits, output_sigs=None, failed_checksums=None))]
fn do_urlsketch(
py: Python,
input_csv: String,
Expand All @@ -101,6 +101,8 @@ fn do_urlsketch(
retry_times: u32,
fasta_location: String,
keep_fastas: bool,
genomes_only: bool,
proteomes_only: bool,
download_only: bool,
batch_size: u32,
n_permits: usize,
Expand All @@ -115,6 +117,8 @@ fn do_urlsketch(
retry_times,
fasta_location,
keep_fastas,
genomes_only,
proteomes_only,
download_only,
batch_size,
n_permits,
Expand Down
5 changes: 5 additions & 0 deletions src/python/sourmash_plugin_directsketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ def __init__(self, p):
help='number of times to retry failed downloads')
p.add_argument('-n', '--n-simultaneous-downloads', default=3, type=int, choices = [1, 2, 3],
help='number of simultaneous downloads (default=3)')
group = p.add_mutually_exclusive_group()
group.add_argument('-g', '--genomes-only', action='store_true', help='just download and sketch genome (DNA) files')
group.add_argument('-m', '--proteomes-only', action='store_true', help='just download and sketch proteome (protein) files')


def main(self, args):
Expand Down Expand Up @@ -179,6 +182,8 @@ def main(self, args):
args.retry_times,
args.fastas,
args.keep_fasta,
args.genomes_only,
args.proteomes_only,
args.download_only,
args.batch_size,
args.n_simultaneous_downloads,
Expand Down
15 changes: 10 additions & 5 deletions src/utils/buildutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,13 @@ impl BuildCollection {
Ok(mf.records.len())
}

pub fn anydna_size(&self) -> Result<usize, SourmashError> {
let multiselection = MultiSelection::from_moltypes(vec!["DNA", "skipm1n3", "skipm2n3"])?;
let mut mf = self.manifest.clone();
mf.select(&multiselection)?;
Ok(mf.records.len())
}

pub fn protein_size(&self) -> Result<usize, SourmashError> {
let multiselection = MultiSelection::from_moltypes(vec!["protein"])?;
let mut mf = self.manifest.clone();
Expand Down Expand Up @@ -644,7 +651,6 @@ impl BuildCollection {
// Check if the record is already in the set.
if seen_records.insert(record.clone()) {
// Add the record and its associated signature to the collection.
// coll.add_template_sig_from_record(&record, &record.moltype);
coll.add_template_sig_from_record(&record);
}
}
Expand Down Expand Up @@ -755,7 +761,6 @@ impl BuildCollection {
input_moltype: &str,
record: &SequenceRecord,
) -> Result<()> {
// Optionally use `par_iter_mut` for parallel execution
self.iter_mut().try_for_each(|(rec, sig)| {
if input_moltype == "protein"
&& (rec.moltype() == HashFunctions::Murmur64Protein
Expand All @@ -768,9 +773,9 @@ impl BuildCollection {
rec.sequence_added = true;
}
} else if (input_moltype == "DNA" || input_moltype == "dna")
&& rec.moltype() == HashFunctions::Murmur64Dna
|| rec.moltype() == HashFunctions::Murmur64Skipm2n3
|| rec.moltype() == HashFunctions::Murmur64Skipm1n3
&& (rec.moltype() == HashFunctions::Murmur64Dna
|| rec.moltype() == HashFunctions::Murmur64Skipm2n3
|| rec.moltype() == HashFunctions::Murmur64Skipm1n3)
{
sig.add_sequence(&record.seq(), true)
.context("Failed to add sequence")?;
Expand Down
91 changes: 91 additions & 0 deletions tests/test_gbsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,3 +880,94 @@ def test_gbsketch_overwrite(runtmp, capfd):
for sig in sigs:
all_siginfo.add((sig.name, sig.md5sum(), sig.minhash.moltype))
assert all_siginfo == expected_siginfo


def test_gbsketch_simple_skipmer(runtmp, capfd):
acc_csv = get_test_data('acc.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')
ch_fail = runtmp.output('checksum_dl_failed.csv')

runtmp.sourmash('scripts', 'gbsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1', '--checksum-fail', ch_fail,
'--param-str', "skipm2n3,k=21,scaled=1000")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)
print(f"looking for path: {output}")

# read the file with python and check sigs
import zipfile, gzip, json

with zipfile.ZipFile(output, "r") as zf:
# Check the manifest exists
assert "SOURMASH-MANIFEST.csv" in zf.namelist()

expected_signatures = [
{
"name": "GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454",
"ksize": 21,
"scaled": 1000,
"moltype": "skipm2n3",
"md5sum": "5745400ada0c3a27ddf1e8d5d1a46b7a",
},
{
"name": "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14",
"ksize": 21,
"scaled": 1000,
"moltype": "skipm2n3",
"md5sum": "2e37ca0bb9228bc3f5e1337e8535ab26",
},
]
expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}

# Read and parse the manifest
with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
manifest_data = manifest_file.read().decode("utf-8").splitlines()
manifest_data = [line for line in manifest_data if not line.startswith("#")]
manifest_reader = csv.DictReader(manifest_data)

for row in manifest_reader:
if row["moltype"] == "skipm2n3":
print("Manifest Row:", row)

# Validate row fields
md5sum = row["md5"]
assert (
md5sum in expected_signatures_dict
), f"Unexpected md5sum: {md5sum}"
expected = expected_signatures_dict[md5sum]
assert (
row["name"] == expected["name"]
), f"Name mismatch: {row['name']}"
assert (
int(row["ksize"]) == expected["ksize"]
), f"Ksize mismatch: {row['ksize']}"
assert (
row["moltype"] == expected["moltype"]
), f"Moltype mismatch: {row['moltype']}"

sig_path = row["internal_location"]
assert sig_path.startswith("signatures/")

# Extract and read the signature file
with zf.open(sig_path) as sig_gz:
with gzip.open(sig_gz, "rt") as sig_file:
sig_contents = json.load(sig_file)
print("Signature Contents:", sig_contents)

# Validate signature contents
sig_data = sig_contents[0]
print(sig_data)
siginfo = sig_data["signatures"][0]
assert (
siginfo["md5sum"] == md5sum
), f"MD5 mismatch: {siginfo['md5sum']}"
assert (
siginfo["ksize"] == expected["ksize"]
), f"Ksize mismatch: {siginfo['ksize']}"
assert (
siginfo["molecule"] == expected["moltype"]
), f"Moltype mismatch: {siginfo['molecule']}"
92 changes: 92 additions & 0 deletions tests/test_urlsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import pytest

import csv
import sourmash
import sourmash_tst_utils as utils
from sourmash_tst_utils import SourmashCommandFailed
Expand Down Expand Up @@ -654,3 +655,94 @@ def test_urlsketch_simple_batch_restart_with_incomplete_zip(runtmp, capfd):

# Verify that the loaded signatures match the expected signatures, order-independent
assert all_siginfo == expected_siginfo, f"Loaded sigs: {all_siginfo}, expected: {expected_siginfo}"


def test_urlsketch_simple_skipmer(runtmp, capfd):
acc_csv = get_test_data('acc-url.csv')
output = runtmp.output('simple.zip')
failed = runtmp.output('failed.csv')
ch_fail = runtmp.output('checksum_dl_failed.csv')

runtmp.sourmash('scripts', 'urlsketch', acc_csv, '-o', output,
'--failed', failed, '-r', '1', '--checksum-fail', ch_fail,
'--param-str', "skipm2n3,k=21,scaled=1000")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.err)
print(f"looking for path: {output}")

# read the file with python and check sigs
import zipfile, gzip, json

with zipfile.ZipFile(output, "r") as zf:
# Check the manifest exists
assert "SOURMASH-MANIFEST.csv" in zf.namelist()

expected_signatures = [
{
"name": "GCA_000961135.2 Candidatus Aramenus sulfurataquae isolate AZ1-454",
"ksize": 21,
"scaled": 1000,
"moltype": "skipm2n3",
"md5sum": "5745400ada0c3a27ddf1e8d5d1a46b7a",
},
{
"name": "GCA_000175535.1 Chlamydia muridarum MopnTet14 (agent of mouse pneumonitis) strain=MopnTet14",
"ksize": 21,
"scaled": 1000,
"moltype": "skipm2n3",
"md5sum": "2e37ca0bb9228bc3f5e1337e8535ab26",
},
]
expected_signatures_dict = {exp["md5sum"]: exp for exp in expected_signatures}

# Read and parse the manifest
with zf.open("SOURMASH-MANIFEST.csv") as manifest_file:
manifest_data = manifest_file.read().decode("utf-8").splitlines()
manifest_data = [line for line in manifest_data if not line.startswith("#")]
manifest_reader = csv.DictReader(manifest_data)

for row in manifest_reader:
if row["moltype"] == "skipm2n3":
print("Manifest Row:", row)

# Validate row fields
md5sum = row["md5"]
assert (
md5sum in expected_signatures_dict
), f"Unexpected md5sum: {md5sum}"
expected = expected_signatures_dict[md5sum]
assert (
row["name"] == expected["name"]
), f"Name mismatch: {row['name']}"
assert (
int(row["ksize"]) == expected["ksize"]
), f"Ksize mismatch: {row['ksize']}"
assert (
row["moltype"] == expected["moltype"]
), f"Moltype mismatch: {row['moltype']}"

sig_path = row["internal_location"]
assert sig_path.startswith("signatures/")

# Extract and read the signature file
with zf.open(sig_path) as sig_gz:
with gzip.open(sig_gz, "rt") as sig_file:
sig_contents = json.load(sig_file)
print("Signature Contents:", sig_contents)

# Validate signature contents
sig_data = sig_contents[0]
print(sig_data)
siginfo = sig_data["signatures"][0]
assert (
siginfo["md5sum"] == md5sum
), f"MD5 mismatch: {siginfo['md5sum']}"
assert (
siginfo["ksize"] == expected["ksize"]
), f"Ksize mismatch: {siginfo['ksize']}"
assert (
siginfo["molecule"] == expected["moltype"]
), f"Moltype mismatch: {siginfo['molecule']}"

0 comments on commit 927916c

Please sign in to comment.