Skip to content

Commit

Permalink
test singleton
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Feb 26, 2024
1 parent cdb0899 commit 3ccf625
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 18 deletions.
37 changes: 22 additions & 15 deletions src/manysketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,22 @@ pub fn manysketch(
.par_iter()
.filter_map(|(name, filenames, moltype)| {
let mut allsigs = Vec::new();
// build sig templates for these sketches from params, check if there are sigs to build
let sig_templates = build_siginfo(&params_vec, moltype);
// if no sigs to build, skip this iteration
if sig_templates.is_empty() {
skipped_paths.fetch_add(filenames.len(), atomic::Ordering::SeqCst);
processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
return None;
}

let mut sigs = sig_templates.clone();
// have name / filename been set for each sig yet?
let mut set_name = false;
// if merging multiple files, sourmash sets filename as last filename
let last_filename = filenames.last().unwrap();

// to do: consider changing reporting to per-sig, no matter how many fastas? but singleton...
for filename in filenames {
// increment processed_fastas counter; make 1-based for % reporting
let i = processed_fastas.fetch_add(1, atomic::Ordering::SeqCst);
Expand All @@ -182,27 +198,17 @@ pub fn manysketch(
);
}

// build sig templates from params
let sig_templates = build_siginfo(&params_vec, moltype);
let mut sigs = sig_templates.clone();
// if no sigs to build, skip
if sigs.is_empty() {
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
return None;
}

// Open fasta file reader
let mut reader = match parse_fastx_file(filename) {
Ok(r) => r,
Err(err) => {
eprintln!("Error opening file {}: {:?}", filename, err);
let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
failed_paths.fetch_add(1, atomic::Ordering::SeqCst);
return None;
}
};

// parse fasta and add to signature
let mut set_name = false;
while let Some(record_result) = reader.next() {
match record_result {
Ok(record) => {
Expand All @@ -216,7 +222,8 @@ pub fn manysketch(
sig.set_filename(filename.as_str());
} else if !set_name {
sig.set_name(name);
sig.set_filename(filename.as_str());
// sourmash sets filename to last filename if merging fastas
sig.set_filename(last_filename.as_str());
set_name = true;
};
if moltype == "protein" {
Expand All @@ -236,9 +243,9 @@ pub fn manysketch(
sigs = sig_templates.clone();
}
}
if !singleton {
allsigs.append(&mut sigs);
}
}
if !singleton {
allsigs.append(&mut sigs);
}
Some(allsigs)
})
Expand Down
41 changes: 38 additions & 3 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,6 @@ def test_manysketch_skip_incompatible_fastas(runtmp, capfd):
assert sig.minhash.ksize == 10
assert sig.minhash.scaled == 1
assert sig.md5sum() == "eb4467d11e0ecd2dbde4193bfc255310"
assert 'Starting file 2/4 (50%)' in captured.err
assert 'Starting file 3/4 (75%)' in captured.err
assert 'Starting file 4/4 (100%)' in captured.err
assert 'DONE. Processed 4 fasta files' in captured.err
assert 'WARNING: 3 fasta files skipped - no compatible signatures.' in captured.err

Expand Down Expand Up @@ -452,3 +449,41 @@ def test_protein_zip_manifest(runtmp, capfd):
assert sig.minhash.ksize == 10 # minhash stores k*3, but does the conversion back for us
assert sig.minhash.moltype == 'protein'
assert sig.minhash.scaled == 1


def test_manysketch_singleton(runtmp):
fa_csv = runtmp.output('db-fa.txt')

fa1 = get_test_data('short.fa')
fa2 = get_test_data('short2.fa')
fa3 = get_test_data('short3.fa')

make_file_csv(fa_csv, [fa1, fa2, fa3])

output = runtmp.output('db.zip')

runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
'--param-str', "dna,k=31,scaled=1", "--singleton")

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)

assert len(sigs) == 4
singleton_sketch = runtmp.output('short3.sig')
runtmp.sourmash('sketch', 'dna', fa3, '-o', singleton_sketch,
'--param-str', "dna,k=31,scaled=1", "--singleton")
ss_sketch = sourmash.load_signatures(singleton_sketch)
ss_sketch1 = next(ss_sketch)
ss_sketch2 = next(ss_sketch)

expected_signames = ['shortName', 'tr1 4', 'firstname', 'other']
for sig in sigs:
assert sig.name in expected_signames
if sig.name == 'firstname':
assert sig == ss_sketch1
if sig.name == 'other':
assert sig == ss_sketch2

0 comments on commit 3ccf625

Please sign in to comment.