Skip to content

Commit

Permalink
test for duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Feb 29, 2024
1 parent 5ee5ef8 commit 37dbb8e
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 6 deletions.
3 changes: 2 additions & 1 deletion src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,8 @@ def main(self, args):
status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv,
args.param_string,
args.output,
args.singleton)
args.singleton,
args.force)
if status == 0:
notify(f"...manysketch is done! results in '{args.output}'")
return status
Expand Down
79 changes: 79 additions & 0 deletions src/python/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,3 +703,82 @@ def test_manysketch_prefix2(runtmp, capfd):
assert sig,minhash.hashes == sig1.minhash.hashes
if sig.name == 'short_protein':
assert sig == sig2


def test_manysketch_prefix_duplicated_fail(runtmp, capfd):
fa_csv = runtmp.output('db-fa.csv')

fa1 = get_test_data('short.fa')

fa_path = os.path.dirname(fa1)
# test without '*'
dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa
prot_prefix = os.path.join(fa_path, "*protein")
zip_exclude = os.path.join(fa_path, "*zip")

# make prefix input file
with open(fa_csv, 'wt') as fp:
fp.write("name,input_moltype,prefix,exclude\n")
fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa
fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped
fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only
# ALSO short-protein.fa, but different name. should raise err without force
fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n")

output = runtmp.output('prefix.zip')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
'--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1")

assert not os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.out)
print(captured.err)
assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out
assert "Found identical FASTA paths in more than one row!" in captured.err
assert 'Duplicated paths: ["/Users/ntward/dib-lab/sourmash_plugin_branchwater/src/python/tests/test-data/short-protein.fa"]' in captured.err
assert "Duplicated FASTA files found. Please use '--force' to enable this" in captured.err


def test_manysketch_prefix_duplicated_force(runtmp, capfd):
fa_csv = runtmp.output('db-fa.csv')

fa1 = get_test_data('short.fa')

fa_path = os.path.dirname(fa1)
# test without '*'
dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa
prot_prefix = os.path.join(fa_path, "*protein")
zip_exclude = os.path.join(fa_path, "*zip")

# make prefix input file
with open(fa_csv, 'wt') as fp:
fp.write("name,input_moltype,prefix,exclude\n")
fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa
fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped
fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only
# ALSO short-protein.fa, but different name. should raise err without force
fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n")

output = runtmp.output('prefix.zip')

runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
'--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1",
'--force')

assert os.path.exists(output)
assert not runtmp.last_result.out # stdout should be empty
captured = capfd.readouterr()
print(captured.out)
print(captured.err)
assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out
assert "Found identical FASTA paths in more than one row!" in captured.err
assert 'Duplicated paths: ["/Users/ntward/dib-lab/sourmash_plugin_branchwater/src/python/tests/test-data/short-protein.fa"]' in captured.err

idx = sourmash.load_file_as_index(output)
sigs = list(idx.signatures())
print(sigs)

assert len(sigs) == 3
9 changes: 4 additions & 5 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use sourmash::selection::Selection;
use sourmash::signature::{Signature, SigsTrait};
use sourmash::sketch::minhash::KmerMinHash;
use sourmash::storage::{FSStorage, InnerStorage, SigStore};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
/// Track a name/minhash.
pub struct SmallSignature {
Expand Down Expand Up @@ -312,10 +312,10 @@ fn process_prefix_csv(
let mut results = Vec::new();
let mut dna_count = 0;
let mut protein_count = 0;
let mut processed_rows = std::collections::HashSet::new();
let mut processed_rows = HashSet::new();
let mut duplicate_count = 0;
let mut all_paths = HashSet::new(); // track FASTA in use
let mut duplicate_paths_count = std::collections::HashMap::new();
let mut duplicate_paths_count = HashMap::new();

for result in rdr.records() {
let record = result?;
Expand Down Expand Up @@ -1062,8 +1062,7 @@ pub fn sigwriter(
let mut zip = zip::ZipWriter::new(file_writer);
let mut manifest_rows: Vec<Record> = Vec::new();
// keep track of md5sum occurrences to prevent overwriting duplicates
let mut md5sum_occurrences: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut md5sum_occurrences: HashMap<String, usize> = HashMap::new();

while let Ok(message) = recv.recv() {
match message {
Expand Down

0 comments on commit 37dbb8e

Please sign in to comment.