test for duplicates

sourmash-bio · Feb 29, 2024 · 37dbb8e · 37dbb8e
1 parent 5ee5ef8
commit 37dbb8e
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 6 deletions.
diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py
@@ -365,7 +365,8 @@ def main(self, args):
         status = sourmash_plugin_branchwater.do_manysketch(args.fromfile_csv,
                                                            args.param_string,
                                                            args.output,
-                                                           args.singleton)
+                                                           args.singleton,
+                                                           args.force)
         if status == 0:
             notify(f"...manysketch is done! results in '{args.output}'")
         return status

diff --git a/src/python/tests/test_sketch.py b/src/python/tests/test_sketch.py
@@ -703,3 +703,82 @@ def test_manysketch_prefix2(runtmp, capfd):
             assert sig,minhash.hashes == sig1.minhash.hashes
         if sig.name == 'short_protein':
             assert sig == sig2
+
+
+def test_manysketch_prefix_duplicated_fail(runtmp, capfd):
+    fa_csv = runtmp.output('db-fa.csv')
+
+    fa1 = get_test_data('short.fa')
+
+    fa_path = os.path.dirname(fa1)
+    # test without '*'
+    dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa
+    prot_prefix = os.path.join(fa_path, "*protein")
+    zip_exclude = os.path.join(fa_path, "*zip")
+
+    # make prefix input file
+    with open(fa_csv, 'wt') as fp:
+        fp.write("name,input_moltype,prefix,exclude\n")
+        fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa
+        fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped 
+        fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only
+        # ALSO short-protein.fa, but different name. should raise err without force
+        fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n")
+
+    output = runtmp.output('prefix.zip')
+
+    with pytest.raises(utils.SourmashCommandFailed):
+        runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
+                        '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1")
+
+    assert not os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.out)
+    print(captured.err)
+    assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out
+    assert "Found identical FASTA paths in more than one row!" in captured.err
+    assert 'Duplicated paths: ["/Users/ntward/dib-lab/sourmash_plugin_branchwater/src/python/tests/test-data/short-protein.fa"]' in captured.err
+    assert "Duplicated FASTA files found. Please use '--force' to enable this" in captured.err
+
+
+def test_manysketch_prefix_duplicated_force(runtmp, capfd):
+    fa_csv = runtmp.output('db-fa.csv')
+
+    fa1 = get_test_data('short.fa')
+
+    fa_path = os.path.dirname(fa1)
+    # test without '*'
+    dna_prefix = os.path.join(fa_path, "short") # need to avoid matching short-protein.fa
+    prot_prefix = os.path.join(fa_path, "*protein")
+    zip_exclude = os.path.join(fa_path, "*zip")
+
+    # make prefix input file
+    with open(fa_csv, 'wt') as fp:
+        fp.write("name,input_moltype,prefix,exclude\n")
+        fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # short.fa, short2.fa, short3.fa, short-protein.fa
+        fp.write(f"short,DNA,{dna_prefix},{prot_prefix}\n") # duplicate of row one -- this should just be skipped 
+        fp.write(f"short_protein,protein,{prot_prefix},{zip_exclude}\n") # short-protein.fa only
+        # ALSO short-protein.fa, but different name. should raise err without force
+        fp.write(f"second_protein,protein,{prot_prefix},{zip_exclude}\n")
+
+    output = runtmp.output('prefix.zip')
+
+    runtmp.sourmash('scripts', 'manysketch', fa_csv, '-o', output,
+                    '--param-str', "dna,k=31,scaled=1", '-p', "protein,k=10,scaled=1",
+                    '--force')
+
+    assert os.path.exists(output)
+    assert not runtmp.last_result.out # stdout should be empty
+    captured = capfd.readouterr()
+    print(captured.out)
+    print(captured.err)
+    assert "Found 'prefix' CSV. Using 'glob' to find files based on 'prefix' column." in captured.out
+    assert "Found identical FASTA paths in more than one row!" in captured.err
+    assert 'Duplicated paths: ["/Users/ntward/dib-lab/sourmash_plugin_branchwater/src/python/tests/test-data/short-protein.fa"]' in captured.err
+
+    idx = sourmash.load_file_as_index(output)
+    sigs = list(idx.signatures())
+    print(sigs)
+
+    assert len(sigs) == 3
diff --git a/src/utils.rs b/src/utils.rs
@@ -23,7 +23,7 @@ use sourmash::selection::Selection;
 use sourmash::signature::{Signature, SigsTrait};
 use sourmash::sketch::minhash::KmerMinHash;
 use sourmash::storage::{FSStorage, InnerStorage, SigStore};
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 /// Track a name/minhash.
 
 pub struct SmallSignature {
@@ -312,10 +312,10 @@ fn process_prefix_csv(
     let mut results = Vec::new();
     let mut dna_count = 0;
     let mut protein_count = 0;
-    let mut processed_rows = std::collections::HashSet::new();
+    let mut processed_rows = HashSet::new();
     let mut duplicate_count = 0;
     let mut all_paths = HashSet::new(); // track FASTA in use
-    let mut duplicate_paths_count = std::collections::HashMap::new();
+    let mut duplicate_paths_count = HashMap::new();
 
     for result in rdr.records() {
         let record = result?;
@@ -1062,8 +1062,7 @@ pub fn sigwriter(
         let mut zip = zip::ZipWriter::new(file_writer);
         let mut manifest_rows: Vec<Record> = Vec::new();
         // keep track of md5sum occurrences to prevent overwriting duplicates
-        let mut md5sum_occurrences: std::collections::HashMap<String, usize> =
-            std::collections::HashMap::new();
+        let mut md5sum_occurrences: HashMap<String, usize> = HashMap::new();
 
         while let Ok(message) = recv.recv() {
             match message {