Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Sep 13, 2023
1 parent 1028e0d commit 05f82de
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 36 deletions.
9 changes: 6 additions & 3 deletions src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ pub fn fastmultigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
let template = Sketch::MinHash(template_mh);

// load the list of query paths
let (querylist_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&query_filenames)?;
let queryfile_name = query_filenames.as_ref().to_string_lossy().to_string();
let (querylist_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&query_filenames)?;
println!("Loaded {} sig paths in querylist", querylist_paths.len());

let threshold_hashes : u64 = {
Expand Down Expand Up @@ -69,8 +70,10 @@ pub fn fastmultigather<P: AsRef<Path> + std::fmt::Debug + Clone>(
let mm = prepare_query(&sigs, &template, &location);

if mm.is_none() {
eprintln!("WARNING: no compatible sketches in path '{}'",
q.display());
if queryfile_name.ends_with(".zip") {
eprintln!("WARNING: no compatible sketches in path '{}'",
q.display());
}
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
mm
Expand Down
4 changes: 2 additions & 2 deletions src/mastiff_manygather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::fs::File;


use crate::utils::{prepare_query, is_revindex_database,
load_sketchlist_filenames};
load_sigpaths_from_zip_or_pathlist};


pub fn mastiff_manygather<P: AsRef<Path>>(
Expand All @@ -35,7 +35,7 @@ pub fn mastiff_manygather<P: AsRef<Path>>(
println!("Loaded DB");

// Load query paths
let query_paths = load_sketchlist_filenames(&queries_file)?;
let (query_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?;

// set up a multi-producer, single-consumer channel.
let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads());
Expand Down
89 changes: 58 additions & 31 deletions src/python/tests/test_multigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ def index_siglist(runtmp, siglist, db):
'-o', db)
return db

@pytest.mark.parametrize('indexed', [False, True])
def test_simple(runtmp, indexed):
def zip_siglist(runtmp, siglist, db):
runtmp.sourmash('sig', 'cat', siglist,
'-o', db)
return db

@pytest.mark.parametrize('zip_against', [False, True])
def test_simple(runtmp, zip_against):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
Expand All @@ -46,43 +51,65 @@ def test_simple(runtmp, indexed):
make_file_list(query_list, [query])
make_file_list(against_list, [sig2, sig47, sig63])

if indexed:
g_output = runtmp.output('out.csv')
against_db = index_siglist(runtmp, against_list, runtmp.output('db'))
runtmp.sourmash('scripts', 'fastmultigather', query_list,
against_db, '-s', '100000', '-t', '0',
'-o', g_output)
else:
cwd = os.getcwd()
try:
os.chdir(runtmp.output(''))
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
'-s', '100000', '-t', '0')
finally:
os.chdir(cwd)
if zip_against:
against_list = zip_siglist(runtmp, against_list, runtmp.output('against.zip'))

print(os.listdir(runtmp.output('')))
cwd = os.getcwd()
try:
os.chdir(runtmp.output(''))
runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list,
'-s', '100000', '-t', '0')
finally:
os.chdir(cwd)

g_output = runtmp.output('SRR606249.sig.gz.gather.csv')
p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv')
assert os.path.exists(p_output)
print(os.listdir(runtmp.output('')))

# check prefetch output (only non-indexed gather)
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}
g_output = runtmp.output('SRR606249.sig.gz.gather.csv')
p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv')
assert os.path.exists(p_output)

# check prefetch output (only non-indexed gather)
df = pandas.read_csv(p_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'}

# check gather output (mostly same for indexed vs non-indexed version)
assert os.path.exists(g_output)
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
if indexed:
assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'}
else:
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'
}
assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'}


@pytest.mark.parametrize('zip_query', [False, True])
def test_simple_indexed(runtmp, zip_query):
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')

make_file_list(query_list, [query])
make_file_list(against_list, [sig2, sig47, sig63])

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

g_output = runtmp.output('out.csv')
against_db = index_siglist(runtmp, against_list, runtmp.output('db'))
runtmp.sourmash('scripts', 'fastmultigather', query_list,
against_db, '-s', '100000', '-t', '0',
'-o', g_output)

assert os.path.exists(g_output)
df = pandas.read_csv(g_output)
assert len(df) == 3
keys = set(df.keys())
assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'}


@pytest.mark.parametrize('indexed', [False, True])
def test_missing_querylist(runtmp, capfd, indexed):
Expand Down

0 comments on commit 05f82de

Please sign in to comment.