Skip to content

Commit

Permalink
add manysearch
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Sep 8, 2023
1 parent d911dcf commit 30a31ca
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 42 deletions.
32 changes: 9 additions & 23 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ use std::sync::atomic;
use std::sync::atomic::AtomicUsize;

use crate::utils::{prepare_query,
load_sketchlist_filenames, load_sketches, SearchResult, csvwriter_thread};
load_sigpaths_from_zip_or_pathlist, SearchResult,
csvwriter_thread, load_sketches_from_zip_or_pathlist,
ReportType, report_on_sketch_loading};

pub fn manysearch<P: AsRef<Path>>(
querylist: P,
Expand All @@ -29,29 +31,13 @@ pub fn manysearch<P: AsRef<Path>>(
eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display());

// Load all queries into memory at once.
let querylist_paths = load_sketchlist_filenames(&querylist)?;

let result = load_sketches(querylist_paths, &template)?;
let (queries, skipped_paths, failed_paths) = result;

eprintln!("Loaded {} query signatures", queries.len());
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
failed_paths);
}
if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
skipped_paths);
}

if queries.is_empty() {
bail!("No query signatures loaded, exiting.");
}
let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?;

// Load all _paths_, not signatures, into memory.
eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display());
let (search_sigs_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(siglist)?;
// eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display());

let search_sigs_paths = load_sketchlist_filenames(&siglist)?;
// let search_sigs_paths = load_sketchlist_filenames(&siglist)?;
if search_sigs_paths.is_empty() {
bail!("No signatures to search loaded, exiting.");
}
Expand Down Expand Up @@ -150,11 +136,11 @@ pub fn manysearch<P: AsRef<Path>>(
let failed_paths = failed_paths.load(atomic::Ordering::SeqCst);

if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
eprintln!("WARNING: skipped {} search paths - no compatible signatures.",
skipped_paths);
}
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
eprintln!("WARNING: {} search paths failed to load. See error messages above.",
failed_paths);
}

Expand Down
4 changes: 2 additions & 2 deletions src/mastiff_manygather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::fs::File;


use crate::utils::{prepare_query, is_revindex_database,
load_sketchlist_filenames};
load_sigpaths_from_zip_or_pathlist};


pub fn mastiff_manygather<P: AsRef<Path>>(
Expand All @@ -35,7 +35,7 @@ pub fn mastiff_manygather<P: AsRef<Path>>(
println!("Loaded DB");

// Load query paths
let query_paths = load_sketchlist_filenames(&queries_file)?;
let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?;

// set up a multi-producer, single-consumer channel.
let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads());
Expand Down
8 changes: 4 additions & 4 deletions src/mastiff_manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::sync::atomic;
use std::sync::atomic::AtomicUsize;

use crate::utils::{prepare_query, is_revindex_database,
load_sketchlist_filenames, SearchResult, csvwriter_thread};
load_sigpaths_from_zip_or_pathlist, SearchResult, csvwriter_thread};


pub fn mastiff_manysearch<P: AsRef<Path>>(
Expand All @@ -32,7 +32,7 @@ pub fn mastiff_manysearch<P: AsRef<Path>>(
println!("Loaded DB");

// Load query paths
let query_paths = load_sketchlist_filenames(&queries_file)?;
let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?;

// if query_paths is empty, exit with error
if query_paths.is_empty() {
Expand Down Expand Up @@ -138,11 +138,11 @@ pub fn mastiff_manysearch<P: AsRef<Path>>(
let failed_paths = failed_paths.load(atomic::Ordering::SeqCst);

if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
eprintln!("WARNING: skipped {} query paths - no compatible signatures.",
skipped_paths);
}
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
eprintln!("WARNING: {} query paths failed to load. See error messages above.",
failed_paths);
}

Expand Down
95 changes: 82 additions & 13 deletions src/python/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,19 @@ def test_installed(runtmp):

assert 'usage: manysearch' in runtmp.last_result.err

def zip_siglist(runtmp, siglist, db):
runtmp.sourmash('sig', 'cat', siglist,
'-o', db)
return db

def index_siglist(runtmp, siglist, db):
# build index
runtmp.sourmash('scripts', 'index', siglist,
'-o', db)
return db

def test_simple(runtmp):
@pytest.mark.parametrize("zip_query", [False, True])
def test_simple(runtmp, zip_query):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -43,6 +49,9 @@ def test_simple(runtmp):

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output)
assert os.path.exists(output)
Expand Down Expand Up @@ -88,7 +97,8 @@ def test_simple(runtmp):
assert intersect_hashes == 2529


def test_simple_indexed(runtmp):
@pytest.mark.parametrize("zip_query", [False, True])
def test_simple_indexed(runtmp, zip_query):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -104,6 +114,9 @@ def test_simple_indexed(runtmp):

against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output)
assert os.path.exists(output)
Expand Down Expand Up @@ -138,7 +151,8 @@ def test_simple_indexed(runtmp):


@pytest.mark.parametrize("indexed", [False, True])
def test_simple_with_cores(runtmp, capfd, indexed):
@pytest.mark.parametrize("zip_query", [False, True])
def test_simple_with_cores(runtmp, capfd, indexed, zip_query):
# test basic execution with -c argument (that it runs, at least!)
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -153,6 +167,9 @@ def test_simple_with_cores(runtmp, capfd, indexed):
if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
Expand All @@ -168,7 +185,8 @@ def test_simple_with_cores(runtmp, capfd, indexed):


@pytest.mark.parametrize("indexed", [False, True])
def test_simple_threshold(runtmp, indexed):
@pytest.mark.parametrize("zip_query", [False, True])
def test_simple_threshold(runtmp, indexed, zip_query):
# test with a simple threshold => only 3 results
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -183,6 +201,9 @@ def test_simple_threshold(runtmp, indexed):
if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
Expand All @@ -194,7 +215,8 @@ def test_simple_threshold(runtmp, indexed):


@pytest.mark.parametrize("indexed", [False, True])
def test_missing_query(runtmp, capfd, indexed):
@pytest.mark.parametrize("zip_query", [False, True])
def test_missing_query(runtmp, capfd, indexed, zip_query):
# test with a missing query list
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -209,6 +231,9 @@ def test_missing_query(runtmp, capfd, indexed):
if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))

if zip_query:
query_list = runtmp.output('query.zip')

output = runtmp.output('out.csv')

with pytest.raises(utils.SourmashCommandFailed):
Expand Down Expand Up @@ -270,7 +295,35 @@ def test_bad_query_2(runtmp, capfd, indexed):
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err
assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err


def test_bad_query_3(runtmp, capfd):
# test with a bad query (a .sig.gz file renamed as zip file)
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

query_zip = runtmp.output('query.zip')
# cp sig2 into query_zip
with open(query_zip, 'wb') as fp:
with open(sig2, 'rb') as fp2:
fp.write(fp2.read())

make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'multisearch', query_zip, against_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)

assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err


@pytest.mark.parametrize("indexed", [False, True])
Expand Down Expand Up @@ -342,7 +395,7 @@ def test_bad_against_2(runtmp, capfd):
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


@pytest.mark.parametrize("indexed", [False, True])
Expand Down Expand Up @@ -371,7 +424,8 @@ def test_empty_query(runtmp, indexed):


@pytest.mark.parametrize("indexed", [False, True])
def test_nomatch_query(runtmp, capfd, indexed):
@pytest.mark.parametrize("zip_query", [False, True])
def test_nomatch_query(runtmp, capfd, indexed, zip_query):
# test a non-matching (diff ksize) in query; do we get warning message?
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -385,6 +439,8 @@ def test_nomatch_query(runtmp, capfd, indexed):
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')
if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))
Expand All @@ -396,7 +452,7 @@ def test_nomatch_query(runtmp, capfd, indexed):
captured = capfd.readouterr()
print(captured.err)

assert 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err
assert 'WARNING: skipped 1 query paths - no compatible signatures.' in captured.err


@pytest.mark.parametrize("indexed", [False, True])
Expand Down Expand Up @@ -431,8 +487,9 @@ def test_load_only_one_bug(runtmp, capfd, indexed):
assert not 'WARNING: no compatible sketches in path ' in captured.err


@pytest.mark.parametrize("zip_query", [False, True])
@pytest.mark.parametrize("indexed", [False, True])
def test_load_only_one_bug_as_query(runtmp, capfd, indexed):
def test_load_only_one_bug_as_query(runtmp, capfd, indexed, zip_query):
# check that we behave properly when presented with multiple query
# sketches in one file, with only one matching.
query_list = runtmp.output('query.txt')
Expand All @@ -447,23 +504,31 @@ def test_load_only_one_bug_as_query(runtmp, capfd, indexed):
make_file_list(query_list, [sig1_all])
make_file_list(against_list, [sig1_k31])

output = runtmp.output('out.csv')

if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))
output = runtmp.output('out.csv')
if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output)

assert os.path.exists(output)

captured = capfd.readouterr()
print(captured.err)
print(runtmp.last_result.out)

assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err

# this fails with zip input, because they become individual signature files when zipped
assert not 'WARNING: no compatible sketches in path ' in captured.err


@pytest.mark.parametrize("zip_query", [False, True])
@pytest.mark.parametrize("indexed", [False, True])
def test_md5(runtmp, indexed):
def test_md5(runtmp, indexed, zip_query):
# test that md5s match what was in the original files, not downsampled etc.
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -475,9 +540,13 @@ def test_md5(runtmp, indexed):
make_file_list(query_list, [sig2, sig47, sig63])
make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

if indexed:
against_list = index_siglist(runtmp, against_list, runtmp.output('db'))
output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'manysearch', query_list, against_list,
'-o', output)
Expand Down

0 comments on commit 30a31ca

Please sign in to comment.