Skip to content

Commit

Permalink
MRG: enable zipfile loading for manysearch (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes authored Sep 13, 2023
1 parent 59cd5a0 commit 07470c9
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 45 deletions.
39 changes: 14 additions & 25 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ use std::sync::atomic;
use std::sync::atomic::AtomicUsize;

use crate::utils::{prepare_query,
load_sketchlist_filenames, load_sketches, SearchResult, csvwriter_thread};
load_sigpaths_from_zip_or_pathlist, SearchResult,
csvwriter_thread, load_sketches_from_zip_or_pathlist,
ReportType};

pub fn manysearch<P: AsRef<Path>>(
querylist: P,
Expand All @@ -29,29 +31,12 @@ pub fn manysearch<P: AsRef<Path>>(
eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display());

// Load all queries into memory at once.
let querylist_paths = load_sketchlist_filenames(&querylist)?;

let result = load_sketches(querylist_paths, &template)?;
let (queries, skipped_paths, failed_paths) = result;

eprintln!("Loaded {} query signatures", queries.len());
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
failed_paths);
}
if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
skipped_paths);
}

if queries.is_empty() {
bail!("No query signatures loaded, exiting.");
}
let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?;

// Load all _paths_, not signatures, into memory.
eprintln!("Reading search file paths from: '{}'", siglist.as_ref().display());
let siglist_name = siglist.as_ref().to_string_lossy().to_string();
let (search_sigs_paths, _temp_dir) = load_sigpaths_from_zip_or_pathlist(siglist)?;

let search_sigs_paths = load_sketchlist_filenames(&siglist)?;
if search_sigs_paths.is_empty() {
bail!("No signatures to search loaded, exiting.");
}
Expand Down Expand Up @@ -114,8 +99,12 @@ pub fn manysearch<P: AsRef<Path>>(
}
}
} else {
eprintln!("WARNING: no compatible sketches in path '{}'",
filename.display());
// for reading zips, this is likely not a useful warning and
// would show up too often (every sig is stored as individual file).
if !siglist_name.ends_with(".zip") {
eprintln!("WARNING: no compatible sketches in path '{}'",
filename.display());
}
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
Some(results)
Expand Down Expand Up @@ -150,11 +139,11 @@ pub fn manysearch<P: AsRef<Path>>(
let failed_paths = failed_paths.load(atomic::Ordering::SeqCst);

if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
eprintln!("WARNING: skipped {} search paths - no compatible signatures.",
skipped_paths);
}
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
eprintln!("WARNING: {} search paths failed to load. See error messages above.",
failed_paths);
}

Expand Down
15 changes: 10 additions & 5 deletions src/mastiff_manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::sync::atomic;
use std::sync::atomic::AtomicUsize;

use crate::utils::{prepare_query, is_revindex_database,
load_sketchlist_filenames, SearchResult, csvwriter_thread};
load_sigpaths_from_zip_or_pathlist, SearchResult, csvwriter_thread};


pub fn mastiff_manysearch<P: AsRef<Path>>(
Expand All @@ -32,7 +32,8 @@ pub fn mastiff_manysearch<P: AsRef<Path>>(
println!("Loaded DB");

// Load query paths
let query_paths = load_sketchlist_filenames(&queries_file)?;
let queryfile_name = queries_file.as_ref().to_string_lossy().to_string();
let (query_paths, temp_dir) = load_sigpaths_from_zip_or_pathlist(&queries_file)?;

// if query_paths is empty, exit with error
if query_paths.is_empty() {
Expand Down Expand Up @@ -92,8 +93,12 @@ pub fn mastiff_manysearch<P: AsRef<Path>>(
}
}
} else {
eprintln!("WARNING: no compatible sketches in path '{}'",
// for reading zips, this is likely not a useful warning and
// would show up too often (every sig is stored as individual file).
if !queryfile_name.ends_with(".zip") {
eprintln!("WARNING: no compatible sketches in path '{}'",
filename.display());
}
let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst);
}
if results.is_empty() {
Expand Down Expand Up @@ -138,11 +143,11 @@ pub fn mastiff_manysearch<P: AsRef<Path>>(
let failed_paths = failed_paths.load(atomic::Ordering::SeqCst);

if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
eprintln!("WARNING: skipped {} query paths - no compatible signatures.",
skipped_paths);
}
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
eprintln!("WARNING: {} query paths failed to load. See error messages above.",
failed_paths);
}

Expand Down
22 changes: 22 additions & 0 deletions src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,28 @@ def test_index_zipfile(runtmp, capfd):
assert 'Found 3 filepaths' in captured.err


def test_index_zipfile_bad(runtmp, capfd):
# test with a bad input zipfile (a .sig.gz file renamed as zip file)
sig2 = get_test_data('2.fa.sig.gz')

query_zip = runtmp.output('query.zip')
# cp sig2 into query_zip
with open(query_zip, 'wb') as fp:
with open(sig2, 'rb') as fp2:
fp.write(fp2.read())

output = runtmp.output('out.csv')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'index', query_zip,
'-o', output)

captured = capfd.readouterr()
print(captured.err)

assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err


def test_index_check(runtmp):
# test check index
siglist = runtmp.output('db-sigs.txt')
Expand Down
Loading

0 comments on commit 07470c9

Please sign in to comment.