Skip to content

Commit

Permalink
MRG: add utility functions for zip reading; use in index, multisearch (
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes authored Sep 12, 2023
1 parent 8827edf commit 59cd5a0
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 108 deletions.
22 changes: 3 additions & 19 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::path::{Path, PathBuf};
use std::path::Path;
use sourmash::sketch::Sketch;
use sourmash::index::revindex::RevIndex;


use crate::utils::{read_signatures_from_zip, load_sketchlist_filenames};
use crate::utils::load_sigpaths_from_zip_or_pathlist;

pub fn index<P: AsRef<Path>>(
siglist: P,
Expand All @@ -12,35 +11,20 @@ pub fn index<P: AsRef<Path>>(
save_paths: bool,
colors: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let mut temp_dir = None;
println!("Loading siglist");

let index_sigs: Vec<PathBuf>;

if siglist.as_ref().extension().map(|ext| ext == "zip").unwrap_or(false) {
let (paths, tempdir) = read_signatures_from_zip(&siglist)?;
temp_dir = Some(tempdir);
index_sigs = paths;
} else {
index_sigs = load_sketchlist_filenames(&siglist)?;
}
let (index_sigs, _temp_dir) = load_sigpaths_from_zip_or_pathlist(&siglist)?;

// if index_sigs pathlist is empty, bail
if index_sigs.is_empty() {
bail!("No signatures to index loaded, exiting.");
}

eprintln!("Loaded {} sig paths in siglist", index_sigs.len());

// Create or open the RevIndex database with the provided output path and colors flag
let db = RevIndex::create(output.as_ref(), colors);

// Index the signatures using the loaded template, threshold, and save_paths option
db.index(index_sigs, &template, 0.0, save_paths);

if let Some(temp_dir) = temp_dir {
temp_dir.close()?;
}

Ok(())
}
4 changes: 2 additions & 2 deletions src/manysketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use rayon::prelude::*;

use std::io::Read;
use std::path::Path;
use crate::utils::{Params, load_sketch_fromfile, ZipMessage, sigwriter};
use crate::utils::{Params, load_fasta_fromfile, ZipMessage, sigwriter};
use sourmash::signature::Signature;
use sourmash::cmd::ComputeParameters;
use std::sync::atomic;
Expand Down Expand Up @@ -130,7 +130,7 @@ pub fn manysketch<P: AsRef<Path> + Sync>(
output: String,
) -> Result<(), Box<dyn std::error::Error>> {

let fileinfo = match load_sketch_fromfile(&filelist) {
let fileinfo = match load_fasta_fromfile(&filelist) {
Ok(result) => result,
Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e)
};
Expand Down
48 changes: 4 additions & 44 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use sourmash::signature::SigsTrait;
use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash};
use sourmash::sketch::Sketch;

use crate::utils::{load_sketchlist_filenames, load_sketches};
use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType};

/// Search many queries against a list of signatures.
///
Expand All @@ -29,7 +29,7 @@ pub fn multisearch<P: AsRef<Path>>(
ksize: u8,
scaled: usize,
output: Option<P>,
) -> Result<()> {
) -> Result<(), Box<dyn std::error::Error>> {
// construct a MinHash template for loading.
let max_hash = max_hash_for_scaled(scaled as u64);
let template_mh = KmerMinHash::builder()
Expand All @@ -39,51 +39,11 @@ let template_mh = KmerMinHash::builder()
.build();
let template = Sketch::MinHash(template_mh);

// Read in list of query paths.
eprintln!("Reading list of queries from: '{}'", querylist.as_ref().display());

// Load all queries into memory at once.
let querylist_paths = load_sketchlist_filenames(&querylist)?;

let result = load_sketches(querylist_paths, &template)?;
let (queries, skipped_paths, failed_paths) = result;

eprintln!("Loaded {} query signatures", queries.len());
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
failed_paths);
}
if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
skipped_paths);
}

if queries.is_empty() {
bail!("No query signatures loaded, exiting.");
}

// Read in list of against paths.
eprintln!("Reading list of against paths from: '{}'", againstlist.as_ref().display());
let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?;

// Load all against sketches into memory at once.
let againstlist_paths = load_sketchlist_filenames(&againstlist)?;

let result = load_sketches(againstlist_paths, &template)?;
let (against, skipped_paths, failed_paths) = result;

eprintln!("Loaded {} against signatures", against.len());
if failed_paths > 0 {
eprintln!("WARNING: {} signature paths failed to load. See error messages above.",
failed_paths);
}
if skipped_paths > 0 {
eprintln!("WARNING: skipped {} paths - no compatible signatures.",
skipped_paths);
}

if against.is_empty() {
bail!("No query signatures loaded, exiting.");
}
let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?;

// set up a multi-producer, single-consumer channel.
let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads());
Expand Down
2 changes: 1 addition & 1 deletion src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def test_index_zipfile(runtmp, capfd):
assert 'index is done' in runtmp.last_result.err
captured = capfd.readouterr()
print(captured.err)
assert 'Loaded 3 sig paths in siglist' in captured.err
assert 'Found 3 filepaths' in captured.err


def test_index_check(runtmp):
Expand Down
97 changes: 86 additions & 11 deletions src/python/tests/test_multisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@ def test_installed(runtmp):

assert 'usage: multisearch' in runtmp.last_result.err

def test_simple(runtmp):
def zip_siglist(runtmp, siglist, db):
runtmp.sourmash('sig', 'cat', siglist,
'-o', db)
return db

@pytest.mark.parametrize("zip_query", [False, True])
@pytest.mark.parametrize("zip_db", [False, True])
def test_simple(runtmp, zip_query, zip_db):
# test basic execution!
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -37,6 +44,11 @@ def test_simple(runtmp):

output = runtmp.output('out.csv')

if zip_db:
against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip'))
if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output)
assert os.path.exists(output)
Expand Down Expand Up @@ -82,7 +94,9 @@ def test_simple(runtmp):
assert intersect_hashes == 2529


def test_simple_threshold(runtmp):
@pytest.mark.parametrize("zip_query", [False, True])
@pytest.mark.parametrize("zip_db", [False, True])
def test_simple_threshold(runtmp, zip_query, zip_db):
# test with a simple threshold => only 3 results
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -96,6 +110,11 @@ def test_simple_threshold(runtmp):

output = runtmp.output('out.csv')

if zip_db:
against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip'))
if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output, '-t', '0.5')
assert os.path.exists(output)
Expand All @@ -104,7 +123,8 @@ def test_simple_threshold(runtmp):
assert len(df) == 3


def test_missing_query(runtmp, capfd):
@pytest.mark.parametrize("zip_query", [False, True])
def test_missing_query(runtmp, capfd, zip_query):
# test with a missing query list
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -118,6 +138,9 @@ def test_missing_query(runtmp, capfd):

output = runtmp.output('out.csv')

if zip_query:
query_list = runtmp.output('query.zip')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output)
Expand Down Expand Up @@ -170,10 +193,39 @@ def test_bad_query_2(runtmp, capfd):
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err
assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err


def test_bad_query_3(runtmp, capfd):
# test with a bad query (a .sig.gz file renamed as zip file)
against_list = runtmp.output('against.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

query_zip = runtmp.output('query.zip')
# cp sig2 into query_zip
with open(query_zip, 'wb') as fp:
with open(sig2, 'rb') as fp2:
fp.write(fp2.read())

make_file_list(against_list, [sig2, sig47, sig63])

output = runtmp.output('out.csv')

def test_missing_against(runtmp, capfd):
with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'multisearch', query_zip, against_list,
'-o', output)

captured = capfd.readouterr()
print(captured.err)

assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err


@pytest.mark.parametrize("zip_db", [False, True])
def test_missing_against(runtmp, capfd, zip_db):
# test with a missing against list
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -185,6 +237,10 @@ def test_missing_against(runtmp, capfd):
make_file_list(query_list, [sig2, sig47, sig63])
# do not create against_list

if zip_db:
#.zip but don't create the file
against_list = runtmp.output('db.zip')

output = runtmp.output('out.csv')

with pytest.raises(utils.SourmashCommandFailed):
Expand Down Expand Up @@ -241,7 +297,7 @@ def test_bad_against_2(runtmp, capfd):
print(captured.err)

assert "WARNING: could not load sketches from path 'no-exist'" in captured.err
assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err
assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err


def test_empty_query(runtmp):
Expand All @@ -266,7 +322,8 @@ def test_empty_query(runtmp):
# @CTB


def test_nomatch_query(runtmp, capfd):
@pytest.mark.parametrize("zip_query", [False, True])
def test_nomatch_query(runtmp, capfd, zip_query):
# test a non-matching (diff ksize) in query; do we get warning message?
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -281,17 +338,21 @@ def test_nomatch_query(runtmp, capfd):

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output)
assert os.path.exists(output)

captured = capfd.readouterr()
print(captured.err)

assert 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err
assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err


def test_load_only_one_bug(runtmp, capfd):
@pytest.mark.parametrize("zip_db", [False, True])
def test_load_only_one_bug(runtmp, capfd, zip_db):
# check that we behave properly when presented with multiple against
# sketches
query_list = runtmp.output('query.txt')
Expand All @@ -306,6 +367,9 @@ def test_load_only_one_bug(runtmp, capfd):
make_file_list(query_list, [sig1_k31])
make_file_list(against_list, [sig1_all])

if zip_db:
against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip'))

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
Expand All @@ -319,7 +383,8 @@ def test_load_only_one_bug(runtmp, capfd):
assert not 'WARNING: no compatible sketches in path ' in captured.err


def test_load_only_one_bug_as_query(runtmp, capfd):
@pytest.mark.parametrize("zip_query", [False, True])
def test_load_only_one_bug_as_query(runtmp, capfd, zip_query):
# check that we behave properly when presented with multiple query
# sketches in one file, with only one matching.
query_list = runtmp.output('query.txt')
Expand All @@ -334,6 +399,9 @@ def test_load_only_one_bug_as_query(runtmp, capfd):
make_file_list(query_list, [sig1_all])
make_file_list(against_list, [sig1_k31])

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))

output = runtmp.output('out.csv')

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
Expand All @@ -347,7 +415,9 @@ def test_load_only_one_bug_as_query(runtmp, capfd):
assert not 'WARNING: no compatible sketches in path ' in captured.err


def test_md5(runtmp):
@pytest.mark.parametrize("zip_query", [False, True])
@pytest.mark.parametrize("zip_db", [False, True])
def test_md5(runtmp, zip_query, zip_db):
# test that md5s match what was in the original files, not downsampled etc.
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand All @@ -361,6 +431,11 @@ def test_md5(runtmp):

output = runtmp.output('out.csv')

if zip_query:
query_list = zip_siglist(runtmp, query_list, runtmp.output('query.zip'))
if zip_db:
against_list = zip_siglist(runtmp, against_list, runtmp.output('db.zip'))

runtmp.sourmash('scripts', 'multisearch', query_list, against_list,
'-o', output)
assert os.path.exists(output)
Expand Down
Loading

0 comments on commit 59cd5a0

Please sign in to comment.