Skip to content

Commit

Permalink
general read_sigpaths fn
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Sep 8, 2023
1 parent b0c909e commit 00bc2bc
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 34 deletions.
20 changes: 2 additions & 18 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ use std::path::{Path, PathBuf};
use sourmash::sketch::Sketch;
use sourmash::index::revindex::RevIndex;


use crate::utils::{load_sigpaths_from_zip, load_sketchlist_filenames};
use crate::utils::load_paths_from_zip_or_pathlist;

pub fn index<P: AsRef<Path>>(
siglist: P,
Expand All @@ -12,35 +11,20 @@ pub fn index<P: AsRef<Path>>(
save_paths: bool,
colors: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let mut temp_dir = None;
println!("Loading siglist");

let index_sigs: Vec<PathBuf>;

if siglist.as_ref().extension().map(|ext| ext == "zip").unwrap_or(false) {
let (paths, tempdir) = load_sigpaths_from_zip(&siglist)?;
temp_dir = Some(tempdir);
index_sigs = paths;
} else {
index_sigs = load_sketchlist_filenames(&siglist)?;
}
let (index_sigs, temp_dir) = load_paths_from_zip_or_pathlist(&siglist)?;

// if index_sigs pathlist is empty, bail
if index_sigs.is_empty() {
bail!("No signatures to index loaded, exiting.");
}

eprintln!("Loaded {} sig paths in siglist", index_sigs.len());

// Create or open the RevIndex database with the provided output path and colors flag
let db = RevIndex::create(output.as_ref(), colors);

// Index the signatures using the loaded template, threshold, and save_paths option
db.index(index_sigs, &template, 0.0, save_paths);

if let Some(temp_dir) = temp_dir {
temp_dir.close()?;
}

Ok(())
}
6 changes: 3 additions & 3 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use sourmash::signature::SigsTrait;
use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash};
use sourmash::sketch::Sketch;

use crate::utils::{report_on_sketch_loading, load_sketches_from_zip_or_pathlist};
use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType};

/// Search many queries against a list of signatures.
///
Expand All @@ -40,10 +40,10 @@ let template_mh = KmerMinHash::builder()
let template = Sketch::MinHash(template_mh);

// Load all queries into memory at once.
let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, true)?;
let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?;

// Load all against sketches into memory at once.
let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, false)?;
let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?;

// set up a multi-producer, single-consumer channel.
let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads());
Expand Down
2 changes: 1 addition & 1 deletion src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def test_index_zipfile(runtmp, capfd):
assert 'index is done' in runtmp.last_result.err
captured = capfd.readouterr()
print(captured.err)
assert 'Loaded 3 sig paths in siglist' in captured.err
assert 'Found 3 filepaths' in captured.err


def test_index_check(runtmp):
Expand Down
2 changes: 1 addition & 1 deletion src/python/tests/test_multisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_simple_threshold(runtmp, zip_query, zip_db):


@pytest.mark.parametrize("zip_query", [False, True])
def test_missing_query(runtmp, capfd, zip_query, zip_db):
def test_missing_query(runtmp, capfd, zip_query):
# test with a missing query list
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand Down
58 changes: 47 additions & 11 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use sourmash::sketch::Sketch;
use sourmash::prelude::MinHashOps;
use sourmash::prelude::FracMinHashOps;

// use tempfile::tempdir;
/// Track a name/minhash.
pub struct SmallSignature {
Expand Down Expand Up @@ -222,7 +223,7 @@ pub fn load_sketchlist_filenames<P: AsRef<Path>>(sketchlist_filename: &P) ->

pub fn load_sigpaths_from_zip<P: AsRef<Path>>(
zip_path: P,
) -> Result<(Vec<PathBuf>, tempfile::TempDir), Box<dyn std::error::Error>> {
) -> Result<(Vec<PathBuf>, tempfile::TempDir)> {
let mut signature_paths = Vec::new();
let temp_dir = tempdir()?;
let zip_file = File::open(&zip_path)?;
Expand All @@ -238,7 +239,7 @@ pub fn load_sigpaths_from_zip<P: AsRef<Path>>(
let mut new_file = File::create(temp_dir.path().join(file_name))?;
new_file.write_all(&sig)?;

// Push the created path directly to the vector
// add path to signature_paths
signature_paths.push(temp_dir.path().join(file_name));
}
}
Expand Down Expand Up @@ -438,13 +439,51 @@ pub fn load_sketches_from_zip<P: AsRef<Path>>(
}


pub fn load_paths_from_zip_or_pathlist<P: AsRef<Path>>(
sketchlist_path: P,
) -> Result<(Vec<PathBuf>, Option<tempfile::TempDir>)> {
eprintln!("Reading list of filepaths from: '{}'", sketchlist_path.as_ref().display());

let result = if sketchlist_path.as_ref().extension().map(|ext| ext == "zip").unwrap_or(false) {
let (paths, tempdir) = load_sigpaths_from_zip(&sketchlist_path)?;
(paths, Some(tempdir))
} else {
let paths = load_sketchlist_filenames(&sketchlist_path)?;
(paths, None)
};

eprintln!("Found {} filepaths", result.0.len());
// should we bail here if empty?
Ok(result)
}

pub enum ReportType {
Query,
Against,
}

impl ReportType {
fn as_str(&self, plural: bool) -> &'static str {
match (self, plural) {
(ReportType::Query, true) => "queries",
(ReportType::Query, false) => "query",
(ReportType::Against, _) => "against",
}
}
}

impl std::fmt::Display for ReportType {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.as_str(false)) // assume not plural?
}
}

pub fn load_sketches_from_zip_or_pathlist<P: AsRef<Path>>(
sketchlist_path: P,
template: &Sketch,
is_query: bool
report_type: ReportType,
) -> Result<Vec<SmallSignature>> {
let report_type = if is_query { "queries" } else { "against" };
eprintln!("Reading list of {} from: '{}'", report_type, sketchlist_path.as_ref().display());
eprintln!("Reading list of {} from: '{}'", report_type.as_str(true), sketchlist_path.as_ref().display());

let (sketchlist, skipped_paths, failed_paths) =
if sketchlist_path.as_ref().extension().map(|ext| ext == "zip").unwrap_or(false) {
Expand All @@ -454,20 +493,17 @@ pub fn load_sketches_from_zip_or_pathlist<P: AsRef<Path>>(
load_sketches(sketch_paths, template)?
};

report_on_sketch_loading(&sketchlist, skipped_paths, failed_paths, is_query)?;
report_on_sketch_loading(&sketchlist, skipped_paths, failed_paths, report_type)?;

Ok(sketchlist)
}



pub fn report_on_sketch_loading(
sketchlist: &[SmallSignature],
skipped_paths: usize,
failed_paths: usize,
is_query: bool,
report_type: ReportType,
) -> Result<()> {
let report_type = if is_query { "query" } else { "against" };

if failed_paths > 0 {
eprintln!(
Expand All @@ -485,7 +521,7 @@ pub fn report_on_sketch_loading(
}

// Validate sketches
eprintln!("Loaded {} {} signatures", sketchlist.len(), report_type);
eprintln!("Loaded {} {} signatures", sketchlist.len(), report_type.as_str(false));
if sketchlist.is_empty() {
bail!("No {} signatures loaded, exiting.", report_type);
}
Expand Down

0 comments on commit 00bc2bc

Please sign in to comment.