Skip to content

Commit

Permalink
MRG: clean up index to use MultiCollection (#451)
Browse files Browse the repository at this point in the history
* try making index work with standard code

* kinda working

* fmt

* refactor

* clear up the tests

* refactor/clean up

* cargo fmt

* add tests for index warning & error
  • Loading branch information
ctb authored Sep 9, 2024
1 parent 88d009a commit 005775b
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 61 deletions.
95 changes: 40 additions & 55 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,81 +1,66 @@
use anyhow::Context;
use camino::Utf8PathBuf as PathBuf;
use sourmash::index::revindex::RevIndex;
use sourmash::index::revindex::RevIndexOps;
use sourmash::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

use crate::utils::{load_collection, ReportType};
use sourmash::collection::{Collection, CollectionSet};

pub fn index<P: AsRef<Path>>(
siglist: String,
selection: &Selection,
output: P,
colors: bool,
_allow_failed_sigpaths: bool,
allow_failed_sigpaths: bool,
use_internal_storage: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading siglist");
eprintln!("Loading sketches from {}", siglist);

let collection = match siglist {
x if x.ends_with(".zip") => Collection::from_zipfile(x)?,
x if x.ends_with(".sig") || x.ends_with(".sig.gz") => {
let signatures = Signature::from_path(&x)
.with_context(|| format!("Failed to load signatures from: '{}'", x))?;
let multi = match load_collection(
&siglist,
selection,
ReportType::General,
allow_failed_sigpaths,
) {
Ok(multi) => multi,
Err(err) => return Err(err.into()),
};
eprintln!("Found {} sketches total.", multi.len());

Collection::from_sigs(signatures).with_context(|| {
format!(
"Loaded signatures but failed to load as collection: '{}'",
x
)
})?
// Try to convert it into a Collection and then CollectionSet.
let collection = match Collection::try_from(multi.clone()) {
// conversion worked!
Ok(c) => {
let cs: CollectionSet = c.select(selection)?.try_into()?;
Ok(cs)
}
_ => {
let file = File::open(siglist.clone())
.with_context(|| format!("Failed to open pathlist file: '{}'", siglist))?;

let reader = BufReader::new(file);

// load list of paths
let lines: Vec<_> = reader
.lines()
.filter_map(|line| match line {
Ok(path) => {
let mut filename = PathBuf::new();
filename.push(path);
Some(filename)
}
Err(_err) => None,
})
.collect();

if lines.is_empty() {
return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into());
// conversion failed; can we/should we load it into memory?
Err(_) => {
if use_internal_storage {
eprintln!("WARNING: loading all sketches into memory in order to index.");
eprintln!("See 'index' documentation for details.");
let c: Collection = multi.load_all_sigs(selection)?;
let cs: CollectionSet = c.try_into()?;
Ok(cs)
} else {
match Collection::from_paths(&lines) {
Ok(collection) => collection,
Err(err) => {
eprintln!("Error in loading from '{}': {}", siglist, err);
return Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into());
}
}
Err(
anyhow::anyhow!("cannot index this type of collection with external storage")
.into(),
)
}
}
};

let collection: CollectionSet = collection.select(selection)?.try_into()?;
match collection {
Ok(collection) => {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if collection.is_empty() {
Err(anyhow::anyhow!("Signatures failed to load. Exiting.").into())
} else {
eprintln!("Indexing {} sketches.", collection.len());
let mut index = RevIndex::create(output.as_ref(), collection, colors)?;

if use_internal_storage {
index.internalize_storage()?;
if use_internal_storage {
index.internalize_storage()?;
}
Ok(())
}
Ok(())
Err(e) => Err(e),
}
}
4 changes: 4 additions & 0 deletions src/python/tests/test_fastgather.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def test_installed(runtmp):


def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand Down Expand Up @@ -60,6 +62,8 @@ def test_simple(runtmp, capfd, indexed_query, indexed_against, zip_against, togg


def test_simple_with_prefetch(runtmp, zip_against, indexed, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
against_list = runtmp.output('against.txt')
Expand Down
11 changes: 10 additions & 1 deletion src/python/tests/test_fastmultigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ def test_simple_read_manifests(runtmp):


def test_simple_indexed(runtmp, zip_query, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")

# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
Expand Down Expand Up @@ -239,6 +242,8 @@ def test_simple_indexed(runtmp, zip_query, toggle_internal_storage):


def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test basic execution!
query = get_test_data('SRR606249.sig.gz')
sig2 = get_test_data('2.fa.sig.gz')
Expand Down Expand Up @@ -273,6 +278,8 @@ def test_simple_indexed_query_manifest(runtmp, toggle_internal_storage):


def test_missing_querylist(runtmp, capfd, indexed, zip_query, toggle_internal_storage):
if toggle_internal_storage == '--no-internal-storage':
raise pytest.xfail("not implemented")
# test missing querylist
query_list = runtmp.output('query.txt')
against_list = runtmp.output('against.txt')
Expand Down Expand Up @@ -1174,7 +1181,9 @@ def test_rocksdb_no_internal_storage_gather_fails(runtmp, capfd):
"47.fa.sig.gz",
"63.fa.sig.gz"])

# index!
# index! CTB, note this will fail currently.
raise pytest.xfail("not implemented")

runtmp.sourmash('scripts', 'index', against_list, '--no-internal-storage',
'-o', 'subdir/against.rocksdb')

Expand Down
59 changes: 54 additions & 5 deletions src/python/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def test_installed(runtmp):


def test_index(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")

# test basic index!
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -35,6 +38,49 @@ def test_index(runtmp, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


def test_index_warning_message(runtmp, capfd):
# test basic index when it has to load things into memory - see #451.
siglist = runtmp.output('db-sigs.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(siglist, [sig2, sig47, sig63])

output = runtmp.output('db.rocksdb')

runtmp.sourmash('scripts', 'index', siglist, '-o', output)
assert os.path.exists(output)
print(runtmp.last_result.err)

assert 'index is done' in runtmp.last_result.err
captured = capfd.readouterr()
print(captured.err)
assert "WARNING: loading all sketches into memory in order to index." in captured.err


def test_index_error_message(runtmp, capfd):
# test basic index when it errors out b/c can't load
siglist = runtmp.output('db-sigs.txt')

sig2 = get_test_data('2.fa.sig.gz')
sig47 = get_test_data('47.fa.sig.gz')
sig63 = get_test_data('63.fa.sig.gz')

make_file_list(siglist, [sig2, sig47, sig63])

output = runtmp.output('db.rocksdb')

with pytest.raises(utils.SourmashCommandFailed):
runtmp.sourmash('scripts', 'index', siglist, '-o', output,
'--no-internal-storage')

captured = capfd.readouterr()
print(captured.err)
assert "cannot index this type of collection with external storage" in captured.err


def test_index_protein(runtmp, toggle_internal_storage):
sigs = get_test_data('protein.zip')
output = runtmp.output('db.rocksdb')
Expand Down Expand Up @@ -82,10 +128,9 @@ def test_index_missing_siglist(runtmp, capfd, toggle_internal_storage):

captured = capfd.readouterr()
print(captured.err)
assert 'Failed to open pathlist file:' in captured.err
assert 'Error: No such file or directory: ' in captured.err


@pytest.mark.xfail(reason="not implemented yet")
def test_index_sig(runtmp, capfd, toggle_internal_storage):
# test index with a .sig.gz file instead of pathlist
# (should work now)
Expand All @@ -101,7 +146,6 @@ def test_index_sig(runtmp, capfd, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


@pytest.mark.xfail(reason="not implemented yet")
def test_index_manifest(runtmp, capfd, toggle_internal_storage):
# test index with a manifest file
sig2 = get_test_data('2.fa.sig.gz')
Expand All @@ -118,7 +162,6 @@ def test_index_manifest(runtmp, capfd, toggle_internal_storage):
assert 'index is done' in runtmp.last_result.err


@pytest.mark.xfail(reason="needs more work")
def test_index_bad_siglist_2(runtmp, capfd):
# test with a bad siglist (containing a missing file)
against_list = runtmp.output('against.txt')
Expand All @@ -139,7 +182,6 @@ def test_index_bad_siglist_2(runtmp, capfd):
assert "WARNING: could not load sketches from path 'no-exist'" in captured.err


@pytest.mark.xfail(reason="needs more work")
def test_index_empty_siglist(runtmp, capfd):
# test empty siglist file
siglist = runtmp.output('db-sigs.txt')
Expand Down Expand Up @@ -347,6 +389,8 @@ def test_index_zipfile_bad(runtmp, capfd):


def test_index_check(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")
# test check index
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -367,6 +411,8 @@ def test_index_check(runtmp, toggle_internal_storage):


def test_index_check_quick(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")
# test check index
siglist = runtmp.output('db-sigs.txt')

Expand All @@ -387,6 +433,9 @@ def test_index_check_quick(runtmp, toggle_internal_storage):


def test_index_subdir(runtmp, toggle_internal_storage):
if toggle_internal_storage == "--no-internal-storage":
raise pytest.xfail("not implemented currently")

# test basic index & output to subdir
siglist = runtmp.output('db-sigs.txt')

Expand Down
33 changes: 33 additions & 0 deletions src/utils/multicollection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,26 @@ impl MultiCollection {
.collect();
MultiCollection::new(colls, self.contains_revindex)
}

pub fn load_all_sigs(self, selection: &Selection) -> Result<Collection> {
let all_sigs: Vec<Signature> = self
.par_iter()
.filter_map(|(coll, _idx, record)| match coll.sig_from_record(record) {
Ok(sig) => {
let sig = sig.clone().select(selection).ok()?;
Some(Signature::from(sig))
}
Err(_) => {
eprintln!(
"FAILED to load sketch from '{}'",
record.internal_location()
);
None
}
})
.collect();
Ok(Collection::from_sigs(all_sigs)?)
}
}

impl Select for MultiCollection {
Expand Down Expand Up @@ -318,6 +338,19 @@ impl From<Vec<MultiCollection>> for MultiCollection {
}
}

// Extract a single Collection from a MultiCollection, if possible
impl TryFrom<MultiCollection> for Collection {
type Error = &'static str;

fn try_from(multi: MultiCollection) -> Result<Self, Self::Error> {
if multi.collections.len() == 1 {
Ok(multi.collections.into_iter().next().unwrap())
} else {
Err("More than one Collection in this MultiCollection; cannot convert")
}
}
}

/// Track a name/minhash.
pub struct SmallSignature {
pub location: String,
Expand Down

0 comments on commit 005775b

Please sign in to comment.