Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: change sig_from_record to use scaled from Record to downsample #3387

Merged
merged 10 commits into from
Nov 13, 2024
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "sourmash"
version = "0.17.1"
authors = ["Luiz Irber <[email protected]>", "N. Tessa Pierce-Ward <[email protected]>"]
authors = ["Luiz Irber <[email protected]>", "N. Tessa Pierce-Ward <[email protected]>", "C. Titus Brown <[email protected]>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
keywords = ["minhash", "bioinformatics"]
Expand Down
11 changes: 5 additions & 6 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,8 @@
// no sigs should remain
assert_eq!(cl.len(), 6);
for (_idx, rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap();
debug!("record scaled is: {}", rec.scaled());

Check failure on line 412 in src/core/src/collection.rs

View workflow job for this annotation

GitHub Actions / test (macos)

cannot find macro `debug` in this scope

Check failure on line 412 in src/core/src/collection.rs

View workflow job for this annotation

GitHub Actions / test (beta)

cannot find macro `debug` in this scope

Check failure on line 412 in src/core/src/collection.rs

View workflow job for this annotation

GitHub Actions / test (stable)

cannot find macro `debug` in this scope

Check failure on line 412 in src/core/src/collection.rs

View workflow job for this annotation

GitHub Actions / test (windows)

cannot find macro `debug` in this scope
let this_sig = cl.sig_from_record(rec).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
Expand Down Expand Up @@ -440,7 +440,7 @@
filename.push("../../tests/test-data/prot/hp.zip");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(100);
selection.set_scaled(200);
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_zipfile(&filename)
Expand All @@ -450,10 +450,9 @@
// count collection length
assert_eq!(cl.len(), 2);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_sig = cl.sig_for_dataset(idx).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
assert_eq!(this_mh.scaled(), 200);
}
}

Expand Down
63 changes: 26 additions & 37 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,13 @@ impl Manifest {
}

impl Select for Manifest {
// select only records that satisfy selection conditions; also update
// scaled value to match.
fn select(self, selection: &Selection) -> Result<Self> {
let rows = self.records.iter().filter(|row| {
let Manifest { mut records } = self;

// TODO: with num as well?
records.retain_mut(|row| {
let mut valid = true;
valid = if let Some(ksize) = selection.ksize() {
row.ksize == ksize
Expand All @@ -279,7 +284,12 @@ impl Select for Manifest {
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled
let v = valid && row.scaled != 0 && row.scaled <= scaled;
// if scaled is set, update!
if v {
row.scaled = scaled
};
v
} else {
valid
};
Expand All @@ -291,41 +301,7 @@ impl Select for Manifest {
valid
});

Ok(Manifest {
records: rows.cloned().collect(),
})

/*
matching_rows = self.rows
if ksize:
matching_rows = ( row for row in matching_rows
if row['ksize'] == ksize )
if moltype:
matching_rows = ( row for row in matching_rows
if row['moltype'] == moltype )
if scaled or containment:
if containment and not scaled:
raise ValueError("'containment' requires 'scaled' in Index.select'")

matching_rows = ( row for row in matching_rows
if row['scaled'] and not row['num'] )
if num:
matching_rows = ( row for row in matching_rows
if row['num'] and not row['scaled'] )

if abund:
# only need to concern ourselves if abundance is _required_
matching_rows = ( row for row in matching_rows
if row['with_abundance'] )

if picklist:
matching_rows = ( row for row in matching_rows
if picklist.matches_manifest_row(row) )

# return only the internal filenames!
for row in matching_rows:
yield row
*/
Ok(Manifest { records })
}
}

Expand Down Expand Up @@ -567,6 +543,19 @@ mod test {
selection.set_scaled(100);
let scaled100 = manifest.select(&selection).unwrap();
assert_eq!(scaled100.len(), 6);

// check that 'scaled' is updated
let manifest = collection.manifest().clone();
selection = Selection::default();
selection.set_scaled(400);
let scaled400 = manifest.select(&selection).unwrap();
assert_eq!(scaled400.len(), 6);
let max_scaled = scaled400
.iter()
.map(|r| r.scaled())
.max()
.expect("no records?!");
assert_eq!(*max_scaled, 400);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ impl Selection {
abund: Some(row.with_abundance()),
moltype: Some(row.moltype()),
num: None,
scaled: None,
scaled: Some(*row.scaled()),
containment: None,
picklist: None,
})
Expand Down
Loading