Skip to content

Commit

Permalink
Merge branch 'latest' into try-skipmers
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes authored Dec 20, 2024
2 parents 8cb0bea + b69c960 commit 81bab97
Show file tree
Hide file tree
Showing 16 changed files with 166 additions and 88 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ repos:
- id: check-toml
- id: debug-statements
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.2
rev: v0.8.3
hooks:
- id: ruff-format
- id: ruff
Expand Down
28 changes: 14 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 13 additions & 1 deletion doc/databases.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ genomes. Among other uses, they can be used to detect host
contamination in microbial metagenomes.

Each file includes sketches at k=21, k=31, and k=51, at a scaled of
1000, and is about 110 MB.
1000, and is under 50 MB.

* Human (hg38) - [hg38.sig.zip](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/host/hg38.sig.zip)
* Cow (bosTau9) - [bosTau9.sig.zip](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/host/bosTau9.sig.zip)
Expand All @@ -49,6 +49,18 @@ Each file includes sketches at k=21, k=31, and k=51, at a scaled of
* Goat (oviAri4) - [oviAri4.sig.zip](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/host/oviAri4.sig.zip)
* Pig (susCr11) - [susScr11.sig.zip](https://farm.cse.ucdavis.edu/~ctbrown/sourmash-db/host/susScr11.sig.zip)

## Sketches for plant genomes

These sketches are for the plant genomes available in GenBank as of 2024-07.

| K-mer size | Zipfile collection |
| -------- | -------- |
| k21 | [download (7G)](https://farm.cse.ucdavis.edu/\~ctbrown/sourmash-db/genbank-plant-2024-07/genbank-plants-2024-07.k21.zip) |
| k31 | [download (8.8G)](https://farm.cse.ucdavis.edu/\~ctbrown/sourmash-db/genbank-plant-2024-07/genbank-plants-2024-07.k31.zip) |
| k51 | [download (11G)](https://farm.cse.ucdavis.edu/\~ctbrown/sourmash-db/genbank-plant-2024-07/genbank-plants-2024-07.k51.zip) |

Lineage spreadsheet for sourmash `tax` commands: [download](https://farm.cse.ucdavis.edu/\~ctbrown/sourmash-db/genbank-plant-2024-07/genbank-plants-2024-07.lineages.csv.gz)

## GTDB R08-RS214 - DNA databases

[GTDB R08-RS214](https://forum.gtdb.ecogenomic.org/t/announcing-gtdb-r08-rs214/456) consists of 402,709 genomes organized into 85,205 species clusters.
Expand Down
6 changes: 3 additions & 3 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ piz = "0.5.0"
primal-check = "0.3.4"
rayon = { version = "1.10.0", optional = true }
rkyv = { version = "0.7.44", optional = true }
roaring = "0.10.8"
roaring = "0.10.9"
roots = "0.0.8"
serde = { version = "1.0.215", features = ["derive"] }
serde = { version = "1.0.216", features = ["derive"] }
serde_json = "1.0.133"
statrs = "0.18.0"
streaming-stats = "0.2.3"
Expand All @@ -66,7 +66,7 @@ vec-collections = "0.4.3"

[dev-dependencies]
codspeed-criterion-compat = "2.7.2"
proptest = { version = "1.5.0", default-features = false, features = ["std"]}
proptest = { version = "1.6.0", default-features = false, features = ["std"]}
rand = "0.8.2"
tempfile = "3.14.0"

Expand Down
4 changes: 2 additions & 2 deletions src/core/benches/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fn intersection(c: &mut Criterion) {
filename.push("../../tests/test-data/gather-abund/genome-s10.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let mut sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let mut sigs = Signature::from_reader(reader).expect("Loading error");
let mh = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] {
mh.clone()
} else {
Expand All @@ -24,7 +24,7 @@ fn intersection(c: &mut Criterion) {
filename.push("../../tests/test-data/gather-abund/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let mut sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let mut sigs = Signature::from_reader(reader).expect("Loading error");
let mh2 = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] {
mh.clone()
} else {
Expand Down
16 changes: 8 additions & 8 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ mod test {
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
Expand All @@ -293,7 +293,7 @@ mod test {
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(500);
Expand All @@ -314,7 +314,7 @@ mod test {
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 4);
// create Selection object
let mut selection = Selection::default();
Expand All @@ -336,7 +336,7 @@ mod test {
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
let sigs_copy = sigs.clone();
assert_eq!(sigs.len(), 4);
// create Selection object
Expand Down Expand Up @@ -366,7 +366,7 @@ mod test {
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 6);
// create Selection object
let mut selection = Selection::default();
Expand All @@ -388,7 +388,7 @@ mod test {
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 4);
// load sigs into collection + select compatible signatures
let mut cl = Collection::from_sigs(sigs).unwrap();
Expand All @@ -413,7 +413,7 @@ mod test {
filename.push("../../tests/test-data/47+63-multisig.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(2000);
Expand Down Expand Up @@ -480,7 +480,7 @@ mod test {
.push("../../tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
let sigs = Signature::from_reader(reader).expect("Loading error");
// create Selection object
let mut selection = Selection::default();
selection.set_moltype(HashFunctions::Murmur64Hp);
Expand Down
8 changes: 5 additions & 3 deletions src/core/src/ffi/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::sketch::Sketch;
use crate::ffi::cmd::compute::SourmashComputeParameters;
use crate::ffi::minhash::SourmashKmerMinHash;
use crate::ffi::utils::{ForeignObject, SourmashStr};
use crate::prelude::ToWriter;

pub struct SourmashSignature;

Expand Down Expand Up @@ -193,8 +194,9 @@ unsafe fn signature_eq(ptr: *const SourmashSignature, other: *const SourmashSign
ffi_fn! {
unsafe fn signature_save_json(ptr: *const SourmashSignature) -> Result<SourmashStr> {
let sig = SourmashSignature::as_rust(ptr);
let st = serde_json::to_string(sig)?;
Ok(SourmashStr::from_string(st))
let mut st: Vec<u8> = vec![];
sig.to_writer(&mut st)?;
Ok(SourmashStr::from_string(String::from_utf8_unchecked(st)))
}
}

Expand Down Expand Up @@ -248,7 +250,7 @@ unsafe fn signatures_save_buffer(ptr: *const *const SourmashSignature, size: usi
} else {
Box::new(&mut buffer)
};
serde_json::to_writer(&mut writer, &rsigs)?;
rsigs.to_writer(&mut writer)?;
}

let b = buffer.into_boxed_slice();
Expand Down
9 changes: 8 additions & 1 deletion src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,14 @@ impl RevIndexOps for RevIndex {
.collection
.record_for_dataset(dataset_id)
.expect("dataset not found");
Some((row.name().into(), size))

let name = [row.name(), row.filename(), row.md5()]
.into_iter()
.skip_while(|v| v.is_empty())
.next()
.unwrap(); // guaranteed to succeed because `md5` always exists

Some((name.into(), size))
} else {
None
}
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ mod test {
)?;

assert_eq!(matches.len(), 1);
assert_eq!(matches[0].name(), "../genome-s10.fa.gz");
assert_eq!(matches[0].name(), ""); // signature name is empty
assert_eq!(matches[0].f_match(), 1.0);

Ok(())
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ impl Record {
Self {
internal_location: path.into(),
moltype: moltype.to_string(),
name: sig.name(),
name: sig.name_str(),
ksize,
md5,
md5short,
Expand Down
Loading

0 comments on commit 81bab97

Please sign in to comment.