Skip to content

Commit

Permalink
MRG: add skipmers; switch to reading frame approach for translation, …
Browse files Browse the repository at this point in the history
…skipmers (#3395)

This PR enables skipmers **ONLY in the rust code**.
- enables two skipmer types: m1n3, m2n3
- switches `SeqToHashes` to use reading frame struct, which
simplifies/unifies the code across the different methods. The reading
frame code handles any modifications needed - i.e. translation or
skipping. Then we just kmerize the reading frame as usual. The main
difference for translation is that we no longer need to store a buffer
of all hashes from the reading frames.

Since this changes the `SeqToHashes` strategy a bit, there's one python
test where we now see a different error (modified).

Future thoughts:
- with the new structure, it would be straightforward to add validation
for protein k-mers. I guess I'm not entirely sure what happens to those
atm...


Skipmer References:
- [Skip-mers: increasing entropy and sensitivity to detect conserved
genic regions with simple cyclic
q-grams](https://www.biorxiv.org/content/10.1101/179960.abstract)
- [Extracting and Evaluating Features from RNA Virus Sequences to
Predict Host Species Susceptibility Using Deep
Learning](https://dl.acm.org/doi/abs/10.1145/3473258.3473271)
  • Loading branch information
bluegenes authored Dec 20, 2024
1 parent b69c960 commit 5680efc
Show file tree
Hide file tree
Showing 11 changed files with 1,150 additions and 180 deletions.
2 changes: 2 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ updates:
- dependency-name: "wasm-bindgen"
- dependency-name: "once_cell"
- dependency-name: "chrono"
- dependency-name: "js-sys"
- dependency-name: "web-sys"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
Expand Down
3 changes: 3 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_INVALID_PROT = 1102,
SOURMASH_ERROR_CODE_INVALID_CODON_LENGTH = 1103,
SOURMASH_ERROR_CODE_INVALID_HASH_FUNCTION = 1104,
SOURMASH_ERROR_CODE_INVALID_SKIPMER_FRAME = 1105,
SOURMASH_ERROR_CODE_INVALID_SKIPMER_SIZE = 1106,
SOURMASH_ERROR_CODE_INVALID_TRANSLATE_FRAME = 1107,
SOURMASH_ERROR_CODE_READ_DATA = 1201,
SOURMASH_ERROR_CODE_STORAGE = 1202,
SOURMASH_ERROR_CODE_HLL_PRECISION_BOUNDS = 1301,
Expand Down
4 changes: 2 additions & 2 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ skip_feature_sets = [
## Wasm section. Crates only used for WASM, as well as specific configurations

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies]
js-sys = "0.3.74"
web-sys = { version = "0.3.74", features = ["console", "File", "FileReaderSync"] }
js-sys = "0.3.72"
web-sys = { version = "0.3.72", features = ["console", "File", "FileReaderSync"] }
wasm-bindgen = "0.2.89"
getrandom = { version = "0.2", features = ["js"] }

Expand Down
42 changes: 42 additions & 0 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ pub struct ComputeParameters {
#[builder(default = false)]
hp: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
skipm1n3: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
skipm2n3: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
singleton: bool,
Expand Down Expand Up @@ -165,6 +173,40 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
));
}

if params.skipm1n3 {
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::Murmur64Skipm1n3)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(Default::default())
} else {
None
})
.build(),
));
}

if params.skipm2n3 {
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::Murmur64Skipm2n3)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(Default::default())
} else {
None
})
.build(),
));
}

if params.dna {
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
Expand Down
105 changes: 105 additions & 0 deletions src/core/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ pub enum HashFunctions {
Murmur64Protein,
Murmur64Dayhoff,
Murmur64Hp,
Murmur64Skipm1n3,
Murmur64Skipm2n3,
Custom(String),
}

Expand All @@ -50,6 +52,14 @@ impl HashFunctions {
pub fn hp(&self) -> bool {
*self == HashFunctions::Murmur64Hp
}

pub fn skipm1n3(&self) -> bool {
*self == HashFunctions::Murmur64Skipm1n3
}

pub fn skipm2n3(&self) -> bool {
*self == HashFunctions::Murmur64Skipm2n3
}
}

impl std::fmt::Display for HashFunctions {
Expand All @@ -62,6 +72,8 @@ impl std::fmt::Display for HashFunctions {
HashFunctions::Murmur64Protein => "protein",
HashFunctions::Murmur64Dayhoff => "dayhoff",
HashFunctions::Murmur64Hp => "hp",
HashFunctions::Murmur64Skipm1n3 => "skipm1n3",
HashFunctions::Murmur64Skipm2n3 => "skipm2n3",
HashFunctions::Custom(v) => v,
}
)
Expand All @@ -77,6 +89,8 @@ impl TryFrom<&str> for HashFunctions {
"dayhoff" => Ok(HashFunctions::Murmur64Dayhoff),
"hp" => Ok(HashFunctions::Murmur64Hp),
"protein" => Ok(HashFunctions::Murmur64Protein),
"skipm1n3" => Ok(HashFunctions::Murmur64Skipm1n3),
"skipm2n3" => Ok(HashFunctions::Murmur64Skipm2n3),
v => unimplemented!("{v}"),
}
}
Expand Down Expand Up @@ -506,6 +520,7 @@ impl<'a> Iterator for Indices<'a> {
#[cfg(test)]
mod test {
use super::*;
use std::convert::TryFrom;

#[test]
fn colors_update() {
Expand Down Expand Up @@ -573,4 +588,94 @@ mod test {

assert_eq!(colors.len(), 2);
}

#[test]
fn test_dna_method() {
assert!(HashFunctions::Murmur64Dna.dna());
assert!(!HashFunctions::Murmur64Protein.dna());
assert!(!HashFunctions::Murmur64Dayhoff.dna());
}

#[test]
fn test_protein_method() {
assert!(HashFunctions::Murmur64Protein.protein());
assert!(!HashFunctions::Murmur64Dna.protein());
assert!(!HashFunctions::Murmur64Dayhoff.protein());
}

#[test]
fn test_dayhoff_method() {
assert!(HashFunctions::Murmur64Dayhoff.dayhoff());
assert!(!HashFunctions::Murmur64Dna.dayhoff());
assert!(!HashFunctions::Murmur64Protein.dayhoff());
}

#[test]
fn test_hp_method() {
assert!(HashFunctions::Murmur64Hp.hp());
assert!(!HashFunctions::Murmur64Dna.hp());
assert!(!HashFunctions::Murmur64Protein.hp());
}

#[test]
fn test_skipm1n3_method() {
assert!(HashFunctions::Murmur64Skipm1n3.skipm1n3());
assert!(!HashFunctions::Murmur64Dna.skipm1n3());
assert!(!HashFunctions::Murmur64Protein.skipm1n3());
}

#[test]
fn test_skipm2n3_method() {
assert!(HashFunctions::Murmur64Skipm2n3.skipm2n3());
assert!(!HashFunctions::Murmur64Dna.skipm2n3());
assert!(!HashFunctions::Murmur64Protein.skipm2n3());
}

#[test]
fn test_display_hashfunctions() {
assert_eq!(HashFunctions::Murmur64Dna.to_string(), "DNA");
assert_eq!(HashFunctions::Murmur64Protein.to_string(), "protein");
assert_eq!(HashFunctions::Murmur64Dayhoff.to_string(), "dayhoff");
assert_eq!(HashFunctions::Murmur64Hp.to_string(), "hp");
assert_eq!(HashFunctions::Murmur64Skipm1n3.to_string(), "skipm1n3");
assert_eq!(HashFunctions::Murmur64Skipm2n3.to_string(), "skipm2n3");
assert_eq!(
HashFunctions::Custom("custom_string".into()).to_string(),
"custom_string"
);
}

#[test]
fn test_try_from_str_valid() {
assert_eq!(
HashFunctions::try_from("dna").unwrap(),
HashFunctions::Murmur64Dna
);
assert_eq!(
HashFunctions::try_from("protein").unwrap(),
HashFunctions::Murmur64Protein
);
assert_eq!(
HashFunctions::try_from("dayhoff").unwrap(),
HashFunctions::Murmur64Dayhoff
);
assert_eq!(
HashFunctions::try_from("hp").unwrap(),
HashFunctions::Murmur64Hp
);
assert_eq!(
HashFunctions::try_from("skipm1n3").unwrap(),
HashFunctions::Murmur64Skipm1n3
);
assert_eq!(
HashFunctions::try_from("skipm2n3").unwrap(),
HashFunctions::Murmur64Skipm2n3
);
}

#[test]
#[should_panic(expected = "not implemented: unknown")]
fn test_try_from_str_invalid() {
HashFunctions::try_from("unknown").unwrap();
}
}
15 changes: 15 additions & 0 deletions src/core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ pub enum SourmashError {
#[error("Codon is invalid length: {message}")]
InvalidCodonLength { message: String },

#[error("Skipmer ksize must be >= n ({n}), but got ksize: {ksize}")]
InvalidSkipmerSize { ksize: usize, n: usize },

#[error("Skipmer frame number must be < n ({n}), but got start: {start}")]
InvalidSkipmerFrame { start: usize, n: usize },

#[error("Frame number must be 0, 1, or 2, but got {frame_number}")]
InvalidTranslateFrame { frame_number: usize },

#[error("Set error rate to a value smaller than 0.367696 and larger than 0.00203125")]
HLLPrecisionBounds,

Expand Down Expand Up @@ -128,6 +137,9 @@ pub enum SourmashErrorCode {
InvalidProt = 11_02,
InvalidCodonLength = 11_03,
InvalidHashFunction = 11_04,
InvalidSkipmerFrame = 11_05,
InvalidSkipmerSize = 11_06,
InvalidTranslateFrame = 11_07,
// index-related errors
ReadData = 12_01,
Storage = 12_02,
Expand Down Expand Up @@ -170,6 +182,9 @@ impl SourmashErrorCode {
SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt,
SourmashError::InvalidCodonLength { .. } => SourmashErrorCode::InvalidCodonLength,
SourmashError::InvalidHashFunction { .. } => SourmashErrorCode::InvalidHashFunction,
SourmashError::InvalidSkipmerFrame { .. } => SourmashErrorCode::InvalidSkipmerFrame,
SourmashError::InvalidSkipmerSize { .. } => SourmashErrorCode::InvalidSkipmerSize,
SourmashError::InvalidTranslateFrame { .. } => SourmashErrorCode::InvalidTranslateFrame,
SourmashError::ReadDataError { .. } => SourmashErrorCode::ReadData,
SourmashError::StorageError { .. } => SourmashErrorCode::Storage,
SourmashError::HLLPrecisionBounds { .. } => SourmashErrorCode::HLLPrecisionBounds,
Expand Down
15 changes: 13 additions & 2 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,26 @@ Result<*const u64> {

let mut output: Vec<u64> = Vec::with_capacity(insize);

// Call SeqToHashes::new and handle errors
let ready_hashes = SeqToHashes::new(
buf,
mh.ksize(),
force,
is_protein,
mh.hash_function(),
mh.seed(),
)?;


if force && bad_kmers_as_zeroes{
for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
for hash_value in ready_hashes{
match hash_value{
Ok(x) => output.push(x),
Err(err) => return Err(err),
}
}
}else{
for hash_value in SeqToHashes::new(buf, mh.ksize(), force, is_protein, mh.hash_function(), mh.seed()){
for hash_value in ready_hashes {
match hash_value{
Ok(0) => continue,
Ok(x) => output.push(x),
Expand Down
Loading

0 comments on commit 5680efc

Please sign in to comment.