Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get nerd-sniped and do some micro-optimizations. #41

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include = ["wrapper.h", "src/lib.rs", "LICENSE", "README.md"]
[dependencies]
anyhow = "1"
libc = "0.2"
rust-htslib = { version = "0.38.2", default-features = false, features = ["serde_feature"] }
rust-htslib = { version = "0.39.5", default-features = false, features = ["serde_feature"] }
star-sys = { version = "0.2", path = "star-sys" }

[profile.release]
Expand Down
16 changes: 14 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::os::raw::c_char;
use std::os::raw::c_int;
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;

use anyhow::{format_err, Error};
use rust_htslib::bam;
Expand Down Expand Up @@ -60,7 +61,7 @@ impl StarReference {

// recover stray CStrings to prevent leaked memory
nvec.into_iter().for_each(|ptr| unsafe {
CString::from_raw(ptr);
drop(CString::from_raw(ptr));
});

let inner = InnerStarReference {
Expand Down Expand Up @@ -253,6 +254,7 @@ impl StarAligner {

/// Aligns a given read and produces BAM records
pub fn align_read(&mut self, name: &[u8], read: &[u8], qual: &[u8]) -> Vec<bam::Record> {
let now = Instant::now();
// STAR will throw an error on empty reads - so just construct an empty record.
if read.len() == 0 {
// Make an unmapped record and return it
Expand All @@ -261,7 +263,17 @@ impl StarAligner {

Self::prepare_fastq(&mut self.fastq1, name, read, qual);
align_read_rust(self.aligner, self.fastq1.as_slice(), &mut self.aln_buf).unwrap();
self.parse_sam_to_records(name)
let record= self.parse_sam_to_records(name);
let new_now = Instant::now();
let (chr, pos, cigar, cigar_ops) = if record.len() > 0 {
let rec = &record[0];
(rec.tid().to_string(), rec.pos().to_string(), format!("{}", rec.cigar()), rec.cigar().len().to_string())
} else {
("NA".to_string(), "NA".to_string(), "NA".to_string(), "NA".to_string())
};
println!("{:?},{:?},{:?},{},{},{},{}", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len(), chr, pos, cigar, cigar_ops);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lots of unnecessary memcpy going on here.

Suggested change
let (chr, pos, cigar, cigar_ops) = if record.len() > 0 {
let rec = &record[0];
(rec.tid().to_string(), rec.pos().to_string(), format!("{}", rec.cigar()), rec.cigar().len().to_string())
} else {
("NA".to_string(), "NA".to_string(), "NA".to_string(), "NA".to_string())
};
println!("{:?},{:?},{:?},{},{},{},{}", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len(), chr, pos, cigar, cigar_ops);
if record.len() > 0 {
let rec = &record[0];
println!("{:?},{:?},{:?},{},{},{},{}", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len(), rec.tid(), rec.pos(),rec.cigar(), rec.cigar().len())
} else {
println!("{:?},{:?},{:?},NA,NA,NA,NA", std::str::from_utf8(&read).unwrap(), new_now.duration_since(now), record.len())
};

Though I guess it doesn't really matter if this is just temporary benchmarking code.

But, maybe this should be an actual benchmark?

record

}

/// Aligns a given read and return the resulting SAM string
Expand Down
2 changes: 1 addition & 1 deletion star-sys/STAR/source/InOutStreams.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

template<class charT, class traits = std::char_traits<charT> >
class NullBuf: public std::basic_streambuf<charT, traits> {
inline typename traits::int_type overflow(typename traits::int_type c) {
inline typename traits::int_type overflow(typename traits::int_type c) override {
return traits::not_eof(c);
}
};
Expand Down
5 changes: 2 additions & 3 deletions star-sys/STAR/source/IncludeDefine.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <iostream>
#include <fstream>
#include <sstream>
#include <time.h>
#include <ctime>
#include <iomanip>
#include <vector>
Expand All @@ -18,9 +17,9 @@
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <cerrno>
#include <limits>
#include <stdint.h>
#include <cstdint>

#include "VERSION"

Expand Down
37 changes: 17 additions & 20 deletions star-sys/STAR/source/ParameterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@

class ParameterInfoBase {
public:
string nameString; //string that identifies parameter
const char* nameString; //string that identifies parameter
int inputLevel; //where the parameter was defined
int inputLevelAllowed; //at which inpurt level parameter definition is allowed
virtual void inputValues(istringstream &streamIn) =0;
friend std::ostream& operator<< (std::ostream& o, ParameterInfoBase const& b);
virtual ~ParameterInfoBase() {};
ParameterInfoBase(const char* nameString, int inputLevel, int inputLevelAllowed);
virtual ~ParameterInfoBase() = default;
ParameterInfoBase(const ParameterInfoBase&) = default;
ParameterInfoBase &operator=(const ParameterInfoBase&) = delete;
ParameterInfoBase(ParameterInfoBase&&) = default;
ParameterInfoBase& operator=(ParameterInfoBase&&) = default;
protected:
virtual void printValues(std::ostream& o) const = 0;
};
Expand Down Expand Up @@ -68,20 +73,16 @@ class ParameterInfoScalar : public ParameterInfoBase {
parameterType *value;
vector <parameterType> allowedValues;

ParameterInfoScalar(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, parameterType* valueIn) {
nameString=nameStringIn;
inputLevel=inputLevelIn;
inputLevelAllowed=inputLevelAllowedIn;
value=valueIn;
};
ParameterInfoScalar(int inputLevelIn, int inputLevelAllowedIn, const char* nameStringIn, parameterType* valueIn)
: ParameterInfoBase(nameStringIn, inputLevelIn, inputLevelAllowedIn),
value(valueIn) {};

void inputValues(istringstream &streamIn) {
void inputValues(istringstream &streamIn) override {
*value=inputOneValue <parameterType> (streamIn);
};

~ParameterInfoScalar() {};
protected:
virtual void printValues(std::ostream& outStr) const {
void printValues(std::ostream& outStr) const override {
printOneValue(value, outStr);
};

Expand All @@ -93,24 +94,20 @@ class ParameterInfoVector : public ParameterInfoBase {
vector <parameterType> *value;
vector <parameterType> allowedValues;

ParameterInfoVector(int inputLevelIn, int inputLevelAllowedIn, string nameStringIn, vector <parameterType> *valueIn) {
nameString=nameStringIn;
inputLevel=inputLevelIn;
inputLevelAllowed=inputLevelAllowedIn;
value=valueIn;
};
ParameterInfoVector(int inputLevelIn, int inputLevelAllowedIn, const char* nameStringIn, vector <parameterType> *valueIn)
: ParameterInfoBase(nameStringIn, inputLevelIn, inputLevelAllowedIn),
value(valueIn) {};

void inputValues(istringstream &streamIn) {
void inputValues(istringstream &streamIn) override {
(*value).clear();
while (streamIn.good()) {
(*value).push_back(inputOneValue <parameterType> (streamIn));
streamIn >> ws; //remove white space, may arrive at the end of line
};
};

~ParameterInfoVector() {};
protected:
virtual void printValues(std::ostream& outStr) const {
void printValues(std::ostream& outStr) const override {
for (int ii=0; ii < (int) (*value).size(); ii++) {
printOneValue(&(*value).at(ii),outStr);
outStr<<" ";
Expand Down
5 changes: 4 additions & 1 deletion star-sys/STAR/source/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

#define PAR_NAME_PRINT_WIDTH 30

ParameterInfoBase::ParameterInfoBase(const char* nameStringIn, int inputLevelIn, int inputLevelAllowedIn)
: nameString(nameStringIn), inputLevel(inputLevelIn), inputLevelAllowed(inputLevelAllowedIn) {}

Parameters::Parameters() {//initalize parameters info

inOut = new InOutStreams;
Expand Down Expand Up @@ -414,7 +417,7 @@ void Parameters::inputParameters (int argInN, char* argIn[]) {//input parameters
for (uint ii=0; ii<parArray.size(); ii++) {
if (parArray[ii]->inputLevel>0) {
inOut->logMain << setw(PAR_NAME_PRINT_WIDTH) << parArray[ii]->nameString <<" "<< *(parArray[ii]) << endl;
if (parArray[ii]->nameString != "parametersFiles" ) {
if (strcmp(parArray[ii]->nameString, "parametersFiles")) {
clFull << " --" << parArray[ii]->nameString << " " << *(parArray[ii]);
};
};
Expand Down
2 changes: 1 addition & 1 deletion star-sys/STAR/source/ReadAlign_peOverlapMergeMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ void ReadAlign::peMergeMates() {
return;
};

void Transcript::peOverlapSEtoPE(uint* mateStart, Transcript &t) {//convert alignment from merged-SE to PE
void Transcript::peOverlapSEtoPE(const uint* mateStart, const Transcript &t) {//convert alignment from merged-SE to PE

uint mLen[2];
mLen[0]=readLength[t.Str];
Expand Down
3 changes: 1 addition & 2 deletions star-sys/STAR/source/SoloReadFeature_record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ void SoloReadFeature::record(SoloReadBarcode &soloBar, uint nTr, set<uint32> &re
stats.V[stats.nAmbigFeature]++;
return;
};
bool sjAnnot;
alignOut->extractSpliceJunctions(readSJs, sjAnnot);
bool sjAnnot = alignOut->extractSpliceJunctions(readSJs);
if ( readSJs.empty() || (sjAnnot && readGene.size()==0) ) {//no junctions, or annotated junction buy no gene (i.e. read does not fully match transcript)
stats.V[stats.nNoFeature]++;
return;
Expand Down
4 changes: 2 additions & 2 deletions star-sys/STAR/source/TimeFunctions.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef TIME_FUNCTIONS_DEF
#define TIME_FUNCTIONS_DEF
#include <string>
#include <time.h>
#include <ctime>

string timeMonthDayTime();
string timeMonthDayTime(time_t &rawTime);
string timeMonthDayTime(std::time_t &rawTime);

#endif
18 changes: 7 additions & 11 deletions star-sys/STAR/source/Transcript.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#include "Transcript.h"

Transcript::Transcript()
{
reset();
};

void Transcript::reset() {
extendL=0;

Expand All @@ -23,21 +18,21 @@ void Transcript::reset() {
nGap=0; lGap=0; lDel=0; lIns=0; nDel=0; nIns=0;

nUnique=nAnchor=0;
};
}

void Transcript::add(Transcript *trIn) {
void Transcript::add(const Transcript *trIn) noexcept {
maxScore+=trIn->maxScore;
nMatch+=trIn->nMatch;
nMM+=trIn->nMM;
nGap+=trIn->nGap; lGap+=trIn->lGap;
lDel+=trIn->lDel; nDel+=trIn->nDel;
lIns+=trIn->lIns; nIns+=trIn->nIns;
nUnique+=trIn->nUnique;
};
}

void Transcript::extractSpliceJunctions(vector<array<uint64,2>> &sjOut, bool &annotYes)
bool Transcript::extractSpliceJunctions(vector<array<uint64,2>> &sjOut) const
{
annotYes=true;
bool annotYes=true;
for (uint64 iex=0; iex<nExons-1; iex++) {//record all junctions
if (canonSJ[iex]>=0) {//only record junctions, not indels or mate gap
array<uint64,2> sj;
Expand All @@ -48,5 +43,6 @@ void Transcript::extractSpliceJunctions(vector<array<uint64,2>> &sjOut, bool &an
annotYes=false;//if one of the SJs is unannoated, annotYes=false
};
};
};
return annotYes;
}

63 changes: 33 additions & 30 deletions star-sys/STAR/source/Transcript.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
#ifndef CODE_Transcript
#define CODE_Transcript

#include <array>

#include "IncludeDefine.h"
#include "Parameters.h"
#include "Variation.h"
#include "Genome.h"

class Transcript {
public:
uint exons[MAX_N_EXONS][EX_SIZE]; //coordinates of all exons: r-start, g-start, length
uint shiftSJ[MAX_N_EXONS][2]; //shift of the SJ coordinates due to genomic micro-repeats
int canonSJ[MAX_N_EXONS]; //canonicity of each junction
uint8 sjAnnot[MAX_N_EXONS]; //anotated or not
uint8 sjStr[MAX_N_EXONS]; //strand of the junction
std::array<std::array<uint, EX_SIZE>, MAX_N_EXONS> exons; //coordinates of all exons: r-start, g-start, length
std::array<std::array<uint, 2>, MAX_N_EXONS> shiftSJ; //shift of the SJ coordinates due to genomic micro-repeats
std::array<int, MAX_N_EXONS> canonSJ; //canonicity of each junction
std::array<uint8, MAX_N_EXONS> sjAnnot; //anotated or not
std::array<uint8, MAX_N_EXONS> sjStr; //strand of the junction

uint intronMotifs[3];
std::array<int, 3> intronMotifs;
uint8 sjMotifStrand;

uint nExons; //number of exons in the read transcript
Expand All @@ -29,43 +31,44 @@ class Transcript {
int iFrag; //frag number of the transcript, if the the transcript contains only one frag

//loci
uint rStart, roStart, rLength, gStart, gLength, cStart; //read, original read, and genomic start/length, chromosome start
uint rStart = 0; //read
uint roStart = 0; //original read
uint rLength = 0; //read length
uint gStart = 0; //genomic start
uint gLength = 0; //genomic length
uint cStart; //chromosome start
uint Chr,Str,roStr; //chromosome and strand and original read Strand

bool primaryFlag;
bool primaryFlag = false;

uint nMatch;//min number of matches
uint nMM;//max number of mismatches
uint nMatch = 0;//min number of matches
uint nMM = 0;//max number of mismatches
uint mappedLength; //total mapped length, sum of lengths of all blocks(exons)

uint extendL; //extension length
intScore maxScore; //maximum Score
uint extendL = 0; //extension length
intScore maxScore = 0; //maximum Score

uint nGap, lGap; //number of genomic gaps (>alignIntronMin) and their total length
uint nDel; //number of genomic deletions (ie genomic gaps)
uint nIns; //number of (ie read gaps)
uint lDel; //total genomic deletion length
uint lIns; //total genomic insertion length
uint nGap = 0;
uint lGap = 0; //number of genomic gaps (>alignIntronMin) and their total length
uint nDel = 0; //number of genomic deletions (ie genomic gaps)
uint nIns = 0; //number of (ie read gaps)
uint lDel = 0; //total genomic deletion length
uint lIns = 0; //total genomic insertion length

uint nUnique, nAnchor; //number of unique pieces in the alignment, number of anchor pieces in the alignment
uint nUnique = 0; //number of anchor pieces in the alignment
uint nAnchor = 0; //number of unique pieces in the alignment

vector <int32> varInd;
vector <int32> varGenCoord, varReadCoord ;
vector <char> varAllele;

Transcript(); //resets to 0
void reset(); //reset to 0
void resetMapG(); // reset map to 0
void resetMapG(uint); // reset map to 0 for Lread bases
void add(Transcript*); // add
intScore alignScore(char **Read1, char *G, const Parameters &P);
int variationAdjust(const Genome &mapGen, char *R);
string generateCigarP(); //generates CIGAR
void peOverlapSEtoPE(uint* mSta, Transcript &t);
void extractSpliceJunctions(vector<array<uint64,2>> &sjOut, bool &annotYes);

private:

void add(const Transcript*) noexcept; // add
intScore alignScore(const char **Read1, const char *G, const Parameters &P);
int variationAdjust(const Genome &mapGen, const char *R);
string generateCigarP() const; //generates CIGAR
void peOverlapSEtoPE(const uint* mSta, const Transcript &t);
bool extractSpliceJunctions(vector<array<uint64,2>> &sjOut) const;
};

#endif
4 changes: 2 additions & 2 deletions star-sys/STAR/source/Transcript_alignScore.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#include <cmath>
#include "Transcript.h"

intScore Transcript::alignScore(char **Read1, char *G, const Parameters &P) {//re-calculates score and number of mismatches
intScore Transcript::alignScore(const char **Read1, const char *G, const Parameters &P) {//re-calculates score and number of mismatches
maxScore=0;
nMM=0;
nMatch=0;
char* R=Read1[roStr==0 ? 0:2];
const char* R=Read1[roStr==0 ? 0:2];
for (uint iex=0;iex<nExons;iex++) {
for (uint ii=0;ii<exons[iex][EX_L];ii++) {
char r1=R[ii+exons[iex][EX_R]];
Expand Down
2 changes: 1 addition & 1 deletion star-sys/STAR/source/Transcript_generateCigarP.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "Transcript.h"
#include "SequenceFuns.h"

string Transcript::generateCigarP()
string Transcript::generateCigarP() const
{//generates CIGARp string for the transcript with "p" operation for PE reads
//p is a special CIGAR operation to encode gap between mates. This gap is negative for overlapping mates.

Expand Down
2 changes: 1 addition & 1 deletion star-sys/STAR/source/Transcript_variationAdjust.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "Transcript.h"
#include "serviceFuns.cpp"

int Transcript::variationAdjust(const Genome &mapGen, char *R)
int Transcript::variationAdjust(const Genome &mapGen, const char *R)
{
Variation &Var=*mapGen.Var;

Expand Down
Loading