Skip to content

Commit

Permalink
Significant performance improvements when dealing with overlapping tr…
Browse files Browse the repository at this point in the history
…anscripts.

Fix *rare* pathalogical performance of hash function.
Improvement to quasi-mapping to reduce ambiguity of reads matching fwd and rc transcripts.
Improvement to fragment length modeling.
  • Loading branch information
rob-p committed Nov 12, 2015
1 parent 3d4809b commit 30fffa4
Show file tree
Hide file tree
Showing 6 changed files with 370 additions and 180 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ enable_testing()

project (Sailfish)

set(CPACK_PACKAGE_VERSION "0.7.6")
set(CPACK_PACKAGE_VERSION "0.7.7")
set(CPACK_PACKAGE_VERSION_MAJOR "0")
set(CPACK_PACKAGE_VERSION_MINOR "7")
set(CPACK_PACKAGE_VERSION_PATCH "6")
set(CPACK_PACKAGE_VERSION_PATCH "7")
set(CPACK_GENERATOR "TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_PACKAGE_VENDOR "Carnegie Mellon University / Stony Brook University")
Expand Down
8 changes: 8 additions & 0 deletions include/EquivalenceClassBuilder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ class EquivalenceClassBuilder {
return true;
}

inline void insertGroup(TranscriptGroup g, uint32_t count) {
std::vector<double> weights(g.txps.size(), 0.0);
//auto updatefn = [count](TGValue& x) { x.count = count; };
TGValue v(weights, count);
countVec_.push_back(std::make_pair(g, v));
//countMap_.upsert(g, updatefn, v);
}

inline void addGroup(TranscriptGroup&& g,
std::vector<double>& weights) {

Expand Down
4 changes: 2 additions & 2 deletions include/SailfishConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
namespace sailfish {
constexpr char majorVersion[] = "0";
constexpr char minorVersion[] = "7";
constexpr char patchVersion[] = "6";
constexpr char version[] = "0.7.6";
constexpr char patchVersion[] = "7";
constexpr char version[] = "0.7.7";
constexpr uint32_t indexVersion = 1;
}

Expand Down
148 changes: 80 additions & 68 deletions src/CollapsedEMOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ void EMUpdate_(
assert(alphaIn.size() == alphaOut.size());

tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
[&eqVec, &alphaIn, &alphaOut](const BlockedIndexRange& range) -> void {
[&eqVec, &alphaIn, &transcripts, &alphaOut](const BlockedIndexRange& range) -> void {
for (auto eqID : boost::irange(range.begin(), range.end())) {
auto& kv = eqVec[eqID];

Expand Down Expand Up @@ -467,7 +467,7 @@ bool doBootstrap(
// Do a new bootstrap
msamp(sampCounts.begin(), totalNumFrags, numClasses, sampleWeights.begin());

double totalLen{0.0};
double totalLen{0.0};
for (size_t i = 0; i < transcripts.size(); ++i) {
alphas[i] = transcripts[i].getActive() ? uniformTxpWeight * totalNumFrags : 0.0;
totalLen += effLens(i);
Expand All @@ -477,25 +477,26 @@ bool doBootstrap(
double maxRelDiff = -std::numeric_limits<double>::max();
size_t itNum = 0;

// If we use VBEM, we'll need the prior parameters
double priorAlpha = 0.01;
// If we use VBEM, we'll need the prior parameters
double priorAlpha = 0.01;
double minAlpha = 1e-8;
double alphaCheckCutoff = 1e-2;
double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha;

while (itNum < maxIter and !converged) {
while (itNum < maxIter and !converged) {

if (useVBEM) {
VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta);
} else {
VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta);
} else {
EMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
effLens, alphas, alphasPrime);
}

converged = true;
maxRelDiff = -std::numeric_limits<double>::max();
for (size_t i = 0; i < transcripts.size(); ++i) {
if (alphas[i] > cutoff) {
if (alphas[i] > alphaCheckCutoff) {
double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff;
if (relDiff > relDiffTolerance) {
Expand Down Expand Up @@ -532,14 +533,23 @@ bool CollapsedEMOptimizer::gatherBootstraps(

std::vector<Transcript>& transcripts = readExp.transcripts();
using VecT = CollapsedEMOptimizer::SerialVecType;
// With atomics

VecT alphas(transcripts.size(), 0.0);
VecT alphasPrime(transcripts.size(), 0.0);
VecT expTheta(transcripts.size());
Eigen::VectorXd effLens(transcripts.size());

uint32_t numBootstraps = sopt.numBootstraps;

// Fill in the effective length vector
double totalLen{0.0};
for (size_t i = 0; i < transcripts.size(); ++i) {
effLens(i) = (sopt.noEffectiveLengthCorrection) ?
transcripts[i].RefLength : transcripts[i].EffectiveLength;
if (effLens(i) <= 1.0) { effLens(i) = 1.0; }
totalLen += effLens(i);
}

std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
readExp.equivalenceClassBuilder().eqVec();

Expand All @@ -562,7 +572,6 @@ bool CollapsedEMOptimizer::gatherBootstraps(
jointLog->info("Optimizing over {} equivalence classes", eqVec.size());

double totalNumFrags{static_cast<double>(readExp.numMappedFragments())};
double totalLen{0.0};

if (activeTranscriptIDs.size() == 0) {
jointLog->error("It seems that no transcripts are expressed; something is likely wrong!");
Expand All @@ -571,11 +580,7 @@ bool CollapsedEMOptimizer::gatherBootstraps(

double scale = 1.0 / activeTranscriptIDs.size();
for (size_t i = 0; i < transcripts.size(); ++i) {
//double m = transcripts[i].mass(false);
alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0;
effLens(i) = (sopt.noEffectiveLengthCorrection) ?
transcripts[i].RefLength : transcripts[i].EffectiveLength;
totalLen += effLens(i);
}

auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
Expand All @@ -601,12 +606,17 @@ bool CollapsedEMOptimizer::gatherBootstraps(

// Iterate over each weight and set it equal to
// 1 / effLen of the corresponding transcript
double wsum{0.0};
for (size_t i = 0; i < classSize; ++i) {
double el = effLens(k.txps[i]);
v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
}
v.weights[i] = (kv.second.count / effLens(k.txps[i]));
wsum += v.weights[i];
}
});
double wnorm = 1.0 / wsum;
for (size_t i = 0; i < classSize; ++i) {
v.weights[i] *= wnorm;
}
}
});

// Since we will use the same weights and transcript groups for each
// of the bootstrap samples (only the count vector will change), it
Expand Down Expand Up @@ -646,18 +656,6 @@ bool CollapsedEMOptimizer::gatherBootstraps(
std::atomic<uint32_t> bsCounter{0};
std::vector<std::thread> workerThreads;
for (size_t tn = 0; tn < numWorkerThreads; ++tn) {
/*
std::vector<std::vector<uint32_t>>& txpGroups,
std::vector<std::vector<double>>& txpGroupWeights,
std::vector<Transcript>& transcripts,
Eigen::VectorXd& effLens,
std::vector<double>& sampleWeights,
uint64_t totSamples,
std::atomic<uint32_t>& bsNum,
SailfishOpts& sopt,
BootstrapWriter* bootstrapWriter,
double relDiffTolerance,
uint32_t maxIter)*/
workerThreads.emplace_back(doBootstrap,
std::ref(txpGroups),
std::ref(txpGroupWeights),
Expand Down Expand Up @@ -692,19 +690,59 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
VecType alphas(transcripts.size(), 0.0);
VecType alphasPrime(transcripts.size(), 0.0);
VecType expTheta(transcripts.size());
VecType uniqueCounts(transcripts.size(), 0.0);
Eigen::VectorXd effLens(transcripts.size());

// Fill in the effective length vector
double totalLen{0.0};
for (size_t i = 0; i < transcripts.size(); ++i) {
effLens(i) = (sopt.noEffectiveLengthCorrection) ?
transcripts[i].RefLength : transcripts[i].EffectiveLength;
if (effLens(i) <= 1.0) { effLens(i) = 1.0; }
totalLen += effLens(i);
}

std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
readExp.equivalenceClassBuilder().eqVec();

tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
[&eqVec, &effLens, &transcripts]( const BlockedIndexRange& range) -> void {
// For each index in the equivalence class vector
for (auto eqID : boost::irange(range.begin(), range.end())) {
// The vector entry
auto& kv = eqVec[eqID];
// The label of the equivalence class
const TranscriptGroup& k = kv.first;
// The size of the label
size_t classSize = k.txps.size();
// The weights of the label
TGValue& v = kv.second;

// Iterate over each weight and set it equal to
// 1 / effLen of the corresponding transcript
double wsum{0.0};
for (size_t i = 0; i < classSize; ++i) {
v.weights[i] = (kv.second.count / effLens(k.txps[i]));
wsum += v.weights[i];
}

double wnorm = 1.0 / wsum;
for (size_t i = 0; i < classSize; ++i) {
v.weights[i] *= wnorm;
}

}
});

std::unordered_set<uint32_t> activeTranscriptIDs;
for (auto& kv : eqVec) {
auto& tg = kv.first;
for (auto& t : tg.txps) {
for (auto& kv : eqVec) {
auto& tg = kv.first;
for (size_t i = 0; i < tg.txps.size(); ++i) {
auto& t = tg.txps[i];
transcripts[t].setActive();
activeTranscriptIDs.insert(t);
}
}
}
}

bool useVBEM{sopt.useVBOpt};
// If we use VBEM, we'll need the prior parameters
Expand All @@ -715,7 +753,6 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
jointLog->info("Optimizing over {} equivalence classes", eqVec.size());

double totalNumFrags{static_cast<double>(readExp.numMappedFragments())};
double totalLen{0.0};

if (activeTranscriptIDs.size() == 0) {
jointLog->error("It seems that no transcripts are expressed; something is likely wrong!");
Expand All @@ -724,43 +761,18 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,

double scale = 1.0 / activeTranscriptIDs.size();
for (size_t i = 0; i < transcripts.size(); ++i) {
//double m = transcripts[i].mass(false);
alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0;
effLens(i) = (sopt.noEffectiveLengthCorrection) ?
transcripts[i].RefLength : transcripts[i].EffectiveLength;
totalLen += effLens(i);
}

auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
numRemoved);
//auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
//sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
// numRemoved);

size_t itNum{0};
double minAlpha = 1e-8;
double alphaCheckCutoff = 1e-2;
double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha;

tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
[&eqVec, &effLens]( const BlockedIndexRange& range) -> void {
// For each index in the equivalence class vector
for (auto eqID : boost::irange(range.begin(), range.end())) {
// The vector entry
auto& kv = eqVec[eqID];
// The label of the equivalence class
const TranscriptGroup& k = kv.first;
// The size of the label
size_t classSize = k.txps.size();
// The weights of the label
TGValue& v = kv.second;

// Iterate over each weight and set it equal to
// 1 / effLen of the corresponding transcript
for (size_t i = 0; i < classSize; ++i) {
double el = effLens(k.txps[i]);
v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
}
}
});

bool converged{false};
double maxRelDiff = -std::numeric_limits<double>::max();
while (itNum < maxIter and !converged) {
Expand All @@ -775,8 +787,8 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
converged = true;
maxRelDiff = -std::numeric_limits<double>::max();
for (size_t i = 0; i < transcripts.size(); ++i) {
if (alphas[i] > cutoff) {
double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
if (alphasPrime[i] > alphaCheckCutoff) {
double relDiff = std::fabs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff;
if (relDiff > relDiffTolerance) {
converged = false;
Expand Down
10 changes: 5 additions & 5 deletions src/SailfishIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Builds a Sailfish index
if (!bfs::exists(outputStem)) {
mustRecompute = true;
try {
bool success = bfs::create_directory(outputStem);
bool success = bfs::create_directories(outputStem);
if (!success) { throw std::runtime_error("unspecified error creating file."); }
} catch ( std::exception& e ) {
std::cerr << "Creation of " << outputStem << " failed [" << e.what() << "]\n.";
Expand All @@ -141,7 +141,7 @@ Builds a Sailfish index

// create the directory for log files
bfs::path logDir = outputPath / "logs";
boost::filesystem::create_directory(logDir);
boost::filesystem::create_directories(logDir);

bfs::path logPath = logDir / "sailfish_index.log";
size_t max_q_size = 2097152;
Expand Down Expand Up @@ -184,9 +184,9 @@ Builds a Sailfish index
argVec.push_back("-k");

if (merLen % 2 == 0) {
std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n";
std::cerr << "please specify an odd value of k\n";
jointLog->flush();
std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n";
std::cerr << "please specify an odd value of k\n";
jointLog->flush();
std::exit(1);
}

Expand Down
Loading

0 comments on commit 30fffa4

Please sign in to comment.