Significant performance improvements when dealing with overlapping tr…

…anscripts. Fix *rare* pathalogical performance of hash function. Improvement to quasi-mapping to reduce ambiguity of reads matching fwd and rc transcripts. Improvement to fragment length modeling.
kingsfordgroup · Nov 12, 2015 · 30fffa4 · 30fffa4
1 parent 3d4809b
commit 30fffa4
Show file tree

Hide file tree

Showing 6 changed files with 370 additions and 180 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,10 +3,10 @@ enable_testing()
 
 project (Sailfish)
 
-set(CPACK_PACKAGE_VERSION "0.7.6")
+set(CPACK_PACKAGE_VERSION "0.7.7")
 set(CPACK_PACKAGE_VERSION_MAJOR "0")
 set(CPACK_PACKAGE_VERSION_MINOR "7")
-set(CPACK_PACKAGE_VERSION_PATCH "6")
+set(CPACK_PACKAGE_VERSION_PATCH "7")
 set(CPACK_GENERATOR "TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
 set(CPACK_PACKAGE_VENDOR "Carnegie Mellon University / Stony Brook University")

diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp
@@ -79,6 +79,14 @@ class EquivalenceClassBuilder {
             return true;
         }
 
+        inline void insertGroup(TranscriptGroup g, uint32_t count) {
+            std::vector<double> weights(g.txps.size(), 0.0);
+            //auto updatefn = [count](TGValue& x) { x.count = count; };
+            TGValue v(weights, count);
+            countVec_.push_back(std::make_pair(g, v));
+            //countMap_.upsert(g, updatefn, v);
+        }
+
         inline void addGroup(TranscriptGroup&& g,
                              std::vector<double>& weights) {
 

diff --git a/include/SailfishConfig.hpp b/include/SailfishConfig.hpp
@@ -28,8 +28,8 @@
 namespace sailfish {
 	constexpr char majorVersion[] = "0";
 	constexpr char minorVersion[] = "7";
-	constexpr char patchVersion[] = "6";
-	constexpr char version[] = "0.7.6";
+	constexpr char patchVersion[] = "7";
+	constexpr char version[] = "0.7.7";
     constexpr uint32_t indexVersion = 1;
 }
 

diff --git a/src/CollapsedEMOptimizer.cpp b/src/CollapsedEMOptimizer.cpp
@@ -231,7 +231,7 @@ void EMUpdate_(
     assert(alphaIn.size() == alphaOut.size());
 
     tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
-            [&eqVec, &alphaIn, &alphaOut](const BlockedIndexRange& range) -> void {
+            [&eqVec, &alphaIn, &transcripts, &alphaOut](const BlockedIndexRange& range) -> void {
             for (auto eqID : boost::irange(range.begin(), range.end())) {
             auto& kv = eqVec[eqID];
 
@@ -467,7 +467,7 @@ bool doBootstrap(
         // Do a new bootstrap
         msamp(sampCounts.begin(), totalNumFrags, numClasses, sampleWeights.begin());
 
-	double totalLen{0.0};
+        double totalLen{0.0};
         for (size_t i = 0; i < transcripts.size(); ++i) {
             alphas[i] = transcripts[i].getActive() ? uniformTxpWeight * totalNumFrags : 0.0;
             totalLen += effLens(i);
@@ -477,25 +477,26 @@ bool doBootstrap(
         double maxRelDiff = -std::numeric_limits<double>::max();
         size_t itNum = 0;
 
-	// If we use VBEM, we'll need the prior parameters
-	double priorAlpha = 0.01;
+        // If we use VBEM, we'll need the prior parameters
+        double priorAlpha = 0.01;
         double minAlpha = 1e-8;
+        double alphaCheckCutoff = 1e-2;
         double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha;
 
-	while (itNum < maxIter and !converged) {
+        while (itNum < maxIter and !converged) {
 
             if (useVBEM) {
-		VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
-			effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta);
-	    } else {
+                VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
+                        effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta);
+            } else {
                 EMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts,
                         effLens, alphas, alphasPrime);
             }
 
             converged = true;
             maxRelDiff = -std::numeric_limits<double>::max();
             for (size_t i = 0; i < transcripts.size(); ++i) {
-                if (alphas[i] > cutoff) {
+                if (alphas[i] > alphaCheckCutoff) {
                     double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
                     maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff;
                     if (relDiff > relDiffTolerance) {
@@ -532,14 +533,23 @@ bool CollapsedEMOptimizer::gatherBootstraps(
 
     std::vector<Transcript>& transcripts = readExp.transcripts();
     using VecT = CollapsedEMOptimizer::SerialVecType;
-    // With atomics
+
     VecT alphas(transcripts.size(), 0.0);
     VecT alphasPrime(transcripts.size(), 0.0);
     VecT expTheta(transcripts.size());
     Eigen::VectorXd effLens(transcripts.size());
 
     uint32_t numBootstraps = sopt.numBootstraps;
 
+    // Fill in the effective length vector
+    double totalLen{0.0};
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+        effLens(i) = (sopt.noEffectiveLengthCorrection) ?
+                        transcripts[i].RefLength : transcripts[i].EffectiveLength;
+        if (effLens(i) <= 1.0) { effLens(i) = 1.0; }
+        totalLen += effLens(i);
+    }
+
     std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
         readExp.equivalenceClassBuilder().eqVec();
 
@@ -562,7 +572,6 @@ bool CollapsedEMOptimizer::gatherBootstraps(
     jointLog->info("Optimizing over {} equivalence classes", eqVec.size());
 
     double totalNumFrags{static_cast<double>(readExp.numMappedFragments())};
-    double totalLen{0.0};
 
     if (activeTranscriptIDs.size() == 0) {
         jointLog->error("It seems that no transcripts are expressed; something is likely wrong!");
@@ -571,11 +580,7 @@ bool CollapsedEMOptimizer::gatherBootstraps(
 
     double scale = 1.0 / activeTranscriptIDs.size();
     for (size_t i = 0; i < transcripts.size(); ++i) {
-        //double m = transcripts[i].mass(false);
         alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0;
-        effLens(i) = (sopt.noEffectiveLengthCorrection) ?
-                        transcripts[i].RefLength : transcripts[i].EffectiveLength;
-        totalLen += effLens(i);
     }
 
     auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
@@ -601,12 +606,17 @@ bool CollapsedEMOptimizer::gatherBootstraps(
 
                     // Iterate over each weight and set it equal to
                     // 1 / effLen of the corresponding transcript
+                    double wsum{0.0};
                     for (size_t i = 0; i < classSize; ++i) {
-                            double el = effLens(k.txps[i]);
-                            v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
-                        }
+                        v.weights[i] = (kv.second.count / effLens(k.txps[i]));
+                        wsum += v.weights[i];
                     }
-                });
+                    double wnorm = 1.0 / wsum;
+                    for (size_t i = 0; i < classSize; ++i) {
+                        v.weights[i] *= wnorm;
+                    }
+                }
+            });
 
     // Since we will use the same weights and transcript groups for each
     // of the bootstrap samples (only the count vector will change), it
@@ -646,18 +656,6 @@ bool CollapsedEMOptimizer::gatherBootstraps(
     std::atomic<uint32_t> bsCounter{0};
     std::vector<std::thread> workerThreads;
     for (size_t tn = 0; tn < numWorkerThreads; ++tn) {
-        /*
-   std::vector<std::vector<uint32_t>>& txpGroups,
-        std::vector<std::vector<double>>& txpGroupWeights,
-        std::vector<Transcript>& transcripts,
-        Eigen::VectorXd& effLens,
-        std::vector<double>& sampleWeights,
-        uint64_t totSamples,
-        std::atomic<uint32_t>& bsNum,
-        SailfishOpts& sopt,
-        BootstrapWriter* bootstrapWriter,
-        double relDiffTolerance,
-        uint32_t maxIter)*/
         workerThreads.emplace_back(doBootstrap,
                 std::ref(txpGroups),
                 std::ref(txpGroupWeights),
@@ -692,19 +690,59 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
     VecType alphas(transcripts.size(), 0.0);
     VecType alphasPrime(transcripts.size(), 0.0);
     VecType expTheta(transcripts.size());
+    VecType uniqueCounts(transcripts.size(), 0.0);
     Eigen::VectorXd effLens(transcripts.size());
 
+    // Fill in the effective length vector
+    double totalLen{0.0};
+    for (size_t i = 0; i < transcripts.size(); ++i) {
+        effLens(i) = (sopt.noEffectiveLengthCorrection) ?
+                        transcripts[i].RefLength : transcripts[i].EffectiveLength;
+        if (effLens(i) <= 1.0) { effLens(i) = 1.0; }
+        totalLen += effLens(i);
+    }
+
     std::vector<std::pair<const TranscriptGroup, TGValue>>& eqVec =
         readExp.equivalenceClassBuilder().eqVec();
 
+    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
+            [&eqVec, &effLens, &transcripts]( const BlockedIndexRange& range) -> void {
+                // For each index in the equivalence class vector
+                for (auto eqID : boost::irange(range.begin(), range.end())) {
+                    // The vector entry
+                    auto& kv = eqVec[eqID];
+                    // The label of the equivalence class
+                    const TranscriptGroup& k = kv.first;
+                    // The size of the label
+                    size_t classSize = k.txps.size();
+                    // The weights of the label
+                    TGValue& v = kv.second;
+
+                    // Iterate over each weight and set it equal to
+                    // 1 / effLen of the corresponding transcript
+                    double wsum{0.0};
+                    for (size_t i = 0; i < classSize; ++i) {
+                        v.weights[i] = (kv.second.count / effLens(k.txps[i]));
+                        wsum += v.weights[i];
+                    }
+
+                    double wnorm = 1.0 / wsum;
+                    for (size_t i = 0; i < classSize; ++i) {
+                        v.weights[i] *= wnorm;
+                    }
+
+                }
+            });
+
     std::unordered_set<uint32_t> activeTranscriptIDs;
-   for (auto& kv : eqVec) {
-       auto& tg = kv.first;
-       for (auto& t : tg.txps) {
+    for (auto& kv : eqVec) {
+        auto& tg = kv.first;
+        for (size_t i = 0; i < tg.txps.size(); ++i) {
+            auto& t = tg.txps[i];
             transcripts[t].setActive();
             activeTranscriptIDs.insert(t);
-       }
-   }
+        }
+    }
 
     bool useVBEM{sopt.useVBOpt};
     // If we use VBEM, we'll need the prior parameters
@@ -715,7 +753,6 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
     jointLog->info("Optimizing over {} equivalence classes", eqVec.size());
 
     double totalNumFrags{static_cast<double>(readExp.numMappedFragments())};
-    double totalLen{0.0};
 
     if (activeTranscriptIDs.size() == 0) {
         jointLog->error("It seems that no transcripts are expressed; something is likely wrong!");
@@ -724,43 +761,18 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
 
     double scale = 1.0 / activeTranscriptIDs.size();
     for (size_t i = 0; i < transcripts.size(); ++i) {
-        //double m = transcripts[i].mass(false);
         alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0;
-        effLens(i) = (sopt.noEffectiveLengthCorrection) ?
-                        transcripts[i].RefLength : transcripts[i].EffectiveLength;
-        totalLen += effLens(i);
     }
 
-    auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
-    sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
-            numRemoved);
+    //auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog);
+    //sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate",
+    //        numRemoved);
 
     size_t itNum{0};
     double minAlpha = 1e-8;
+    double alphaCheckCutoff = 1e-2;
     double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha;
 
-    tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())),
-            [&eqVec, &effLens]( const BlockedIndexRange& range) -> void {
-                // For each index in the equivalence class vector
-                for (auto eqID : boost::irange(range.begin(), range.end())) {
-                    // The vector entry
-                    auto& kv = eqVec[eqID];
-                    // The label of the equivalence class
-                    const TranscriptGroup& k = kv.first;
-                    // The size of the label
-                    size_t classSize = k.txps.size();
-                    // The weights of the label
-                    TGValue& v = kv.second;
-
-                    // Iterate over each weight and set it equal to
-                    // 1 / effLen of the corresponding transcript
-                    for (size_t i = 0; i < classSize; ++i) {
-                            double el = effLens(k.txps[i]);
-                            v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el);
-                        }
-                    }
-                });
-
     bool converged{false};
     double maxRelDiff = -std::numeric_limits<double>::max();
     while (itNum < maxIter and !converged) {
@@ -775,8 +787,8 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp,
         converged = true;
         maxRelDiff = -std::numeric_limits<double>::max();
         for (size_t i = 0; i < transcripts.size(); ++i) {
-            if (alphas[i] > cutoff) {
-                double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
+            if (alphasPrime[i] > alphaCheckCutoff) {
+                double relDiff = std::fabs(alphas[i] - alphasPrime[i]) / alphasPrime[i];
                 maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff;
                 if (relDiff > relDiffTolerance) {
                     converged = false;

diff --git a/src/SailfishIndexer.cpp b/src/SailfishIndexer.cpp
@@ -128,7 +128,7 @@ Builds a Sailfish index
         if (!bfs::exists(outputStem)) {
             mustRecompute = true;
             try {
-                bool success = bfs::create_directory(outputStem);
+                bool success = bfs::create_directories(outputStem);
                 if (!success) { throw std::runtime_error("unspecified error creating file."); }
             } catch ( std::exception& e ) {
                 std::cerr << "Creation of " << outputStem << " failed [" << e.what() << "]\n.";
@@ -141,7 +141,7 @@ Builds a Sailfish index
 
         // create the directory for log files
         bfs::path logDir = outputPath / "logs";
-        boost::filesystem::create_directory(logDir);
+        boost::filesystem::create_directories(logDir);
 
         bfs::path logPath = logDir / "sailfish_index.log";
         size_t max_q_size = 2097152;
@@ -184,9 +184,9 @@ Builds a Sailfish index
             argVec.push_back("-k");
 
             if (merLen % 2 == 0) {
-		std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n";
-		std::cerr << "please specify an odd value of k\n";
-		jointLog->flush();
+                std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n";
+                std::cerr << "please specify an odd value of k\n";
+                jointLog->flush();
                 std::exit(1);
             }