diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e746ae..9035f96 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,10 +3,10 @@ enable_testing() project (Sailfish) -set(CPACK_PACKAGE_VERSION "0.7.6") +set(CPACK_PACKAGE_VERSION "0.7.7") set(CPACK_PACKAGE_VERSION_MAJOR "0") set(CPACK_PACKAGE_VERSION_MINOR "7") -set(CPACK_PACKAGE_VERSION_PATCH "6") +set(CPACK_PACKAGE_VERSION_PATCH "7") set(CPACK_GENERATOR "TGZ") set(CPACK_SOURCE_GENERATOR "TGZ") set(CPACK_PACKAGE_VENDOR "Carnegie Mellon University / Stony Brook University") diff --git a/include/EquivalenceClassBuilder.hpp b/include/EquivalenceClassBuilder.hpp index 04711f0..f564ea1 100644 --- a/include/EquivalenceClassBuilder.hpp +++ b/include/EquivalenceClassBuilder.hpp @@ -79,6 +79,14 @@ class EquivalenceClassBuilder { return true; } + inline void insertGroup(TranscriptGroup g, uint32_t count) { + std::vector weights(g.txps.size(), 0.0); + //auto updatefn = [count](TGValue& x) { x.count = count; }; + TGValue v(weights, count); + countVec_.push_back(std::make_pair(g, v)); + //countMap_.upsert(g, updatefn, v); + } + inline void addGroup(TranscriptGroup&& g, std::vector& weights) { diff --git a/include/SailfishConfig.hpp b/include/SailfishConfig.hpp index 78bcde1..a8db38a 100644 --- a/include/SailfishConfig.hpp +++ b/include/SailfishConfig.hpp @@ -28,8 +28,8 @@ namespace sailfish { constexpr char majorVersion[] = "0"; constexpr char minorVersion[] = "7"; - constexpr char patchVersion[] = "6"; - constexpr char version[] = "0.7.6"; + constexpr char patchVersion[] = "7"; + constexpr char version[] = "0.7.7"; constexpr uint32_t indexVersion = 1; } diff --git a/src/CollapsedEMOptimizer.cpp b/src/CollapsedEMOptimizer.cpp index 69caf6a..d744ccb 100644 --- a/src/CollapsedEMOptimizer.cpp +++ b/src/CollapsedEMOptimizer.cpp @@ -231,7 +231,7 @@ void EMUpdate_( assert(alphaIn.size() == alphaOut.size()); tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())), - [&eqVec, &alphaIn, &alphaOut](const BlockedIndexRange& range) -> void { + [&eqVec, &alphaIn, &transcripts, &alphaOut](const BlockedIndexRange& range) -> void { for (auto eqID : boost::irange(range.begin(), range.end())) { auto& kv = eqVec[eqID]; @@ -467,7 +467,7 @@ bool doBootstrap( // Do a new bootstrap msamp(sampCounts.begin(), totalNumFrags, numClasses, sampleWeights.begin()); - double totalLen{0.0}; + double totalLen{0.0}; for (size_t i = 0; i < transcripts.size(); ++i) { alphas[i] = transcripts[i].getActive() ? uniformTxpWeight * totalNumFrags : 0.0; totalLen += effLens(i); @@ -477,17 +477,18 @@ bool doBootstrap( double maxRelDiff = -std::numeric_limits::max(); size_t itNum = 0; - // If we use VBEM, we'll need the prior parameters - double priorAlpha = 0.01; + // If we use VBEM, we'll need the prior parameters + double priorAlpha = 0.01; double minAlpha = 1e-8; + double alphaCheckCutoff = 1e-2; double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha; - while (itNum < maxIter and !converged) { + while (itNum < maxIter and !converged) { if (useVBEM) { - VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts, - effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta); - } else { + VBEMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts, + effLens, priorAlpha, totalLen, alphas, alphasPrime, expTheta); + } else { EMUpdate_(txpGroups, txpGroupWeights, sampCounts, transcripts, effLens, alphas, alphasPrime); } @@ -495,7 +496,7 @@ bool doBootstrap( converged = true; maxRelDiff = -std::numeric_limits::max(); for (size_t i = 0; i < transcripts.size(); ++i) { - if (alphas[i] > cutoff) { + if (alphas[i] > alphaCheckCutoff) { double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i]; maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff; if (relDiff > relDiffTolerance) { @@ -532,7 +533,7 @@ bool CollapsedEMOptimizer::gatherBootstraps( std::vector& transcripts = readExp.transcripts(); using VecT = CollapsedEMOptimizer::SerialVecType; - // With atomics + VecT alphas(transcripts.size(), 0.0); VecT alphasPrime(transcripts.size(), 0.0); VecT expTheta(transcripts.size()); @@ -540,6 +541,15 @@ bool CollapsedEMOptimizer::gatherBootstraps( uint32_t numBootstraps = sopt.numBootstraps; + // Fill in the effective length vector + double totalLen{0.0}; + for (size_t i = 0; i < transcripts.size(); ++i) { + effLens(i) = (sopt.noEffectiveLengthCorrection) ? + transcripts[i].RefLength : transcripts[i].EffectiveLength; + if (effLens(i) <= 1.0) { effLens(i) = 1.0; } + totalLen += effLens(i); + } + std::vector>& eqVec = readExp.equivalenceClassBuilder().eqVec(); @@ -562,7 +572,6 @@ bool CollapsedEMOptimizer::gatherBootstraps( jointLog->info("Optimizing over {} equivalence classes", eqVec.size()); double totalNumFrags{static_cast(readExp.numMappedFragments())}; - double totalLen{0.0}; if (activeTranscriptIDs.size() == 0) { jointLog->error("It seems that no transcripts are expressed; something is likely wrong!"); @@ -571,11 +580,7 @@ bool CollapsedEMOptimizer::gatherBootstraps( double scale = 1.0 / activeTranscriptIDs.size(); for (size_t i = 0; i < transcripts.size(); ++i) { - //double m = transcripts[i].mass(false); alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0; - effLens(i) = (sopt.noEffectiveLengthCorrection) ? - transcripts[i].RefLength : transcripts[i].EffectiveLength; - totalLen += effLens(i); } auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog); @@ -601,12 +606,17 @@ bool CollapsedEMOptimizer::gatherBootstraps( // Iterate over each weight and set it equal to // 1 / effLen of the corresponding transcript + double wsum{0.0}; for (size_t i = 0; i < classSize; ++i) { - double el = effLens(k.txps[i]); - v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el); - } + v.weights[i] = (kv.second.count / effLens(k.txps[i])); + wsum += v.weights[i]; } - }); + double wnorm = 1.0 / wsum; + for (size_t i = 0; i < classSize; ++i) { + v.weights[i] *= wnorm; + } + } + }); // Since we will use the same weights and transcript groups for each // of the bootstrap samples (only the count vector will change), it @@ -646,18 +656,6 @@ bool CollapsedEMOptimizer::gatherBootstraps( std::atomic bsCounter{0}; std::vector workerThreads; for (size_t tn = 0; tn < numWorkerThreads; ++tn) { - /* - std::vector>& txpGroups, - std::vector>& txpGroupWeights, - std::vector& transcripts, - Eigen::VectorXd& effLens, - std::vector& sampleWeights, - uint64_t totSamples, - std::atomic& bsNum, - SailfishOpts& sopt, - BootstrapWriter* bootstrapWriter, - double relDiffTolerance, - uint32_t maxIter)*/ workerThreads.emplace_back(doBootstrap, std::ref(txpGroups), std::ref(txpGroupWeights), @@ -692,19 +690,59 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp, VecType alphas(transcripts.size(), 0.0); VecType alphasPrime(transcripts.size(), 0.0); VecType expTheta(transcripts.size()); + VecType uniqueCounts(transcripts.size(), 0.0); Eigen::VectorXd effLens(transcripts.size()); + // Fill in the effective length vector + double totalLen{0.0}; + for (size_t i = 0; i < transcripts.size(); ++i) { + effLens(i) = (sopt.noEffectiveLengthCorrection) ? + transcripts[i].RefLength : transcripts[i].EffectiveLength; + if (effLens(i) <= 1.0) { effLens(i) = 1.0; } + totalLen += effLens(i); + } + std::vector>& eqVec = readExp.equivalenceClassBuilder().eqVec(); + tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())), + [&eqVec, &effLens, &transcripts]( const BlockedIndexRange& range) -> void { + // For each index in the equivalence class vector + for (auto eqID : boost::irange(range.begin(), range.end())) { + // The vector entry + auto& kv = eqVec[eqID]; + // The label of the equivalence class + const TranscriptGroup& k = kv.first; + // The size of the label + size_t classSize = k.txps.size(); + // The weights of the label + TGValue& v = kv.second; + + // Iterate over each weight and set it equal to + // 1 / effLen of the corresponding transcript + double wsum{0.0}; + for (size_t i = 0; i < classSize; ++i) { + v.weights[i] = (kv.second.count / effLens(k.txps[i])); + wsum += v.weights[i]; + } + + double wnorm = 1.0 / wsum; + for (size_t i = 0; i < classSize; ++i) { + v.weights[i] *= wnorm; + } + + } + }); + std::unordered_set activeTranscriptIDs; - for (auto& kv : eqVec) { - auto& tg = kv.first; - for (auto& t : tg.txps) { + for (auto& kv : eqVec) { + auto& tg = kv.first; + for (size_t i = 0; i < tg.txps.size(); ++i) { + auto& t = tg.txps[i]; transcripts[t].setActive(); activeTranscriptIDs.insert(t); - } - } + } + } bool useVBEM{sopt.useVBOpt}; // If we use VBEM, we'll need the prior parameters @@ -715,7 +753,6 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp, jointLog->info("Optimizing over {} equivalence classes", eqVec.size()); double totalNumFrags{static_cast(readExp.numMappedFragments())}; - double totalLen{0.0}; if (activeTranscriptIDs.size() == 0) { jointLog->error("It seems that no transcripts are expressed; something is likely wrong!"); @@ -724,43 +761,18 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp, double scale = 1.0 / activeTranscriptIDs.size(); for (size_t i = 0; i < transcripts.size(); ++i) { - //double m = transcripts[i].mass(false); alphas[i] = transcripts[i].getActive() ? scale * totalNumFrags : 0.0; - effLens(i) = (sopt.noEffectiveLengthCorrection) ? - transcripts[i].RefLength : transcripts[i].EffectiveLength; - totalLen += effLens(i); } - auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog); - sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate", - numRemoved); + //auto numRemoved = markDegenerateClasses(eqVec, alphas, sopt.jointLog); + //sopt.jointLog->info("Marked {} weighted equivalence classes as degenerate", + // numRemoved); size_t itNum{0}; double minAlpha = 1e-8; + double alphaCheckCutoff = 1e-2; double cutoff = (useVBEM) ? (priorAlpha + minAlpha) : minAlpha; - tbb::parallel_for(BlockedIndexRange(size_t(0), size_t(eqVec.size())), - [&eqVec, &effLens]( const BlockedIndexRange& range) -> void { - // For each index in the equivalence class vector - for (auto eqID : boost::irange(range.begin(), range.end())) { - // The vector entry - auto& kv = eqVec[eqID]; - // The label of the equivalence class - const TranscriptGroup& k = kv.first; - // The size of the label - size_t classSize = k.txps.size(); - // The weights of the label - TGValue& v = kv.second; - - // Iterate over each weight and set it equal to - // 1 / effLen of the corresponding transcript - for (size_t i = 0; i < classSize; ++i) { - double el = effLens(k.txps[i]); - v.weights[i] = (el <= 0.0) ? 1.0 : (1.0 / el); - } - } - }); - bool converged{false}; double maxRelDiff = -std::numeric_limits::max(); while (itNum < maxIter and !converged) { @@ -775,8 +787,8 @@ bool CollapsedEMOptimizer::optimize(ReadExperiment& readExp, converged = true; maxRelDiff = -std::numeric_limits::max(); for (size_t i = 0; i < transcripts.size(); ++i) { - if (alphas[i] > cutoff) { - double relDiff = std::abs(alphas[i] - alphasPrime[i]) / alphasPrime[i]; + if (alphasPrime[i] > alphaCheckCutoff) { + double relDiff = std::fabs(alphas[i] - alphasPrime[i]) / alphasPrime[i]; maxRelDiff = (relDiff > maxRelDiff) ? relDiff : maxRelDiff; if (relDiff > relDiffTolerance) { converged = false; diff --git a/src/SailfishIndexer.cpp b/src/SailfishIndexer.cpp index 3a08db5..9d12e9b 100644 --- a/src/SailfishIndexer.cpp +++ b/src/SailfishIndexer.cpp @@ -128,7 +128,7 @@ Builds a Sailfish index if (!bfs::exists(outputStem)) { mustRecompute = true; try { - bool success = bfs::create_directory(outputStem); + bool success = bfs::create_directories(outputStem); if (!success) { throw std::runtime_error("unspecified error creating file."); } } catch ( std::exception& e ) { std::cerr << "Creation of " << outputStem << " failed [" << e.what() << "]\n."; @@ -141,7 +141,7 @@ Builds a Sailfish index // create the directory for log files bfs::path logDir = outputPath / "logs"; - boost::filesystem::create_directory(logDir); + boost::filesystem::create_directories(logDir); bfs::path logPath = logDir / "sailfish_index.log"; size_t max_q_size = 2097152; @@ -184,9 +184,9 @@ Builds a Sailfish index argVec.push_back("-k"); if (merLen % 2 == 0) { - std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n"; - std::cerr << "please specify an odd value of k\n"; - jointLog->flush(); + std::cerr << "k-mer length should be odd to avoid a k-mer being it's own reverse complement\n"; + std::cerr << "please specify an odd value of k\n"; + jointLog->flush(); std::exit(1); } diff --git a/src/SailfishQuantify.cpp b/src/SailfishQuantify.cpp index 144c8eb..142feec 100644 --- a/src/SailfishQuantify.cpp +++ b/src/SailfishQuantify.cpp @@ -17,6 +17,7 @@ #include // TBB include +#include "tbb/atomic.h" #include "tbb/blocked_range.h" #include "tbb/parallel_for.h" @@ -56,31 +57,51 @@ using single_parser = jellyfish::whole_sequence_parser; /****** Parser aliases ***/ -using FragLengthCountMap = std::unordered_map; +// using FragLengthCountMap = std::unordered_map; +using FragLengthCountMap = std::vector>; using std::string; constexpr uint32_t readGroupSize{1000}; int performBiasCorrectionSalmon(boost::filesystem::path featPath, - boost::filesystem::path expPath, - boost::filesystem::path outPath, - size_t numThreads); + boost::filesystem::path expPath, + boost::filesystem::path outPath, + size_t numThreads); +int32_t getMeanFragLen(FragLengthCountMap& flMap) { + double totalCount{0.0}; + double totalLength{0.0}; + for (size_t i = 0; i < flMap.size(); ++i) { + totalLength += i * flMap[i]; + totalCount += flMap[i]; + } + double ret{200.0}; + if (totalCount <= 0.0) { + std::cerr << "Saw no fragments; can't get mean fragment length\n"; + std::cerr << "This appears to be a bug. Please report it on GitHub\n"; + return ret; + } + if (totalLength > totalCount) { + ret = (totalLength / totalCount); + } + return static_cast(ret); +} void processReadsQuasi(paired_parser* parser, ReadExperiment& readExp, ReadLibrary& rl, const SailfishOpts& sfOpts, FragLengthCountMap& flMap, - std::atomic& remainingFLOps, + std::atomic& remainingFLOps, std::mutex& iomutex) { uint32_t maxFragLen = sfOpts.maxFragLen; uint64_t prevObservedFrags{1}; uint64_t leftHitCount{0}; uint64_t hitListCount{0}; + int32_t meanFragLen{-1}; size_t locRead{0}; uint64_t localUpperBoundHits{0}; @@ -91,9 +112,10 @@ void processReadsQuasi(paired_parser* parser, auto& numObservedFragments = readExp.numObservedFragmentsAtomic(); auto& validHits = readExp.numMappedFragmentsAtomic(); - auto&totalHits = readExp.numFragHitsAtomic(); + auto& totalHits = readExp.numFragHitsAtomic(); auto& upperBoundHits = readExp.upperBoundHitsAtomic(); auto& eqBuilder = readExp.equivalenceClassBuilder(); + auto& transcripts = readExp.transcripts(); auto sidx = readExp.getIndex(); SACollector hitCollector(sidx->quasiIndex()); @@ -107,6 +129,9 @@ void processReadsQuasi(paired_parser* parser, std::vector txpIDs; std::vector auxProbs; size_t txpIDsHash{0}; + double priorFragProb{0.0}; + + std::unique_ptr empDist{nullptr}; while(true) { typename paired_parser::job j(*parser); // Get a job from the parser: a bunch of read (at most max_read_group) @@ -120,19 +145,22 @@ void processReadsQuasi(paired_parser* parser, rightHits.clear(); txpIDs.clear(); auxProbs.clear(); - txpIDsHash = 0; bool lh = hitCollector(j->data[i].first.seq, leftHits, saSearcher, - MateStatus::PAIRED_END_LEFT); + MateStatus::PAIRED_END_LEFT, + true // strict check + ); bool rh = hitCollector(j->data[i].second.seq, rightHits, saSearcher, - MateStatus::PAIRED_END_RIGHT); + MateStatus::PAIRED_END_RIGHT, + true // strict check + ); rapmap::utils::mergeLeftRightHits( - leftHits, rightHits, jointHits, - readLen, maxNumHits, tooManyHits, hctr); + leftHits, rightHits, jointHits, + readLen, maxNumHits, tooManyHits, hctr); upperBoundHits += (jointHits.size() > 0); @@ -151,12 +179,15 @@ void processReadsQuasi(paired_parser* parser, // This is a unique hit if (jointHits.size() == 1 and isPaired and remainingFLOps > 0) { auto& h = jointHits.front(); + if (h.fwd != h.mateIsFwd and h.fragLen < maxFragLen) { flMap[h.fragLen]++; remainingFLOps--; } + } + // If these aren't paired-end reads --- so that // we have orphans --- make sure we sort the // mappings so that they are in transcript order @@ -175,15 +206,59 @@ void processReadsQuasi(paired_parser* parser, }); } - auto auxProb = 1.0 / jointHits.size(); + + double auxSum = 0.0; for (auto& h : jointHits) { auto transcriptID = h.transcriptID(); - txpIDs.push_back(transcriptID); - auxProbs.push_back(auxProb); - //boost::hash_combine(txpIDsHash, transcriptID); + + if (!isPaired and remainingFLOps <= 0) { + if (meanFragLen < 0) { + meanFragLen = getMeanFragLen(flMap); + } + int32_t pos = static_cast(h.pos); + if (h.fwd and (pos + meanFragLen) <= transcripts[transcriptID].RefLength) { + txpIDs.push_back(transcriptID); + auxProbs.push_back(1.0); + auxSum += 1.0; + } else if (!h.fwd and (pos - meanFragLen) >= 0 ) { + txpIDs.push_back(transcriptID); + auxProbs.push_back(1.0); + auxSum += 1.0; + } + } else { + double distProb = 1.0; + /* -- don't use this info for the time being + if (remainingFLOps <= 0) { + if (!empDist) { + size_t totObs{0}; + std::vector vals(flMap.size()); + std::vector freq(flMap.size()); + for (size_t i = 0; i < flMap.size(); ++i) { + vals[i] = i; + freq[i] = flMap[i]; + totObs += freq[i]; + } + empDist.reset(new EmpiricalDistribution(vals, freq)); + priorFragProb = 0.1 / totObs; + } + distProb = empDist->pdf(h.fragLen) + priorFragProb; + } + */ + txpIDs.push_back(transcriptID); + auxProbs.push_back(distProb); + auxSum += distProb; + } + + } + if (txpIDs.size() > 0) { + /* + for (auto& p : auxProbs) { + p /= auxSum; + } + */ + TranscriptGroup tg(txpIDs); + eqBuilder.addGroup(std::move(tg), auxProbs); } - TranscriptGroup tg(txpIDs); - eqBuilder.addGroup(std::move(tg), auxProbs); } validHits += (jointHits.size() > 0); @@ -193,7 +268,9 @@ void processReadsQuasi(paired_parser* parser, if (numObservedFragments % 500000 == 0) { iomutex.lock(); fmt::print(stderr, "\033[A\r\rprocessed {} fragments\n", numObservedFragments); - fmt::print(stderr, "hits per frag: {}", totalHits / static_cast(prevObservedFrags)); + fmt::print(stderr, "hits: {}; hits per frag: {}", + totalHits, + totalHits / static_cast(prevObservedFrags)); iomutex.unlock(); } @@ -220,7 +297,7 @@ void processReadsQuasi(single_parser* parser, auto& numObservedFragments = readExp.numObservedFragmentsAtomic(); auto& validHits = readExp.numMappedFragmentsAtomic(); - auto&totalHits = readExp.numFragHitsAtomic(); + auto& totalHits = readExp.numFragHitsAtomic(); auto& upperBoundHits = readExp.upperBoundHitsAtomic(); auto& eqBuilder = readExp.equivalenceClassBuilder(); @@ -275,7 +352,9 @@ void processReadsQuasi(single_parser* parser, if (numObservedFragments % 500000 == 0) { iomutex.lock(); fmt::print(stderr, "\033[A\r\rprocessed {} fragments\n", numObservedFragments); - fmt::print(stderr, "hits per frag: {}", totalHits / static_cast(prevObservedFrags)); + fmt::print(stderr, "hits: {}; hits per frag: {}", + totalHits, + totalHits / static_cast(prevObservedFrags)); iomutex.unlock(); } @@ -285,19 +364,40 @@ void processReadsQuasi(single_parser* parser, } } -void setEffectiveLengthsTrivial(ReadExperiment& readExp, +std::vector getNormalFragLengthDist( + const SailfishOpts& sfOpts) { + + std::vector correctionFactors(sfOpts.maxFragLen, 0.0); + auto maxLen = sfOpts.maxFragLen; + auto mean = sfOpts.fragLenDistPriorMean; + auto sd = sfOpts.fragLenDistPriorSD; + + auto kernel = [mean, sd](double p) -> double { + double invStd = 1.0 / sd; + double x = invStd * (p - mean); + return std::exp(-0.5 * x * x) * invStd; + }; + + double cumulativeMass{0.0}; + double cumulativeDenisty{0.0}; + for (size_t i = 0; i < sfOpts.maxFragLen; ++i) { + auto d = kernel(static_cast(i)); + cumulativeMass += i * d; + cumulativeDenisty += d; + if (cumulativeDenisty > 0) { + correctionFactors[i] = cumulativeMass / cumulativeDenisty; + } + } + return correctionFactors; +} + +void setEffectiveLengthsDirect(ReadExperiment& readExp, const SailfishOpts& sfOpts) { auto& transcripts = readExp.transcripts(); for(size_t txpID = 0; txpID < transcripts.size(); ++txpID) { auto& txp = transcripts[txpID]; double refLen = txp.RefLength; - if (txp.RefLength <= sfOpts.fragLenDistPriorMean) { - txp.EffectiveLength = refLen; - } else { - // Maybe convolve this with the normal given the variance - // provided by the user. - txp.EffectiveLength = refLen - sfOpts.fragLenDistPriorMean; - } + txp.EffectiveLength = txp.RefLength; } } @@ -353,54 +453,52 @@ void computeEmpiricalEffectiveLengths( }); } - -void computeSmoothedEffectiveLengths( +std::vector correctionFactorsFromCounts( const SailfishOpts& sfOpts, - std::vector& transcripts, std::map& jointMap) { + auto maxLen = sfOpts.maxFragLen; - auto maxLen = sfOpts.maxFragLen; - - std::vector correctionFactors(maxLen, 0.0); - std::vector vals(maxLen, 0.0); - std::vector multiplicities(maxLen, 0); + std::vector correctionFactors(maxLen, 0.0); + std::vector vals(maxLen, 0.0); + std::vector multiplicities(maxLen, 0); - auto valIt = jointMap.find(0); - if (valIt != jointMap.end()) { - multiplicities[0] = valIt->second; - } else { - multiplicities[0] = 0; - } + auto valIt = jointMap.find(0); + if (valIt != jointMap.end()) { + multiplicities[0] = valIt->second; + } else { + multiplicities[0] = 0; + } - sfOpts.jointLog->info( - "Computing effective length factors --- max length = {}", maxLen); - uint32_t v{0}; - for (size_t i = 1; i < maxLen; ++i) { - valIt = jointMap.find(i); - if (valIt == jointMap.end()) { - v = 0; - } else { - v = valIt->second; - } - vals[i] = static_cast(v * i) + vals[i-1]; - multiplicities[i] = v + multiplicities[i-1]; - if (multiplicities[i] > 0) { - correctionFactors[i] = vals[i] / static_cast(multiplicities[i]); - } - } - sfOpts.jointLog->info("finished computing effective length factors"); - sfOpts.jointLog->info("mean fragment length = {}", correctionFactors[maxLen-1]); + sfOpts.jointLog->info( + "Computing effective length factors --- max length = {}", + maxLen); + uint32_t v{0}; + for (size_t i = 1; i < maxLen; ++i) { + valIt = jointMap.find(i); + if (valIt == jointMap.end()) { + v = 0; + } else { + v = valIt->second; + } + vals[i] = static_cast(v * i) + vals[i-1]; + multiplicities[i] = v + multiplicities[i-1]; + if (multiplicities[i] > 0) { + correctionFactors[i] = vals[i] / static_cast(multiplicities[i]); + } + } + sfOpts.jointLog->info("finished computing effective length factors"); + sfOpts.jointLog->info("mean fragment length = {}", correctionFactors[maxLen-1]); - /* - std::ofstream fldFile("fld.txt"); - for (size_t i = 1; i < maxLen; ++i) { - fldFile << i << '\t' << correctionFactors[i] << '\n'; - } - fldFile.close(); - */ + return correctionFactors; +} +void computeSmoothedEffectiveLengths( + const SailfishOpts& sfOpts, + std::vector& transcripts, + std::vector& correctionFactors) { + auto maxLen = sfOpts.maxFragLen; using BlockedIndexRange = tbb::blocked_range; tbb::task_scheduler_init tbbScheduler(sfOpts.numThreads); sfOpts.jointLog->info("Estimating effective lengths"); @@ -426,6 +524,15 @@ void computeSmoothedEffectiveLengths( }); } +std::vector split(const std::string &s, char delim) { + std::stringstream ss(s); + std::string item; + std::vector elems; + while (std::getline(ss, item, delim)) { + elems.push_back(item); + } + return elems; +} void quasiMapReads( @@ -443,8 +550,8 @@ void quasiMapReads( std::unique_ptr singleParserPtr{nullptr}; // Remember the fragment lengths that we see in each thread - std::vector flMaps(numThreads); - + //std::vector flMaps(numThreads); + FragLengthCountMap flMap(sfOpts.maxFragLen, 0); // If the read library is paired-end // ------ Paired-end -------- @@ -469,7 +576,8 @@ void quasiMapReads( paired_parser(4 * numThreads, maxReadGroup, concurrentFile, pairFileList, pairFileList+numFiles)); - std::atomic remainingFLOps{10000}; + int32_t numRequiredFLDObs{10000}; + std::atomic remainingFLOps{numRequiredFLDObs}; for(int i = 0; i < numThreads; ++i) { // NOTE: we *must* capture i by value here, b/c it can (sometimes, does) @@ -480,7 +588,7 @@ void quasiMapReads( readExp, rl, sfOpts, - flMaps[i], + flMap, remainingFLOps, iomutex); }; @@ -490,13 +598,11 @@ void quasiMapReads( size_t totalObs{0}; std::map jointMap; - for(int i = 0; i < numThreads; ++i) { - threads[i].join(); - auto& flMap = flMaps[i]; - for (auto& kv : flMap) { - jointMap[kv.first] += kv.second; - totalObs += kv.second; - } + for(int i = 0; i < numThreads; ++i) { threads[i].join(); } + + for (size_t i = 0; i < flMap.size(); ++i) { + jointMap[i] = flMap[i]; + totalObs += flMap[i]; } sfOpts.jointLog->info("Gathered fragment lengths from all threads"); @@ -508,21 +614,28 @@ void quasiMapReads( // Note: if "noEffectiveLengthCorrection" is set, so that these values // won't matter anyway, then don't bother computing this "expensive" // version. - const size_t numRequiredFLDObs{10000}; - if (totalObs >= numRequiredFLDObs and !sfOpts.noEffectiveLengthCorrection) { - - if (sfOpts.useUnsmoothedFLD) { - computeEmpiricalEffectiveLengths(sfOpts, readExp.transcripts(), jointMap); + if (sfOpts.noEffectiveLengthCorrection) { + setEffectiveLengthsDirect(readExp, sfOpts); + } else { + // We didn't have sufficient observations, use the provided + // values + if (remainingFLOps > 0) { + sfOpts.jointLog->warn("Sailfish saw fewer then {} uniquely mapped reads " + "so {} will be used as the mean fragment length and {} as " + "the standard deviation for effective length correction", + numRequiredFLDObs, + sfOpts.fragLenDistPriorMean, + sfOpts.fragLenDistPriorSD); + auto correctionFactors = getNormalFragLengthDist(sfOpts); + computeSmoothedEffectiveLengths(sfOpts, readExp.transcripts(), correctionFactors); } else { - computeSmoothedEffectiveLengths(sfOpts, readExp.transcripts(), jointMap); + if (sfOpts.useUnsmoothedFLD) { + computeEmpiricalEffectiveLengths(sfOpts, readExp.transcripts(), jointMap); + } else { + auto correctionFactors = correctionFactorsFromCounts(sfOpts, jointMap); + computeSmoothedEffectiveLengths(sfOpts, readExp.transcripts(), correctionFactors); + } } - - } else { - sfOpts.jointLog->warn("Sailfish saw fewer then {} uniquely mapped reads " - "so {} will be used as the mean fragment length for " - "effective length correction", numRequiredFLDObs, - sfOpts.fragLenDistPriorMean); - setEffectiveLengthsTrivial(readExp, sfOpts); } } // ------ Single-end -------- else if (rl.format().type == ReadType::SINGLE_END) { @@ -552,7 +665,12 @@ void quasiMapReads( threads.emplace_back(threadFun); } for(int i = 0; i < numThreads; ++i) { threads[i].join(); } - setEffectiveLengthsTrivial(readExp, sfOpts); + if (sfOpts.noEffectiveLengthCorrection) { + setEffectiveLengthsDirect(readExp, sfOpts); + } else { + auto correctionFactors = getNormalFragLengthDist(sfOpts); + computeSmoothedEffectiveLengths(sfOpts, readExp.transcripts(), correctionFactors); + } } // ------ END Single-end -------- } @@ -566,6 +684,7 @@ int mainQuantify(int argc, char* argv[]) { bool biasCorrect{false}; SailfishOpts sopt; sopt.numThreads = std::thread::hardware_concurrency(); + sopt.allowOrphans = true; vector unmatedReadFiles; vector mate1ReadFiles; @@ -616,6 +735,7 @@ int mainQuantify(int argc, char* argv[]) { "characteristic function over each transcript") ("maxFragLen", po::value(&(sopt.maxFragLen))->default_value(1000), "The maximum length of a fragment to consider when " "building the empirical fragment length distribution") + //("readEqClasses", po::value(&eqClassFile), "Read equivalence classes in directly") ("txpAggregationKey", po::value(&txpAggregationKey)->default_value("gene_id"), "When generating the gene-level estimates, " "use the provided key for aggregating transcripts. The default is the \"gene_id\" field, but other fields (e.g. \"gene_name\") might " "be useful depending on the specifics of the annotation being used. Note: this option only affects aggregation when using a " @@ -763,18 +883,15 @@ int mainQuantify(int argc, char* argv[]) { experiment.equivalenceClassBuilder().start(); std::mutex ioMutex; - fmt::print(stderr, "\n\n"); - quasiMapReads(experiment, sopt, ioMutex); - fmt::print(stderr, "Done Quasi-Mapping \n\n"); - - experiment.equivalenceClassBuilder().finish(); - // Now that the streaming pass is complete, we have - // our initial estimates, and our rich equivalence - // classes. Perform further optimization until - // convergence. + fmt::print(stderr, "\n\n"); + quasiMapReads(experiment, sopt, ioMutex); + fmt::print(stderr, "Done Quasi-Mapping \n\n"); + experiment.equivalenceClassBuilder().finish(); + + // Now that we have our reads mapped and our equivalence + // classes, iterate the abundance estimates to convergence. CollapsedEMOptimizer optimizer; jointLog->info("Starting optimizer"); - //sailfish::utils::normalizeAlphas(sopt, experiment); optimizer.optimize(experiment, sopt, 0.01, 10000); jointLog->info("Finished optimizer"); @@ -867,7 +984,60 @@ int mainQuantify(int argc, char* argv[]) { std::cerr << "For usage information, try " << argv[0] << " quant --help\nExiting.\n"; std::exit(1); } + return 0; +} - return 0; +/** +void loadEquivClasses(const std::string& eqClassFile, + ReadExperiment& readExp) { + + auto& numObservedFragments = readExp.numObservedFragmentsAtomic(); + auto& validHits = readExp.numMappedFragmentsAtomic(); + auto& totalHits = readExp.numFragHitsAtomic(); + auto& upperBoundHits = readExp.upperBoundHitsAtomic(); + + auto& eqClassBuilder = readExp.equivalenceClassBuilder(); + + auto& transcripts = readExp.transcripts(); + + std::unordered_map nameMap; + size_t i{0}; + for (auto& t : transcripts) { + nameMap[t.RefName] = i; + i += 1; + } + + std::ifstream ifile(eqClassFile); + std::string line; + std::cerr << "reading equivalence classes\n"; + i = 0; + while (std::getline(ifile, line)) { + auto toks = split(line, '\t'); + + std::vector txpIDs; + for (size_t tn=0; tn < toks.size() - 1; ++tn) { + txpIDs.push_back(nameMap[toks[tn]]); + } + std::sort(txpIDs.begin(), txpIDs.end()); + + uint32_t count = static_cast(std::stoul(toks.back())); + numObservedFragments += count; + validHits += count; + totalHits += count; + upperBoundHits += count; + TranscriptGroup tg(txpIDs); + eqClassBuilder.insertGroup(tg, count); + if (i % 1000 == 1) { + std::cerr << "read " << i << " equivalence classes\n"; + std::cerr << "[\t"; + for (auto txp : txpIDs) { + std::cerr << txp << '\t'; + } + std::cerr << "] : " << count << "\n"; + } + ++i; + } } +**/ +