Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize extractMates #149

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## NOTE:
This fork of ExpansionHunter contains I/O optimizations that speed up the tool by 2-3x without changing the output.
These optimizations bring down the cost of running ExpansionHunter on very large variants catalogs and/or large numbers of samples.
These changes are hosted here pending review and incorporation into the main ExpansionHunter repo.

Compiled binaries for MacOSX and Linux are available in the [`bin`](https://github.com/bw2/ExpansionHunter/tree/master/bin) directory.

---

# Expansion Hunter: a tool for estimating repeat sizes

There are a number of regions in the human genome consisting of repetitions of
Expand Down
14 changes: 11 additions & 3 deletions ehunter/core/HtsHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,24 @@ class Cache_seq_nt16_str_lc

static Cache_seq_nt16_str_lc seq_nt16_str_lc;

Read decodeRead(bam1_t* htsAlignPtr)
{
ReadId decodeReadId(bam1_t* htsAlignPtr) {
const uint32_t samFlag = htsAlignPtr->core.flag;
const bool isFirstMate = samFlag & BAM_FREAD1;
const bool isReversed = samFlag & BAM_FREVERSE;

const char* qname(bam_get_qname(htsAlignPtr));
MateNumber mateNumber = isFirstMate ? MateNumber::kFirstMate : MateNumber::kSecondMate;
ReadId readId(qname, mateNumber);

return readId;
}

Read decodeRead(bam1_t* htsAlignPtr)
{
ReadId readId = decodeReadId(htsAlignPtr);

const uint32_t samFlag = htsAlignPtr->core.flag;
const bool isReversed = samFlag & BAM_FREVERSE;

string bases;

{
Expand Down
1 change: 1 addition & 0 deletions ehunter/core/HtsHelpers.hh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ namespace htshelpers

LinearAlignmentStats decodeAlignmentStats(bam1_t* htsAlignPtr);
bool isPrimaryAlignment(bam1_t* htsAlignPtr);
ReadId decodeReadId(bam1_t* htsAlignPtr);
Read decodeRead(bam1_t* htsAlignPtr);
ReferenceContigInfo decodeContigInfo(bam_hdr_t* htsHeaderPtr);

Expand Down
71 changes: 59 additions & 12 deletions ehunter/sample/HtsSeekingSampleAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,35 @@ bool checkIfMatesWereMappedNearby(const LinearAlignmentStats& alignmentStats)
return false;
}

bool addMateToRegionIfNearby(
GenomicRegion& mateGenomicRegion, const ReadId& readId, htshelpers::MateRegionToRecover& mateRegionToRecover)
{
const int maxDistance = 2000;
bool isNearby = mateRegionToRecover.genomicRegion.distance(mateGenomicRegion) < maxDistance;

ReadId mateReadId(readId.fragmentId(),
(readId.mateNumber() == MateNumber::kFirstMate) ? MateNumber::kSecondMate : MateNumber::kFirstMate);

if (isNearby) {
mateRegionToRecover.genomicRegion = GenomicRegion(
mateRegionToRecover.genomicRegion.contigIndex(),
std::min(mateRegionToRecover.genomicRegion.start(), mateGenomicRegion.start()),
std::max(mateRegionToRecover.genomicRegion.end(), mateGenomicRegion.end()));

mateRegionToRecover.mateReadIds.emplace(mateReadId);
}

return isNearby;
}

void recoverMates(
htshelpers::MateExtractor& mateExtractor, AlignmentStatsCatalog& alignmentStatsCatalog, ReadPairs& readPairs)
{

// compute a vector of MateRegionToRecover structs, each of which represents a GenomicRegion + a vector of
// ReadIds that need to be recovered from that region.
vector<htshelpers::MateRegionToRecover> mateRegionsToRecover;

for (auto& fragmentIdAndReadPair : readPairs)
{
ReadPair& readPair = fragmentIdAndReadPair.second;
Expand All @@ -105,20 +131,41 @@ void recoverMates(
}
const LinearAlignmentStats& alignmentStats = alignmentStatsIterator->second;

if (!checkIfMatesWereMappedNearby(alignmentStats))
{
LinearAlignmentStats mateStats;
optional<Read> optionalMate = mateExtractor.extractMate(read, alignmentStats, mateStats);
if (optionalMate)
{
const Read& mate = *optionalMate;
if (checkIfMatesWereMappedNearby(alignmentStats)) {
continue;
}

const int32_t mateContigIndex = alignmentStats.isMateMapped ? alignmentStats.mateChromId : alignmentStats.chromId;
const int32_t matePos = alignmentStats.isMateMapped ? alignmentStats.matePos : alignmentStats.pos;

// check if mate is close to other mates so they can be fetched together
bool foundNearbyRegion = false;
for (auto& mateRegionToRecover : mateRegionsToRecover) {
GenomicRegion mateGenomicRegion = GenomicRegion(mateContigIndex, matePos, matePos + read.sequence().length());

if (addMateToRegionIfNearby(mateGenomicRegion, read.readId(), mateRegionToRecover)) {
foundNearbyRegion = true;
break;
}
}
if (!foundNearbyRegion) {
// create a new entry in mateRegionsToRecover
GenomicRegion mateGenomicRegion = GenomicRegion(mateContigIndex, matePos, matePos + read.sequence().length());
htshelpers::MateRegionToRecover newMateRegionToRecover = { mateGenomicRegion, std::unordered_set<ReadId, boost::hash<ReadId>>() };
addMateToRegionIfNearby(mateGenomicRegion, read.readId(), newMateRegionToRecover);

mateRegionsToRecover.push_back(newMateRegionToRecover);
}
}

// fetch mates for the regions
for (auto& mateRegionToRecover : mateRegionsToRecover) {
for (auto& mateAndAlignmentStats : mateExtractor.extractMates(mateRegionToRecover)) {
auto& mate = mateAndAlignmentStats.first;
auto& alignmentStats = mateAndAlignmentStats.second;
alignmentStatsCatalog.emplace(std::make_pair(mate.readId(), alignmentStats));
readPairs.AddMateToExistingRead(mate);
}
else
{
spdlog::warn("Could not recover the mate of {}", read.readId());
}

}
}
}
Expand Down
44 changes: 27 additions & 17 deletions ehunter/sample/MateExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,16 @@ void MateExtractor::loadIndex()
}
}

optional<Read> MateExtractor::extractMate(
const Read& read, const LinearAlignmentStats& alignmentStats, LinearAlignmentStats& mateStats)
std::vector<std::pair<Read, LinearAlignmentStats>> MateExtractor::extractMates(
const MateRegionToRecover& mateRegionToRecover)
{
const int32_t searchRegionContigIndex
= alignmentStats.isMateMapped ? alignmentStats.mateChromId : alignmentStats.chromId;

const int32_t searchRegionStart = alignmentStats.isMateMapped ? alignmentStats.matePos : alignmentStats.pos;
const int32_t searchRegionEnd = searchRegionStart + 1;
std::unordered_set<ReadId, boost::hash<ReadId>> mateReadIdsNotFoundYet = mateRegionToRecover.mateReadIds;
std::vector<std::pair<Read, LinearAlignmentStats>> extractedMatesAndAlignmentStats;

const int32_t searchRegionContigIndex = mateRegionToRecover.genomicRegion.contigIndex();
const int32_t searchRegionStart = mateRegionToRecover.genomicRegion.start();
const int32_t searchRegionEnd = mateRegionToRecover.genomicRegion.end();

hts_itr_t* htsRegionPtr_
= sam_itr_queryi(htsIndexPtr_, searchRegionContigIndex, searchRegionStart, searchRegionEnd);
Expand All @@ -128,21 +130,29 @@ optional<Read> MateExtractor::extractMate(
continue;
}

Read putativeMate = htshelpers::decodeRead(htsAlignmentPtr_);

const bool belongToSameFragment = read.fragmentId() == putativeMate.fragmentId();
const bool formProperPair = read.mateNumber() != putativeMate.mateNumber();

if (belongToSameFragment && formProperPair)
{
mateStats = decodeAlignmentStats(htsAlignmentPtr_);
hts_itr_destroy(htsRegionPtr_);
return putativeMate;
ReadId putativeMateReadId = decodeReadId(htsAlignmentPtr_);

// if this is one of the fragment ids we're looking for, check if forms a proper pair with the 1st read with
// this fragmentId. If it does, decode this read and add it as a mate
bool foundRequestedMate = mateReadIdsNotFoundYet.count(putativeMateReadId) > 0;
if (foundRequestedMate) {
LinearAlignmentStats mateStats = decodeAlignmentStats(htsAlignmentPtr_);
Read putativeMate = htshelpers::decodeRead(htsAlignmentPtr_);
auto mateAndAlignmentStats = std::make_pair(putativeMate, mateStats);
extractedMatesAndAlignmentStats.push_back(mateAndAlignmentStats);
mateReadIdsNotFoundYet.erase(putativeMate.readId());
if (mateReadIdsNotFoundYet.size() == 0) {
break; // found all requested mates
}
}
}
hts_itr_destroy(htsRegionPtr_);

return optional<Read>();
if (mateReadIdsNotFoundYet.size() > 0) {
throw std::runtime_error("Failed to recover " + std::to_string(mateReadIdsNotFoundYet.size()) + " mates");
}

return extractedMatesAndAlignmentStats;
}

}
Expand Down
13 changes: 13 additions & 0 deletions ehunter/sample/MateExtractor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ extern "C"
#include "htslib/sam.h"
}

#include "core/GenomicRegion.hh"
#include "core/Read.hh"
#include "core/ReferenceContigInfo.hh"

Expand All @@ -40,6 +41,17 @@ namespace ehunter

namespace htshelpers
{


// Represents one or more mates that need to be recovered from a specific genomic region.
struct MateRegionToRecover {
GenomicRegion genomicRegion;

//stores the ReadId of each mate that needs to be recovered from the genomicRegion
std::unordered_set<ReadId, boost::hash<ReadId>> mateReadIds;
};


class MateExtractor
{
public:
Expand All @@ -48,6 +60,7 @@ public:

boost::optional<Read>
extractMate(const Read& read, const LinearAlignmentStats& alignmentStats, LinearAlignmentStats& mateStats);
std::vector<std::pair<Read, LinearAlignmentStats>> extractMates(const MateRegionToRecover& mateRegionToRecover);

private:
void openFile();
Expand Down