-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a simple BAM parser to dnaio #116
Merged
Merged
Changes from all commits
Commits
Show all changes
34 commits
Select commit
Hold shift + click to select a range
cb199f4
Construct BamIter class
rhpvorderman 66b7194
Allow reading
rhpvorderman da86a00
Fix compile errors
rhpvorderman cc81baa
Fix small errors
rhpvorderman 20d0474
Fix segfault and compile issue
rhpvorderman fcd3f75
Fix small size offset issue
rhpvorderman 7e8be76
Use SSSE3 on linux x86_64
rhpvorderman df726da
Add basic reading test
rhpvorderman f82f8f6
Reformatting
rhpvorderman 99c5f09
Add a header parsing test
rhpvorderman 2a2071d
Start on tests for truncated records
rhpvorderman 0b21fe9
Make sure EOF is thrown for truncated headers
rhpvorderman 24b2b34
Test truncated records
rhpvorderman d931ea1
Test truncation before magic
rhpvorderman 4aa4d56
Add more bam parser tests
rhpvorderman d7e1fb0
Reformat setup.py
rhpvorderman b2955b5
Fix mypy issues, create stub for BamIter
rhpvorderman 7ef5606
Add missing header file
rhpvorderman c4a4805
Prevent \r\n line endings in sam file
rhpvorderman 076ae56
Add missing tests
rhpvorderman 59aff44
Address formatting errors and typo's
rhpvorderman e14b58a
Do not set -mssse3 in setup.py
rhpvorderman e4d2b3d
Build linux wheels with ssse3 support
rhpvorderman fd045a3
Fix typo in test
rhpvorderman 739e59a
Throw an error on mapped files
rhpvorderman dd2b4cb
Test mapped reads throw errors
rhpvorderman 0233247
Revert "Build linux wheels with ssse3 support"
rhpvorderman 63eb2bf
Build with -mssse3 on linux
rhpvorderman 89c7ae3
Reformat setup.py
rhpvorderman fae9f8b
Merge branch 'main' into bamparser
rhpvorderman 4e14cc3
Remove erroneous docstring
rhpvorderman f98d72f
Remove unused ignore comment
rhpvorderman 42c6f20
Make sure -mssse3 flag is tested
rhpvorderman 9afbee7
Apply suggestions from code review
marcelm File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
*.fastq -crlf | ||
*.fasta -crlf | ||
*.sam -crlf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,127 @@ | ||||||
#include <stdint.h> | ||||||
#include <stddef.h> | ||||||
#include <string.h> | ||||||
#include <assert.h> | ||||||
|
||||||
#ifdef __SSE2__ | ||||||
#include "emmintrin.h" | ||||||
#endif | ||||||
|
||||||
#ifdef __SSSE3__ | ||||||
#include "tmmintrin.h" | ||||||
#endif | ||||||
|
||||||
static void | ||||||
decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length) | ||||||
{ | ||||||
/* Reuse a trick from sam_internal.h in htslib. Have a table to lookup | ||||||
two characters simultaneously.*/ | ||||||
static const char code2base[512] = | ||||||
"===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N" | ||||||
"A=AAACAMAGARASAVATAWAYAHAKADABAN" | ||||||
"C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN" | ||||||
"M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN" | ||||||
"G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN" | ||||||
"R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN" | ||||||
"S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN" | ||||||
"V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN" | ||||||
"T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN" | ||||||
"W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN" | ||||||
"Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN" | ||||||
"H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN" | ||||||
"K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN" | ||||||
"D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN" | ||||||
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN" | ||||||
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN"; | ||||||
static const uint8_t *nuc_lookup = (uint8_t *)"=ACMGRSVTWYHKDBN"; | ||||||
const uint8_t *dest_end_ptr = dest + length; | ||||||
uint8_t *dest_cursor = dest; | ||||||
const uint8_t *encoded_cursor = encoded_sequence; | ||||||
#ifdef __SSSE3__ | ||||||
const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i)); | ||||||
__m128i first_upper_shuffle = _mm_setr_epi8( | ||||||
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff); | ||||||
__m128i first_lower_shuffle = _mm_setr_epi8( | ||||||
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7); | ||||||
__m128i second_upper_shuffle = _mm_setr_epi8( | ||||||
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff); | ||||||
__m128i second_lower_shuffle = _mm_setr_epi8( | ||||||
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15); | ||||||
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup); | ||||||
/* Work on 16 encoded characters at the time resulting in 32 decoded characters | ||||||
Examples are given for 8 encoded characters A until H to keep it readable. | ||||||
Encoded stored as |AB|CD|EF|GH| | ||||||
Shuffle into |AB|00|CD|00|EF|00|GH|00| and | ||||||
|00|AB|00|CD|00|EF|00|GH| | ||||||
Shift upper to the right resulting into | ||||||
|0A|B0|0C|D0|0E|F0|0G|H0| and | ||||||
|00|AB|00|CD|00|EF|00|GH| | ||||||
Merge with or resulting into (X stands for garbage) | ||||||
|0A|XB|0C|XD|0E|XF|0G|XH| | ||||||
Bitwise and with 0b1111 leads to: | ||||||
|0A|0B|0C|0D|0E|0F|0G|0H| | ||||||
We can use the resulting 4-bit integers as indexes for the shuffle of | ||||||
the nucleotide lookup. */ | ||||||
while (dest_cursor < dest_vec_end_ptr) { | ||||||
__m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor); | ||||||
|
||||||
__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); | ||||||
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); | ||||||
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); | ||||||
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); | ||||||
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111)); | ||||||
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); | ||||||
_mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides); | ||||||
|
||||||
__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); | ||||||
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); | ||||||
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); | ||||||
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); | ||||||
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111)); | ||||||
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); | ||||||
_mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides); | ||||||
|
||||||
encoded_cursor += sizeof(__m128i); | ||||||
dest_cursor += 2 * sizeof(__m128i); | ||||||
} | ||||||
#endif | ||||||
/* Do two at the time until it gets to the last even address. */ | ||||||
const uint8_t *dest_end_ptr_twoatatime = dest + (length & (~1ULL)); | ||||||
while (dest_cursor < dest_end_ptr_twoatatime) { | ||||||
/* According to htslib, size_t cast helps the optimizer. | ||||||
Code confirmed to indeed run faster. */ | ||||||
memcpy(dest_cursor, code2base + ((size_t)*encoded_cursor * 2), 2); | ||||||
dest_cursor += 2; | ||||||
encoded_cursor += 1; | ||||||
} | ||||||
assert((dest_end_ptr - dest_cursor) < 2); | ||||||
if (dest_cursor != dest_end_ptr) { | ||||||
/* There is a single encoded nuc left */ | ||||||
uint8_t encoded_nucs = *encoded_cursor; | ||||||
uint8_t upper_nuc_index = encoded_nucs >> 4; | ||||||
dest_cursor[0] = nuc_lookup[upper_nuc_index]; | ||||||
} | ||||||
} | ||||||
|
||||||
static void | ||||||
decode_bam_qualities(uint8_t *dest, const uint8_t *encoded_qualities, size_t length) | ||||||
{ | ||||||
const uint8_t *end_ptr = encoded_qualities + length; | ||||||
const uint8_t *cursor = encoded_qualities; | ||||||
uint8_t *dest_cursor = dest; | ||||||
#ifdef __SSE2__ | ||||||
const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i); | ||||||
while (cursor < vec_end_ptr) { | ||||||
__m128i quals = _mm_loadu_si128((__m128i *)cursor); | ||||||
__m128i phreds = _mm_add_epi8(quals, _mm_set1_epi8(33)); | ||||||
_mm_storeu_si128((__m128i *)dest_cursor, phreds); | ||||||
cursor += sizeof(__m128i); | ||||||
dest_cursor += sizeof(__m128i); | ||||||
} | ||||||
#endif | ||||||
while (cursor < end_ptr) { | ||||||
*dest_cursor = *cursor + 33; | ||||||
cursor += 1; | ||||||
dest_cursor += 1; | ||||||
} | ||||||
} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing newline at end of file