Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a simple BAM parser to dnaio #116

Merged
merged 34 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
cb199f4
Construct BamIter class
rhpvorderman Sep 25, 2023
66b7194
Allow reading
rhpvorderman Sep 25, 2023
da86a00
Fix compile errors
rhpvorderman Sep 25, 2023
cc81baa
Fix small errors
rhpvorderman Sep 25, 2023
20d0474
Fix segfault and compile issue
rhpvorderman Sep 25, 2023
fcd3f75
Fix small size offset issue
rhpvorderman Sep 25, 2023
7e8be76
Use SSSE3 on linux x86_64
rhpvorderman Sep 25, 2023
df726da
Add basic reading test
rhpvorderman Sep 25, 2023
f82f8f6
Reformatting
rhpvorderman Sep 25, 2023
99c5f09
Add a header parsing test
rhpvorderman Sep 25, 2023
2a2071d
Start on tests for truncated records
rhpvorderman Sep 25, 2023
0b21fe9
Make sure EOF is thrown for truncated headers
rhpvorderman Sep 25, 2023
24b2b34
Test truncated records
rhpvorderman Sep 25, 2023
d931ea1
Test truncation before magic
rhpvorderman Sep 26, 2023
4aa4d56
Add more bam parser tests
rhpvorderman Sep 26, 2023
d7e1fb0
Reformat setup.py
rhpvorderman Sep 26, 2023
b2955b5
Fix mypy issues, create stub for BamIter
rhpvorderman Sep 26, 2023
7ef5606
Add missing header file
rhpvorderman Sep 26, 2023
c4a4805
Prevent \r\n line endings in sam file
rhpvorderman Sep 26, 2023
076ae56
Add missing tests
rhpvorderman Sep 26, 2023
59aff44
Address formatting errors and typo's
rhpvorderman Oct 17, 2023
e14b58a
Do not set -mssse3 in setup.py
rhpvorderman Oct 17, 2023
e4d2b3d
Build linux wheels with ssse3 support
rhpvorderman Oct 17, 2023
fd045a3
Fix typo in test
rhpvorderman Oct 17, 2023
739e59a
Throw an error on mapped files
rhpvorderman Oct 17, 2023
dd2b4cb
Test mapped reads throw errors
rhpvorderman Oct 17, 2023
0233247
Revert "Build linux wheels with ssse3 support"
rhpvorderman Oct 17, 2023
63eb2bf
Build with -mssse3 on linux
rhpvorderman Oct 17, 2023
89c7ae3
Reformat setup.py
rhpvorderman Oct 17, 2023
fae9f8b
Merge branch 'main' into bamparser
rhpvorderman Oct 17, 2023
4e14cc3
Remove erroneous docstring
rhpvorderman Oct 17, 2023
f98d72f
Remove unused ignore comment
rhpvorderman Oct 17, 2023
42c6f20
Make sure -mssse3 flag is tested
rhpvorderman Oct 17, 2023
9afbee7
Apply suggestions from code review
marcelm Nov 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.fastq -crlf
*.fasta -crlf
*.sam -crlf
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@ jobs:
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
compile_flags: [""]
include:
- os: macos-latest
python-version: "3.10"
- os: windows-latest
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.10"
compile_flags: "-mssse3"
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -66,6 +70,8 @@ jobs:
run: python -m pip install tox
- name: Test
run: tox -e py
env:
CFLAGS: ${{ matrix.compile_flags }}
- name: Upload coverage report
uses: codecov/codecov-action@v3

Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ write_to = "src/dnaio/_version.py"
testpaths = ["tests"]

[tool.cibuildwheel]
environment = "CFLAGS=-g0"
environment_windows = "CFLAGS=-g0 -DNDEBUG"
environment_macos = "CFLAGS=-g0 -DNDEBUG"
environment_linux = "CFLAGS=-g0 -DNDEBUG -mssse3"

test-requires = "pytest"
test-command = ["cd {project}", "pytest tests"]

Expand Down
3 changes: 2 additions & 1 deletion src/dnaio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

__all__ = [
"open",
"BamReader",
"SequenceRecord",
"SingleEndReader",
"PairedEndReader",
Expand Down Expand Up @@ -41,7 +42,7 @@
)
from ._core import record_names_match # noqa: F401 # deprecated
from ._core import records_are_mates
from .readers import FastaReader, FastqReader
from .readers import BamReader, FastaReader, FastqReader
from .writers import FastaWriter, FastqWriter
from .singleend import _open_single
from .pairedend import (
Expand Down
9 changes: 9 additions & 0 deletions src/dnaio/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ class FastqIter(Generic[T]):
@property
def number_of_records(self) -> int: ...

class BamIter:
def __init__(self, file: BinaryIO, buffer_size: int): ...
def __iter__(self) -> Iterator[SequenceRecord]: ...
def __next__(self) -> SequenceRecord: ...
@property
def header(self) -> bytes: ...
@property
def number_of_records(self) -> int: ...

# Deprecated
def record_names_match(header1: str, header2: str) -> bool: ...

Expand Down
168 changes: 168 additions & 0 deletions src/dnaio/_core.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ from cpython.object cimport Py_TYPE, PyTypeObject
from cpython.ref cimport PyObject
from cpython.tuple cimport PyTuple_GET_ITEM
from libc.string cimport memcmp, memcpy, memchr, strcspn, strspn, memmove
from libc.stdint cimport uint8_t, uint16_t, uint32_t, int32_t

cimport cython

cdef extern from "Python.h":
Expand All @@ -21,6 +23,10 @@ cdef extern from "ascii_check.h":
cdef extern from "_conversions.h":
const char NUCLEOTIDE_COMPLEMENTS[256]

cdef extern from "bam.h":
void decode_bam_sequence(void *dest, void *encoded_sequence, size_t length)
void decode_bam_qualities(uint8_t *dest, uint8_t *encoded_qualities, size_t length)

from .exceptions import FastqFormatError
from ._util import shorten

Expand Down Expand Up @@ -667,6 +673,168 @@ cdef class FastqIter:
self.record_start = qualities_end + 1
return ret_val

cdef struct BamRecordHeader:
uint32_t block_size
int32_t reference_id
int32_t pos
uint8_t l_read_name
uint8_t mapq
uint16_t bin
uint16_t n_cigar_op
uint16_t flag
uint32_t l_seq
int32_t next_ref_id
int32_t next_pos
int32_t tlen

cdef class BamIter:
cdef:
uint8_t *record_start
uint8_t *buffer_end
size_t read_in_size
uint8_t *read_in_buffer
size_t read_in_buffer_size
object file
readonly object header
readonly Py_ssize_t number_of_records

def __dealloc__(self):
PyMem_Free(self.read_in_buffer)

def __cinit__(self, fileobj, read_in_size = 48 * 1024):
if read_in_size < 4:
raise ValueError(f"read_in_size must be at least 4 got "
f"{read_in_size}")

# Skip ahead and save the BAM header for later inspection
magic_and_header_size = fileobj.read(8)
if not isinstance(magic_and_header_size, bytes):
raise TypeError(f"fileobj {fileobj} is not a binary IO type, "
f"got {type(fileobj)}")
if len(magic_and_header_size) < 8:
raise EOFError("Truncated BAM file")
if magic_and_header_size[:4] != b"BAM\1":
raise ValueError(
f"fileobj: {fileobj}, is not a BAM file. No BAM magic, instead "
f"found {magic_and_header_size[:4]}")
l_text = int.from_bytes(magic_and_header_size[4:], "little", signed=False)
header = fileobj.read(l_text)
if len(header) < l_text:
raise EOFError("Truncated BAM file")
n_ref_obj = fileobj.read(4)
if len(n_ref_obj) < 4:
raise EOFError("Truncated BAM file")
n_ref = int.from_bytes(n_ref_obj, "little", signed=False)
for i in range(n_ref):
l_name_obj = fileobj.read(4)
if len(l_name_obj) < 4:
raise EOFError("Truncated BAM file")
l_name = int.from_bytes(l_name_obj, "little", signed=False)
reference_chunk_size = l_name + 4 # Include name and uint32_t of size
reference_chunk = fileobj.read(reference_chunk_size)
if len(reference_chunk) < reference_chunk_size:
raise EOFError("Truncated BAM file")
# Fileobj is now skipped ahead and at the start of the BAM records

self.header = header
self.read_in_size = read_in_size
self.file = fileobj
self.read_in_buffer = NULL
self.read_in_buffer_size = 0
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start

def __iter__(self):
return self

cdef _read_into_buffer(self):
cdef size_t read_in_size
cdef size_t leftover_size = self.buffer_end - self.record_start
cdef uint32_t block_size
memmove(self.read_in_buffer, self.record_start, leftover_size)
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start + leftover_size
if leftover_size >= 4:
# Immediately check how much data is at least required
block_size = (<uint32_t *>self.record_start)[0]
read_in_size = max(block_size, self.read_in_size)
else:
read_in_size = self.read_in_size - leftover_size
new_bytes = self.file.read(read_in_size)
cdef size_t new_bytes_size = PyBytes_GET_SIZE(new_bytes)
cdef uint8_t *new_bytes_buf = <uint8_t *>PyBytes_AS_STRING(new_bytes)
cdef size_t new_buffer_size = leftover_size + new_bytes_size
if new_buffer_size == 0:
# File completely read
raise StopIteration()
elif new_bytes_size == 0:
raise EOFError("Incomplete record at the end of file")
cdef uint8_t *tmp
if new_buffer_size > self.read_in_buffer_size:
tmp = <uint8_t *>PyMem_Realloc(self.read_in_buffer, new_buffer_size)
if tmp == NULL:
raise MemoryError()
self.read_in_buffer = tmp
self.read_in_buffer_size = new_buffer_size
memcpy(self.read_in_buffer + leftover_size, new_bytes_buf, new_bytes_size)
self.record_start = self.read_in_buffer
self.buffer_end = self.record_start + new_buffer_size

def __next__(self):
cdef:
SequenceRecord seq_record
uint8_t *record_start
uint8_t *buffer_end
uint32_t record_size
uint8_t *record_end
cdef BamRecordHeader header
cdef uint8_t *bam_name_start
uint32_t name_length
uint8_t *bam_seq_start
uint32_t seq_length
uint8_t *bam_qual_start
uint32_t encoded_seq_length

while True:
record_start = self.record_start
buffer_end = self.buffer_end
if record_start + 4 >= buffer_end:
self._read_into_buffer()
continue
record_size = (<uint32_t *>record_start)[0]
record_end = record_start + 4 + record_size
if record_end > buffer_end:
self._read_into_buffer()
continue
header = (<BamRecordHeader *>record_start)[0]
if header.flag != 4:
raise NotImplementedError(
"The BAM parser has been implemented with unmapped single "
"reads in mind to support ONT sequencing input. Mapped "
"BAM files or files with multiple reads are not supported. "
"Please use samtools fastq to make the appropriate "
"conversion to FASTQ format."
)
bam_name_start = record_start + sizeof(BamRecordHeader)
name_length = header.l_read_name
bam_seq_start = bam_name_start + name_length + header.n_cigar_op * sizeof(uint32_t)
name_length -= 1 # Do not include the null byte
seq_length = header.l_seq
encoded_seq_length = (seq_length + 1) // 2
bam_qual_start = bam_seq_start + encoded_seq_length
name = PyUnicode_New(name_length, 127)
sequence = PyUnicode_New(seq_length, 127)
qualities = PyUnicode_New(seq_length, 127)
memcpy(<uint8_t *>PyUnicode_DATA(name), bam_name_start, name_length)
decode_bam_sequence(<uint8_t *>PyUnicode_DATA(sequence), bam_seq_start, seq_length)
decode_bam_qualities(<uint8_t *>PyUnicode_DATA(qualities), bam_qual_start, seq_length)
seq_record = SequenceRecord.__new__(SequenceRecord)
seq_record._name = name
seq_record._sequence = sequence
seq_record._qualities = qualities
self.number_of_records += 1
self.record_start = record_end
return seq_record

def record_names_match(header1: str, header2: str):
"""
Expand Down
127 changes: 127 additions & 0 deletions src/dnaio/bam.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <assert.h>

#ifdef __SSE2__
#include "emmintrin.h"
#endif

#ifdef __SSSE3__
#include "tmmintrin.h"
#endif

static void
decode_bam_sequence(uint8_t *dest, const uint8_t *encoded_sequence, size_t length)
{
/* Reuse a trick from sam_internal.h in htslib. Have a table to lookup
two characters simultaneously.*/
static const char code2base[512] =
"===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
"A=AAACAMAGARASAVATAWAYAHAKADABAN"
"C=CACCCMCGCRCSCVCTCWCYCHCKCDCBCN"
"M=MAMCMMMGMRMSMVMTMWMYMHMKMDMBMN"
"G=GAGCGMGGGRGSGVGTGWGYGHGKGDGBGN"
"R=RARCRMRGRRRSRVRTRWRYRHRKRDRBRN"
"S=SASCSMSGSRSSSVSTSWSYSHSKSDSBSN"
"V=VAVCVMVGVRVSVVVTVWVYVHVKVDVBVN"
"T=TATCTMTGTRTSTVTTTWTYTHTKTDTBTN"
"W=WAWCWMWGWRWSWVWTWWWYWHWKWDWBWN"
"Y=YAYCYMYGYRYSYVYTYWYYYHYKYDYBYN"
"H=HAHCHMHGHRHSHVHTHWHYHHHKHDHBHN"
"K=KAKCKMKGKRKSKVKTKWKYKHKKKDKBKN"
"D=DADCDMDGDRDSDVDTDWDYDHDKDDDBDN"
"B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
"N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";
static const uint8_t *nuc_lookup = (uint8_t *)"=ACMGRSVTWYHKDBN";
const uint8_t *dest_end_ptr = dest + length;
uint8_t *dest_cursor = dest;
const uint8_t *encoded_cursor = encoded_sequence;
#ifdef __SSSE3__
const uint8_t *dest_vec_end_ptr = dest_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
__m128i first_lower_shuffle = _mm_setr_epi8(
0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
__m128i second_lower_shuffle = _mm_setr_epi8(
0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)nuc_lookup);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
while (dest_cursor < dest_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)encoded_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)dest_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(dest_cursor + 16), second_nucleotides);

encoded_cursor += sizeof(__m128i);
dest_cursor += 2 * sizeof(__m128i);
}
#endif
/* Do two at the time until it gets to the last even address. */
const uint8_t *dest_end_ptr_twoatatime = dest + (length & (~1ULL));
while (dest_cursor < dest_end_ptr_twoatatime) {
/* According to htslib, size_t cast helps the optimizer.
Code confirmed to indeed run faster. */
memcpy(dest_cursor, code2base + ((size_t)*encoded_cursor * 2), 2);
dest_cursor += 2;
encoded_cursor += 1;
}
assert((dest_end_ptr - dest_cursor) < 2);
if (dest_cursor != dest_end_ptr) {
/* There is a single encoded nuc left */
uint8_t encoded_nucs = *encoded_cursor;
uint8_t upper_nuc_index = encoded_nucs >> 4;
dest_cursor[0] = nuc_lookup[upper_nuc_index];
}
}

static void
decode_bam_qualities(uint8_t *dest, const uint8_t *encoded_qualities, size_t length)
{
const uint8_t *end_ptr = encoded_qualities + length;
const uint8_t *cursor = encoded_qualities;
uint8_t *dest_cursor = dest;
#ifdef __SSE2__
const uint8_t *vec_end_ptr = end_ptr - sizeof(__m128i);
while (cursor < vec_end_ptr) {
__m128i quals = _mm_loadu_si128((__m128i *)cursor);
__m128i phreds = _mm_add_epi8(quals, _mm_set1_epi8(33));
_mm_storeu_si128((__m128i *)dest_cursor, phreds);
cursor += sizeof(__m128i);
dest_cursor += sizeof(__m128i);
}
#endif
while (cursor < end_ptr) {
*dest_cursor = *cursor + 33;
cursor += 1;
dest_cursor += 1;
}
}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing newline at end of file

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
}
}

Loading