Skip to content

Commit

Permalink
Merge pull request #106 from marcelm/incomplete-pairs
Browse files Browse the repository at this point in the history
Fix buffer filling up on incomplete paired FASTQ
  • Loading branch information
rhpvorderman authored Dec 13, 2022
2 parents 457eb59 + 92f50bd commit eea6a5d
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
17 changes: 14 additions & 3 deletions src/dnaio/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def read_paired_chunks(
Raises:
ValueError: A FASTQ record was encountered that is larger than *buffer_size*.
"""
if buffer_size < 1:
if buffer_size < 6:
raise ValueError("Buffer size too small")

buf1 = bytearray(buffer_size)
Expand All @@ -167,8 +167,10 @@ def read_paired_chunks(
)

while True:
if start1 == len(buf1) or start2 == len(buf2):
raise ValueError("FASTQ record does not fit into buffer")
if start1 == len(buf1) and start2 == len(buf2):
raise ValueError(
f"FASTQ records do not fit into buffer of size {buffer_size}"
)
bufend1 = f.readinto(memoryview(buf1)[start1:]) + start1 # type: ignore
bufend2 = f2.readinto(memoryview(buf2)[start2:]) + start2 # type: ignore
if start1 == bufend1 and start2 == bufend2:
Expand All @@ -180,6 +182,15 @@ def read_paired_chunks(

if end1 > 0 or end2 > 0:
yield (memoryview(buf1)[0:end1], memoryview(buf2)[0:end2])
else:
assert end1 == 0 and end2 == 0
extra = ""
if bufend1 == 0 or bufend2 == 0:
i = 1 if bufend1 == 0 else 2
extra = f". File {i} ended, but more data found in the other file"
raise FileFormatError(
f"Premature end of paired FASTQ input{extra}.", line=None
)
start1 = bufend1 - end1
assert start1 >= 0
buf1[0:start1] = buf1[end1:bufend1]
Expand Down
13 changes: 12 additions & 1 deletion tests/test_chunks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pytest import raises
from io import BytesIO

from dnaio import UnknownFileFormat
from dnaio import UnknownFileFormat, FileFormatError
from dnaio._core import paired_fastq_heads
from dnaio.chunks import _fastq_head, _fasta_head, read_chunks, read_paired_chunks

Expand Down Expand Up @@ -74,6 +74,17 @@ def test_read_paired_chunks():
print(c1, c2)


def test_paired_chunks_different_number_of_records():
record = b"@r\nAA\n+\n##\n"
buf1 = record
buf2 = record * 3
it = read_paired_chunks(BytesIO(buf1), BytesIO(buf2), 16)
assert next(it) == (record, record)
with raises(FileFormatError) as error:
next(it)
error.match("more data found in the other file")


def test_read_chunks():
for data in [b"@r1\nACG\n+\nHHH\n", b">r1\nACGACGACG\n"]:
assert [m.tobytes() for m in read_chunks(BytesIO(data))] == [data]
Expand Down

0 comments on commit eea6a5d

Please sign in to comment.