From 3f76f17e910611c45971fe1ff2d8ef6b7805f252 Mon Sep 17 00:00:00 2001 From: Jonathan Diamond Date: Wed, 6 Mar 2024 14:20:59 -0800 Subject: [PATCH] Fix indexer error that double counted data in a certain edge case. --- python/fusion_engine_client/parsers/fast_indexer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/fusion_engine_client/parsers/fast_indexer.py b/python/fusion_engine_client/parsers/fast_indexer.py index c27e2318..de80343f 100644 --- a/python/fusion_engine_client/parsers/fast_indexer.py +++ b/python/fusion_engine_client/parsers/fast_indexer.py @@ -49,9 +49,17 @@ def _search_blocks_for_fe(input_path: str, block_starts: List[int]): if len(data) == _READ_SIZE_BYTES + _MAX_FE_MSG_SIZE_BYTES: word_count = int(_READ_SIZE_BYTES / 2) # The last read on the last thread will run out of data, so read - # whatever is left. - else: + # whatever is left. If the amount left is less then the overlap + # space (and this wasn't the first thread), this data will already + # have been processed by another thread with the `elif len(data) >= + # _MAX_FE_MSG_SIZE_BYTES` branch. + elif block_offset == 0 or len(data) >= _MAX_FE_MSG_SIZE_BYTES: word_count = int(len(data) / 2) - 1 + # If the amount left is less then the overlap space, this data will + # already have been processed by another thread with the `elif + # len(data) >= _MAX_FE_MSG_SIZE_BYTES` branch. + else: + break # This is a fairly optimized search for preamble matches. # Allocate space for all the message offsets to check.