-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
174 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import os | ||
import typing as tp | ||
|
||
|
||
def _safe_readline(fd) -> str: | ||
pos = fd.tell() | ||
while True: | ||
try: | ||
return fd.readline() | ||
except UnicodeDecodeError: | ||
pos -= 1 | ||
fd.seek(pos) # search where this character begins | ||
|
||
|
||
def find_offsets(filename: str, num_chunks: int) -> tp.List[int]: | ||
""" | ||
given a file and a number of chuncks, find the offsets in the file | ||
to be able to chunk around full lines. | ||
""" | ||
with open(filename, "r", encoding="utf-8") as f: | ||
size = os.fstat(f.fileno()).st_size | ||
chunk_size = size // num_chunks | ||
offsets = [0 for _ in range(num_chunks + 1)] | ||
for i in range(1, num_chunks): | ||
f.seek(chunk_size * i) | ||
_safe_readline(f) | ||
offsets[i] = f.tell() | ||
offsets[-1] = size | ||
return offsets | ||
|
||
|
||
class ChunkLineIterator: | ||
""" | ||
Iterator to properly iterate over lines of a file chunck. | ||
""" | ||
|
||
def __init__(self, fd, start_offset: int, end_offset: int): | ||
self._fd = fd | ||
self._start_offset = start_offset | ||
self._end_offset = end_offset | ||
|
||
def __iter__(self) -> tp.Iterable[str]: | ||
self._fd.seek(self._start_offset) | ||
# next(f) breaks f.tell(), hence readline() must be used | ||
line = _safe_readline(self._fd) | ||
while line: | ||
pos = self._fd.tell() | ||
# f.tell() does not always give the byte position in the file | ||
# sometimes it skips to a very large number | ||
# it is unlikely that through a normal read we go from | ||
# end bytes to end + 2**32 bytes (4 GB) and this makes it unlikely | ||
# that the procedure breaks by the undeterministic behavior of | ||
# f.tell() | ||
if ( | ||
self._end_offset > 0 | ||
and pos > self._end_offset | ||
and pos < self._end_offset + 2**32 | ||
): | ||
break | ||
yield line | ||
line = self._fd.readline() | ||
|
||
|
||
class Chunker: | ||
""" | ||
contextmanager to read a chunck of a file line by line. | ||
""" | ||
|
||
def __init__(self, path: str, start_offset: int, end_offset: int): | ||
self.path = path | ||
self.start_offset = start_offset | ||
self.end_offset = end_offset | ||
|
||
def __enter__(self) -> ChunkLineIterator: | ||
self.fd = open(self.path, "r", encoding="utf-8") | ||
return ChunkLineIterator(self.fd, self.start_offset, self.end_offset) | ||
|
||
def __exit__(self, exc_type, exc_val, exc_tb) -> None: | ||
self.fd.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[flake8] | ||
ignore = H102,H103,W503,H238,E203,H301,H306,E231 | ||
max-line-length = 130 | ||
[pycodestyle] | ||
ignore = H102,H103,W503,H238,E203,H301,H306,E231 | ||
max-line-length = 130 | ||
[isort] | ||
profile = black |