Skip to content

Commit

Permalink
fix for no .remove file
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Nov 28, 2024
1 parent f7a0267 commit 42b1e10
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions src/datatrove/pipeline/dedup/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,13 @@ def __init__(
self.lines_to_buffer = lines_to_buffer

def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1):
if not self.data_folder.isfile(f"{rank:06d}.remove"):
logger.warning(f"No .remove file for {rank=}.")
for doc in data:
self.stat_update(StatHints.total, StatHints.forwarded)
yield doc
return

# additional metadata files
# cluster ids
if self.load_cluster_ids and not self.data_folder.exists(f"{rank:06d}.clusters"):
Expand All @@ -581,12 +588,6 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1):
logger.warning(f"No .sizes file for {rank=}.")
raise FileNotFoundError

if not self.data_folder.isfile(f"{rank:06d}.remove"):
logger.warning(f"No .remove file for {rank=}.")
for doc in data:
self.stat_update(StatHints.total, StatHints.forwarded)
yield doc
return
with self.data_folder.open(f"{rank:06d}.remove", "rb") as f:
with self.exclusion_writer if self.exclusion_writer else contextlib.nullcontext() as exc_writer:

Expand Down

0 comments on commit 42b1e10

Please sign in to comment.