Skip to content

Commit

Permalink
fixes for empty folders
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Nov 29, 2024
1 parent ea3adf9 commit 610560c
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
7 changes: 5 additions & 2 deletions src/datatrove/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def list_files(
]
)

def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str] | None:
"""Fetch a shard (set of files) for a given rank, assuming there are a total of `world_size` shards.
This should be deterministic to not have any overlap among different ranks.
Will return files [rank, rank+world_size, rank+2*world_size, ...]
Expand All @@ -175,7 +175,10 @@ def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
Returns: a list of file paths
"""
return self.list_files(**kwargs)[rank::world_size]
all_files = self.list_files(**kwargs)
if len(all_files) == 0:
return None
return all_files[rank::world_size]

def resolve_paths(self, paths) -> list[str] | str:
"""
Expand Down
7 changes: 4 additions & 3 deletions src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,11 +222,12 @@ def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1
if not self.paths_file
else list(get_shard_from_paths_file(self.paths_file, rank, world_size))
)
if len(files_shard) == 0:
if rank == 0:
raise RuntimeError(f"No files found on {self.data_folder.path}!")
if files_shard is None:
raise RuntimeError(f"No files found on {self.data_folder.path}!")
elif len(files_shard) == 0:
# otherwise just a warning
logger.warning(f"No files found on {self.data_folder.path} for {rank=}")

if self.shuffle_files:
random.shuffle(files_shard)
for doc in self.read_files_shard(files_shard):
Expand Down

0 comments on commit 610560c

Please sign in to comment.