diff --git a/src/datatrove/pipeline/readers/base.py b/src/datatrove/pipeline/readers/base.py index dec3e78a..e7f4569b 100644 --- a/src/datatrove/pipeline/readers/base.py +++ b/src/datatrove/pipeline/readers/base.py @@ -190,6 +190,7 @@ def read_files_shard(self, shard: list[str]) -> DocumentsPipeline: self.stat_update("input_files") logger.info(f"Reading input file {filepath}, {i+1}/{len(shard)}") di = 0 + ndocs = 0 for di, document in enumerate(self.read_file(filepath)): if skipped < self.skip: skipped += 1 @@ -199,11 +200,9 @@ def read_files_shard(self, shard: list[str]) -> DocumentsPipeline: yield document doc_pbar.update() li += 1 + ndocs += 1 file_pbar.update() - # document count is non-zero, increment di and store the number - # of documents instead of the index of the last document - di += min(di, 1) - self.stat_update("documents", value=di, unit="input_file") + self.stat_update("documents", value=ndocs, unit="input_file") if self.limit != -1 and li >= self.limit: break