Skip to content

Commit

Permalink
Update document counting based on advice from @guipenedo
Browse files Browse the repository at this point in the history
Now use a seperate variable `ndocs` to count number of docs yielded.
  • Loading branch information
lyuwen committed Sep 11, 2024
1 parent acb7d72 commit 177af1f
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def read_files_shard(self, shard: list[str]) -> DocumentsPipeline:
self.stat_update("input_files")
logger.info(f"Reading input file {filepath}, {i+1}/{len(shard)}")
di = 0
ndocs = 0
for di, document in enumerate(self.read_file(filepath)):
if skipped < self.skip:
skipped += 1
Expand All @@ -199,11 +200,9 @@ def read_files_shard(self, shard: list[str]) -> DocumentsPipeline:
yield document
doc_pbar.update()
li += 1
ndocs += 1
file_pbar.update()
# document count is non-zero, increment di and store the number
# of documents instead of the index of the last document
di += min(di, 1)
self.stat_update("documents", value=di, unit="input_file")
self.stat_update("documents", value=ndocs, unit="input_file")
if self.limit != -1 and li >= self.limit:
break

Expand Down

0 comments on commit 177af1f

Please sign in to comment.