Skip to content

Commit

Permalink
bugfix sentencededup stage 3 #2
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Oct 17, 2023
1 parent 48b7436 commit f53c379
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/dedup/sentence_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def __init__(

def remove_dup_sentences(self, doc: Document, du_lines: set = None) -> (str, str):
if not du_lines:
return doc.content
return doc.content, None
sentence_spans = list(self._tokenizer.span_tokenize(doc.content))
kept_sentences = []
original_formatted = []
Expand Down Expand Up @@ -272,5 +272,5 @@ def __call__(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1)
doc.content = filtered_content
self.stats.doc_len.update(len(doc.content))
yield doc
elif writer:
elif writer and original_formatted:
writer.write(Document(**dataclasses.asdict(doc), content=original_formatted), rank=rank)

0 comments on commit f53c379

Please sign in to comment.