From f53c3796fff1d17994f2dfa3629bc094b712ee60 Mon Sep 17 00:00:00 2001 From: guipenedo Date: Tue, 17 Oct 2023 13:05:26 +0200 Subject: [PATCH] bugfix sentencededup stage 3 #2 --- src/datatrove/pipeline/dedup/sentence_dedup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datatrove/pipeline/dedup/sentence_dedup.py b/src/datatrove/pipeline/dedup/sentence_dedup.py index ebb10e1d..973cdab0 100644 --- a/src/datatrove/pipeline/dedup/sentence_dedup.py +++ b/src/datatrove/pipeline/dedup/sentence_dedup.py @@ -222,7 +222,7 @@ def __init__( def remove_dup_sentences(self, doc: Document, du_lines: set = None) -> (str, str): if not du_lines: - return doc.content + return doc.content, None sentence_spans = list(self._tokenizer.span_tokenize(doc.content)) kept_sentences = [] original_formatted = [] @@ -272,5 +272,5 @@ def __call__(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) doc.content = filtered_content self.stats.doc_len.update(len(doc.content)) yield doc - elif writer: + elif writer and original_formatted: writer.write(Document(**dataclasses.asdict(doc), content=original_formatted), rank=rank)