small QoL improvements; small bugfix on text normalization for dedupl…

…ication
huggingface · Oct 30, 2023 · 5c0b035 · 5c0b035
1 parent 444649d
commit 5c0b035
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/src/datatrove/pipeline/dedup/utils.py b/src/datatrove/pipeline/dedup/utils.py
@@ -32,13 +32,13 @@ class ExtensionHelperES:
 def simplify_content(text: str):
     # lower case
     text = text.lower()
-    # remove punctuation
-    text = text.translate(str.maketrans("", "", PUNCTUATION))
     # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
     text = re.sub(r"\s+", " ", text.strip())
+    # remove punctuation
+    text = text.translate(str.maketrans("", "", PUNCTUATION))
     # diacritics/unicode normalization
     text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
-    return text
+    return text.strip()
 
 
 def _b2i(b: bytes) -> int:

diff --git a/src/datatrove/pipeline/readers/base.py b/src/datatrove/pipeline/readers/base.py
@@ -34,11 +34,11 @@ def read_files_shard(self, shard):
                     if self.limit != -1 and li >= self.limit:
                         return
                     yield document
-                    if pbar:
+                    if self.progress:
                         pbar.update()
                     li += 1
 
-    def __call__(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
+    def __call__(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
         if data:
             yield from data
         yield from self.read_files_shard(self.data_folder.get_files_shard(rank, world_size))