Skip to content

Commit

Permalink
small QoL improvements; small bugfix on text normalization for dedupl…
Browse files Browse the repository at this point in the history
…ication
  • Loading branch information
guipenedo committed Oct 30, 2023
1 parent 444649d commit 5c0b035
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
6 changes: 3 additions & 3 deletions src/datatrove/pipeline/dedup/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ class ExtensionHelperES:
def simplify_content(text: str):
# lower case
text = text.lower()
# remove punctuation
text = text.translate(str.maketrans("", "", PUNCTUATION))
# remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
text = re.sub(r"\s+", " ", text.strip())
# remove punctuation
text = text.translate(str.maketrans("", "", PUNCTUATION))
# diacritics/unicode normalization
text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
return text
return text.strip()


def _b2i(b: bytes) -> int:
Expand Down
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def read_files_shard(self, shard):
if self.limit != -1 and li >= self.limit:
return
yield document
if pbar:
if self.progress:
pbar.update()
li += 1

def __call__(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
def __call__(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1) -> DocumentsPipeline:
if data:
yield from data
yield from self.read_files_shard(self.data_folder.get_files_shard(rank, world_size))

0 comments on commit 5c0b035

Please sign in to comment.