Skip to content

Commit

Permalink
add skip in decont index builder
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed May 4, 2024
1 parent 15c8425 commit d56d3c5
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/datatrove/pipeline/decont/n_grams.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,14 @@ def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1

for task_name, task in task_dict.items():
for eval_doc in task.eval_docs():
for gold in eval_doc.get_golds():
hashes[task_name].update(self.compute_hashes(gold, eval_doc.query))
try:
golds = eval_doc.get_golds()
query = eval_doc.query
except Exception as e:
logger.warning(f"Error while fetching doc data: {e}")
continue
for gold in golds:
hashes[task_name].update(self.compute_hashes(gold, query))

for task_name, task_hashes in hashes.items():
hashes_array = np.array(list(task_hashes), dtype="<u8")
Expand Down Expand Up @@ -216,5 +222,7 @@ def filter(self, doc: Document) -> bool | Tuple[bool, str]:
doc.metadata["contaminated_ngram"] = n_gram
doc.metadata["contaminated_task"] = task
self.stat_update(f"contaminated_{task}")
if ":" in task:
self.stat_update(f"contaminated_tg_{task[:task.index(':')]}")
return False, "contaminated"
return True

0 comments on commit d56d3c5

Please sign in to comment.