From 41138daeca9331acdedaf30e7ec768bebc8498fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= <kydlicek.hynek@gmail.com>
Date: Mon, 25 Mar 2024 15:40:20 +0100
Subject: [PATCH 1/5] nitts for url_dedup

---
 src/datatrove/pipeline/dedup/url_dedup.py | 394 ++++++++++++++++++++++
 tests/pipeline/test_url_deduplication.py  | 145 ++++++++
 2 files changed, 539 insertions(+)
 create mode 100644 src/datatrove/pipeline/dedup/url_dedup.py
 create mode 100644 tests/pipeline/test_url_deduplication.py

diff --git a/src/datatrove/pipeline/dedup/url_dedup.py b/src/datatrove/pipeline/dedup/url_dedup.py
new file mode 100644
index 00000000..957a11ad
--- /dev/null
+++ b/src/datatrove/pipeline/dedup/url_dedup.py
@@ -0,0 +1,394 @@
+"""
+URL based deduplication.
+"""
+
+import contextlib
+import dataclasses
+import heapq
+import struct
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
+from typing import Callable, Generator
+
+import numpy as np
+from fsspec.spec import AbstractBufferedFile
+from loguru import logger
+from tqdm import tqdm
+
+from datatrove.data import Document, DocumentsPipeline
+from datatrove.io import DataFolderLike, get_datafolder
+from datatrove.pipeline.base import PipelineStep
+from datatrove.utils.binaryio import read_tuples_from_file
+from datatrove.utils.text import sha1_hash64
+from datatrove.utils.typeshelper import ExtensionHelperSD, StatHints
+
+from ..writers.disk_base import DiskWriter
+
+
+@dataclass
+class UrlDedupConfig:
+    """
+    Args:
+        url_normalizer: Callable[[str], str] Normalize the url, e.g. remove query parameters
+        document_priority: Callable[[Document], int]
+            Function for determining the priority of a document.
+            Only the document with the highest priority will be preserved, out of duplicates.
+            The document priority must be in range [0, 65535]
+    """
+
+    url_normalizer: Callable[[str], str] = (
+        lambda x: x
+    )  # Normalize the url, e.g. remove query parameters
+    document_priority: Callable[[Document], int] = (
+        lambda x: 0
+    )  # Urls with higher will be preserved, will be saved as unsigned short!
+
+
+DEFAULT_URL_DEDUP_CONFIG = UrlDedupConfig()
+
+
+@dataclass(order=False)
+class HashSig:
+    hash_value: int
+    priority: int
+    doc_id: int
+    file_id: int
+
+    def is_from_index(self):
+        return self.doc_id == -1 and self.priority == 1
+
+    def __lt__(self, other: "HashSig") -> bool:
+        # Ensure that highest priority is always first of the hashes
+        return (self.hash_value, -self.priority, self.doc_id) < (
+            other.hash_value,
+            -other.priority,
+            other.doc_id,
+        )
+
+
+class UrlDedupSignature(PipelineStep):
+    """UrlDedup: First pipeline step
+
+        Creates a signature for url in each document. Each HashSig has n hash, the -priority the doc id. Before saving
+        them the hashes are sorted. We use negative priority as we want to the highest priority urls to be first in priority queue.
+
+    Args:
+        output_folder: folder where signatures are saved
+    """
+
+    type = "🫂 - DEDUPS"
+    name = "💥 url-deduplication stage 1"
+    _requires_dependencies = ["nltk"]
+
+    def __init__(
+        self,
+        output_folder: DataFolderLike,
+        finder_workers: int = 1,
+        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
+        language: str = "english",
+    ):
+        super().__init__()
+        self.output_folder = get_datafolder(output_folder)
+        if finder_workers <= 0:
+            raise ValueError("finder_workers must be >= 1")
+        elif finder_workers > 1:
+            logger.warning(
+                f"Remember to also set the name of tasks of the finder block to {finder_workers=}!"
+            )
+        self.finder_workers = finder_workers
+        self.config = config
+        self.language = language
+
+    def save_hashes(self, rank: int, signatures):
+        # explicitly define little endiannes
+
+        priority_max = np.iinfo(np.dtype("<u2")).max
+
+        assert all(
+            sig[1] >= 0 and sig[1] < priority_max for sig in signatures
+        ), f"priority must be between 1 and {priority_max}"
+        signatures = np.array(
+            signatures, dtype=[("hash", "<u8"), ("priority", "<u2"), ("doc", "<u4")]
+        )
+
+        # Ensure that the highest priority is always first
+        signatures["priority"] = -signatures["priority"]
+        signatures.sort(axis=0)
+        signatures["priority"] = -signatures["priority"]
+
+        hashes_per_worker = np.iinfo(np.uint64).max // self.finder_workers
+        left_idx = 0
+        for hash_i in range(self.finder_workers):
+            with self.output_folder.open(
+                f"{hash_i:04d}/{rank:05d}{ExtensionHelperSD.stage_1_signature}",
+                mode="wb",
+            ) as f:
+                # last bucket needs to have everything
+                right_hash = (
+                    (hash_i + 1) * hashes_per_worker
+                    if hash_i != self.finder_workers - 1
+                    else np.iinfo(np.uint64).max
+                )
+                # find last hash that goes in this bucket. This obeys the following rule:
+                # signatures['hash'][right_idx - 1] <= right_hash <= signatures['hash'][right_idx]
+                right_idx = left_idx + signatures["hash"][left_idx:].searchsorted(
+                    right_hash, side="right"
+                )
+                # save to file
+                if right_idx > left_idx:
+                    signatures[left_idx:right_idx].tofile(f)
+                left_idx = right_idx
+                # we've reached the end of our data
+                if right_idx >= len(signatures):
+                    break
+
+    def get_hashes(
+        self, doc: Document, doc_idx: int
+    ) -> list[None] | list[tuple[int, int, int]]:
+        normalized_url = self.config.url_normalizer(doc.metadata["url"])
+        priority = self.config.document_priority(doc)
+        hashes = [(sha1_hash64(normalized_url.encode("utf-8")), priority, doc_idx)]
+
+        return hashes
+
+    def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1):
+        """Args:
+            data
+            rank
+            world_size
+
+        Returns:
+
+        UrlDedupSignature creates a signature for each document. Each HashSig has n hash, the priority the doc id.
+        Before saving them the hashes are sorted.
+        """
+        signatures = []
+        for doc_idx, doc in enumerate(data):
+            with self.stats.time_stats:
+                self.stat_update(StatHints.total)
+                signatures.extend(self.get_hashes(doc, doc_idx))
+        self.save_hashes(rank, signatures)
+
+
+def read_sigs(
+    file: AbstractBufferedFile,
+    file_id: int,
+    index_file: bool = False,
+    lines_to_buffer: int = 5,
+) -> Generator[HashSig, None, None]:
+    line_format = "QHI" if not index_file else "Q"
+    last = None
+    with file as f:
+        for data in read_tuples_from_file(
+            f, line_format, lines_to_buffer=lines_to_buffer
+        ):
+            assert (
+                last is None or data[0] >= last
+            ), f"Hash order error. {f.tell()=}, {data[0]=}, {last=}"
+            last = data[0]
+            yield (
+                HashSig(hash_value=data[0], doc_id=-1, file_id=file_id, priority=-1)
+                if index_file
+                else HashSig(
+                    file_id=file_id,
+                    hash_value=data[0],
+                    priority=data[1],
+                    doc_id=data[2],
+                )
+            )
+
+
+class UrlFindDedups(PipelineStep):
+    """UrlDedup: Second pipeline step
+
+        UrlFindDedups runs on a single worker. It reads all the signatures from the previous step and loads them
+        in a priority queue to check for duplicates. If a duplicate is found its document id is saved.
+        The document with the highest priority is the one that will be saved out of the duplicates .
+
+    Args:
+        data_folder: data folder where signatures are saved
+        output_folder: folder where duplicates are saved
+        index_folder: folder where index files are saved
+        only_dedup_in_index: only dedup in index
+    """
+
+    type = "🫂 - DEDUPS"
+    name = "💥 url-deduplication stage 2"
+
+    def __init__(
+        self,
+        data_folder: DataFolderLike,
+        output_folder: DataFolderLike,
+        index_folder: DataFolderLike = None,
+        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
+        lines_to_buffer: int = 5,
+    ):
+        super().__init__()
+        self.data_folder = get_datafolder(data_folder)
+        self.output_folder = get_datafolder(output_folder)
+        self.index_folder = get_datafolder(index_folder) if index_folder else None
+
+        self.config = config
+        self.lines_to_buffer = lines_to_buffer
+
+    def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1):
+        with self.stats.time_stats:
+            if world_size == 1:
+                # check that there was not a mistake in setting this values
+                sig_files = self.data_folder.list_files(
+                    glob_pattern="*/*" + ExtensionHelperSD.stage_1_signature
+                )
+                if any(not sig_file.startswith("0000/") for sig_file in sig_files):
+                    raise ValueError(
+                        f"{world_size=} but found sig files for different hash buckets. Set tasks=finder_workers"
+                    )
+            else:
+                sig_files = self.data_folder.list_files(
+                    subdirectory=f"{rank:04d}",
+                    glob_pattern=ExtensionHelperSD.stage_1_signature,
+                )
+            sig_readers = [
+                read_sigs(file, file_i, lines_to_buffer=self.lines_to_buffer)
+                for file_i, file in enumerate(self.data_folder.open_files(sig_files))
+            ]
+            index_files = self.index_folder.list_files() if self.index_folder else None
+            if index_files:
+                logger.info(f"Found index file(s): {', '.join(index_files)}")
+                sig_readers.extend(
+                    [
+                        read_sigs(
+                            file,
+                            len(sig_readers) + file_i,
+                            index_file=True,
+                            lines_to_buffer=self.lines_to_buffer,
+                        )
+                        for file_i, file in enumerate(
+                            self.data_folder.open_files(index_files)
+                        )
+                    ]
+                )
+
+            logger.info(f"Initializing pq with {len(sig_readers)} files.")
+            with ThreadPoolExecutor() as executor:
+                pq = [
+                    x
+                    for x in tqdm(
+                        executor.map(lambda x: next(x, None), sig_readers),
+                        total=len(sig_readers),
+                        desc="Initializing pq...",
+                    )
+                    if x
+                ]
+            heapq.heapify(pq)
+            logger.info("PQ initialized.")
+
+            output_mg = self.output_folder.get_output_file_manager(mode="wb")
+
+            packer = struct.Struct("<I")
+
+            last: HashSig | None = None
+            while pq:
+                v: HashSig = heapq.heappop(pq)
+                if last and last.hash_value == v.hash_value and not v.is_from_index():
+                    out_filename = f"{rank:04d}/{v.file_id:05d}{ExtensionHelperSD.stage_2_duplicates}"
+                    output_mg.write(out_filename, packer.pack(v.doc_id))
+                last = v
+                new_v = next(sig_readers[v.file_id], None)
+
+                if new_v:
+                    heapq.heappush(pq, new_v)
+
+        output_mg.close()
+
+
+class UrlDedupFilter(PipelineStep):
+    """UrlDedup: Third pipeline step
+
+        UrlDedupFilter reads a DocumentPipeline and removes duplicated urls found at stage 2
+
+    Args:
+        data_folder: data folder to get duplicate files.
+        config: config for the dedup
+        exclusion_writer: writer to save excluded documents
+    """
+
+    type = "🫂 - DEDUPS"
+    name = "💥 url-deduplication stage 3"
+
+    def __init__(
+        self,
+        data_folder: DataFolderLike,
+        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
+        exclusion_writer: DiskWriter = None,
+        language: str = "english",
+    ):
+        from nltk import load
+
+        super().__init__()
+        self.data_folder = get_datafolder(data_folder)
+        self.config = config
+        self._tokenizer = load(f"tokenizers/punkt/{language}.pickle")
+        self.exclusion_writer = exclusion_writer
+        self.language = language
+
+    def read_duplicates(self, file: AbstractBufferedFile) -> np.ndarray:
+        """Helper function to read duplicates from a binary file storing (doc_id) as created by the second stage."""
+        with file as f:
+            return np.fromfile(f, dtype="<u4")
+
+    def run(
+        self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1
+    ) -> DocumentsPipeline:
+        """step method for Filters.
+        Drops documents that if .filter() is False
+
+        UrlDedupFilter reads a DocumentPipeline and removes duplicated urls found at stage 2
+        """
+        folders = self.data_folder.list_files(include_directories=True, recursive=False)
+        # for performance reasons when having for instance 12k*10k files
+        files = [
+            f
+            for f in [
+                f"{folder}/{rank:05d}{ExtensionHelperSD.stage_2_duplicates}"
+                for folder in folders
+            ]
+            if self.data_folder.exists(f)
+        ]
+
+        logger.info(f"Loading duplicate indexes from {len(files)} results files.")
+
+        all_dups = np.array([], dtype="<u4")
+        if files:
+            with ThreadPoolExecutor() as pool:
+                all_dups = np.concatenate(
+                    list(
+                        tqdm(
+                            pool.map(
+                                self.read_duplicates, self.data_folder.open_files(files)
+                            ),
+                            total=len(files),
+                        )
+                    ),
+                    axis=0,
+                )
+            all_dups.sort()
+
+        logger.info("Loaded duplicate indexes.")
+
+        dups_doc_i = 0
+        with (
+            self.exclusion_writer if self.exclusion_writer else contextlib.nullcontext()
+        ) as writer:
+            for doc_idx, doc in enumerate(data):
+                self.stat_update(StatHints.total)
+                with self.stats.time_stats:
+                    if (
+                        dups_doc_i < all_dups.shape[0]
+                        and all_dups[dups_doc_i] == doc_idx
+                    ):
+                        if writer:
+                            writer.write(doc, rank=rank)
+                        dups_doc_i += 1
+                    else:
+                        self.update_doc_stats(doc)
+                        yield doc
diff --git a/tests/pipeline/test_url_deduplication.py b/tests/pipeline/test_url_deduplication.py
new file mode 100644
index 00000000..f8d0d770
--- /dev/null
+++ b/tests/pipeline/test_url_deduplication.py
@@ -0,0 +1,145 @@
+import copy
+import random
+import shutil
+import string
+import tempfile
+import unittest
+
+from datatrove.data import Document
+from datatrove.pipeline.dedup.url_dedup import (
+    UrlDedupConfig,
+    UrlDedupFilter,
+    UrlDedupSignature,
+    UrlFindDedups,
+)
+
+from ..utils import require_nltk
+
+
+def get_random_string(n: int = 20):
+    return "".join(random.choices(string.ascii_uppercase + string.digits, k=n)) + "."
+
+DOCS = [
+    Document(text="", metadata={"url": "https://example.com"}, id="1"),
+    Document(text="", metadata={"url": "https://example.com"}, id="2"),
+    Document(text="", metadata={"url": "https://new-site.com"}, id="3"),
+    Document(text="", metadata={"url": "https://example.com"}, id="4"),
+    Document(text="", metadata={"url": "https://example2.com"}, id="5"),
+]
+
+DOCS_1 = DOCS[:2]
+DOCS_2 = DOCS[2:]
+
+@require_nltk
+class SentenceDedup(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory
+        self.tmp_dir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.tmp_dir)
+
+    def test_url_deduplication(self):
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs")
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", lines_to_buffer=1000)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups")
+
+        signature_creation(data=DOCS)
+        find_duplicates()
+        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
+        self.assertEqual(len(docs), 3)
+        self.assertEqual(set(doc.metadata["url"] for doc in docs), set([doc.metadata["url"] for doc in DOCS]))
+
+    
+    def test_url_deduplication_with_priority_highest_id(self):
+        config = UrlDedupConfig(
+            document_priority=lambda x: int(x.id)
+        )
+
+
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
+
+        signature_creation(data=DOCS)
+        find_duplicates()
+        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
+
+        expected_ids = [3, 4, 5]
+        self.assertEqual(len(docs), 3)
+        self.assertEqual(set(int(doc.id) for doc in docs), set(expected_ids))
+
+    def test_url_deduplication_with_priority_lowest_id(self):
+        config = UrlDedupConfig(
+            document_priority=lambda x: 5 - int(x.id)
+        )
+
+
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
+
+        signature_creation(data=DOCS)
+        find_duplicates()
+        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
+
+        expected_ids = [1, 3, 5]
+        self.assertEqual(len(docs), 3)
+        self.assertEqual(set(int(doc.id) for doc in docs), set(expected_ids))
+
+
+    def test_url_deduplication_with_normalization(self):
+        config = UrlDedupConfig(
+            url_normalizer=lambda x: x.replace("2", "")
+        )
+
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
+
+        signature_creation(data=DOCS)
+        find_duplicates()
+        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
+
+        self.assertEqual(len(docs), 2)
+        self.assertEqual(set(doc.metadata["url"] for doc in docs), set(["https://example.com", "https://new-site.com"]))
+
+    def test_sd_worker(self):
+        config = UrlDedupConfig(
+            document_priority=lambda x: int(x.id)
+        )
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
+
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
+
+        signature_creation(data=DOCS_1, rank=0, world_size=2)
+        signature_creation(data=DOCS_2, rank=1, world_size=2)
+        find_duplicates()
+
+        dedup_1 = list(dedup_filter(data=copy.deepcopy(DOCS_1), rank=0, world_size=2))
+        dedup_2 = list(dedup_filter(data=copy.deepcopy(DOCS_2), rank=1, world_size=2))
+
+        self.assertEqual(len(dedup_1), 0)
+        self.assertEqual(len(dedup_2), 3)
+        self.assertEqual(set(doc.metadata["url"] for doc in dedup_2), set(doc.metadata["url"] for doc in DOCS))
+
+    def test_distributed_find_dups(self):
+        config = UrlDedupConfig(
+            document_priority=lambda x: int(x.id)
+        )
+
+        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", finder_workers=50, config=config)
+
+        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
+        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
+
+        signature_creation(data=DOCS_1, rank=0, world_size=2)
+        signature_creation(data=DOCS_2, rank=1, world_size=2)
+        for rank in range(50):
+            find_duplicates(rank=rank, world_size=50)
+
+        dedup_docs = list(dedup_filter(data=copy.deepcopy(DOCS_1), rank=0, world_size=2))
+
+        dedup_docs_2 = list(dedup_filter(data=copy.deepcopy(DOCS_2), rank=1, world_size=2))
+        self.assertEqual(len(dedup_docs), 0)
+        self.assertEqual(len(dedup_docs_2), 3)
+        self.assertEqual(set(doc.metadata["url"] for doc in dedup_docs_2), set(doc.metadata["url"] for doc in DOCS))

From f208fb92255b2145d50dfae01b53d80249c0fd47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= <kydlicek.hynek@gmail.com>
Date: Mon, 22 Apr 2024 17:39:22 +0200
Subject: [PATCH 2/5] add pypi release action

---
 .github/workflows/ci.yml           |  1 +
 .github/workflows/pypi-release.yml | 67 ++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 .github/workflows/pypi-release.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 675e7897..2149b38c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,7 @@ on:
   push:
     branches:
       - main
+  workflow_call:
 
 jobs:
   check_code_quality:
diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
new file mode 100644
index 00000000..432f961b
--- /dev/null
+++ b/.github/workflows/pypi-release.yml
@@ -0,0 +1,67 @@
+name: PyPI release
+on:
+  workflow_dispatch:
+
+jobs:
+  ci:
+    uses: ./.github/workflows/ci.yml
+  release:
+    needs: ci
+    runs-on: ubuntu-latest
+    env:
+      TWINE_USERNAME: __token__
+      GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v3
+          
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -U twine build
+      
+      - name: Build the dist files
+        run: python -m build .
+      
+      - name: Publish to the test PyPI
+        env:
+          TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
+        run: twine upload dist/* --repository=testpypi
+
+      - name: Test installing from test PyPI and running tests
+        env:
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple testing-datatrove[terting]
+          python -m nltk.downloader punkt
+
+      - name: Get tag name
+        id: get_tag_name
+        run: |
+          echo TAG_NAME=$(grep '^version' pyproject.toml | head -1 | cut -d '"' -f 2) >> $GITHUB_OUTPUT
+          echo ::notice
+
+        
+      - name: Tag the release
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ env.GITHUB_TOKEN }}
+          script: |
+            github.rest.git.createRef({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              ref: 'refs/tags/${{ steps.get_tag_name.outputs.TAG_NAME }}',
+              sha: context.sha
+            })
+
+
+      - name: Publish to PyPI
+        env:
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: twine upload dist/* --repository=pypi

From 4ce37b8adf00fefbda8c2e2a0362558890ece6eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= <kydlicek.hynek@gmail.com>
Date: Mon, 22 Apr 2024 17:51:09 +0200
Subject: [PATCH 3/5] Update PyPI release workflow

---
 .github/workflows/pypi-release.yml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
index 432f961b..d463bd35 100644
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -10,7 +10,6 @@ jobs:
     runs-on: ubuntu-latest
     env:
       TWINE_USERNAME: __token__
-      GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
 
     steps:
       - name: Checkout Repo
@@ -38,20 +37,17 @@ jobs:
         env:
           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
         run: |
-          pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple testing-datatrove[terting]
+          pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple datatrove[testing]
           python -m nltk.downloader punkt
 
       - name: Get tag name
         id: get_tag_name
         run: |
           echo TAG_NAME=$(grep '^version' pyproject.toml | head -1 | cut -d '"' -f 2) >> $GITHUB_OUTPUT
-          echo ::notice
-
         
       - name: Tag the release
         uses: actions/github-script@v7
         with:
-          github-token: ${{ env.GITHUB_TOKEN }}
           script: |
             github.rest.git.createRef({
               owner: context.repo.owner,

From 48b1b2187f931a0fe73e2e7dda0f35d59d21e8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= <kydlicek.hynek@gmail.com>
Date: Mon, 22 Apr 2024 17:55:01 +0200
Subject: [PATCH 4/5] remove teh unwanted files

---
 src/datatrove/pipeline/dedup/url_dedup.py | 394 ----------------------
 tests/pipeline/test_url_deduplication.py  | 145 --------
 2 files changed, 539 deletions(-)
 delete mode 100644 src/datatrove/pipeline/dedup/url_dedup.py
 delete mode 100644 tests/pipeline/test_url_deduplication.py

diff --git a/src/datatrove/pipeline/dedup/url_dedup.py b/src/datatrove/pipeline/dedup/url_dedup.py
deleted file mode 100644
index 957a11ad..00000000
--- a/src/datatrove/pipeline/dedup/url_dedup.py
+++ /dev/null
@@ -1,394 +0,0 @@
-"""
-URL based deduplication.
-"""
-
-import contextlib
-import dataclasses
-import heapq
-import struct
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass, field
-from typing import Callable, Generator
-
-import numpy as np
-from fsspec.spec import AbstractBufferedFile
-from loguru import logger
-from tqdm import tqdm
-
-from datatrove.data import Document, DocumentsPipeline
-from datatrove.io import DataFolderLike, get_datafolder
-from datatrove.pipeline.base import PipelineStep
-from datatrove.utils.binaryio import read_tuples_from_file
-from datatrove.utils.text import sha1_hash64
-from datatrove.utils.typeshelper import ExtensionHelperSD, StatHints
-
-from ..writers.disk_base import DiskWriter
-
-
-@dataclass
-class UrlDedupConfig:
-    """
-    Args:
-        url_normalizer: Callable[[str], str] Normalize the url, e.g. remove query parameters
-        document_priority: Callable[[Document], int]
-            Function for determining the priority of a document.
-            Only the document with the highest priority will be preserved, out of duplicates.
-            The document priority must be in range [0, 65535]
-    """
-
-    url_normalizer: Callable[[str], str] = (
-        lambda x: x
-    )  # Normalize the url, e.g. remove query parameters
-    document_priority: Callable[[Document], int] = (
-        lambda x: 0
-    )  # Urls with higher will be preserved, will be saved as unsigned short!
-
-
-DEFAULT_URL_DEDUP_CONFIG = UrlDedupConfig()
-
-
-@dataclass(order=False)
-class HashSig:
-    hash_value: int
-    priority: int
-    doc_id: int
-    file_id: int
-
-    def is_from_index(self):
-        return self.doc_id == -1 and self.priority == 1
-
-    def __lt__(self, other: "HashSig") -> bool:
-        # Ensure that highest priority is always first of the hashes
-        return (self.hash_value, -self.priority, self.doc_id) < (
-            other.hash_value,
-            -other.priority,
-            other.doc_id,
-        )
-
-
-class UrlDedupSignature(PipelineStep):
-    """UrlDedup: First pipeline step
-
-        Creates a signature for url in each document. Each HashSig has n hash, the -priority the doc id. Before saving
-        them the hashes are sorted. We use negative priority as we want to the highest priority urls to be first in priority queue.
-
-    Args:
-        output_folder: folder where signatures are saved
-    """
-
-    type = "🫂 - DEDUPS"
-    name = "💥 url-deduplication stage 1"
-    _requires_dependencies = ["nltk"]
-
-    def __init__(
-        self,
-        output_folder: DataFolderLike,
-        finder_workers: int = 1,
-        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
-        language: str = "english",
-    ):
-        super().__init__()
-        self.output_folder = get_datafolder(output_folder)
-        if finder_workers <= 0:
-            raise ValueError("finder_workers must be >= 1")
-        elif finder_workers > 1:
-            logger.warning(
-                f"Remember to also set the name of tasks of the finder block to {finder_workers=}!"
-            )
-        self.finder_workers = finder_workers
-        self.config = config
-        self.language = language
-
-    def save_hashes(self, rank: int, signatures):
-        # explicitly define little endiannes
-
-        priority_max = np.iinfo(np.dtype("<u2")).max
-
-        assert all(
-            sig[1] >= 0 and sig[1] < priority_max for sig in signatures
-        ), f"priority must be between 1 and {priority_max}"
-        signatures = np.array(
-            signatures, dtype=[("hash", "<u8"), ("priority", "<u2"), ("doc", "<u4")]
-        )
-
-        # Ensure that the highest priority is always first
-        signatures["priority"] = -signatures["priority"]
-        signatures.sort(axis=0)
-        signatures["priority"] = -signatures["priority"]
-
-        hashes_per_worker = np.iinfo(np.uint64).max // self.finder_workers
-        left_idx = 0
-        for hash_i in range(self.finder_workers):
-            with self.output_folder.open(
-                f"{hash_i:04d}/{rank:05d}{ExtensionHelperSD.stage_1_signature}",
-                mode="wb",
-            ) as f:
-                # last bucket needs to have everything
-                right_hash = (
-                    (hash_i + 1) * hashes_per_worker
-                    if hash_i != self.finder_workers - 1
-                    else np.iinfo(np.uint64).max
-                )
-                # find last hash that goes in this bucket. This obeys the following rule:
-                # signatures['hash'][right_idx - 1] <= right_hash <= signatures['hash'][right_idx]
-                right_idx = left_idx + signatures["hash"][left_idx:].searchsorted(
-                    right_hash, side="right"
-                )
-                # save to file
-                if right_idx > left_idx:
-                    signatures[left_idx:right_idx].tofile(f)
-                left_idx = right_idx
-                # we've reached the end of our data
-                if right_idx >= len(signatures):
-                    break
-
-    def get_hashes(
-        self, doc: Document, doc_idx: int
-    ) -> list[None] | list[tuple[int, int, int]]:
-        normalized_url = self.config.url_normalizer(doc.metadata["url"])
-        priority = self.config.document_priority(doc)
-        hashes = [(sha1_hash64(normalized_url.encode("utf-8")), priority, doc_idx)]
-
-        return hashes
-
-    def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1):
-        """Args:
-            data
-            rank
-            world_size
-
-        Returns:
-
-        UrlDedupSignature creates a signature for each document. Each HashSig has n hash, the priority the doc id.
-        Before saving them the hashes are sorted.
-        """
-        signatures = []
-        for doc_idx, doc in enumerate(data):
-            with self.stats.time_stats:
-                self.stat_update(StatHints.total)
-                signatures.extend(self.get_hashes(doc, doc_idx))
-        self.save_hashes(rank, signatures)
-
-
-def read_sigs(
-    file: AbstractBufferedFile,
-    file_id: int,
-    index_file: bool = False,
-    lines_to_buffer: int = 5,
-) -> Generator[HashSig, None, None]:
-    line_format = "QHI" if not index_file else "Q"
-    last = None
-    with file as f:
-        for data in read_tuples_from_file(
-            f, line_format, lines_to_buffer=lines_to_buffer
-        ):
-            assert (
-                last is None or data[0] >= last
-            ), f"Hash order error. {f.tell()=}, {data[0]=}, {last=}"
-            last = data[0]
-            yield (
-                HashSig(hash_value=data[0], doc_id=-1, file_id=file_id, priority=-1)
-                if index_file
-                else HashSig(
-                    file_id=file_id,
-                    hash_value=data[0],
-                    priority=data[1],
-                    doc_id=data[2],
-                )
-            )
-
-
-class UrlFindDedups(PipelineStep):
-    """UrlDedup: Second pipeline step
-
-        UrlFindDedups runs on a single worker. It reads all the signatures from the previous step and loads them
-        in a priority queue to check for duplicates. If a duplicate is found its document id is saved.
-        The document with the highest priority is the one that will be saved out of the duplicates .
-
-    Args:
-        data_folder: data folder where signatures are saved
-        output_folder: folder where duplicates are saved
-        index_folder: folder where index files are saved
-        only_dedup_in_index: only dedup in index
-    """
-
-    type = "🫂 - DEDUPS"
-    name = "💥 url-deduplication stage 2"
-
-    def __init__(
-        self,
-        data_folder: DataFolderLike,
-        output_folder: DataFolderLike,
-        index_folder: DataFolderLike = None,
-        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
-        lines_to_buffer: int = 5,
-    ):
-        super().__init__()
-        self.data_folder = get_datafolder(data_folder)
-        self.output_folder = get_datafolder(output_folder)
-        self.index_folder = get_datafolder(index_folder) if index_folder else None
-
-        self.config = config
-        self.lines_to_buffer = lines_to_buffer
-
-    def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1):
-        with self.stats.time_stats:
-            if world_size == 1:
-                # check that there was not a mistake in setting this values
-                sig_files = self.data_folder.list_files(
-                    glob_pattern="*/*" + ExtensionHelperSD.stage_1_signature
-                )
-                if any(not sig_file.startswith("0000/") for sig_file in sig_files):
-                    raise ValueError(
-                        f"{world_size=} but found sig files for different hash buckets. Set tasks=finder_workers"
-                    )
-            else:
-                sig_files = self.data_folder.list_files(
-                    subdirectory=f"{rank:04d}",
-                    glob_pattern=ExtensionHelperSD.stage_1_signature,
-                )
-            sig_readers = [
-                read_sigs(file, file_i, lines_to_buffer=self.lines_to_buffer)
-                for file_i, file in enumerate(self.data_folder.open_files(sig_files))
-            ]
-            index_files = self.index_folder.list_files() if self.index_folder else None
-            if index_files:
-                logger.info(f"Found index file(s): {', '.join(index_files)}")
-                sig_readers.extend(
-                    [
-                        read_sigs(
-                            file,
-                            len(sig_readers) + file_i,
-                            index_file=True,
-                            lines_to_buffer=self.lines_to_buffer,
-                        )
-                        for file_i, file in enumerate(
-                            self.data_folder.open_files(index_files)
-                        )
-                    ]
-                )
-
-            logger.info(f"Initializing pq with {len(sig_readers)} files.")
-            with ThreadPoolExecutor() as executor:
-                pq = [
-                    x
-                    for x in tqdm(
-                        executor.map(lambda x: next(x, None), sig_readers),
-                        total=len(sig_readers),
-                        desc="Initializing pq...",
-                    )
-                    if x
-                ]
-            heapq.heapify(pq)
-            logger.info("PQ initialized.")
-
-            output_mg = self.output_folder.get_output_file_manager(mode="wb")
-
-            packer = struct.Struct("<I")
-
-            last: HashSig | None = None
-            while pq:
-                v: HashSig = heapq.heappop(pq)
-                if last and last.hash_value == v.hash_value and not v.is_from_index():
-                    out_filename = f"{rank:04d}/{v.file_id:05d}{ExtensionHelperSD.stage_2_duplicates}"
-                    output_mg.write(out_filename, packer.pack(v.doc_id))
-                last = v
-                new_v = next(sig_readers[v.file_id], None)
-
-                if new_v:
-                    heapq.heappush(pq, new_v)
-
-        output_mg.close()
-
-
-class UrlDedupFilter(PipelineStep):
-    """UrlDedup: Third pipeline step
-
-        UrlDedupFilter reads a DocumentPipeline and removes duplicated urls found at stage 2
-
-    Args:
-        data_folder: data folder to get duplicate files.
-        config: config for the dedup
-        exclusion_writer: writer to save excluded documents
-    """
-
-    type = "🫂 - DEDUPS"
-    name = "💥 url-deduplication stage 3"
-
-    def __init__(
-        self,
-        data_folder: DataFolderLike,
-        config: UrlDedupConfig = DEFAULT_URL_DEDUP_CONFIG,
-        exclusion_writer: DiskWriter = None,
-        language: str = "english",
-    ):
-        from nltk import load
-
-        super().__init__()
-        self.data_folder = get_datafolder(data_folder)
-        self.config = config
-        self._tokenizer = load(f"tokenizers/punkt/{language}.pickle")
-        self.exclusion_writer = exclusion_writer
-        self.language = language
-
-    def read_duplicates(self, file: AbstractBufferedFile) -> np.ndarray:
-        """Helper function to read duplicates from a binary file storing (doc_id) as created by the second stage."""
-        with file as f:
-            return np.fromfile(f, dtype="<u4")
-
-    def run(
-        self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1
-    ) -> DocumentsPipeline:
-        """step method for Filters.
-        Drops documents that if .filter() is False
-
-        UrlDedupFilter reads a DocumentPipeline and removes duplicated urls found at stage 2
-        """
-        folders = self.data_folder.list_files(include_directories=True, recursive=False)
-        # for performance reasons when having for instance 12k*10k files
-        files = [
-            f
-            for f in [
-                f"{folder}/{rank:05d}{ExtensionHelperSD.stage_2_duplicates}"
-                for folder in folders
-            ]
-            if self.data_folder.exists(f)
-        ]
-
-        logger.info(f"Loading duplicate indexes from {len(files)} results files.")
-
-        all_dups = np.array([], dtype="<u4")
-        if files:
-            with ThreadPoolExecutor() as pool:
-                all_dups = np.concatenate(
-                    list(
-                        tqdm(
-                            pool.map(
-                                self.read_duplicates, self.data_folder.open_files(files)
-                            ),
-                            total=len(files),
-                        )
-                    ),
-                    axis=0,
-                )
-            all_dups.sort()
-
-        logger.info("Loaded duplicate indexes.")
-
-        dups_doc_i = 0
-        with (
-            self.exclusion_writer if self.exclusion_writer else contextlib.nullcontext()
-        ) as writer:
-            for doc_idx, doc in enumerate(data):
-                self.stat_update(StatHints.total)
-                with self.stats.time_stats:
-                    if (
-                        dups_doc_i < all_dups.shape[0]
-                        and all_dups[dups_doc_i] == doc_idx
-                    ):
-                        if writer:
-                            writer.write(doc, rank=rank)
-                        dups_doc_i += 1
-                    else:
-                        self.update_doc_stats(doc)
-                        yield doc
diff --git a/tests/pipeline/test_url_deduplication.py b/tests/pipeline/test_url_deduplication.py
deleted file mode 100644
index f8d0d770..00000000
--- a/tests/pipeline/test_url_deduplication.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import copy
-import random
-import shutil
-import string
-import tempfile
-import unittest
-
-from datatrove.data import Document
-from datatrove.pipeline.dedup.url_dedup import (
-    UrlDedupConfig,
-    UrlDedupFilter,
-    UrlDedupSignature,
-    UrlFindDedups,
-)
-
-from ..utils import require_nltk
-
-
-def get_random_string(n: int = 20):
-    return "".join(random.choices(string.ascii_uppercase + string.digits, k=n)) + "."
-
-DOCS = [
-    Document(text="", metadata={"url": "https://example.com"}, id="1"),
-    Document(text="", metadata={"url": "https://example.com"}, id="2"),
-    Document(text="", metadata={"url": "https://new-site.com"}, id="3"),
-    Document(text="", metadata={"url": "https://example.com"}, id="4"),
-    Document(text="", metadata={"url": "https://example2.com"}, id="5"),
-]
-
-DOCS_1 = DOCS[:2]
-DOCS_2 = DOCS[2:]
-
-@require_nltk
-class SentenceDedup(unittest.TestCase):
-    def setUp(self):
-        # Create a temporary directory
-        self.tmp_dir = tempfile.mkdtemp()
-        self.addCleanup(shutil.rmtree, self.tmp_dir)
-
-    def test_url_deduplication(self):
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs")
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", lines_to_buffer=1000)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups")
-
-        signature_creation(data=DOCS)
-        find_duplicates()
-        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
-        self.assertEqual(len(docs), 3)
-        self.assertEqual(set(doc.metadata["url"] for doc in docs), set([doc.metadata["url"] for doc in DOCS]))
-
-    
-    def test_url_deduplication_with_priority_highest_id(self):
-        config = UrlDedupConfig(
-            document_priority=lambda x: int(x.id)
-        )
-
-
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
-
-        signature_creation(data=DOCS)
-        find_duplicates()
-        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
-
-        expected_ids = [3, 4, 5]
-        self.assertEqual(len(docs), 3)
-        self.assertEqual(set(int(doc.id) for doc in docs), set(expected_ids))
-
-    def test_url_deduplication_with_priority_lowest_id(self):
-        config = UrlDedupConfig(
-            document_priority=lambda x: 5 - int(x.id)
-        )
-
-
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
-
-        signature_creation(data=DOCS)
-        find_duplicates()
-        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
-
-        expected_ids = [1, 3, 5]
-        self.assertEqual(len(docs), 3)
-        self.assertEqual(set(int(doc.id) for doc in docs), set(expected_ids))
-
-
-    def test_url_deduplication_with_normalization(self):
-        config = UrlDedupConfig(
-            url_normalizer=lambda x: x.replace("2", "")
-        )
-
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
-
-        signature_creation(data=DOCS)
-        find_duplicates()
-        docs = list(dedup_filter(data=copy.deepcopy(DOCS)))
-
-        self.assertEqual(len(docs), 2)
-        self.assertEqual(set(doc.metadata["url"] for doc in docs), set(["https://example.com", "https://new-site.com"]))
-
-    def test_sd_worker(self):
-        config = UrlDedupConfig(
-            document_priority=lambda x: int(x.id)
-        )
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", config=config)
-
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
-
-        signature_creation(data=DOCS_1, rank=0, world_size=2)
-        signature_creation(data=DOCS_2, rank=1, world_size=2)
-        find_duplicates()
-
-        dedup_1 = list(dedup_filter(data=copy.deepcopy(DOCS_1), rank=0, world_size=2))
-        dedup_2 = list(dedup_filter(data=copy.deepcopy(DOCS_2), rank=1, world_size=2))
-
-        self.assertEqual(len(dedup_1), 0)
-        self.assertEqual(len(dedup_2), 3)
-        self.assertEqual(set(doc.metadata["url"] for doc in dedup_2), set(doc.metadata["url"] for doc in DOCS))
-
-    def test_distributed_find_dups(self):
-        config = UrlDedupConfig(
-            document_priority=lambda x: int(x.id)
-        )
-
-        signature_creation = UrlDedupSignature(output_folder=self.tmp_dir + "/sigs", finder_workers=50, config=config)
-
-        find_duplicates = UrlFindDedups(data_folder=self.tmp_dir + "/sigs", output_folder=self.tmp_dir + "/dups", config=config)
-        dedup_filter = UrlDedupFilter(data_folder=self.tmp_dir + "/dups", config=config)
-
-        signature_creation(data=DOCS_1, rank=0, world_size=2)
-        signature_creation(data=DOCS_2, rank=1, world_size=2)
-        for rank in range(50):
-            find_duplicates(rank=rank, world_size=50)
-
-        dedup_docs = list(dedup_filter(data=copy.deepcopy(DOCS_1), rank=0, world_size=2))
-
-        dedup_docs_2 = list(dedup_filter(data=copy.deepcopy(DOCS_2), rank=1, world_size=2))
-        self.assertEqual(len(dedup_docs), 0)
-        self.assertEqual(len(dedup_docs_2), 3)
-        self.assertEqual(set(doc.metadata["url"] for doc in dedup_docs_2), set(doc.metadata["url"] for doc in DOCS))

From 246e343d0d7d41666eff9a71026e59ba4b4216b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= <kydlicek.hynek@gmail.com>
Date: Mon, 22 Apr 2024 18:43:31 +0200
Subject: [PATCH 5/5] fixed workflow erros

---
 .github/workflows/pypi-release.yml        | 8 +++-----
 .github/workflows/{ci.yml => testing.yml} | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)
 rename .github/workflows/{ci.yml => testing.yml} (97%)

diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
index d463bd35..ddf596b7 100644
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -3,7 +3,7 @@ on:
   workflow_dispatch:
 
 jobs:
-  ci:
+  testing:
     uses: ./.github/workflows/ci.yml
   release:
     needs: ci
@@ -34,11 +34,10 @@ jobs:
         run: twine upload dist/* --repository=testpypi
 
       - name: Test installing from test PyPI and running tests
-        env:
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
         run: |
           pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple datatrove[testing]
           python -m nltk.downloader punkt
+          make test
 
       - name: Get tag name
         id: get_tag_name
@@ -52,11 +51,10 @@ jobs:
             github.rest.git.createRef({
               owner: context.repo.owner,
               repo: context.repo.repo,
-              ref: 'refs/tags/${{ steps.get_tag_name.outputs.TAG_NAME }}',
+              ref: 'refs/tags/v${{ steps.get_tag_name.outputs.TAG_NAME }}',
               sha: context.sha
             })
 
-
       - name: Publish to PyPI
         env:
           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/testing.yml
similarity index 97%
rename from .github/workflows/ci.yml
rename to .github/workflows/testing.yml
index 2149b38c..a0fb9920 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/testing.yml
@@ -1,4 +1,4 @@
-name: CI
+name: Test & Check Code Quality
 
 on:
   pull_request: