From a38fdc7b9f731fc397eeec4335a9c2faa8d7a46d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jun 2024 11:55:39 +0000 Subject: [PATCH] Move rules to YAML instead of JSON to add support for inline comments --- .github/workflows/Publish.yaml | 7 +++++- .github/workflows/Tests.yaml | 2 +- CHANGELOG.md | 4 ++++ build_js.sh | 3 ++- docs/technical_architecture.md | 2 +- pyproject.toml | 6 +++-- rules/generate_rules.py | 6 ++--- rules/rules.json | 44 ---------------------------------- rules/rules.yaml | 39 ++++++++++++++++++++++++++++++ 9 files changed, 60 insertions(+), 53 deletions(-) delete mode 100644 rules/rules.json create mode 100644 rules/rules.yaml diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml index 2f019212..9565f2bc 100644 --- a/.github/workflows/Publish.yaml +++ b/.github/workflows/Publish.yaml @@ -19,8 +19,13 @@ jobs: python-version-file: pyproject.toml architecture: x64 + - name: Install dependencies (and project) + run: | + pip install -U pip + pip install -e .[scripts] + - name: Generate fuzzy rules - run: pip install jinja2==3.1.3 && python rules/generate_rules.py + run: python rules/generate_rules.py - name: Build Javascript wombatSetup.js uses: addnab/docker-run-action@v3 diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index c8538159..a8a780e6 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -62,7 +62,7 @@ jobs: - name: Install dependencies (and project) run: | pip install -U pip build - pip install -e . + pip install -e .[scripts] - name: Generate fuzzy rules run: python rules/generate_rules.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f8c2648..f086d9d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Moved rules definition from JSON to YAML and documented update process (#216) + ## [2.0.2] - 2024-06-18 ### Added diff --git a/build_js.sh b/build_js.sh index f424871e..0e305b0c 100755 --- a/build_js.sh +++ b/build_js.sh @@ -14,7 +14,8 @@ python3 -m venv /local /local/bin/python -m pip install --no-cache-dir -U \ pip \ - jinja2==3.1.3 + jinja2==3.1.4 \ + PyYAML==6.0.1 /local/bin/python /src/rules/generate_rules.py diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md index e9e3ab7d..dacaa6b0 100644 --- a/docs/technical_architecture.md +++ b/docs/technical_architecture.md @@ -2,7 +2,7 @@ ## Fuzzy rules -Fuzzy rules are stored in `rules/rules.json`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code. +Fuzzy rules are stored in `rules/rules.yaml`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code. Should you update these fuzzy rules, you hence have to: - regenerate Python and JS files by running `python rules/generateRules.py` diff --git a/pyproject.toml b/pyproject.toml index 6e088aac..40320169 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [build-system] # jinja2 is required to generate JS and Python rules at build time -requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4"] +# PyYAML is used to parse fuzzy rules and generate Python/JS code +requires = ["hatchling", "hatch-openzim==0.2.1", "jinja2==3.1.4", "PyYAML==6.0.1"] build-backend = "hatchling.build" [project] @@ -12,7 +13,7 @@ dependencies = [ "warcio==1.7.4", "requests==2.32.3", "zimscraperlib==3.3.2", - "jinja2==3.1.4", + "jinja2==3.1.4", # also update version in build-system above and in build_js.sh # to support possible brotli content in warcs, must be added separately "brotlipy==0.7.0", "cdxj_indexer==1.4.5", @@ -35,6 +36,7 @@ email="info@webrecorder.net" [project.optional-dependencies] scripts = [ "invoke==2.2.0", + "PyYAML==6.0.1", # used to parse fuzzy rules and generate Python/JS code ; also update version in build-system above and in build_js.sh ] lint = [ "black==24.4.2", diff --git a/rules/generate_rules.py b/rules/generate_rules.py index a7725b14..9df34c0f 100644 --- a/rules/generate_rules.py +++ b/rules/generate_rules.py @@ -1,17 +1,17 @@ -import json import re import sys from pathlib import Path +import yaml from jinja2 import Environment -rules_src = Path(__file__).with_name("rules.json") +rules_src = Path(__file__).with_name("rules.yaml") if not rules_src.exists(): # This skip is usefull mostly for CI operations when installing only Python deps print("Skipping rules generation, rule file is missing") sys.exit() -FUZZY_RULES = json.loads(rules_src.read_text())["fuzzyRules"] +FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"] PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII) diff --git a/rules/rules.json b/rules/rules.json deleted file mode 100644 index f49cf230..00000000 --- a/rules/rules.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "fuzzyRules": [ - { - "pattern": ".*googlevideo.com/(videoplayback(?=\\?)).*[?&](id=[^&]+).*", - "replace": "youtube.fuzzy.replayweb.page/\\1?\\2" - }, - { - "pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/(get_video_info\\?).*(video_id=[^&]+).*", - "replace": "youtube.fuzzy.replayweb.page/\\1\\2" - }, - { - "pattern": "i\\.ytimg\\.com\\/vi\\/(.*?)\\/.*?\\.(\\w*?)(?:\\?.*|$)", - "replace": "i.ytimg.com.fuzzy.replayweb.page/vi/\\1/thumbnail.\\2" - }, - { - "pattern": "([^?]+)\\?[\\d]+$", - "replace": "\\1" - }, - { - "pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com\\/(youtubei\\/[^?]+).*(videoId[^&]+).*", - "replace": "youtube.fuzzy.replayweb.page/\\1?\\2" - }, - { - "pattern": "(?:www\\.)?youtube(?:-nocookie)?\\.com/embed/([^?]+).*", - "replace": "youtube.fuzzy.replayweb.page/embed/\\1" - }, - { - "pattern": ".*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\\.akamaized\\.net.*/(.+?.mp4)\\?.*range=(.*?)(?:&|$)", - "replace": "vimeo-cdn.fuzzy.replayweb.page/\\1?range=\\2" - }, - { - "pattern": ".*(?:gcs-vimeo|vod|vod-progressive)\\.akamaized\\.net.*?/([\\d/]+.mp4)$", - "replace": "vimeo-cdn.fuzzy.replayweb.page/\\1" - }, - { - "pattern": ".*player.vimeo.com/(video/[\\d]+)\\?.*", - "replace": "vimeo.fuzzy.replayweb.page/\\1" - }, - { - "pattern": ".*i\\.vimeocdn\\.com\\/(.*)\\?.*", - "replace": "i.vimeocdn.fuzzy.replayweb.page/\\1" - } - ] -} diff --git a/rules/rules.yaml b/rules/rules.yaml new file mode 100644 index 00000000..06150a5b --- /dev/null +++ b/rules/rules.yaml @@ -0,0 +1,39 @@ +# This file comes from an adaptation of rules present in +# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js +# +# Syncing rules is done manually, based on expert knowledge, especially because in +# warc2zim we are not really fuzzy matching (searching the best entry among existing +# ones) but just rewriting to proper path. +# +# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815 +# from June 9, 2024 +# +# This file should be updated at every release of warc2zim +# +# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter, +# Washington Post, WixStatic, Facebook +# +# Generic rules are also ommitted on purpose, we don't need them +# +fuzzyRules: + - pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).* + replace: youtube.fuzzy.replayweb.page/\1?\2 + - pattern: (?:www\.)?youtube(?:-nocookie)?\.com/(get_video_info\?).*(video_id=[^&]+).* + replace : youtube.fuzzy.replayweb.page/\1\2 + - pattern: i\.ytimg\.com\/vi\/(.*?)\/.*?\.(\w*?)(?:\?.*|$) + replace : i.ytimg.com.fuzzy.replayweb.page/vi/\1/thumbnail.\2 + - pattern: ([^?]+)\?[\d]+$ + replace : \1 + - pattern: (?:www\.)?youtube(?:-nocookie)?\.com\/(youtubei\/[^?]+).*(videoId[^&]+).* + replace : youtube.fuzzy.replayweb.page/\1?\2 + - pattern: (?:www\.)?youtube(?:-nocookie)?\.com/embed/([^?]+).* + replace : youtube.fuzzy.replayweb.page/embed/\1 + # next one is a custom warc2zim rule intended to fix Vimeo support + - pattern: .*(?:gcs-vimeo|vod|vod-progressive|vod-adaptive)\.akamaized\.net.*/(.+?.mp4)\?.*range=(.*?)(?:&|$) + replace : vimeo-cdn.fuzzy.replayweb.page/\1?range=\2 + - pattern: .*(?:gcs-vimeo|vod|vod-progressive)\.akamaized\.net.*?/([\d/]+.mp4)$ + replace : vimeo-cdn.fuzzy.replayweb.page/\1 + - pattern: .*player.vimeo.com/(video/[\d]+)\?.* + replace : vimeo.fuzzy.replayweb.page/\1 + - pattern: .*i\.vimeocdn\.com\/(.*)\?.* + replace : i.vimeocdn.fuzzy.replayweb.page/\1