From 47747b06b2f50f3b13708837932e6e19f2c2fe43 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 25 Jun 2024 12:14:57 +0000 Subject: [PATCH] Add explanation about fuzzy rules configuration --- rules/rules.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/rules/rules.yaml b/rules/rules.yaml index eb917a6..b4d852f 100644 --- a/rules/rules.yaml +++ b/rules/rules.yaml @@ -1,3 +1,20 @@ +# This file comes from an adaptation of rules present in +# https://github.com/webrecorder/wabac.js/blame/main/src/fuzzymatcher.js +# +# Syncing rules is done manually, based on expert knowledge, especially because in +# warc2zim we are not really fuzzy matching (searching the best entry among existing +# ones) but just rewriting to proper path. +# +# This file is in sync with content at commit 879018d5b96962df82340a9a57570bbc0fc67815 +# from June 9, 2024 +# +# This file should be updated at every release of warc2zim +# +# Some rules are voluntarily missing because not been tested in warc2zim yet: Twitter, +# Washington Post, WixStatic, Facebook +# +# Generic rules are also ommitted on purpose, we don't need them +# fuzzyRules: - pattern: .*googlevideo.com/(videoplayback(?=\?)).*[?&](id=[^&]+).* replace: youtube.fuzzy.replayweb.page/\1?\2