diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index 7061f40..a59ca22 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -16,16 +16,19 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Build packages + working-directory: scraper run: | pip install -U pip build python -m build --sdist --wheel - name: Upload to PyPI uses: pypa/gh-action-pypi-publish@release/v1.8 + with: + packages-dir: scraper/dist/ - name: Build and push Docker image uses: openzim/docker-publish-action@v10 diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index a9d2172..106548f 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -7,7 +7,7 @@ on: - main jobs: - check-qa: + check-scraper-qa: runs-on: ubuntu-22.04 steps: @@ -16,19 +16,49 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - - name: Install dependencies (and project) + - name: Install dependencies + working-directory: scraper run: | pip install -U pip pip install -e .[lint,check,scripts,test] - name: Check black formatting + working-directory: scraper run: inv lint-black - name: Check ruff + working-directory: scraper run: inv lint-ruff - name: Check pyright + working-directory: scraper run: inv check-pyright + + check-zimui-qa: + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version-file: zimui/.node-version + + - name: Install JS dependencies + working-directory: zimui + run: | + yarn install + + - name: Check prettier + working-directory: zimui + run: | + yarn format + + - name: Check eslint + working-directory: zimui + run: | + yarn lint diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml index 838269f..3c6276f 100644 --- a/.github/workflows/Tests.yml +++ b/.github/workflows/Tests.yml @@ -7,7 +7,7 @@ on: - main jobs: - run-tests: + test-scraper: runs-on: ubuntu-22.04 steps: @@ -16,15 +16,17 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Install dependencies (and project) + working-directory: scraper run: | pip install -U pip pip install -e .[test,scripts] - name: Run the tests + working-directory: scraper run: inv coverage --args "-vvv" - name: Upload coverage report to codecov @@ -32,7 +34,7 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} - build_python: + build-scraper: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 @@ -40,15 +42,36 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: pyproject.toml + python-version-file: scraper/pyproject.toml architecture: x64 - name: Ensure we can build Python targets + working-directory: scraper run: | pip install -U pip build python3 -m build --sdist --wheel - build_docker: + build-zimui: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version-file: zimui/.node-version + + - name: Install dependencies + working-directory: zimui + run: | + yarn install + + - name: Build + working-directory: zimui + run: | + yarn build + + build-docker: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index db971bd..ece9e81 100644 --- a/.gitignore +++ b/.gitignore @@ -368,16 +368,6 @@ pyrightconfig.json # assets that we download .dockerignore -src/kolibri2zim/templates/assets/bootstrap/ -src/kolibri2zim/templates/assets/pdfjs/ -src/kolibri2zim/templates/assets/videojs/ -src/kolibri2zim/templates/assets/jquery.min.js -src/kolibri2zim/templates/assets/ogvjs/ -src/kolibri2zim/templates/assets/videojs-ogvjs.js -src/kolibri2zim/templates/assets/epub.min.js -src/kolibri2zim/templates/assets/bootstrap-icons/ -src/kolibri2zim/templates/assets/jszip.min.js -src/kolibri2zim/templates/assets/perseus/ # output dir output diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 577ac69..57b468a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,8 +4,24 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - - id: trailing-whitespace - - id: end-of-file-fixer + - id: trailing-whitespace + - id: end-of-file-fixer +- repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.3 + hooks: + - id: prettier + files: zimui\/.*$ # files in zimui folder +- repo: https://github.com/pre-commit/mirrors-eslint + rev: v8.51.0 + hooks: + - id: eslint + types: [file] + files: zimui\/src\/.*(?:\.[jt]sx?|\.vue)$ # *.js, *.jsx, *.ts, *.tsx, *.vue in zimui/src folder + args: + - --ignore-path + - zimui/.eslintignore + - --config + - zimui/.eslintrc.cjs - repo: https://github.com/psf/black rev: "23.3.0" hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 59efa03..2c1d0be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Small fixes in invoke tasks ### Changed +- Scraper (Python code) has been moved to the scraper subfolder +- Vue.JS is now used as main UI framework ; all its code is in the zimui subfolder ; it is rendered with Vite to produce a static website +- QA and Tests workflows have been adapted + - to the new folder structure + - to also QA and Test the Vue.JS part +- precommit hooks have been configured for the Vue.JS part +- Dockerfile has been adapted to first build the Vue.JS part in a dedicated stage and then embed the generated files into the final Python-based image +- Topics are stored as JSON files in the ZIM + - JSON is generated by pydantic + - these files are consumed by the Vue.JS UI + - content (video, audio, pdf, epub, ...) is still rendered by Jinja2 as before +- URLs are meaningful slugs + - generated by Python slugify lib + - from Kolibri node title + - should two distinct nodes have the same title resulting in the same slug, conflicts are handled with a _1, _2, ... suffix +- changes in the ZIM "folder" structure: + - files generated by Vite are placed in / + - thumbnails are placed in /thumbnails + - JSON files generated to render topics are placed in /topics + - most Kolibri content (video, audio, ePub, PDF) are placed in /files (some content is still placed at the root to not break some stuff which was found hard to fix for now, will be tackled in specific issues for each content type) +- legacy MANIFEST.in has been deleted (left-over from migration to hatch) +- is_front property has been adjusted when adding the item to the ZIM +- one new CLI argument --zimui-dist to specify the folder where zimui has been built (by Vite) + + - Dockerfile: split installation of Python dependencies for more efficiency - Github workflow: publish `dev` tag on every push to `main` branch - Github workflow: build Docker image + test its startup diff --git a/Dockerfile b/Dockerfile index 2b96101..b0414e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,10 @@ +FROM node:20-alpine as zimui + +WORKDIR /src +COPY zimui /src +RUN yarn install --frozen-lockfile +RUN yarn build + FROM python:3.11-bookworm LABEL org.opencontainers.image.source https://github.com/openzim/kolibri @@ -12,22 +19,28 @@ RUN apt-get update \ pip # Copy pyproject.toml and its dependencies -COPY pyproject.toml README.md get_js_deps.sh hatch_build.py /src/ -COPY src/kolibri2zim/__about__.py /src/src/kolibri2zim/__about__.py +COPY README.md /src/ +COPY scraper/pyproject.toml scraper/get_web_deps.sh scraper/hatch_build.py /src/scraper/ +COPY scraper/src/kolibri2zim/__about__.py /src/scraper/src/kolibri2zim/__about__.py # Install Python dependencies -RUN pip install --no-cache-dir /src +RUN pip install --no-cache-dir /src/scraper # Copy code + associated artifacts -COPY src /src/src -COPY *.md LICENSE *.py /src/ +COPY scraper/src /src/scraper/src +COPY *.md LICENSE /src/ # Install + cleanup -RUN pip install --no-cache-dir /src \ - && rm -rf /src +RUN pip install --no-cache-dir /src/scraper \ + && rm -rf /src/scraper + +# Copy zimui build output +COPY --from=zimui /src/dist /src/zimui # default output directory RUN mkdir -p /output WORKDIR /output +ENV KOLIBRI_ZIMUI_DIST=/src/zimui + CMD ["kolibri2zim", "--help"] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 15b313b..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include *.md -include get_js_deps.sh -include requirements.txt -recursive-include kolibri2zim * diff --git a/README.md b/README.md index 3b3a3e3..405f7d2 100644 --- a/README.md +++ b/README.md @@ -16,30 +16,38 @@ HTML files folder of it before creating a ZIM off of it. Requirements ------------ +* Node 20.x +* Python 3.11 * [`ffmpeg`](https://ffmpeg.org/) for video transcoding (only used with `--use-webm` or `--low-quality`). -* `curl` and `unzip` to install Javascript dependencies. See `get_js_deps.sh` if you want to do it manually. +* `curl` and `unzip` to install Javascript dependencies. See `get_web_deps.sh` if you want to do it manually. Installation ------------ -## Virtualenv +### Virtualenv -`kolibri2zim` is a Python3 software. If you are not using the +`kolibri2zim` is a Python3 software. If you are not using the [Docker](https://docker.com) image, you are advised to use it in a virtual environment to avoid installing software dependencies on your system. ```bash python3 -m venv env # Create virtualenv -source env/bin/Activate # Activate the virtualenv +source env/bin/activate # Activate the virtualenv pip3 install kolibri2zim # Install dependencies kolibri2zim --help # Display kolibri2zim help ``` Call `deactivate` to quit the virtual environment. -See `requirements.txt` for the list of python dependencies. +See `pyproject.toml` for the list of python dependencies. -## Docker +To test epubs and pdfs rendering, a potential usefull command is: + +```bash +kolibri2zim --name "Biblioteca Elejandria" --output /output --tmp-dir /tmp --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' +``` + +### Docker ```bash docker run -v my_dir:/output ghcr.io/openzim/kolibri2zim kolibri2zim --help @@ -56,8 +64,34 @@ Development Before contributing be sure to check out the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines. -To test epubs and pdfs rendering, a potential usefull command is: -```bash +Some usefull test channels: + +- 7f744ce8d28b471eaf663abd60c92267: a very minimal channel with all kind of content +- 9f15f4e9aeaa48b5ae271e5749d6fe80 : a small channel with significantly nested items and all kind of content + +### Build and running scraper locally + +You have to: + +- build the `zimui` frontend which will be embededed inside the ZIM (and redo it every time you make modifications to the `zimui`) +- run the `scraper` to retrieve FCC curriculum and build the ZIM + +Sample commands: + +``` +cd zimui +yarn install +yarn build +cd ../scraper +hatch run kolibri2zim --name "Biblioteca Elejandria" --output output --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' +``` + +### Running scraper with Docker + +Run from official version (published on GHCR.io) ; ZIM will be available in the `output` sub-folder of current working directory. + +``` +docker run --rm -it -v $(pwd)/output:/output ghcr.io/openzim/kolibri2zim:latest kolibri2zim --name "Biblioteca Elejandria" --output /output --tmp-dir /tmp --zim-file Biblioteca_Elejandria.zim --channel-id "fed29d60e4d84a1e8dcfc781d920b40e" --node-ids 'd92c07655128458f8248416154b18a68,89fe2f86ee3f4fbaa7fb2bf9bd56d088,75f99e6b97d14b14a4e74762ad77391f,89fe2f86ee3f4fbaa7fb2bf9bd56d088' ``` diff --git a/dump_channel_to_fs.py b/scraper/dump_channel_to_fs.py similarity index 100% rename from dump_channel_to_fs.py rename to scraper/dump_channel_to_fs.py diff --git a/get_js_deps.sh b/scraper/get_web_deps.sh similarity index 89% rename from get_js_deps.sh rename to scraper/get_web_deps.sh index 8954e5b..16761b8 100755 --- a/get_js_deps.sh +++ b/scraper/get_web_deps.sh @@ -25,7 +25,7 @@ fi SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )" ASSETS_PATH="${SCRIPT_PATH}/src/kolibri2zim/templates/assets" -echo "About to download JS assets to ${ASSETS_PATH}" +echo "About to download web assets to ${ASSETS_PATH}" echo "getting pdf.js" curl -L -O https://github.com/mozilla/pdf.js/releases/download/v2.6.347/pdfjs-2.6.347-es5-dist.zip @@ -100,7 +100,12 @@ mkdir -p $ASSETS_PATH/perseus mv standalone-perseus-1.1.4/* $ASSETS_PATH/perseus rm -rf standalone-perseus-1.1.4/ rm -f v1.1.4.zip -sed -i $SEDEXT '1s/""/"assets\/perseus\/"/' $ASSETS_PATH/perseus/build/frame-perseus.js +sed -i $SEDEXT '1s/""/"..\/assets\/perseus\/"/' $ASSETS_PATH/perseus/build/frame-perseus.js + +echo "getting lato font" +mkdir -p $ASSETS_PATH/fonts +curl -L -o $ASSETS_PATH/fonts/lato-v24-latin-regular.ttf https://dev.kiwix.org/fonts/lato/lato-v24-latin-regular.ttf +curl -L -o $ASSETS_PATH/fonts/lato-v24-latin-regular.woff2 https://dev.kiwix.org/fonts/lato/lato-v24-latin-regular.woff2 if command -v fix_ogvjs_dist > /dev/null; then echo "fixing JS files" diff --git a/hatch_build.py b/scraper/hatch_build.py similarity index 86% rename from hatch_build.py rename to scraper/hatch_build.py index 86dfa52..52f00fc 100644 --- a/hatch_build.py +++ b/scraper/hatch_build.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) # update list in constants.py as well -JS_DEPS = [ +WEB_DEPS = [ "pdfjs", "videojs", "ogvjs", @@ -19,6 +19,8 @@ "jszip.min.js", "jquery.min.js", "videojs-ogvjs.js", + "lato-v24-latin-regular.woff2", + "lato-v24-latin-regular.ttf", ] @@ -29,13 +31,13 @@ def initialize(self, version, build_data): return Path(self.root).joinpath("src/kolibri2zim/templates/assets") subprocess.run( - str(Path(self.root).joinpath("get_js_deps.sh")), + str(Path(self.root).joinpath("get_web_deps.sh")), check=True, ) return super().initialize(version, build_data) def deps_already_installed(self) -> bool: - for dep in JS_DEPS: + for dep in WEB_DEPS: if ( not Path(self.root) .joinpath("src/kolibri2zim/templates/assets") diff --git a/pyproject.toml b/scraper/pyproject.toml similarity index 97% rename from pyproject.toml rename to scraper/pyproject.toml index a002b7b..262b334 100644 --- a/pyproject.toml +++ b/scraper/pyproject.toml @@ -8,7 +8,7 @@ authors = [{ name = "Kiwix", email = "dev@kiwix.org" }] keywords = ["kiwix", "zim", "offline", "kolibri"] requires-python = ">=3.11" description = "Make ZIM file from Kolibri Channels" -readme = "README.md" +readme = "../README.md" license = { text = "GPL-3.0-or-later" } classifiers = [ "Programming Language :: Python :: 3", @@ -22,6 +22,9 @@ dependencies = [ "pif==0.8.2", "beautifulsoup4==4.9.3", "retrying==1.3.4", + "pydantic==2.4.2", + "python-slugify==8.0.1", + "pyhumps==3.8.0", ] dynamic = ["version"] @@ -105,6 +108,7 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 target-version = ['py311'] +exclude="(src/kolibri2zim/templates/.*|.hatch/.*)" [tool.ruff] target-version = "py311" diff --git a/src/kolibri2zim/__about__.py b/scraper/src/kolibri2zim/__about__.py similarity index 100% rename from src/kolibri2zim/__about__.py rename to scraper/src/kolibri2zim/__about__.py diff --git a/src/kolibri2zim/__init__.py b/scraper/src/kolibri2zim/__init__.py similarity index 100% rename from src/kolibri2zim/__init__.py rename to scraper/src/kolibri2zim/__init__.py diff --git a/src/kolibri2zim/__main__.py b/scraper/src/kolibri2zim/__main__.py similarity index 100% rename from src/kolibri2zim/__main__.py rename to scraper/src/kolibri2zim/__main__.py diff --git a/src/kolibri2zim/constants.py b/scraper/src/kolibri2zim/constants.py similarity index 94% rename from src/kolibri2zim/constants.py rename to scraper/src/kolibri2zim/constants.py index 53e0aa5..18e134a 100644 --- a/src/kolibri2zim/constants.py +++ b/scraper/src/kolibri2zim/constants.py @@ -21,7 +21,7 @@ STUDIO_URL = os.getenv("STUDIO_URL", STUDIO_DEFAULT_BASE_URL) # when modifiying this list, update list in hatch_build.py as well -JS_DEPS: list[str] = [ +WEB_DEPS: list[str] = [ "pdfjs", "videojs", "ogvjs", @@ -32,6 +32,8 @@ "jszip.min.js", "jquery.min.js", "videojs-ogvjs.js", + "lato-v24-latin-regular.woff2", + "lato-v24-latin-regular.ttf", ] diff --git a/src/kolibri2zim/database.py b/scraper/src/kolibri2zim/database.py similarity index 98% rename from src/kolibri2zim/database.py rename to scraper/src/kolibri2zim/database.py index 7ecfa6e..bdcc17d 100644 --- a/src/kolibri2zim/database.py +++ b/scraper/src/kolibri2zim/database.py @@ -102,7 +102,7 @@ def get_node_children(self, node_id, left=None, right=None): right = node["right"] for row in self.get_rows( - "SELECT id, title, kind " + "SELECT id, title, description, kind, lft as left, rght as right " "FROM content_contentnode WHERE lft > ? AND rght < ? " "AND parent_id=?" "ORDER BY level ASC", diff --git a/src/kolibri2zim/debug.py b/scraper/src/kolibri2zim/debug.py similarity index 100% rename from src/kolibri2zim/debug.py rename to scraper/src/kolibri2zim/debug.py diff --git a/src/kolibri2zim/entrypoint.py b/scraper/src/kolibri2zim/entrypoint.py similarity index 95% rename from src/kolibri2zim/entrypoint.py rename to scraper/src/kolibri2zim/entrypoint.py index 34bc34b..b5f89a3 100755 --- a/src/kolibri2zim/entrypoint.py +++ b/scraper/src/kolibri2zim/entrypoint.py @@ -2,6 +2,7 @@ # vim: ai ts=4 sts=4 et sw=4 nu import argparse +import os import sys from kolibri2zim.constants import NAME, SCRAPER, Global, get_logger, set_debug @@ -127,6 +128,15 @@ def main(): "Receives all data (storage space)", ) + parser.add_argument( + "--zimui-dist", + type=str, + help=( + "Directory containing Vite build output from the Zim UI Vue.JS application" + ), + default=os.getenv("KOLIBRI_ZIMUI_DIST", "../zimui/dist"), + ) + parser.add_argument( "--zim-file", help="ZIM file name (based on --name if not provided)", diff --git a/src/kolibri2zim/nodes.py b/scraper/src/kolibri2zim/nodes.py similarity index 100% rename from src/kolibri2zim/nodes.py rename to scraper/src/kolibri2zim/nodes.py diff --git a/src/kolibri2zim/processing.py b/scraper/src/kolibri2zim/processing.py similarity index 100% rename from src/kolibri2zim/processing.py rename to scraper/src/kolibri2zim/processing.py diff --git a/scraper/src/kolibri2zim/schemas.py b/scraper/src/kolibri2zim/schemas.py new file mode 100644 index 0000000..fae0602 --- /dev/null +++ b/scraper/src/kolibri2zim/schemas.py @@ -0,0 +1,52 @@ +from humps import camelize +from pydantic import BaseModel + + +class CamelModel(BaseModel): + """Model than transform Python snake_case into JSON camelCase""" + + class Config: + alias_generator = camelize + populate_by_name = True + + +class TopicSubSection(CamelModel): + """One subclass to serialize data about one Kolibri topic""" + + slug: str + title: str + description: str + kind: str + thumbnail: str | None + + +class TopicSection(CamelModel): + """Another subclass to serialize data about one Kolibri topic""" + + slug: str + title: str + description: str + kind: str + thumbnail: str | None + subsections: list[TopicSubSection] + + +class Topic(CamelModel): + """Class to serialize data about one Kolibri topic + + One topic is composed of parents, sections and subsections. + This is already preprocessed information, closely adapted + to current UI needs + """ + + parents_slugs: list[str] + title: str + description: str + sections: list[TopicSection] + thumbnail: str | None + + +class Channel(CamelModel): + """Class to serialize data about the Kolibri channel""" + + root_slug: str diff --git a/src/kolibri2zim/scraper.py b/scraper/src/kolibri2zim/scraper.py similarity index 84% rename from src/kolibri2zim/scraper.py rename to scraper/src/kolibri2zim/scraper.py index e812305..1b68c5a 100644 --- a/src/kolibri2zim/scraper.py +++ b/scraper/src/kolibri2zim/scraper.py @@ -17,6 +17,7 @@ from bs4 import BeautifulSoup from kiwixstorage import KiwixStorage from pif import get_public_ip +from slugify import slugify from zimscraperlib.constants import ( MAXIMUM_DESCRIPTION_METADATA_LENGTH as MAX_DESC_LENGTH, ) @@ -32,7 +33,7 @@ from zimscraperlib.zim.creator import Creator from zimscraperlib.zim.items import StaticItem -from kolibri2zim.constants import JS_DEPS, ROOT_DIR, STUDIO_URL, Global, get_logger +from kolibri2zim.constants import ROOT_DIR, STUDIO_URL, WEB_DEPS, Global, get_logger from kolibri2zim.database import KolibriDB from kolibri2zim.debug import ( ON_DISK_THRESHOLD, @@ -40,6 +41,7 @@ get_size_and_mime, safer_reencode, ) +from kolibri2zim.schemas import Channel, Topic, TopicSection, TopicSubSection logger = get_logger() options = [ @@ -131,6 +133,7 @@ def go(option): if go("tmp_dir"): Path(go("tmp_dir")).mkdir(parents=True, exist_ok=True) # pyright: ignore self.build_dir = Path(tempfile.mkdtemp(dir=go("tmp_dir"))) + self.zimui_dist = Path(go("zimui_dist") or "../zimui/dist") # performances options self.nb_threads = int(go("threads") or 1) @@ -155,19 +158,20 @@ def go(option): loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True ) + # a dictionnary mapping node_id (keys) to slug (values) + self.nodes_ids_to_slugs: dict[str, str] = {} + @property def templates_dir(self): return ROOT_DIR.joinpath("templates") def add_local_files(self, root_path, folder): """recursively add local files from {folder} starting at {path}""" - non_front = ("viewer.html", "epub_embed.html") for fpath in folder.iterdir(): path = "/".join([root_path, fpath.name]) if fpath.is_file(): - mimetype = "text/html;raw=true" if fpath.name in non_front else None self.creator.add_item_for( - path=path, title="", fpath=fpath, mimetype=mimetype + path=path, title="", fpath=fpath, is_front=False ) logger.debug(f"Adding {path}") else: @@ -188,6 +192,37 @@ def schedule_node(item): if self.node_ids is None or node["id"] in self.node_ids: schedule_node((node["id"], node["kind"])) + def get_or_create_node_slug(self, node) -> str: + """Compute a unique slug to be used as URL for a given node""" + if node["id"] in self.nodes_ids_to_slugs: + return self.nodes_ids_to_slugs[node["id"]] + slug = slugify(str(node.get("title", node["id"]))) + if slug in self.nodes_ids_to_slugs.values(): + suffix = 1 + while True: + if f"{slug}_{suffix}" not in self.nodes_ids_to_slugs.values(): + break + suffix += 1 + slug = f"{slug}_{suffix}" + self.nodes_ids_to_slugs[node["id"]] = slug + return slug + + def add_channel_json(self): + node = self.db.get_node( + node_id=self.root_id, with_parents=True, with_children=True + ) + + with self.creator_lock: + self.creator.add_item_for( + path="channel.json", + title=node["title"], + content=Channel( + root_slug=self.get_or_create_node_slug(node) + ).model_dump_json(by_alias=True, indent=2), + mimetype="application/json", + is_front=False, + ) + def add_node(self, item): """process a content node from the tuple in queue""" node_id, kind = item @@ -202,18 +237,18 @@ def add_node(self, item): # add thumbnail to zim if there's one for this node thumbnail = self.db.get_node_thumbnail(node_id) if thumbnail: - self.funnel_file(thumbnail["id"], thumbnail["ext"]) + self.funnel_file(thumbnail["id"], thumbnail["ext"], "thumbnails/") # fire the add_{kind}_node() method which will actually process it handler(node_id) - def funnel_file(self, fid, fext): + def funnel_file(self, fid, fext, path_prefix=""): """directly add a Kolibri file to the ZIM using same name""" url, fname = get_kolibri_url_for(fid, fext) size, mimetype = get_size_and_mime(url) item_kw = { - "path": fname, + "path": path_prefix + fname, "title": "", "mimetype": mimetype, "delete_fpath": True, @@ -307,19 +342,52 @@ def add_topic_node(self, node_id): Topic nodes are used only for hierarchy and solely contains metadata""" # fetch details including parents for breadcrumb and children to link to - node = self.db.get_node(node_id, with_parents=True, with_children=True) + node = self.db.get_node(node_id=node_id, with_parents=True, with_children=True) + node_slug = self.get_or_create_node_slug(node) - html = self.jinja2_env.get_template("topic.html").render( - node_id=node_id, **node - ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"topics/{node_slug}.json", title=node["title"], - content=html, - mimetype="text/html", + content=Topic( + parents_slugs=[ + self.get_or_create_node_slug(parent) + for parent in node["parents"] + ], + title=node["title"], + description=node["description"], + sections=[ + TopicSection( + slug=self.get_or_create_node_slug(section), + title=section["title"], + description=section["description"], + kind=section["kind"], + thumbnail=self.db.get_thumbnail_name(section["id"]), + subsections=[ + TopicSubSection( + slug=self.get_or_create_node_slug(subsection), + title=subsection["title"], + description=subsection["description"], + kind=subsection["kind"], + thumbnail=self.db.get_thumbnail_name( + subsection["id"] + ), + ) + for subsection in self.db.get_node_children( + section["id"], + section["left"], + section["right"], + ) + ], + ) + for section in node["children"] + ], + thumbnail=self.db.get_thumbnail_name(node_id), + ).model_dump_json(by_alias=True, indent=2), + mimetype="application/json", + is_front=False, ) - logger.debug(f"Added topic #{node_id}") + logger.debug(f"Added topic #{node_id} - {node_slug}") def add_video_node(self, node_id): """Add content from this `video` node to zim @@ -424,6 +492,7 @@ def add_video_node(self, node_id): ) node = self.db.get_node(node_id, with_parents=True) + node_slug = self.get_or_create_node_slug(node) html = self.jinja2_env.get_template("video.html").render( node_id=node_id, video_filename=video_filename, @@ -435,12 +504,13 @@ def add_video_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node_slug}", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added video #{node_id}") + logger.debug(f"Added video #{node_id} - {node_slug}") def add_video_upon_completion(self, future): """adds the converted video inside this future to the zim @@ -528,6 +598,7 @@ def add_audio_node(self, node_id): self.funnel_file(file["id"], file["ext"]) node = self.db.get_node(node_id, with_parents=True) + node_slug = self.get_or_create_node_slug(node) html = self.jinja2_env.get_template("audio.html").render( node_id=node_id, filename=filename_for(file), @@ -538,12 +609,13 @@ def add_audio_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node_slug}", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added audio #{node_id}") + logger.debug(f"Added audio #{node_id} - {node_slug}") def add_exercise_node(self, node_id): """Add content from this `exercise` node to zim @@ -590,22 +662,25 @@ def add_exercise_node(self, node_id): ) assessment_items.append(perseus_content) + node = self.db.get_node(node_id, with_parents=True, with_children=False) + node_slug = self.get_or_create_node_slug(node) + # add all support files to ZIM for ark_member in zip_ark.namelist(): if ark_member == manifest_name: continue - path = f"{node_id}/{ark_member}" + path = f"files/{node_id}/{ark_member}" with self.creator_lock: self.creator.add_item_for( path=path, title="", content=read_from_zip(zip_ark, ark_member), + is_front=False, ) logger.debug(f"Added exercise support file {path}") # prepare and add exercise HTML article - node = self.db.get_node(node_id, with_parents=True, with_children=False) html = self.jinja2_env.get_template("perseus_exercise.html").render( node_id=node_id, perseus_content=f"[{', '.join(assessment_items)}]", @@ -614,12 +689,13 @@ def add_exercise_node(self, node_id): ) with self.creator_lock: self.creator.add_item_for( - path=node_id, + path=f"files/{node_slug}", title=node["title"], content=html, mimetype="text/html", + is_front=True, ) - logger.debug(f"Added exercise node #{node_id}") + logger.debug(f"Added exercise node #{node_id} - {node_slug}") def add_document_node(self, node_id): """Add content from this `document` node to zim @@ -640,9 +716,12 @@ def add_document_node(self, node_id): def target_for(file): filename = filename_for(file) if file["ext"] == "pdf": - return f"./assets/pdfjs/web/viewer.html?file=../../../{filename}" - if file["ext"] == "epub": - return f"./assets/epub_embed.html?url=../{filename}" + return f"../assets/pdfjs/web/viewer.html?file=../../../files/{filename}" + if get_is_epub(file): + return f"../assets/epub_embed.html?url=../files/{filename}" + + def get_is_epub(file): + return file["ext"] == "epub" # record the actual document files = self.db.get_node_files(node_id, thumbnail=False) @@ -662,10 +741,12 @@ def target_for(file): alt_document = None for file in files: - self.funnel_file(file["id"], file["ext"]) + self.funnel_file(file["id"], file["ext"], path_prefix="files/") file["target"] = target_for(file) node = self.db.get_node(node_id, with_parents=True) + node_slug = self.get_or_create_node_slug(node) + # convert generator to list as we might read it twice node["parents"] = list(node["parents"]) @@ -678,16 +759,18 @@ def target_for(file): for is_alt in options: html = self.jinja2_env.get_template("document.html").render( node_id=node_id, + node_slug=node_slug, main_document=filename_for(main_document), main_document_ext=main_document["ext"], alt_document=filename_for(alt_document) if alt_document else None, alt_document_ext=alt_document["ext"] if alt_document else None, target=target_for(alt_document if is_alt else main_document), is_alt=is_alt, + is_epub=get_is_epub(alt_document if is_alt else main_document), **node, ) with self.creator_lock: - path = node_id + path = f"files/{node_slug}" if is_alt: path += "_alt" self.creator.add_item_for( @@ -695,8 +778,9 @@ def target_for(file): title=node["title"], content=html, mimetype="text/html", + is_front=is_alt, ) - logger.debug(f"Added document #{node_id}") + logger.debug(f"Added document #{node_id} - {node_slug}") def add_html5_node(self, node_id): """Add content from this `html5` node to zim @@ -713,6 +797,9 @@ def add_html5_node(self, node_id): if not file: return + node = self.db.get_node(node_id) + node_slug = self.get_or_create_node_slug(node) + # download ZIP file to memory ark_url, ark_name = get_kolibri_url_for(file["id"], file["ext"]) ark_data = io.BytesIO() @@ -724,8 +811,11 @@ def add_html5_node(self, node_id): if not self.dedup_html_files: with self.creator_lock: self.creator.add_item_for( - path=f"{node_id}/{ark_member}", + path=f"files/{node_slug}/{ark_member}" + if ark_member != "index.html" + else f"files/{node_slug}", content=zip_ark.open(ark_member).read(), + is_front=(ark_member == "index.html"), ) continue @@ -739,16 +829,20 @@ def add_html5_node(self, node_id): self.creator.add_item_for( path=f"html5_files/{content_hash}", content=content, + is_front=False, ) # add redirect to the unique sum-based entry for that file's path with self.creator_lock: self.creator.add_redirect( - path=f"{node_id}/{ark_member}", + path=f"files/{node_slug}/{ark_member}" + if ark_member != "index.html" + else f"files/{node_slug}", target_path=f"html5_files/{content_hash}", + is_front=ark_member == "index.html", ) - logger.debug(f"Added HTML5 node #{node_id}") + logger.debug(f"Added HTML5 node #{node_id} - {node_slug}") def run(self): if self.s3_url_with_credentials and not self.s3_credentials_ok(): @@ -811,7 +905,7 @@ def run(self): return 1 self.creator = Creator( filename=self.output_dir.joinpath(self.clean_fname), - main_path=self.root_id, + main_path="home", ignore_duplicates=True, ) self.creator.config_metadata( @@ -830,9 +924,11 @@ def run(self): succeeded = False try: self.add_favicon() + self.add_zimui() + self.add_custom_about_and_css() - # add static files + # add assets files logger.info("Adding local files (assets)") self.add_local_files("assets", self.templates_dir.joinpath("assets")) @@ -859,6 +955,8 @@ def run(self): # only awaits future completion and doesn't include callbacks self.videos_executor.shutdown() + self.add_channel_json() + succeeded = ( not result.not_done and sum([1 if fs.exception() else 0 for fs in result.done]) == 0 @@ -867,7 +965,8 @@ def run(self): # DEBUG: raise first exception if not succeeded and result.done: logger.info( - f"FAILURE not_done={len(result.not_done)} done={len(result.done)}" + f"FAILURE not_done={len(result.not_done)}" + f"done={len(result.done)}" ) for future in result.done: if future.exception(): @@ -1014,8 +1113,25 @@ def retrieve_favicon(self): def add_favicon(self): self.creator.add_illustration(96, self.favicon_96_fpath.read_bytes()) - self.creator.add_item_for("favicon.png", fpath=self.favicon_96_fpath) - self.creator.add_item_for("favicon.ico", fpath=self.favicon_ico_path) + self.creator.add_item_for( + "favicon.png", fpath=self.favicon_96_fpath, is_front=False + ) + self.creator.add_item_for( + "favicon.ico", fpath=self.favicon_ico_path, is_front=False + ) + + def add_zimui(self): + logger.info(f"Adding files in {self.zimui_dist}") + for file in self.zimui_dist.rglob("*"): + if file.is_dir(): + continue + path = str(Path(file).relative_to(self.zimui_dist)) + logger.debug(f"Adding {path} to ZIM") + self.creator.add_item_for( + path if path != "index.html" else "home", + fpath=file, + is_front=path == "index.html", + ) def add_custom_about_and_css(self): channel_meta = self.db.get_channel_metadata(self.channel_id) @@ -1047,10 +1163,11 @@ def add_custom_about_and_css(self): ) with self.creator_lock: self.creator.add_item_for( - path="about", + path="files/about", title=title, content=html, mimetype="text/html", + is_front=True, ) del html @@ -1067,13 +1184,15 @@ def add_custom_about_and_css(self): else: content = "" - self.creator.add_item_for("custom.css", content=content, mimetype="text/css") + self.creator.add_item_for( + "custom.css", content=content, mimetype="text/css", is_front=False + ) logger.debug("Added about page and custom CSS") def ensure_js_deps_are_present(self): - for dep in JS_DEPS: + for dep in WEB_DEPS: if not self.templates_dir.joinpath(f"assets/{dep}").exists(): raise ValueError( - "It looks like JS deps have not been installed," + "It looks like web deps have not been installed," f" {dep} is missing" ) diff --git a/src/kolibri2zim/templates/about.html b/scraper/src/kolibri2zim/templates/about.html similarity index 85% rename from src/kolibri2zim/templates/about.html rename to scraper/src/kolibri2zim/templates/about.html index 046fc84..00d6622 100644 --- a/src/kolibri2zim/templates/about.html +++ b/scraper/src/kolibri2zim/templates/about.html @@ -8,7 +8,7 @@
{{ description }}
{% if author %}Created by {{ author }}
{% endif %} {% if last_updated %}Updated on {{ last_updated }}
{% endif %} - + {% endif %} {% endblock %} diff --git a/scraper/src/kolibri2zim/templates/assets/.gitignore b/scraper/src/kolibri2zim/templates/assets/.gitignore new file mode 100644 index 0000000..c36eb42 --- /dev/null +++ b/scraper/src/kolibri2zim/templates/assets/.gitignore @@ -0,0 +1,11 @@ +bootstrap/ +pdfjs/ +videojs/ +jquery.min.js +ogvjs/ +videojs-ogvjs.js +epub.min.js +bootstrap-icons/ +jszip.min.js +perseus/ +lato* diff --git a/src/kolibri2zim/templates/assets/document.js b/scraper/src/kolibri2zim/templates/assets/document.js similarity index 100% rename from src/kolibri2zim/templates/assets/document.js rename to scraper/src/kolibri2zim/templates/assets/document.js diff --git a/src/kolibri2zim/templates/assets/epub_embed.css b/scraper/src/kolibri2zim/templates/assets/epub_embed.css similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.css rename to scraper/src/kolibri2zim/templates/assets/epub_embed.css diff --git a/src/kolibri2zim/templates/assets/epub_embed.html b/scraper/src/kolibri2zim/templates/assets/epub_embed.html similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.html rename to scraper/src/kolibri2zim/templates/assets/epub_embed.html diff --git a/src/kolibri2zim/templates/assets/epub_embed.js b/scraper/src/kolibri2zim/templates/assets/epub_embed.js similarity index 100% rename from src/kolibri2zim/templates/assets/epub_embed.js rename to scraper/src/kolibri2zim/templates/assets/epub_embed.js diff --git a/src/kolibri2zim/templates/assets/perseus_exercise.js b/scraper/src/kolibri2zim/templates/assets/perseus_exercise.js similarity index 100% rename from src/kolibri2zim/templates/assets/perseus_exercise.js rename to scraper/src/kolibri2zim/templates/assets/perseus_exercise.js diff --git a/src/kolibri2zim/templates/audio.html b/scraper/src/kolibri2zim/templates/audio.html similarity index 71% rename from src/kolibri2zim/templates/audio.html rename to scraper/src/kolibri2zim/templates/audio.html index 3153cbe..07e6ff3 100644 --- a/src/kolibri2zim/templates/audio.html +++ b/scraper/src/kolibri2zim/templates/audio.html @@ -1,7 +1,7 @@ {% extends "base.html" %} {% block head %} - + {% block head %}{% endblock %} - ++ {% endif %} + {% block content %}{% endblock %} {% block footer %} @@ -41,7 +52,7 @@ {% endblock %}