Added a devdocs client and tests.

Beyond the included tests, I've validated URLs in a WIP branch that also includes the skeleton generator and CLI. Additionally bumped the requests version to make dependabot happy. Part of #1.
openzim · Jul 18, 2024 · ee1c445 · ee1c445
1 parent 9dfde06
commit ee1c445
Show file tree

Hide file tree

Showing 4 changed files with 589 additions and 3 deletions.
diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
@@ -31,8 +31,7 @@ jobs:
         uses: codecov/codecov-action@v4
         with:
           slug: openzim/devdocs
-          # TODO(#4): Codecov uploads are failing. Fix them and turn this on.
-          fail_ci_if_error: false
+          fail_ci_if_error: true
           token: ${{ secrets.CODECOV_TOKEN }}
 
   build_python:

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,8 @@ requires-python = ">=3.12,<3.13"
 description = "Make ZIM files from DevDocs.io"
 readme = "README.md"
 dependencies = [
-    "requests==2.31.0",
+    "requests==2.32.0",
+    "pydantic==2.8.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

diff --git a/src/devdocs2zim/client.py b/src/devdocs2zim/client.py
@@ -0,0 +1,195 @@
+import logging
+
+import requests
+from pydantic import BaseModel, TypeAdapter
+
+HTTP_TIMEOUT_SECONDS = 15
+
+logger = logging.getLogger(__name__)
+
+
+class DevdocsMetadataLinks(BaseModel):
+    """Project links for a specific documentation set."""
+
+    # Home page for the project.
+    home: str = ""
+    # Link to the project's source code.
+    code: str = ""
+
+
+class DevdocsMetadata(BaseModel):
+    """Metadata about a documentation set."""
+
+    # Human readable name for the documentation.
+    name: str
+    # Directory name devdocs puts the docs under. Takes the format:
+    # name[~version] e.g. "python" or "python-3.10".
+    slug: str
+    # Links to project resources.
+    links: DevdocsMetadataLinks | None = None
+    # Shortened version displayed in devdocs, if any. Second part of the slug.
+    version: str = ""
+    # Specific release of the software the documentation is for, if any.
+    release: str = ""
+    # License and attribution information, if any.
+    attribution: str = ""
+
+    @property
+    def slug_without_version(self):
+        return self.slug.split("~")[0]
+
+    def placeholders(self) -> dict[str, str]:
+        """Gets placeholders for filenames."""
+        home_link = ""
+        code_link = ""
+        if self.links is not None:
+            home_link = self.links.home
+            code_link = self.links.code
+
+        full_name = self.name
+        if self.version:
+            full_name += f" {self.version}"
+
+        # properties are inspired by what devdocs uses for their frontend:
+        # https://github.com/freeCodeCamp/devdocs/blob/6caa5eb1b18ab8d34034f319024bd81877035b36/lib/app.rb#L110
+        return {
+            "name": self.name,
+            "full_name": full_name,
+            "slug": self.slug,
+            "version": self.version,
+            "release": self.release,
+            "attribution": self.attribution,
+            "home_link": home_link,
+            "code_link": code_link,
+            "slug_without_version": self.slug_without_version,
+        }
+
+
+class DevdocsIndexEntry(BaseModel):
+    """A link to a document in the sidebar."""
+
+    # Display name for the entry.
+    name: str
+
+    # Path to the entry in the db.json file. This may contain a fragment identifier
+    # linking to an anchor tag e.g. #fragment that would not exist in the db.json file.
+    path: str
+
+    # Name of the type (section) the entry is located under.
+    type: str
+
+    @property
+    def path_without_fragment(self) -> str:
+        """Key in db.json for the file's contents."""
+        return self.path.split("#")[0]
+
+
+class DevdocsIndexType(BaseModel):
+    """A section header for documentation."""
+
+    # Display name for the section.
+    name: str
+
+    # Number of documents in the section.
+    count: int
+
+    # Section slug. This appears to be unused.
+    slug: str
+
+
+class DevdocsIndex(BaseModel):
+    """Represents entries in the /<slug>/index.json file for each resource."""
+
+    # List of entries.
+    entries: list[DevdocsIndexEntry]
+
+    # List of "types" or section headings.
+    # These are displayed mostly in order, except regular expressions are used to sort:
+    # https://github.com/freeCodeCamp/devdocs/blob/e28f81d3218bdbad7eac0540c58c11c7fe1d33d3/assets/javascripts/collections/types.js#L3
+    types: list[DevdocsIndexType]
+
+
+class DevdocsClient:
+    def __init__(self, documents_url: str, frontend_url: str) -> None:
+        self.documents_url = documents_url
+        self.frontend_url = frontend_url
+
+    # TODO: Support per-document images by fetching them from the git repository.
+    # Devdocs deploys these as a single spritesheet that's difficult to split.
+    #
+    # fcc2zim has similar functionality already:
+    # https://github.com/openzim/freecodecamp/blob/a1221073049895609d7712a8834474311535f0c1/scraper/src/fcc2zim/fetch.py#L12
+
+    def read_frontend_file(self, file_path: str) -> str:
+        """Read a file from the devdocs frontend server.
+
+        Parameters:
+          file_path: Path of the file relative to the root.
+        """
+        resp = requests.get(
+            f"{self.frontend_url}/{file_path}",
+            allow_redirects=True,
+            timeout=HTTP_TIMEOUT_SECONDS,
+        )
+        resp.raise_for_status()
+
+        return resp.text
+
+    def read_application_css(self) -> str:
+        """Read the app's CSS which includes classes for normalizing content."""
+
+        return self.read_frontend_file("application.css")
+
+    def list_docs(self) -> list[DevdocsMetadata]:
+        """List the documents devdocs currently has published."""
+
+        # NOTE: There is also a backend file named docs.json, but it
+        # is missing attribution information.
+        file_contents = self.read_frontend_file("docs.json")
+
+        return TypeAdapter(list[DevdocsMetadata]).validate_json(file_contents)
+
+    def read_doc_file(self, doc_slug: str, file_name: str) -> str:
+        """Read a file from the devdocs documents server.
+
+        Parameters:
+          doc_slug: The document's slug e.g. language~v123.
+          file_name: Name of the file under the slug e.g. index.json.
+        """
+
+        # As of 2024-07-17 the largest file is scala~2.12_library/db.json at 144M.
+        # Tested by building the devdocs container image.
+        #
+        # This amount should fit in memory fine, but we need to be careful not to
+        # cache these large vaules in memory.
+        resp = requests.get(
+            url=f"{self.documents_url}/{doc_slug}/{file_name}",
+            allow_redirects=True,
+            timeout=HTTP_TIMEOUT_SECONDS,
+        )
+        resp.raise_for_status()
+
+        return resp.text
+
+    def get_index(self, doc_slug: str) -> DevdocsIndex:
+        """Fetch the set of headings and entries that make up the navigation sidebar."""
+
+        file_contents = self.read_doc_file(doc_slug, "index.json")
+
+        return DevdocsIndex.model_validate_json(file_contents)
+
+    def get_meta(self, doc_slug: str) -> DevdocsMetadata:
+        """Fetch metadata about the given document.
+
+        Prefer using list_docs and filtering if possible because
+        the metadata returned there is more complete.
+        """
+        file_contents = self.read_doc_file(doc_slug, "meta.json")
+
+        return DevdocsMetadata.model_validate_json(file_contents)
+
+    def get_db(self, doc_slug: str) -> dict[str, str]:
+        """Fetch the contents of the pages in the index."""
+        file_contents = self.read_doc_file(doc_slug, "db.json")
+
+        return TypeAdapter(dict[str, str]).validate_json(file_contents)