Skip to content

Commit

Permalink
Added a devdocs client and tests.
Browse files Browse the repository at this point in the history
Beyond the included tests, I've validated URLs in a WIP branch that also includes the skeleton generator and CLI.

Additionally bumped the requests version to make dependabot happy.

Part of #1.
  • Loading branch information
josephlewis42 committed Jul 18, 2024
1 parent 9dfde06 commit ee1c445
Show file tree
Hide file tree
Showing 4 changed files with 589 additions and 3 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/Tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ jobs:
uses: codecov/codecov-action@v4
with:
slug: openzim/devdocs
# TODO(#4): Codecov uploads are failing. Fix them and turn this on.
fail_ci_if_error: false
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}

build_python:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ requires-python = ">=3.12,<3.13"
description = "Make ZIM files from DevDocs.io"
readme = "README.md"
dependencies = [
"requests==2.31.0",
"requests==2.32.0",
"pydantic==2.8.2",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
195 changes: 195 additions & 0 deletions src/devdocs2zim/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import logging

import requests
from pydantic import BaseModel, TypeAdapter

HTTP_TIMEOUT_SECONDS = 15

logger = logging.getLogger(__name__)


class DevdocsMetadataLinks(BaseModel):
"""Project links for a specific documentation set."""

# Home page for the project.
home: str = ""
# Link to the project's source code.
code: str = ""


class DevdocsMetadata(BaseModel):
"""Metadata about a documentation set."""

# Human readable name for the documentation.
name: str
# Directory name devdocs puts the docs under. Takes the format:
# name[~version] e.g. "python" or "python-3.10".
slug: str
# Links to project resources.
links: DevdocsMetadataLinks | None = None
# Shortened version displayed in devdocs, if any. Second part of the slug.
version: str = ""
# Specific release of the software the documentation is for, if any.
release: str = ""
# License and attribution information, if any.
attribution: str = ""

@property
def slug_without_version(self):
return self.slug.split("~")[0]

def placeholders(self) -> dict[str, str]:
"""Gets placeholders for filenames."""
home_link = ""
code_link = ""
if self.links is not None:
home_link = self.links.home
code_link = self.links.code

full_name = self.name
if self.version:
full_name += f" {self.version}"

# properties are inspired by what devdocs uses for their frontend:
# https://github.com/freeCodeCamp/devdocs/blob/6caa5eb1b18ab8d34034f319024bd81877035b36/lib/app.rb#L110
return {
"name": self.name,
"full_name": full_name,
"slug": self.slug,
"version": self.version,
"release": self.release,
"attribution": self.attribution,
"home_link": home_link,
"code_link": code_link,
"slug_without_version": self.slug_without_version,
}


class DevdocsIndexEntry(BaseModel):
"""A link to a document in the sidebar."""

# Display name for the entry.
name: str

# Path to the entry in the db.json file. This may contain a fragment identifier
# linking to an anchor tag e.g. #fragment that would not exist in the db.json file.
path: str

# Name of the type (section) the entry is located under.
type: str

@property
def path_without_fragment(self) -> str:
"""Key in db.json for the file's contents."""
return self.path.split("#")[0]


class DevdocsIndexType(BaseModel):
"""A section header for documentation."""

# Display name for the section.
name: str

# Number of documents in the section.
count: int

# Section slug. This appears to be unused.
slug: str


class DevdocsIndex(BaseModel):
"""Represents entries in the /<slug>/index.json file for each resource."""

# List of entries.
entries: list[DevdocsIndexEntry]

# List of "types" or section headings.
# These are displayed mostly in order, except regular expressions are used to sort:
# https://github.com/freeCodeCamp/devdocs/blob/e28f81d3218bdbad7eac0540c58c11c7fe1d33d3/assets/javascripts/collections/types.js#L3
types: list[DevdocsIndexType]


class DevdocsClient:
def __init__(self, documents_url: str, frontend_url: str) -> None:
self.documents_url = documents_url
self.frontend_url = frontend_url

# TODO: Support per-document images by fetching them from the git repository.

Check notice on line 117 in src/devdocs2zim/client.py

View check run for this annotation

codefactor.io / CodeFactor

src/devdocs2zim/client.py#L117

Unresolved comment '# TODO: Support per-document images by fetching them from the git repository.' (C100)
# Devdocs deploys these as a single spritesheet that's difficult to split.
#
# fcc2zim has similar functionality already:
# https://github.com/openzim/freecodecamp/blob/a1221073049895609d7712a8834474311535f0c1/scraper/src/fcc2zim/fetch.py#L12

def read_frontend_file(self, file_path: str) -> str:
"""Read a file from the devdocs frontend server.
Parameters:
file_path: Path of the file relative to the root.
"""
resp = requests.get(
f"{self.frontend_url}/{file_path}",
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
)
resp.raise_for_status()

return resp.text

def read_application_css(self) -> str:
"""Read the app's CSS which includes classes for normalizing content."""

return self.read_frontend_file("application.css")

def list_docs(self) -> list[DevdocsMetadata]:
"""List the documents devdocs currently has published."""

# NOTE: There is also a backend file named docs.json, but it
# is missing attribution information.
file_contents = self.read_frontend_file("docs.json")

return TypeAdapter(list[DevdocsMetadata]).validate_json(file_contents)

def read_doc_file(self, doc_slug: str, file_name: str) -> str:
"""Read a file from the devdocs documents server.
Parameters:
doc_slug: The document's slug e.g. language~v123.
file_name: Name of the file under the slug e.g. index.json.
"""

# As of 2024-07-17 the largest file is scala~2.12_library/db.json at 144M.
# Tested by building the devdocs container image.
#
# This amount should fit in memory fine, but we need to be careful not to
# cache these large vaules in memory.
resp = requests.get(
url=f"{self.documents_url}/{doc_slug}/{file_name}",
allow_redirects=True,
timeout=HTTP_TIMEOUT_SECONDS,
)
resp.raise_for_status()

return resp.text

def get_index(self, doc_slug: str) -> DevdocsIndex:
"""Fetch the set of headings and entries that make up the navigation sidebar."""

file_contents = self.read_doc_file(doc_slug, "index.json")

return DevdocsIndex.model_validate_json(file_contents)

def get_meta(self, doc_slug: str) -> DevdocsMetadata:
"""Fetch metadata about the given document.
Prefer using list_docs and filtering if possible because
the metadata returned there is more complete.
"""
file_contents = self.read_doc_file(doc_slug, "meta.json")

return DevdocsMetadata.model_validate_json(file_contents)

def get_db(self, doc_slug: str) -> dict[str, str]:
"""Fetch the contents of the pages in the index."""
file_contents = self.read_doc_file(doc_slug, "db.json")

return TypeAdapter(dict[str, str]).validate_json(file_contents)
Loading

0 comments on commit ee1c445

Please sign in to comment.