Skip to content

Commit

Permalink
EmbedAPI: main content based on documentation tool (#11812)
Browse files Browse the repository at this point in the history
We need different CSS selectors to find the main content node depending
on the documentation tool used.

This PR adds the `?selector=` attribute to the EmbedAPI to accept a
specific selector for the main content of the page. This attribute will
be sent by the Addons frontend code using the heuristic code we wrote
there.

If this specific selector is not sent, we fallback to the default
selector's value.

### Using the _default selector_


![Screenshot_2024-11-28_16-04-58](https://github.com/user-attachments/assets/992d2f83-97c8-4860-9950-edcb4e7e2fb2)

### Using the specific selector for this documentation tool



![Screenshot_2024-11-28_16-05-48](https://github.com/user-attachments/assets/db88e3db-d6a1-4943-b0b3-61c1ef1c78fa)
  • Loading branch information
humitos authored Dec 3, 2024
1 parent c20603e commit 7ff6c0e
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 8 deletions.
3 changes: 2 additions & 1 deletion docs/user/api/v3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2001,10 +2001,11 @@ Embed
:query string url: full URL of the document (with optional fragment) to fetch content from.
:query string doctool: *optional* documentation tool key name used to generate the target documentation (currently, only ``sphinx`` is accepted)
:query string doctoolversion: *optional* documentation tool version used to generate the target documentation (e.g. ``4.2.0``).
:query string maincontent: *optional* CSS selector for the main content of the page (e.g. ``div#main``).

.. note::

Passing ``?doctool=`` and ``?doctoolversion=`` may improve the response,
Passing ``?doctool=``, ``?doctoolversion=`` and ``?maincontent=`` may improve the response
since the endpoint will know more about the exact structure of the HTML and can make better decisions.

Additional APIs
Expand Down
1 change: 1 addition & 0 deletions docs/user/guides/embedding-content.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ from our own docs and will populate the content of it into the ``#help-container
'url': 'https://docs.readthedocs.io/en/latest/automation-rules.html%23creating-an-automation-rule',
// 'doctool': 'sphinx',
// 'doctoolversion': '4.2.0',
// 'maincontent': 'div#main',
};
var url = 'https://readthedocs.org/api/v3/embed/?' + $.param(params);
$.get(url, function(data) {
Expand Down
26 changes: 26 additions & 0 deletions readthedocs/embed/v3/tests/test_external_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,32 @@ def test_default_main_section(self, app, client, requests_mock):
}
compare_content_without_blank_lines(json_response["content"], content)

@pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
def test_specific_main_content_selector(self, app, client, requests_mock):
app.build()
path = app.outdir / "index.html"
assert path.exists() is True
content = open(path).read()
requests_mock.get("https://docs.project.com", text=content)

params = {
"url": "https://docs.project.com",
"maincontent": "#invalid-selector",
}
response = client.get(self.api_url, params)
assert response.status_code == 404

params = {
"url": "https://docs.project.com",
"maincontent": "section",
}
response = client.get(self.api_url, params)
assert response.status_code == 200
# Check the main three sections are returned.
assert "Title" in response.json()["content"]
assert "Sub-title" in response.json()["content"]
assert "Manual Reference Section" in response.json()["content"]

@pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
def test_specific_identifier(self, app, client, requests_mock):
app.build()
Expand Down
43 changes: 36 additions & 7 deletions readthedocs/embed/v3/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class EmbedAPIBase(EmbedAPIMixin, CDNCacheTagsMixin, APIView):
* url (with fragment) (required)
* doctool
* doctoolversion
* maincontent
### Example
Expand Down Expand Up @@ -121,7 +122,14 @@ def _get_page_content_from_storage(self, project, version, filename):

return None

def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
def _get_content_by_fragment(
self,
url,
fragment,
doctool,
doctoolversion,
selector,
):
if self.external:
page_content = self._download_page_content(url)
else:
Expand All @@ -133,10 +141,17 @@ def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
)

return self._parse_based_on_doctool(
page_content, fragment, doctool, doctoolversion
page_content,
fragment,
doctool,
doctoolversion,
selector,
)

def _find_main_node(self, html):
def _find_main_node(self, html, selector):
if selector:
return html.css_first(selector)

main_node = html.css_first("[role=main]")
if main_node:
log.debug("Main node found. selector=[role=main]")
Expand All @@ -152,7 +167,14 @@ def _find_main_node(self, html):
log.debug("Main node found. selector=h1")
return first_header.parent

def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversion):
def _parse_based_on_doctool(
self,
page_content,
fragment,
doctool,
doctoolversion,
selector,
):
# pylint: disable=unused-argument disable=too-many-nested-blocks
if not page_content:
return
Expand All @@ -167,7 +189,7 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
node = HTMLParser(page_content).css_first(selector)
else:
html = HTMLParser(page_content)
node = self._find_main_node(html)
node = self._find_main_node(html, selector)

if not node:
return
Expand Down Expand Up @@ -284,6 +306,7 @@ def get(self, request): # noqa
url = request.GET.get("url")
doctool = request.GET.get("doctool")
doctoolversion = request.GET.get("doctoolversion")
selector = request.GET.get("maincontent")

if not url:
return Response(
Expand Down Expand Up @@ -338,6 +361,7 @@ def get(self, request): # noqa
fragment,
doctool,
doctoolversion,
selector,
)
except requests.exceptions.TooManyRedirects:
log.exception("Too many redirects.", url=url)
Expand Down Expand Up @@ -365,12 +389,17 @@ def get(self, request): # noqa
)

if not content_requested:
log.warning("Identifier not found.", url=url, fragment=fragment)
log.warning(
"Identifier not found.",
url=url,
fragment=fragment,
maincontent=selector,
)
return Response(
{
"error": (
"Can't find content for section: "
f"url={url} fragment={fragment}"
f"url={url} fragment={fragment} maincontent={selector}"
)
},
status=status.HTTP_404_NOT_FOUND,
Expand Down

0 comments on commit 7ff6c0e

Please sign in to comment.