EmbedAPI: main content based on documentation tool (#11812)

We need different CSS selectors to find the main content node depending on the documentation tool used. This PR adds the `?selector=` attribute to the EmbedAPI to accept a specific selector for the main content of the page. This attribute will be sent by the Addons frontend code using the heuristic code we wrote there. If this specific selector is not sent, we fallback to the default selector's value. ### Using the _default selector_ ![Screenshot_2024-11-28_16-04-58](https://github.com/user-attachments/assets/992d2f83-97c8-4860-9950-edcb4e7e2fb2) ### Using the specific selector for this documentation tool ![Screenshot_2024-11-28_16-05-48](https://github.com/user-attachments/assets/db88e3db-d6a1-4943-b0b3-61c1ef1c78fa)
readthedocs · Dec 3, 2024 · 7ff6c0e · 7ff6c0e
1 parent c20603e
commit 7ff6c0e
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 8 deletions.
diff --git a/docs/user/api/v3.rst b/docs/user/api/v3.rst
@@ -2001,10 +2001,11 @@ Embed
     :query string url: full URL of the document (with optional fragment) to fetch content from.
     :query string doctool: *optional* documentation tool key name used to generate the target documentation (currently, only ``sphinx`` is accepted)
     :query string doctoolversion: *optional* documentation tool version used to generate the target documentation (e.g. ``4.2.0``).
+    :query string maincontent: *optional* CSS selector for the main content of the page (e.g. ``div#main``).
 
     .. note::
 
-       Passing ``?doctool=`` and ``?doctoolversion=`` may improve the response,
+       Passing ``?doctool=``, ``?doctoolversion=`` and ``?maincontent=`` may improve the response
        since the endpoint will know more about the exact structure of the HTML and can make better decisions.
 
 Additional APIs

diff --git a/docs/user/guides/embedding-content.rst b/docs/user/guides/embedding-content.rst
@@ -63,6 +63,7 @@ from our own docs and will populate the content of it into the ``#help-container
       'url': 'https://docs.readthedocs.io/en/latest/automation-rules.html%23creating-an-automation-rule',
       // 'doctool': 'sphinx',
       // 'doctoolversion': '4.2.0',
+      // 'maincontent': 'div#main',
     };
     var url = 'https://readthedocs.org/api/v3/embed/?' + $.param(params);
     $.get(url, function(data) {

diff --git a/readthedocs/embed/v3/tests/test_external_pages.py b/readthedocs/embed/v3/tests/test_external_pages.py
@@ -86,6 +86,32 @@ def test_default_main_section(self, app, client, requests_mock):
         }
         compare_content_without_blank_lines(json_response["content"], content)
 
+    @pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
+    def test_specific_main_content_selector(self, app, client, requests_mock):
+        app.build()
+        path = app.outdir / "index.html"
+        assert path.exists() is True
+        content = open(path).read()
+        requests_mock.get("https://docs.project.com", text=content)
+
+        params = {
+            "url": "https://docs.project.com",
+            "maincontent": "#invalid-selector",
+        }
+        response = client.get(self.api_url, params)
+        assert response.status_code == 404
+
+        params = {
+            "url": "https://docs.project.com",
+            "maincontent": "section",
+        }
+        response = client.get(self.api_url, params)
+        assert response.status_code == 200
+        # Check the main three sections are returned.
+        assert "Title" in response.json()["content"]
+        assert "Sub-title" in response.json()["content"]
+        assert "Manual Reference Section" in response.json()["content"]
+
     @pytest.mark.sphinx("html", srcdir=srcdir, freshenv=True)
     def test_specific_identifier(self, app, client, requests_mock):
         app.build()

diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py
@@ -50,6 +50,7 @@ class EmbedAPIBase(EmbedAPIMixin, CDNCacheTagsMixin, APIView):
     * url (with fragment) (required)
     * doctool
     * doctoolversion
+    * maincontent
 
     ### Example
 
@@ -121,7 +122,14 @@ def _get_page_content_from_storage(self, project, version, filename):
 
         return None
 
-    def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
+    def _get_content_by_fragment(
+        self,
+        url,
+        fragment,
+        doctool,
+        doctoolversion,
+        selector,
+    ):
         if self.external:
             page_content = self._download_page_content(url)
         else:
@@ -133,10 +141,17 @@ def _get_content_by_fragment(self, url, fragment, doctool, doctoolversion):
             )
 
         return self._parse_based_on_doctool(
-            page_content, fragment, doctool, doctoolversion
+            page_content,
+            fragment,
+            doctool,
+            doctoolversion,
+            selector,
         )
 
-    def _find_main_node(self, html):
+    def _find_main_node(self, html, selector):
+        if selector:
+            return html.css_first(selector)
+
         main_node = html.css_first("[role=main]")
         if main_node:
             log.debug("Main node found. selector=[role=main]")
@@ -152,7 +167,14 @@ def _find_main_node(self, html):
             log.debug("Main node found. selector=h1")
             return first_header.parent
 
-    def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversion):
+    def _parse_based_on_doctool(
+        self,
+        page_content,
+        fragment,
+        doctool,
+        doctoolversion,
+        selector,
+    ):
         # pylint: disable=unused-argument disable=too-many-nested-blocks
         if not page_content:
             return
@@ -167,7 +189,7 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
             node = HTMLParser(page_content).css_first(selector)
         else:
             html = HTMLParser(page_content)
-            node = self._find_main_node(html)
+            node = self._find_main_node(html, selector)
 
         if not node:
             return
@@ -284,6 +306,7 @@ def get(self, request):  # noqa
         url = request.GET.get("url")
         doctool = request.GET.get("doctool")
         doctoolversion = request.GET.get("doctoolversion")
+        selector = request.GET.get("maincontent")
 
         if not url:
             return Response(
@@ -338,6 +361,7 @@ def get(self, request):  # noqa
                 fragment,
                 doctool,
                 doctoolversion,
+                selector,
             )
         except requests.exceptions.TooManyRedirects:
             log.exception("Too many redirects.", url=url)
@@ -365,12 +389,17 @@ def get(self, request):  # noqa
             )
 
         if not content_requested:
-            log.warning("Identifier not found.", url=url, fragment=fragment)
+            log.warning(
+                "Identifier not found.",
+                url=url,
+                fragment=fragment,
+                maincontent=selector,
+            )
             return Response(
                 {
                     "error": (
                         "Can't find content for section: "
-                        f"url={url} fragment={fragment}"
+                        f"url={url} fragment={fragment} maincontent={selector}"
                     )
                 },
                 status=status.HTTP_404_NOT_FOUND,