From 0d20c314dd0508ea956482fbdd6ce7854b85fc01 Mon Sep 17 00:00:00 2001 From: Alex Tonkonozhenko Date: Tue, 10 Dec 2024 17:05:23 +0100 Subject: [PATCH] Confluence Loader: Fix CQL loading (#27620) fix #12082 --- .../document_loaders/confluence.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/confluence.py b/libs/community/langchain_community/document_loaders/confluence.py index 263c0c8d31fe2..954f4139cae5c 100644 --- a/libs/community/langchain_community/document_loaders/confluence.py +++ b/libs/community/langchain_community/document_loaders/confluence.py @@ -442,17 +442,25 @@ def lazy_load(self) -> Iterator[Document]: yield from self._lazy_load() def _search_content_by_cql( - self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any - ) -> List[dict]: - url = "rest/api/content/search" + self, + cql: str, + include_archived_spaces: Optional[bool] = None, + next_url: str = "", + **kwargs: Any, + ) -> tuple[List[dict], str]: + if next_url: + response = self.confluence.get(next_url) + else: + url = "rest/api/content/search" + + params: Dict[str, Any] = {"cql": cql} + params.update(kwargs) + if include_archived_spaces is not None: + params["includeArchivedSpaces"] = include_archived_spaces - params: Dict[str, Any] = {"cql": cql} - params.update(kwargs) - if include_archived_spaces is not None: - params["includeArchivedSpaces"] = include_archived_spaces + response = self.confluence.get(url, params=params) - response = self.confluence.get(url, params=params) - return response.get("results", []) + return response.get("results", []), response.get("_links", {}).get("next", "") def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: """Paginate the various methods to retrieve groups of pages. @@ -477,6 +485,7 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: max_pages = kwargs.pop("max_pages") docs: List[dict] = [] + next_url: str = "" while len(docs) < max_pages: get_pages = retry( reraise=True, @@ -490,9 +499,15 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: ), before_sleep=before_sleep_log(logger, logging.WARNING), )(retrieval_method) - batch = get_pages(**kwargs, start=len(docs)) - if not batch: - break + if self.cql: # cursor pagination for CQL + batch, next_url = get_pages(**kwargs, next_url=next_url) + if not next_url: + docs.extend(batch) + break + else: + batch = get_pages(**kwargs, start=len(docs)) + if not batch: + break docs.extend(batch) return docs[:max_pages] @@ -694,8 +709,11 @@ def process_pdf( return text for i, image in enumerate(images): - image_text = pytesseract.image_to_string(image, lang=ocr_languages) - text += f"Page {i + 1}:\n{image_text}\n\n" + try: + image_text = pytesseract.image_to_string(image, lang=ocr_languages) + text += f"Page {i + 1}:\n{image_text}\n\n" + except pytesseract.TesseractError as ex: + logger.warning(f"TesseractError: {ex}") return text