Skip to content

Commit

Permalink
Confluence Loader: Fix CQL loading (#27620)
Browse files Browse the repository at this point in the history
fix #12082

<!---
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
-->
  • Loading branch information
Tonkonozhenko authored Dec 10, 2024
1 parent aba2711 commit 0d20c31
Showing 1 changed file with 32 additions and 14 deletions.
46 changes: 32 additions & 14 deletions libs/community/langchain_community/document_loaders/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,17 +442,25 @@ def lazy_load(self) -> Iterator[Document]:
yield from self._lazy_load()

def _search_content_by_cql(
self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any
) -> List[dict]:
url = "rest/api/content/search"
self,
cql: str,
include_archived_spaces: Optional[bool] = None,
next_url: str = "",
**kwargs: Any,
) -> tuple[List[dict], str]:
if next_url:
response = self.confluence.get(next_url)
else:
url = "rest/api/content/search"

params: Dict[str, Any] = {"cql": cql}
params.update(kwargs)
if include_archived_spaces is not None:
params["includeArchivedSpaces"] = include_archived_spaces

params: Dict[str, Any] = {"cql": cql}
params.update(kwargs)
if include_archived_spaces is not None:
params["includeArchivedSpaces"] = include_archived_spaces
response = self.confluence.get(url, params=params)

response = self.confluence.get(url, params=params)
return response.get("results", [])
return response.get("results", []), response.get("_links", {}).get("next", "")

def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages.
Expand All @@ -477,6 +485,7 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:

max_pages = kwargs.pop("max_pages")
docs: List[dict] = []
next_url: str = ""
while len(docs) < max_pages:
get_pages = retry(
reraise=True,
Expand All @@ -490,9 +499,15 @@ def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(retrieval_method)
batch = get_pages(**kwargs, start=len(docs))
if not batch:
break
if self.cql: # cursor pagination for CQL
batch, next_url = get_pages(**kwargs, next_url=next_url)
if not next_url:
docs.extend(batch)
break
else:
batch = get_pages(**kwargs, start=len(docs))
if not batch:
break
docs.extend(batch)
return docs[:max_pages]

Expand Down Expand Up @@ -694,8 +709,11 @@ def process_pdf(
return text

for i, image in enumerate(images):
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n"
try:
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n"
except pytesseract.TesseractError as ex:
logger.warning(f"TesseractError: {ex}")

return text

Expand Down

0 comments on commit 0d20c31

Please sign in to comment.