Skip to content

Commit

Permalink
community: Add @mozilla/readability document transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
CNSeniorious000 committed Oct 24, 2024
1 parent d081a54 commit e86d5d2
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
from langchain_community.document_transformers.openai_functions import (
OpenAIMetadataTagger,
)
from langchain_community.document_transformers.mozilla_readability import (
ReadabilityTransformer,
)

__all__ = [
"BeautifulSoupTransformer",
Expand All @@ -69,6 +72,7 @@
"NucliaTextTransformer",
"OpenAIMetadataTagger",
"get_stateful_documents",
"ReadabilityTransformer",
]

_module_lookup = {
Expand All @@ -85,6 +89,7 @@
"NucliaTextTransformer": "langchain_community.document_transformers.nuclia_text_transform", # noqa: E501
"OpenAIMetadataTagger": "langchain_community.document_transformers.openai_functions", # noqa: E501
"get_stateful_documents": "langchain_community.document_transformers.embeddings_redundant_filter", # noqa: E501
"ReadabilityTransformer": "langchain_community.document_transformers.mozilla_readability", # noqa: E501
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from typing import Any, Literal, Sequence

from langchain_core.documents import BaseDocumentTransformer, Document


class ReadabilityTransformer(BaseDocumentTransformer):
"""A transformer that uses the Mozilla Readability library
to extract the main contentfrom a web page.
Arguments:
target: The target format of the extracted content; defaults to "text".
**readability_options: Additional options to pass to the readability parser.
Example:
.. code-block:: python
from langchain_community.document_transformers import ReadabilityTransformer
html2text = Html2TextTransformer()
docs_transform = html2text.transform_documents(docs)
"""

def __init__(
self,
target: Literal["text", "html"] = "text",
**readability_options: Any,
) -> None:
self.target = target
self.options = readability_options

def transform_document(self, document: Document) -> Document:
try:
from readability import parse
except ImportError:
raise ImportError(
"readability module not found, "
"please install it with `pip install python-readability`"
)

article = parse(document.page_content, **self.options)

result = article.text_content if self.target == "html" else article.content

return Document(page_content=result or "", **document.metadata)

def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
return list(map(self.transform_document, documents))

0 comments on commit e86d5d2

Please sign in to comment.