diff --git a/libs/community/langchain_community/document_transformers/__init__.py b/libs/community/langchain_community/document_transformers/__init__.py index 14aa448841e615..5e584d3db93fe0 100644 --- a/libs/community/langchain_community/document_transformers/__init__.py +++ b/libs/community/langchain_community/document_transformers/__init__.py @@ -54,6 +54,9 @@ from langchain_community.document_transformers.openai_functions import ( OpenAIMetadataTagger, ) + from langchain_community.document_transformers.mozilla_readability import ( + ReadabilityTransformer, + ) __all__ = [ "BeautifulSoupTransformer", @@ -69,6 +72,7 @@ "NucliaTextTransformer", "OpenAIMetadataTagger", "get_stateful_documents", + "ReadabilityTransformer", ] _module_lookup = { @@ -85,6 +89,7 @@ "NucliaTextTransformer": "langchain_community.document_transformers.nuclia_text_transform", # noqa: E501 "OpenAIMetadataTagger": "langchain_community.document_transformers.openai_functions", # noqa: E501 "get_stateful_documents": "langchain_community.document_transformers.embeddings_redundant_filter", # noqa: E501 + "ReadabilityTransformer": "langchain_community.document_transformers.mozilla_readability", # noqa: E501 } diff --git a/libs/community/langchain_community/document_transformers/mozilla_readability.py b/libs/community/langchain_community/document_transformers/mozilla_readability.py new file mode 100644 index 00000000000000..5ed22676e6cc89 --- /dev/null +++ b/libs/community/langchain_community/document_transformers/mozilla_readability.py @@ -0,0 +1,47 @@ +from typing import Any, Literal, Sequence + +from langchain_core.documents import BaseDocumentTransformer, Document + + +class ReadabilityTransformer(BaseDocumentTransformer): + """A transformer that uses the Mozilla Readability library + to extract the main contentfrom a web page. + + Arguments: + target: The target format of the extracted content; defaults to "text". + **readability_options: Additional options to pass to the readability parser. + + Example: + .. code-block:: python + from langchain_community.document_transformers import ReadabilityTransformer + html2text = Html2TextTransformer() + docs_transform = html2text.transform_documents(docs) + """ + + def __init__( + self, + target: Literal["text", "html"] = "text", + **readability_options: Any, + ) -> None: + self.target = target + self.options = readability_options + + def transform_document(self, document: Document) -> Document: + try: + from readability import parse + except ImportError: + raise ImportError( + "readability module not found, " + "please install it with `pip install python-readability`" + ) + + article = parse(document.page_content, **self.options) + + result = article.text_content if self.target == "html" else article.content + + return Document(page_content=result or "", **document.metadata) + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + return list(map(self.transform_document, documents))