diff --git a/libs/community/langchain_community/document_loaders/s3_file.py b/libs/community/langchain_community/document_loaders/s3_file.py index 59b3164993afb..fb0f0c675aba9 100644 --- a/libs/community/langchain_community/document_loaders/s3_file.py +++ b/libs/community/langchain_community/document_loaders/s3_file.py @@ -2,7 +2,7 @@ import os import tempfile -from typing import TYPE_CHECKING, Callable, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader @@ -29,6 +29,7 @@ def __init__( boto_config: Optional[botocore.client.Config] = None, mode: str = "single", post_processors: Optional[List[Callable]] = None, + **unstructured_kwargs: Any, ): """Initialize with bucket and key name. @@ -85,11 +86,13 @@ def __init__( the client will be the result of calling ``merge()`` on the default config with the config provided to this call. :param mode: Mode in which to read the file. Valid options are: single, - paged and elements + paged and elements. :param post_processors: Post processing functions to be applied to - extracted elements + extracted elements. + :param **unstructured_kwargs: Arbitrary additional kwargs to pass in when + calling `partition` """ - super().__init__(mode, post_processors) + super().__init__(mode, post_processors, **unstructured_kwargs) self.bucket = bucket self.key = key self.region_name = region_name @@ -129,7 +132,7 @@ def _get_elements(self) -> List: file_path = f"{temp_dir}/{self.key}" os.makedirs(os.path.dirname(file_path), exist_ok=True) s3.download_file(self.bucket, self.key, file_path) - return partition(filename=file_path) + return partition(filename=file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {"source": f"s3://{self.bucket}/{self.key}"}