Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: support unstructured_kwargs for s3 loader #15473

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions libs/community/langchain_community/document_loaders/s3_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import tempfile
from typing import TYPE_CHECKING, Callable, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union

from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader

Expand All @@ -29,6 +29,7 @@ def __init__(
boto_config: Optional[botocore.client.Config] = None,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
**unstructured_kwargs: Any,
):
"""Initialize with bucket and key name.

Expand Down Expand Up @@ -85,11 +86,13 @@ def __init__(
the client will be the result of calling ``merge()`` on the
default config with the config provided to this call.
:param mode: Mode in which to read the file. Valid options are: single,
paged and elements
paged and elements.
:param post_processors: Post processing functions to be applied to
extracted elements
extracted elements.
:param **unstructured_kwargs: Arbitrary additional kwargs to pass in when
calling `partition`
"""
super().__init__(mode, post_processors)
super().__init__(mode, post_processors, **unstructured_kwargs)
self.bucket = bucket
self.key = key
self.region_name = region_name
Expand Down Expand Up @@ -129,7 +132,7 @@ def _get_elements(self) -> List:
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
return partition(filename=file_path)
return partition(filename=file_path, **self.unstructured_kwargs)

def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}
Loading