Skip to content

Commit

Permalink
community[patch]: support unstructured_kwargs for s3 loader (#15473)
Browse files Browse the repository at this point in the history
fix #15472

Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
2 people authored and hinthornw committed Apr 26, 2024
1 parent 60742cd commit c3deab6
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions libs/community/langchain_community/document_loaders/s3_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import tempfile
from typing import TYPE_CHECKING, Callable, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union

from langchain_community.document_loaders.unstructured import UnstructuredBaseLoader

Expand All @@ -29,6 +29,7 @@ def __init__(
boto_config: Optional[botocore.client.Config] = None,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
**unstructured_kwargs: Any,
):
"""Initialize with bucket and key name.
Expand Down Expand Up @@ -85,11 +86,13 @@ def __init__(
the client will be the result of calling ``merge()`` on the
default config with the config provided to this call.
:param mode: Mode in which to read the file. Valid options are: single,
paged and elements
paged and elements.
:param post_processors: Post processing functions to be applied to
extracted elements
extracted elements.
:param **unstructured_kwargs: Arbitrary additional kwargs to pass in when
calling `partition`
"""
super().__init__(mode, post_processors)
super().__init__(mode, post_processors, **unstructured_kwargs)
self.bucket = bucket
self.key = key
self.region_name = region_name
Expand Down Expand Up @@ -129,7 +132,7 @@ def _get_elements(self) -> List:
file_path = f"{temp_dir}/{self.key}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
s3.download_file(self.bucket, self.key, file_path)
return partition(filename=file_path)
return partition(filename=file_path, **self.unstructured_kwargs)

def _get_metadata(self) -> dict:
return {"source": f"s3://{self.bucket}/{self.key}"}

0 comments on commit c3deab6

Please sign in to comment.