diff --git a/poetry.lock b/poetry.lock index 9d4d01bf..0517ec93 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -960,6 +960,24 @@ files = [ marshmallow = ">=3.18.0,<4.0.0" typing-inspect = ">=0.4.0,<1" +[[package]] +name = "deepdiff" +version = "8.0.1" +description = "Deep Difference and Search of any Python object/data. Recreate objects by adding adding deltas to each other." +optional = true +python-versions = ">=3.8" +files = [ + {file = "deepdiff-8.0.1-py3-none-any.whl", hash = "sha256:42e99004ce603f9a53934c634a57b04ad5900e0d8ed0abb15e635767489cbc05"}, + {file = "deepdiff-8.0.1.tar.gz", hash = "sha256:245599a4586ab59bb599ca3517a9c42f3318ff600ded5e80a3432693c8ec3c4b"}, +] + +[package.dependencies] +orderly-set = "5.2.2" + +[package.extras] +cli = ["click (==8.1.7)", "pyyaml (==6.0.1)"] +optimize = ["orjson"] + [[package]] name = "deprecated" version = "1.2.14" @@ -1555,6 +1573,17 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-python" +version = "1.0.6" +description = "A more powerful JSONPath implementation in modern python" +optional = true +python-versions = ">=3.6" +files = [ + {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, + {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, +] + [[package]] name = "jsonpointer" version = "3.0.0" @@ -2502,6 +2531,17 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.8.1" @@ -2617,6 +2657,17 @@ files = [ [package.dependencies] et-xmlfile = "*" +[[package]] +name = "orderly-set" +version = "5.2.2" +description = "Orderly set" +optional = true +python-versions = ">=3.8" +files = [ + {file = "orderly_set-5.2.2-py3-none-any.whl", hash = "sha256:f7a37c95a38c01cdfe41c3ffb62925a318a2286ea0a41790c057fc802aec54da"}, + {file = "orderly_set-5.2.2.tar.gz", hash = "sha256:52a18b86aaf3f5d5a498bbdb27bf3253a4e5c57ab38e5b7a56fa00115cd28448"}, +] + [[package]] name = "orjson" version = "3.10.11" @@ -3421,6 +3472,28 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "5.1.0" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc"}, + {file = "pypdf-5.1.0.tar.gz", hash = "sha256:425a129abb1614183fd1aca6982f650b47f8026867c0ce7c4b9f281c443d2740"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -4241,6 +4314,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -4867,13 +4945,13 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "unstructured" -version = "0.10.27" +version = "0.11.8" description = "A library that prepares raw documents for downstream ML tasks." optional = true python-versions = ">=3.7.0" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.11.8-py3-none-any.whl", hash = "sha256:71e8d135a723d8c692a0a43683e3dca4a9b3e0fd8a1d255f57689591ed3860c0"}, + {file = "unstructured-0.11.8.tar.gz", hash = "sha256:dba8b2d54fdc781acef6c9590510e1f55e7467abaaf00403031331903d415f07"}, ] [package.dependencies] @@ -4887,64 +4965,109 @@ langdetect = "*" lxml = "*" nltk = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +python-docx = {version = ">=1.1.0", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-pptx = {version = "<=0.6.23", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" tabulate = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.0)", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.18)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] azure = ["adlfs", "fsspec (==2023.9.1)"] azure-cognitive-search = ["azure-search-documents"] bedrock = ["boto3", "langchain"] biomed = ["bs4"] box = ["boxfs", "fsspec (==2023.9.1)"] +chroma = ["chromadb"] confluence = ["atlassian-python-api"] csv = ["pandas"] delta-table = ["deltalake", "fsspec (==2023.9.1)"] discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] +doc = ["python-docx (>=1.1.0)"] +docx = ["python-docx (>=1.1.0)"] dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] +elasticsearch = ["elasticsearch"] embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] epub = ["pypandoc"] gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] github = ["pygithub (>1.58.0)"] gitlab = ["python-gitlab"] google-drive = ["google-api-python-client"] +hubspot = ["hubspot-api-client", "urllib3 (>=1.26.17)"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] +image = ["onnx", "pdf2image", "pdfminer.six", "pikepdf", "pypdf", "unstructured-inference (==0.7.18)", "unstructured.pytesseract (>=0.3.12)"] jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.0)", "python-pptx (<=0.6.23)", "unstructured-inference (==0.7.18)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] +mongodb = ["pymongo"] msg = ["msg-parser"] notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] +odt = ["pypandoc", "python-docx (>=1.1.0)"] onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] openai = ["langchain", "openai", "tiktoken"] org = ["pypandoc"] outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] +pdf = ["onnx", "pdf2image", "pdfminer.six", "pikepdf", "pypdf", "unstructured-inference (==0.7.18)", "unstructured.pytesseract (>=0.3.12)"] +pinecone = ["pinecone-client"] +ppt = ["python-pptx (<=0.6.23)"] +pptx = ["python-pptx (<=0.6.23)"] +qdrant = ["qdrant-client"] reddit = ["praw"] rst = ["pypandoc"] rtf = ["pypandoc"] s3 = ["fsspec (==2023.9.1)", "s3fs"] salesforce = ["simple-salesforce"] +sftp = ["fsspec", "paramiko"] sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] slack = ["slack-sdk"] tsv = ["pandas"] +weaviate = ["weaviate-client"] wikipedia = ["wikipedia"] xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +[[package]] +name = "unstructured-client" +version = "0.25.9" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.8" +files = [ + {file = "unstructured-client-0.25.9.tar.gz", hash = "sha256:fcc461623f58fefb0e22508e28bf653a8f6934b9779cb4a90dd68d77a39fb5b2"}, + {file = "unstructured_client-0.25.9-py3-none-any.whl", hash = "sha256:c984c01878c8fc243be7c842467d1113a194d885ab6396ae74258ee42717c5b5"}, +] + +[package.dependencies] +certifi = ">=2023.7.22" +charset-normalizer = ">=3.2.0" +cryptography = ">=3.1" +dataclasses-json = ">=0.6.4" +deepdiff = ">=6.0" +httpx = ">=0.27.0" +idna = ">=3.4" +jsonpath-python = ">=1.0.6" +marshmallow = ">=3.19.0" +mypy-extensions = ">=1.0.0" +nest-asyncio = ">=1.6.0" +packaging = ">=23.1" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests = ">=2.31.0" +requests-toolbelt = ">=1.0.0" +six = ">=1.16.0" +typing-extensions = ">=4.7.1" +typing-inspect = ">=0.9.0" +urllib3 = ">=1.26.18" + +[package.extras] +dev = ["pylint (==3.1.0)"] + [[package]] name = "unstructured-pytesseract" version = "0.3.13" @@ -5247,4 +5370,4 @@ vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "ea323704b6919c3f47c5f65ff15e14fe3870521a0b71164d29706a71c8debbdf" +content-hash = "97145570b4156d8bc9d532472c4fd4107a1af1c018762b941ed7e26c948bd0c7" diff --git a/pyproject.toml b/pyproject.toml index d1cd818d..fd44d904 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ nltk = { version = "3.8.1", optional = true } # This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen. # airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict. numpy = "<2" -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.11.8", extras = ["docx", "pptx"], optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = "^42.0.5"