From 23b433f683d1dfd2474c4f5a570b7f01ce81c4da Mon Sep 17 00:00:00 2001 From: ccurme Date: Sat, 14 Dec 2024 10:13:19 -0500 Subject: [PATCH] infra: fix notebook tests (#28722) Bump unstructured to pick up resolution of https://github.com/Unstructured-IO/unstructured/issues/3795 --- poetry.lock | 165 +++++++++++++++++++++---------------------------- pyproject.toml | 3 +- 2 files changed, 72 insertions(+), 96 deletions(-) diff --git a/poetry.lock b/poetry.lock index a651014235914..f53be8c20e771 100644 --- a/poetry.lock +++ b/poetry.lock @@ -161,13 +161,13 @@ files = [ [[package]] name = "anthropic" -version = "0.37.1" +version = "0.40.0" description = "The official Python library for the anthropic API" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "anthropic-0.37.1-py3-none-any.whl", hash = "sha256:8f550f88906823752e2abf99fbe491fbc8d40bce4cb26b9663abdf7be990d721"}, - {file = "anthropic-0.37.1.tar.gz", hash = "sha256:99f688265795daa7ba9256ee68eaf2f05d53cd99d7417f4a0c2dc292c106d00a"}, + {file = "anthropic-0.40.0-py3-none-any.whl", hash = "sha256:442028ae8790ff9e3b6f8912043918755af1230d193904ae2ef78cc22995280c"}, + {file = "anthropic-0.40.0.tar.gz", hash = "sha256:3efeca6d9e97813f93ed34322c6c7ea2279bf0824cd0aa71b59ce222665e2b87"}, ] [package.dependencies] @@ -177,7 +177,6 @@ httpx = ">=0.23.0,<1" jiter = ">=0.4.0,<1" pydantic = ">=1.9.0,<3" sniffio = "*" -tokenizers = ">=0.13.0" typing-extensions = ">=4.7,<5" [package.extras] @@ -1934,6 +1933,27 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml"] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml"] + [[package]] name = "httpcore" version = "1.0.6" @@ -2794,7 +2814,7 @@ adal = ["adal (>=1.0.2)"] [[package]] name = "langchain" -version = "0.3.4" +version = "0.3.11" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9,<4.0" @@ -2804,12 +2824,12 @@ develop = true [package.dependencies] aiohttp = "^3.8.3" async-timeout = {version = "^4.0.0", markers = "python_version < \"3.11\""} -langchain-core = "^0.3.12" +langchain-core = "^0.3.24" langchain-text-splitters = "^0.3.0" -langsmith = "^0.1.17" +langsmith = ">=0.1.17,<0.3" numpy = [ - {version = ">=1,<2", markers = "python_version < \"3.12\""}, - {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.22.4,<2", markers = "python_version < \"3.12\""}, + {version = ">=1.26.2,<3", markers = "python_version >= \"3.12\""}, ] pydantic = "^2.7.4" PyYAML = ">=5.3" @@ -2823,7 +2843,7 @@ url = "libs/langchain" [[package]] name = "langchain-anthropic" -version = "0.2.3" +version = "0.3.0" description = "An integration package connecting AnthropicMessages and LangChain" optional = false python-versions = ">=3.9,<4.0" @@ -2831,8 +2851,8 @@ files = [] develop = true [package.dependencies] -anthropic = ">=0.30.0,<1" -langchain-core = "^0.3.9" +anthropic = ">=0.39.0,<1" +langchain-core = "^0.3.17" pydantic = "^2.7.4" [package.source] @@ -2866,19 +2886,19 @@ subdirectory = "libs/aws" [[package]] name = "langchain-chroma" -version = "0.1.5" +version = "0.2.0" description = "An integration package connecting Chroma and LangChain" optional = false -python-versions = ">=3.8.1,<4" +python-versions = ">=3.9,<4" files = [] develop = true [package.dependencies] chromadb = ">=0.4.0,<0.6.0,!=0.5.4,!=0.5.5,!=0.5.7,!=0.5.9,!=0.5.10,!=0.5.11,!=0.5.12" -langchain-core = {version = ">=0.1.40,<0.4", markers = "python_version >= \"3.9\""} +langchain-core = ">=0.2.43,<0.4.0,!=0.3.0,!=0.3.1,!=0.3.2,!=0.3.3,!=0.3.4,!=0.3.5,!=0.3.6,!=0.3.7,!=0.3.8,!=0.3.9,!=0.3.10,!=0.3.11,!=0.3.12,!=0.3.13,!=0.3.14" numpy = [ - {version = ">=1,<2", markers = "python_version < \"3.12\""}, - {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.22.4,<2.0.0", markers = "python_version < \"3.12\""}, + {version = ">=1.26.2,<2.0.0", markers = "python_version >= \"3.12\""}, ] [package.source] @@ -2887,7 +2907,7 @@ url = "libs/partners/chroma" [[package]] name = "langchain-community" -version = "0.3.3" +version = "0.3.11" description = "Community contributed LangChain integrations." optional = false python-versions = ">=3.9,<4.0" @@ -2898,12 +2918,12 @@ develop = true aiohttp = "^3.8.3" dataclasses-json = ">= 0.5.7, < 0.7" httpx-sse = "^0.4.0" -langchain = "^0.3.4" -langchain-core = "^0.3.12" -langsmith = "^0.1.125" +langchain = "^0.3.11" +langchain-core = "^0.3.24" +langsmith = ">=0.1.125,<0.3" numpy = [ - {version = ">=1,<2", markers = "python_version < \"3.12\""}, - {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.22.4,<2", markers = "python_version < \"3.12\""}, + {version = ">=1.26.2,<3", markers = "python_version >= \"3.12\""}, ] pydantic-settings = "^2.4.0" PyYAML = ">=5.3" @@ -2917,7 +2937,7 @@ url = "libs/community" [[package]] name = "langchain-core" -version = "0.3.20" +version = "0.3.25" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.9,<4.0" @@ -2926,7 +2946,7 @@ develop = true [package.dependencies] jsonpatch = "^1.33" -langsmith = "^0.1.125" +langsmith = ">=0.1.125,<0.3" packaging = ">=23.2,<25" pydantic = [ {version = ">=2.5.2,<3.0.0", markers = "python_full_version < \"3.12.4\""}, @@ -2962,7 +2982,7 @@ subdirectory = "libs/experimental" [[package]] name = "langchain-fireworks" -version = "0.2.1" +version = "0.2.5" description = "An integration package connecting Fireworks and LangChain" optional = false python-versions = ">=3.9,<4.0" @@ -2972,7 +2992,7 @@ develop = true [package.dependencies] aiohttp = "^3.9.1" fireworks-ai = ">=0.13.0" -langchain-core = "^0.3.9" +langchain-core = "^0.3.15" openai = "^1.10.0" requests = "^2" @@ -3010,7 +3030,7 @@ subdirectory = "libs/vertexai" [[package]] name = "langchain-groq" -version = "0.2.0" +version = "0.2.1" description = "An integration package connecting Groq and LangChain" optional = false python-versions = ">=3.9,<4.0" @@ -3019,7 +3039,7 @@ develop = true [package.dependencies] groq = ">=0.4.1,<1" -langchain-core = "^0.3" +langchain-core = "^0.3.15" [package.source] type = "directory" @@ -3027,7 +3047,7 @@ url = "libs/partners/groq" [[package]] name = "langchain-mistralai" -version = "0.2.0" +version = "0.2.3" description = "An integration package connecting Mistral and LangChain" optional = false python-versions = ">=3.9,<4.0" @@ -3037,7 +3057,7 @@ develop = true [package.dependencies] httpx = ">=0.25.2,<1" httpx-sse = ">=0.3.1,<1" -langchain-core = "^0.3.0" +langchain-core = "^0.3.21" pydantic = ">=2,<3" tokenizers = ">=0.15.1,<1" @@ -3047,7 +3067,7 @@ url = "libs/partners/mistralai" [[package]] name = "langchain-openai" -version = "0.2.4" +version = "0.2.12" description = "An integration package connecting OpenAI and LangChain" optional = false python-versions = ">=3.9,<4.0" @@ -3055,8 +3075,8 @@ files = [] develop = true [package.dependencies] -langchain-core = "^0.3.13" -openai = "^1.52.0" +langchain-core = "^0.3.21" +openai = "^1.55.3" tiktoken = ">=0.7,<1" [package.source] @@ -3065,7 +3085,7 @@ url = "libs/partners/openai" [[package]] name = "langchain-text-splitters" -version = "0.3.0" +version = "0.3.2" description = "LangChain text splitting utilities" optional = false python-versions = ">=3.9,<4.0" @@ -3073,7 +3093,7 @@ files = [] develop = true [package.dependencies] -langchain-core = "^0.3.0" +langchain-core = "^0.3.15" [package.source] type = "directory" @@ -4154,13 +4174,13 @@ sympy = "*" [[package]] name = "openai" -version = "1.52.2" +version = "1.57.4" description = "The official Python library for the openai API" optional = false -python-versions = ">=3.7.1" +python-versions = ">=3.8" files = [ - {file = "openai-1.52.2-py3-none-any.whl", hash = "sha256:57e9e37bc407f39bb6ec3a27d7e8fb9728b2779936daa1fcf95df17d3edfaccc"}, - {file = "openai-1.52.2.tar.gz", hash = "sha256:87b7d0f69d85f5641678d414b7ee3082363647a5c66a462ed7f3ccb59582da0d"}, + {file = "openai-1.57.4-py3-none-any.whl", hash = "sha256:7def1ab2d52f196357ce31b9cfcf4181529ce00838286426bb35be81c035dafb"}, + {file = "openai-1.57.4.tar.gz", hash = "sha256:a8f071a3e9198e2818f63aade68e759417b9f62c0971bdb83de82504b70b77f7"}, ] [package.dependencies] @@ -6646,13 +6666,13 @@ files = [ [[package]] name = "unstructured" -version = "0.15.14" +version = "0.16.11" description = "A library that prepares raw documents for downstream ML tasks." optional = false python-versions = "<3.13,>=3.9.0" files = [ - {file = "unstructured-0.15.14-py3-none-any.whl", hash = "sha256:502903cbcc60844c82f5351a0bc2e77f00f16a144cb884ac44d2f175470a1df8"}, - {file = "unstructured-0.15.14.tar.gz", hash = "sha256:876546c308c257314865996ce15745139c9fd4f79c7b4f09ad9d719d466b5b55"}, + {file = "unstructured-0.16.11-py3-none-any.whl", hash = "sha256:a92d5bc2c2b7bb23369641fb7a7f0daba1775639199306ce4cd83ca564a03763"}, + {file = "unstructured-0.16.11.tar.gz", hash = "sha256:33ebf68aae11ce33c8a96335296557b5abd8ba96eaba3e5a1554c0b9eee40bb5"}, ] [package.dependencies] @@ -6662,6 +6682,7 @@ chardet = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" markdown = {version = "*", optional = true, markers = "extra == \"md\""} @@ -6673,76 +6694,30 @@ python-magic = "*" python-oxmsg = "*" rapidfuzz = "*" requests = "*" -tabulate = "*" tqdm = "*" typing-extensions = "*" unstructured-client = "*" wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -astradb = ["astrapy"] -azure = ["adlfs", "fsspec"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain-community"] -biomed = ["bs4"] -box = ["boxfs", "fsspec"] -chroma = ["chromadb (>0.4.14)", "importlib-metadata (>=8.2.0)", "tenacity (==8.5.0)", "typer (<=0.9.0)"] -clarifai = ["clarifai"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] csv = ["pandas"] -databricks-volumes = ["databricks-sdk"] -delta-table = ["deltalake (<=0.19.1)", "fsspec"] -discord = ["discord-py"] doc = ["python-docx (>=1.1.2)"] docx = ["python-docx (>=1.1.2)"] -dropbox = ["dropboxdrivefs", "fsspec"] -elasticsearch = ["elasticsearch[async]"] -embed-huggingface = ["langchain-huggingface"] -embed-mixedbreadai = ["mixedbread-ai"] -embed-octoai = ["openai", "tiktoken"] -embed-vertexai = ["langchain", "langchain-community", "langchain-google-vertexai"] -embed-voyageai = ["langchain", "langchain-voyageai"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] -hubspot = ["hubspot-api-client", "urllib3"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -kafka = ["confluent-kafka"] -local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -mongodb = ["pymongo"] -notion = ["htmlBuilder", "notion-client"] odt = ["pypandoc", "python-docx (>=1.1.2)"] -onedrive = ["Office365-REST-Python-Client", "bs4", "msal"] -openai = ["langchain-openai"] -opensearch = ["opensearch-py"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client", "msal"] paddleocr = ["paddlepaddle (==3.0.0b1)", "unstructured.paddleocr (==2.8.1.0)"] -pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.7.36)", "unstructured.pytesseract (>=0.3.12)"] -pinecone = ["pinecone-client (>=3.7.1)"] -postgres = ["psycopg2-binary"] +pdf = ["effdet", "google-cloud-vision", "onnx", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (==0.8.1)", "unstructured.pytesseract (>=0.3.12)"] ppt = ["python-pptx (>=1.0.1)"] pptx = ["python-pptx (>=1.0.1)"] -qdrant = ["qdrant-client"] -reddit = ["praw"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec", "s3fs"] -salesforce = ["simple-salesforce"] -sftp = ["fsspec", "paramiko"] -sharepoint = ["Office365-REST-Python-Client", "msal"] -singlestore = ["singlestoredb"] -slack = ["slack-sdk"] tsv = ["pandas"] -weaviate = ["weaviate-client"] -wikipedia = ["wikipedia"] xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] [[package]] @@ -7407,4 +7382,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "cb6b45ac7f487c6510a0eeef80c6cfd4b163f671eedb42d9ebf5315f42fa1ab1" +content-hash = "138c279994b75a02c377fd5fde3808770c9ae6259c59728b9986480d93790aa1" diff --git a/pyproject.toml b/pyproject.toml index b4172ec49bae5..0466cad3b8ed8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,8 @@ grandalf = "^0.8" lark = "^1.1.9" pandas = "^2" rank-bm25 = "^0.2.2" -unstructured = { version = "^0.15.12", extras = ["md"], python = "<3.13" } +tabulate = "^0.9.0" +unstructured = { version = "^0.16.11", extras = ["md"], python = "<3.13" } wikipedia = "^1.4.0" pypdf = "^5.0.0" vcrpy = "^6.0.1"