From 7c512fbdd227982a5c1aa8475e5ac59668058278 Mon Sep 17 00:00:00 2001 From: Fernando Silva Date: Wed, 12 Jun 2024 18:17:18 -0300 Subject: [PATCH] Split HuggingFace embeddings in HuggingFace API and TextGenerationInference packages (#14013) --- .../api_reference/embeddings/huggingface.md | 1 - .../embeddings/huggingface_api.md | 4 + docs/docs/api_reference/llms/huggingface.md | 1 - .../api_reference/llms/huggingface_api.md | 4 + .../llms/text_generation_inference.md | 4 + .../examples/cookbooks/llama3_cookbook.ipynb | 5 +- .../cookbooks/prometheus2_cookbook.ipynb | 4 +- .../examples/embeddings/jina_embeddings.ipynb | 3 + .../evaluation/prometheus_evaluation.ipynb | 4 +- ...llm_judge_single_grading_correctness.ipynb | 4 +- .../pairwise/finetune_llm_judge.ipynb | 4 +- docs/docs/examples/llm/huggingface.ipynb | 21 +- .../examples/node_postprocessor/rankGPT.ipynb | 7 +- docs/mkdocs.yml | 6 + .../llama_index/core/embeddings/loading.py | 2 +- .../core/ingestion/transformations.py | 2 +- .../llama_index/core/llms/loading.py | 2 +- .../.gitignore | 153 ++++++ .../BUILD | 3 + .../Makefile | 17 + .../README.md | 26 + .../embeddings/huggingface_api/BUILD | 1 + .../embeddings/huggingface_api/__init__.py | 7 + .../embeddings/huggingface_api/base.py | 217 +++++++++ .../embeddings/huggingface_api/pooling.py | 74 +++ .../pyproject.toml | 67 +++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_embeddings_huggingface.py | 11 + .../tests/test_hf_inference.py | 108 +++++ .../embeddings/huggingface/base.py | 5 + .../.gitignore | 153 ++++++ .../llama-index-llms-huggingface-api/BUILD | 3 + .../llama-index-llms-huggingface-api/Makefile | 17 + .../README.md | 26 + .../llama_index/llms/huggingface_api/BUILD | 1 + .../llms/huggingface_api/__init__.py | 5 + .../llama_index/llms/huggingface_api/base.py | 284 +++++++++++ .../pyproject.toml | 63 +++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../tests/test_huggingface_api.py | 115 +++++ .../tests/test_llms_huggingface_api.py | 7 + .../llama_index/llms/huggingface/base.py | 9 + .../.gitignore | 153 ++++++ .../BUILD | 3 + .../Makefile | 17 + .../README.md | 24 + .../llms/text_generation_inference/BUILD | 1 + .../text_generation_inference/__init__.py | 5 + .../llms/text_generation_inference/base.py | 445 ++++++++++++++++++ .../llms/text_generation_inference/utils.py | 66 +++ .../pyproject.toml | 63 +++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 .../test_llms_text_generation_inference.py | 7 + .../rag_moderator_llama_guard_pack.ipynb | 4 +- .../llama-index-utils-huggingface/.gitignore | 153 ++++++ .../llama-index-utils-huggingface/BUILD | 3 + .../llama-index-utils-huggingface/Makefile | 17 + .../llama-index-utils-huggingface/README.md | 1 + .../llama_index/utils/huggingface/BUILD | 1 + .../llama_index/utils/huggingface/__init__.py | 31 ++ .../llama_index/utils/huggingface/base.py | 99 ++++ .../pyproject.toml | 64 +++ 65 files changed, 2583 insertions(+), 27 deletions(-) create mode 100644 docs/docs/api_reference/embeddings/huggingface_api.md create mode 100644 docs/docs/api_reference/llms/huggingface_api.md create mode 100644 docs/docs/api_reference/llms/text_generation_inference.md create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py create mode 100644 llama-index-utils/llama-index-utils-huggingface/.gitignore create mode 100644 llama-index-utils/llama-index-utils-huggingface/BUILD create mode 100644 llama-index-utils/llama-index-utils-huggingface/Makefile create mode 100644 llama-index-utils/llama-index-utils-huggingface/README.md create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py create mode 100644 llama-index-utils/llama-index-utils-huggingface/pyproject.toml diff --git a/docs/docs/api_reference/embeddings/huggingface.md b/docs/docs/api_reference/embeddings/huggingface.md index 597bd6798f3a5..0a8842e38b31f 100644 --- a/docs/docs/api_reference/embeddings/huggingface.md +++ b/docs/docs/api_reference/embeddings/huggingface.md @@ -2,4 +2,3 @@ options: members: - HuggingFaceEmbedding - - HuggingFaceInferenceAPIEmbedding diff --git a/docs/docs/api_reference/embeddings/huggingface_api.md b/docs/docs/api_reference/embeddings/huggingface_api.md new file mode 100644 index 0000000000000..26888df65a1c3 --- /dev/null +++ b/docs/docs/api_reference/embeddings/huggingface_api.md @@ -0,0 +1,4 @@ +::: llama_index.embeddings.huggingface_api + options: + members: + - HuggingFaceInferenceAPIEmbedding diff --git a/docs/docs/api_reference/llms/huggingface.md b/docs/docs/api_reference/llms/huggingface.md index a03d1953b7c01..3edfc3aaf212e 100644 --- a/docs/docs/api_reference/llms/huggingface.md +++ b/docs/docs/api_reference/llms/huggingface.md @@ -1,5 +1,4 @@ ::: llama_index.llms.huggingface options: members: - - HuggingFaceInferenceAPI - HuggingFaceLLM diff --git a/docs/docs/api_reference/llms/huggingface_api.md b/docs/docs/api_reference/llms/huggingface_api.md new file mode 100644 index 0000000000000..33ee697f599b2 --- /dev/null +++ b/docs/docs/api_reference/llms/huggingface_api.md @@ -0,0 +1,4 @@ +::: llama_index.llms.huggingface_api + options: + members: + - HuggingFaceInferenceAPI diff --git a/docs/docs/api_reference/llms/text_generation_inference.md b/docs/docs/api_reference/llms/text_generation_inference.md new file mode 100644 index 0000000000000..afeb9ae9d3543 --- /dev/null +++ b/docs/docs/api_reference/llms/text_generation_inference.md @@ -0,0 +1,4 @@ +::: llama_index.llms.text_generation_inference + options: + members: + - TextGenerationInference diff --git a/docs/docs/examples/cookbooks/llama3_cookbook.ipynb b/docs/docs/examples/cookbooks/llama3_cookbook.ipynb index d40468490be1b..610c56e064765 100644 --- a/docs/docs/examples/cookbooks/llama3_cookbook.ipynb +++ b/docs/docs/examples/cookbooks/llama3_cookbook.ipynb @@ -29,7 +29,8 @@ "source": [ "!pip install llama-index\n", "!pip install llama-index-llms-huggingface\n", - "!pip install llama-index-embeddings-huggingface" + "!pip install llama-index-embeddings-huggingface\n", + "!pip install llama-index-embeddings-huggingface-api" ] }, { @@ -166,7 +167,7 @@ "source": [ "## You can deploy the model on HF Inference Endpoint and use it\n", "\n", - "# from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "# llm = HuggingFaceInferenceAPI(\n", "# model_name=\"\",\n", diff --git a/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb b/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb index fbd28ba0a4c5f..bc8f6aa3ced84 100644 --- a/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb +++ b/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb @@ -53,7 +53,7 @@ "outputs": [], "source": [ "!pip install llama-index\n", - "!pip install llama-index-llms-huggingface" + "!pip install llama-index-llms-huggingface-api" ] }, { @@ -145,7 +145,7 @@ "metadata": {}, "outputs": [], "source": [ - "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "HF_TOKEN = \"YOUR HF TOKEN\"\n", "HF_ENDPOINT_URL = \"YOUR HF ENDPOINT URL\"\n", diff --git a/docs/docs/examples/embeddings/jina_embeddings.ipynb b/docs/docs/examples/embeddings/jina_embeddings.ipynb index f79d19ae81b9c..1ca790bfa2326 100644 --- a/docs/docs/examples/embeddings/jina_embeddings.ipynb +++ b/docs/docs/examples/embeddings/jina_embeddings.ipynb @@ -26,6 +26,7 @@ "outputs": [], "source": [ "%pip install llama-index-embeddings-huggingface\n", + "%pip install llama-index-embeddings-huggingface-api\n", "%pip install llama-index-embeddings-openai" ] }, @@ -58,6 +59,8 @@ "source": [ "from llama_index.embeddings.huggingface import (\n", " HuggingFaceEmbedding,\n", + ")\n", + "from llama_index.embeddings.huggingface_api import (\n", " HuggingFaceInferenceAPIEmbedding,\n", ")\n", "from llama_index.embeddings.openai import OpenAIEmbedding\n", diff --git a/docs/docs/examples/evaluation/prometheus_evaluation.ipynb b/docs/docs/examples/evaluation/prometheus_evaluation.ipynb index f6fdd8dcfc0cd..bdf6ecccfc6d0 100644 --- a/docs/docs/examples/evaluation/prometheus_evaluation.ipynb +++ b/docs/docs/examples/evaluation/prometheus_evaluation.ipynb @@ -76,7 +76,7 @@ "outputs": [], "source": [ "%pip install llama-index-llms-openai\n", - "%pip install llama-index-llms-huggingface" + "%pip install llama-index-llms-huggingface-api" ] }, { @@ -146,7 +146,7 @@ } ], "source": [ - "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "HF_TOKEN = \"YOUR HF TOKEN\"\n", "HF_ENDPOINT_URL = (\n", diff --git a/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb b/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb index dffed455848df..6d4a2e0920a84 100644 --- a/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb +++ b/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb @@ -27,7 +27,7 @@ "%pip install llama-index-finetuning\n", "%pip install llama-index-llms-openai\n", "%pip install llama-index-finetuning-callbacks\n", - "%pip install llama-index-llms-huggingface" + "%pip install llama-index-llms-huggingface-api" ] }, { @@ -265,7 +265,7 @@ ], "source": [ "from llama_index.core.query_engine import RetrieverQueryEngine\n", - "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "llm = HuggingFaceInferenceAPI(\n", " model_name=\"meta-llama/Llama-2-7b-chat-hf\",\n", diff --git a/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb b/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb index 806440204a1ca..1cbbc9447af4d 100644 --- a/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb +++ b/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb @@ -28,7 +28,7 @@ "%pip install llama-index-finetuning\n", "%pip install llama-index-llms-openai\n", "%pip install llama-index-finetuning-callbacks\n", - "%pip install llama-index-llms-huggingface" + "%pip install llama-index-llms-huggingface-api" ] }, { @@ -410,7 +410,7 @@ "outputs": [], "source": [ "from llama_index.core.query_engine import RetrieverQueryEngine\n", - "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "\n", "def create_query_engine(\n", diff --git a/docs/docs/examples/llm/huggingface.ipynb b/docs/docs/examples/llm/huggingface.ipynb index 8625117882528..34add77208449 100644 --- a/docs/docs/examples/llm/huggingface.ipynb +++ b/docs/docs/examples/llm/huggingface.ipynb @@ -49,7 +49,8 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install llama-index-llms-huggingface" + "%pip install llama-index-llms-huggingface\n", + "%pip install llama-index-llms-huggingface-api" ] }, { @@ -99,10 +100,8 @@ "import os\n", "from typing import List, Optional\n", "\n", - "from llama_index.llms.huggingface import (\n", - " HuggingFaceInferenceAPI,\n", - " HuggingFaceLLM,\n", - ")\n", + "from llama_index.llms.huggingface import HuggingFaceLLM\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", "# SEE: https://huggingface.co/docs/hub/security-tokens\n", "# We just need a token with read permissions for this demo\n", @@ -227,6 +226,16 @@ "The new `TextGenerationInference` class allows to interface with endpoints running [`text-generation-inference`, TGI](https://huggingface.co/docs/text-generation-inference/index). In addition to blazingly fast inference, it supports `tool` usage starting from version `2.0.1`. " ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "46c5c06d", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-llms-text-generation-inference" + ] + }, { "cell_type": "markdown", "id": "055ddcb1", @@ -253,7 +262,7 @@ "import os\n", "from typing import List, Optional\n", "\n", - "from llama_index.llms.huggingface import (\n", + "from llama_index.llms.text_generation_inference import (\n", " TextGenerationInference,\n", ")\n", "\n", diff --git a/docs/docs/examples/node_postprocessor/rankGPT.ipynb b/docs/docs/examples/node_postprocessor/rankGPT.ipynb index 909b08c0d5ce1..aefc3c4ffc71d 100644 --- a/docs/docs/examples/node_postprocessor/rankGPT.ipynb +++ b/docs/docs/examples/node_postprocessor/rankGPT.ipynb @@ -32,6 +32,7 @@ "source": [ "%pip install llama-index-postprocessor-rankgpt-rerank\n", "%pip install llama-index-llms-huggingface\n", + "%pip install llama-index-llms-huggingface-api\n", "%pip install llama-index-llms-openai\n", "%pip install llama-index-llms-ollama" ] @@ -484,10 +485,8 @@ "from llama_index.core import QueryBundle\n", "import pandas as pd\n", "from IPython.display import display, HTML\n", - "from llama_index.llms.huggingface import (\n", - " HuggingFaceInferenceAPI,\n", - " HuggingFaceLLM,\n", - ")\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface import HuggingFaceLLM\n", "\n", "from llama_index.postprocessor.rankgpt_rerank import RankGPTRerank\n", "\n", diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 7e80c50490938..eb624ecf60eef 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -754,6 +754,7 @@ nav: - ./api_reference/embeddings/google.md - ./api_reference/embeddings/gradient.md - ./api_reference/embeddings/huggingface.md + - ./api_reference/embeddings/huggingface_api.md - ./api_reference/embeddings/huggingface_itrex.md - ./api_reference/embeddings/huggingface_openvino.md - ./api_reference/embeddings/huggingface_optimum.md @@ -839,6 +840,7 @@ nav: - ./api_reference/llms/gradient.md - ./api_reference/llms/groq.md - ./api_reference/llms/huggingface.md + - ./api_reference/llms/huggingface_api.md - ./api_reference/llms/index.md - ./api_reference/llms/ipex_llm.md - ./api_reference/llms/konko.md @@ -877,6 +879,7 @@ nav: - ./api_reference/llms/rungpt.md - ./api_reference/llms/sagemaker_endpoint.md - ./api_reference/llms/solar.md + - ./api_reference/llms/text_generation_inference.md - ./api_reference/llms/together.md - ./api_reference/llms/unify.md - ./api_reference/llms/upstage.md @@ -1964,6 +1967,9 @@ plugins: - ../llama-index-integrations/readers/llama-index-readers-azure-devops - ../llama-index-integrations/retrievers/llama-index-retrievers-duckdb-retriever - ../llama-index-packs/llama-index-packs-zenguard + - ../llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api + - ../llama-index-integrations/llms/llama-index-llms-text-generation-inference + - ../llama-index-integrations/llms/llama-index-llms-huggingface-api - redirects: redirect_maps: ./api/llama_index.vector_stores.MongoDBAtlasVectorSearch.html: api_reference/storage/vector_store/mongodb.md diff --git a/llama-index-core/llama_index/core/embeddings/loading.py b/llama-index-core/llama_index/core/embeddings/loading.py index fd84ee64ed7f1..ac1449b970c26 100644 --- a/llama-index-core/llama_index/core/embeddings/loading.py +++ b/llama-index-core/llama_index/core/embeddings/loading.py @@ -25,7 +25,7 @@ pass try: - from llama_index.embeddings.huggingface import ( + from llama_index.embeddings.huggingface_api import ( HuggingFaceInferenceAPIEmbedding, ) # pants: no-infer-dep diff --git a/llama-index-core/llama_index/core/ingestion/transformations.py b/llama-index-core/llama_index/core/ingestion/transformations.py index 1cd488ebc135f..49e62df3715d1 100644 --- a/llama-index-core/llama_index/core/ingestion/transformations.py +++ b/llama-index-core/llama_index/core/ingestion/transformations.py @@ -285,7 +285,7 @@ def build_configured_transformation( pass try: - from llama_index.embeddings.huggingface import ( + from llama_index.embeddings.huggingface_api import ( HuggingFaceInferenceAPIEmbedding, ) # pants: no-infer-dep diff --git a/llama-index-core/llama_index/core/llms/loading.py b/llama-index-core/llama_index/core/llms/loading.py index 005a69c89a7f0..20ce3f0dcfea3 100644 --- a/llama-index-core/llama_index/core/llms/loading.py +++ b/llama-index-core/llama_index/core/llms/loading.py @@ -25,7 +25,7 @@ pass try: - from llama_index.llms.huggingface import ( + from llama_index.llms.huggingface_api import ( HuggingFaceInferenceAPI, ) # pants: no-infer-dep diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md new file mode 100644 index 0000000000000..4b3371966a15d --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md @@ -0,0 +1,26 @@ +# LlamaIndex Embeddings Integration: Huggingface API + +Integration with Hugging Face's Inference API for embeddings. + +For more information on Hugging Face's Inference API, visit [Hugging Face's Inference API documentation](https://huggingface.co/docs/api-inference/quicktour). + +## Installation + +```shell +pip install llama-index-embeddings-huggingface-api +``` + +## Usage + +```python +from llama_index.embeddings.huggingface_api import ( + HuggingFaceInferenceAPIEmbedding, +) + +my_embed = HuggingFaceInferenceAPIEmbedding( + model_name="BAAI/bge-small-en-v1.5", + token="", # Optional +) + +embeddings = my_embed.get_text_embedding("Why sky is blue") +``` diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py new file mode 100644 index 0000000000000..8359f3684f8fd --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py @@ -0,0 +1,7 @@ +from llama_index.embeddings.huggingface_api.base import ( + HuggingFaceInferenceAPIEmbedding, +) + +__all__ = [ + "HuggingFaceInferenceAPIEmbedding", +] diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py new file mode 100644 index 0000000000000..012abd0a74f9f --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py @@ -0,0 +1,217 @@ +import asyncio +import logging +from typing import Any, Callable, Dict, List, Optional, Sequence, Union + +from huggingface_hub import ( + AsyncInferenceClient, + InferenceClient, + model_info, +) +from huggingface_hub.hf_api import ModelInfo +from llama_index.core.base.embeddings.base import ( + BaseEmbedding, + Embedding, +) +from llama_index.core.bridge.pydantic import Field, PrivateAttr +from llama_index.embeddings.huggingface_api.pooling import Pooling +from llama_index.utils.huggingface import ( + format_query, + format_text, +) + +logger = logging.getLogger(__name__) + + +class HuggingFaceInferenceAPIEmbedding(BaseEmbedding): # type: ignore[misc] + """ + Wrapper on the Hugging Face's Inference API for embeddings. + + Overview of the design: + - Uses the feature extraction task: https://huggingface.co/tasks/feature-extraction + """ + + pooling: Optional[Pooling] = Field( + default=Pooling.CLS, + description="Pooling strategy. If None, the model's default pooling is used.", + ) + query_instruction: Optional[str] = Field( + default=None, description="Instruction to prepend during query embedding." + ) + text_instruction: Optional[str] = Field( + default=None, description="Instruction to prepend during text embedding." + ) + + # Corresponds with huggingface_hub.InferenceClient + model_name: Optional[str] = Field( + default=None, + description="Hugging Face model name. If None, the task will be used.", + ) + token: Union[str, bool, None] = Field( + default=None, + description=( + "Hugging Face token. Will default to the locally saved token. Pass " + "token=False if you don’t want to send your token to the server." + ), + ) + timeout: Optional[float] = Field( + default=None, + description=( + "The maximum number of seconds to wait for a response from the server." + " Loading a new model in Inference API can take up to several minutes." + " Defaults to None, meaning it will loop until the server is available." + ), + ) + headers: Dict[str, str] = Field( + default=None, + description=( + "Additional headers to send to the server. By default only the" + " authorization and user-agent headers are sent. Values in this dictionary" + " will override the default values." + ), + ) + cookies: Dict[str, str] = Field( + default=None, description="Additional cookies to send to the server." + ) + task: Optional[str] = Field( + default=None, + description=( + "Optional task to pick Hugging Face's recommended model, used when" + " model_name is left as default of None." + ), + ) + _sync_client: "InferenceClient" = PrivateAttr() + _async_client: "AsyncInferenceClient" = PrivateAttr() + _get_model_info: "Callable[..., ModelInfo]" = PrivateAttr() + + def _get_inference_client_kwargs(self) -> Dict[str, Any]: + """Extract the Hugging Face InferenceClient construction parameters.""" + return { + "model": self.model_name, + "token": self.token, + "timeout": self.timeout, + "headers": self.headers, + "cookies": self.cookies, + } + + def __init__(self, **kwargs: Any) -> None: + """Initialize. + + Args: + kwargs: See the class-level Fields. + """ + if kwargs.get("model_name") is None: + task = kwargs.get("task", "") + # NOTE: task being None or empty string leads to ValueError, + # which ensures model is present + kwargs["model_name"] = InferenceClient.get_recommended_model(task=task) + logger.debug( + f"Using Hugging Face's recommended model {kwargs['model_name']}" + f" given task {task}." + ) + print(kwargs["model_name"], flush=True) + super().__init__(**kwargs) # Populate pydantic Fields + self._sync_client = InferenceClient(**self._get_inference_client_kwargs()) + self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs()) + self._get_model_info = model_info + + def validate_supported(self, task: str) -> None: + """ + Confirm the contained model_name is deployed on the Inference API service. + + Args: + task: Hugging Face task to check within. A list of all tasks can be + found here: https://huggingface.co/tasks + """ + all_models = self._sync_client.list_deployed_models(frameworks="all") + try: + if self.model_name not in all_models[task]: + raise ValueError( + "The Inference API service doesn't have the model" + f" {self.model_name!r} deployed." + ) + except KeyError as exc: + raise KeyError( + f"Input task {task!r} not in possible tasks {list(all_models.keys())}." + ) from exc + + def get_model_info(self, **kwargs: Any) -> "ModelInfo": + """Get metadata on the current model from Hugging Face.""" + return self._get_model_info(self.model_name, **kwargs) + + @classmethod + def class_name(cls) -> str: + return "HuggingFaceInferenceAPIEmbedding" + + async def _async_embed_single(self, text: str) -> Embedding: + embedding = await self._async_client.feature_extraction(text) + if len(embedding.shape) == 1: + return embedding.tolist() + embedding = embedding.squeeze(axis=0) + if len(embedding.shape) == 1: # Some models pool internally + return embedding.tolist() + try: + return self.pooling(embedding).tolist() # type: ignore[misc] + except TypeError as exc: + raise ValueError( + f"Pooling is required for {self.model_name} because it returned" + " a > 1-D value, please specify pooling as not None." + ) from exc + + async def _async_embed_bulk(self, texts: Sequence[str]) -> List[Embedding]: + """ + Embed a sequence of text, in parallel and asynchronously. + + NOTE: this uses an externally created asyncio event loop. + """ + tasks = [self._async_embed_single(text) for text in texts] + return await asyncio.gather(*tasks) + + def _get_query_embedding(self, query: str) -> Embedding: + """ + Embed the input query synchronously. + + NOTE: a new asyncio event loop is created internally for this. + """ + return asyncio.run(self._aget_query_embedding(query)) + + def _get_text_embedding(self, text: str) -> Embedding: + """ + Embed the text query synchronously. + + NOTE: a new asyncio event loop is created internally for this. + """ + return asyncio.run(self._aget_text_embedding(text)) + + def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]: + """ + Embed the input sequence of text synchronously and in parallel. + + NOTE: a new asyncio event loop is created internally for this. + """ + loop = asyncio.new_event_loop() + try: + tasks = [ + loop.create_task(self._aget_text_embedding(text)) for text in texts + ] + loop.run_until_complete(asyncio.wait(tasks)) + finally: + loop.close() + return [task.result() for task in tasks] + + async def _aget_query_embedding(self, query: str) -> Embedding: + return await self._async_embed_single( + text=format_query(query, self.model_name, self.query_instruction) + ) + + async def _aget_text_embedding(self, text: str) -> Embedding: + return await self._async_embed_single( + text=format_text(text, self.model_name, self.text_instruction) + ) + + async def _aget_text_embeddings(self, texts: List[str]) -> List[Embedding]: + return await self._async_embed_bulk( + texts=[ + format_text(text, self.model_name, self.text_instruction) + for text in texts + ] + ) diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py new file mode 100644 index 0000000000000..2a4035cd55656 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py @@ -0,0 +1,74 @@ +from enum import Enum +from typing import TYPE_CHECKING, Union, overload + +import numpy as np + +if TYPE_CHECKING: + import torch + + +class Pooling(str, Enum): + """Enum of possible pooling choices with pooling behaviors.""" + + CLS = "cls" + MEAN = "mean" + LAST = "last" # last token pooling + + def __call__(self, array: np.ndarray) -> np.ndarray: + if self == self.CLS: + return self.cls_pooling(array) + elif self == self.LAST: + return self.last_pooling(array) + return self.mean_pooling(array) + + @classmethod + @overload + def cls_pooling(cls, array: np.ndarray) -> np.ndarray: + ... + + @classmethod + @overload + # TODO: Remove this `type: ignore` after the false positive problem + # is addressed in mypy: https://github.com/python/mypy/issues/15683 . + def cls_pooling(cls, array: "torch.Tensor") -> "torch.Tensor": # type: ignore + ... + + @classmethod + def cls_pooling( + cls, array: "Union[np.ndarray, torch.Tensor]" + ) -> "Union[np.ndarray, torch.Tensor]": + if len(array.shape) == 3: + return array[:, 0] + if len(array.shape) == 2: + return array[0] + raise NotImplementedError(f"Unhandled shape {array.shape}.") + + @classmethod + def mean_pooling(cls, array: np.ndarray) -> np.ndarray: + if len(array.shape) == 3: + return array.mean(axis=1) + if len(array.shape) == 2: + return array.mean(axis=0) + raise NotImplementedError(f"Unhandled shape {array.shape}.") + + @classmethod + @overload + def last_pooling(cls, array: np.ndarray) -> np.ndarray: + ... + + @classmethod + @overload + # TODO: Remove this `type: ignore` after the false positive problem + # is addressed in mypy: https://github.com/python/mypy/issues/15683 . + def last_pooling(cls, array: "torch.Tensor") -> "torch.Tensor": # type: ignore + ... + + @classmethod + def last_pooling( + cls, array: "Union[np.ndarray, torch.Tensor]" + ) -> "Union[np.ndarray, torch.Tensor]": + if len(array.shape) == 3: + return array[:, -1] + if len(array.shape) == 2: + return array[-1] + raise NotImplementedError(f"Unhandled shape {array.shape}.") diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml new file mode 100644 index 0000000000000..6100f0bc90cd1 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml @@ -0,0 +1,67 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.embeddings.huggingface_api" + +[tool.llamahub.class_authors] +HuggingFaceInferenceAPIEmbedding = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index embeddings huggingface api integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-embeddings-huggingface-api" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.1" +llama-index-utils-huggingface = "^0.1.0" + +[tool.poetry.dependencies.huggingface-hub] +extras = ["inference"] +version = ">=0.19.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py new file mode 100644 index 0000000000000..bb33981498d98 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py @@ -0,0 +1,11 @@ +from llama_index.core.base.embeddings.base import BaseEmbedding +from llama_index.embeddings.huggingface_api import ( + HuggingFaceInferenceAPIEmbedding, +) + + +def test_huggingfaceapiembedding_class(): + names_of_base_classes = [ + b.__name__ for b in HuggingFaceInferenceAPIEmbedding.__mro__ + ] + assert BaseEmbedding.__name__ in names_of_base_classes diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py new file mode 100644 index 0000000000000..a78c2389bb4d4 --- /dev/null +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py @@ -0,0 +1,108 @@ +from unittest.mock import AsyncMock, MagicMock, patch + +import numpy as np +import pytest +from llama_index.embeddings.huggingface_api.base import HuggingFaceInferenceAPIEmbedding +from llama_index.embeddings.huggingface_api.pooling import Pooling + +STUB_MODEL_NAME = "placeholder_model" + + +@pytest.fixture(name="hf_inference_api_embedding") +def fixture_hf_inference_api_embedding() -> HuggingFaceInferenceAPIEmbedding: + with patch.dict("sys.modules", huggingface_hub=MagicMock()): + return HuggingFaceInferenceAPIEmbedding(model_name=STUB_MODEL_NAME) + + +class TestHuggingFaceInferenceAPIEmbeddings: + def test_class_name( + self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding + ) -> None: + assert ( + HuggingFaceInferenceAPIEmbedding.class_name() + == HuggingFaceInferenceAPIEmbedding.__name__ + ) + assert ( + hf_inference_api_embedding.class_name() + == HuggingFaceInferenceAPIEmbedding.__name__ + ) + + # def test_using_recommended_model(self) -> None: + # mock_hub = MagicMock() + # mock_hub.InferenceClient.get_recommended_model.return_value = ( + # "facebook/bart-base" + # ) + # with patch.dict("sys.modules", huggingface_hub=mock_hub): + # embedding = HuggingFaceInferenceAPIEmbedding(task="feature-extraction") + # assert embedding.model_name == "facebook/bart-base" + # # mock_hub.InferenceClient.get_recommended_model.assert_called_once_with( + # # task="feature-extraction" + # # ) + + def test_embed_query( + self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding + ) -> None: + raw_single_embedding = np.random.default_rng().random( + (1, 3, 1024), dtype=np.float32 + ) + + hf_inference_api_embedding.pooling = Pooling.CLS + with patch.object( + hf_inference_api_embedding._async_client, + "feature_extraction", + AsyncMock(return_value=raw_single_embedding), + ) as mock_feature_extraction: + embedding = hf_inference_api_embedding.get_query_embedding("test") + assert isinstance(embedding, list) + assert len(embedding) == 1024 + assert isinstance(embedding[0], float) + assert np.all( + np.array(embedding, dtype=raw_single_embedding.dtype) + == raw_single_embedding[0, 0] + ) + mock_feature_extraction.assert_awaited_once_with("test") + + hf_inference_api_embedding.pooling = Pooling.MEAN + with patch.object( + hf_inference_api_embedding._async_client, + "feature_extraction", + AsyncMock(return_value=raw_single_embedding), + ) as mock_feature_extraction: + embedding = hf_inference_api_embedding.get_query_embedding("test") + assert isinstance(embedding, list) + assert len(embedding) == 1024 + assert isinstance(embedding[0], float) + assert np.all( + np.array(embedding, dtype=raw_single_embedding.dtype) + == raw_single_embedding[0].mean(axis=0) + ) + mock_feature_extraction.assert_awaited_once_with("test") + + def test_embed_query_one_dimension( + self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding + ) -> None: + raw_single_embedding = np.random.default_rng().random(1024, dtype=np.float32) + + with patch.object( + hf_inference_api_embedding._async_client, + "feature_extraction", + AsyncMock(return_value=raw_single_embedding), + ) as mock_feature_extraction: + embedding = hf_inference_api_embedding.get_query_embedding("test") + assert isinstance(embedding, list) + assert len(embedding) == 1024 + assert isinstance(embedding[0], float) + assert np.all( + np.array(embedding, dtype=raw_single_embedding.dtype) + == raw_single_embedding + ) + mock_feature_extraction.assert_awaited_once_with("test") + + def test_serialization( + self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding + ) -> None: + serialized = hf_inference_api_embedding.to_dict() + # Check Hugging Face Inference API base class specifics + assert serialized["model_name"] == STUB_MODEL_NAME + # Check Hugging Face Inference API Embeddings derived class specifics + assert serialized["pooling"] == Pooling.CLS diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py index c400a2f694c55..1e520273bd42c 100644 --- a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py +++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py @@ -2,6 +2,7 @@ import logging from typing import Any, Callable, Dict, List, Optional, Sequence, Union +from deprecated import deprecated from huggingface_hub import ( AsyncInferenceClient, InferenceClient, @@ -149,6 +150,10 @@ def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: return self._embed(texts, prompt_name="text") +@deprecated( + "Deprecated in favor of `HuggingFaceInferenceAPIEmbedding` from `llama-index-embeddings-huggingface-api` which should be used instead.", + action="always", +) class HuggingFaceInferenceAPIEmbedding(BaseEmbedding): # type: ignore[misc] """ Wrapper on the Hugging Face's Inference API for embeddings. diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore b/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile b/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md b/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md new file mode 100644 index 0000000000000..ee70d26be2ade --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md @@ -0,0 +1,26 @@ +# LlamaIndex Llms Integration: Huggingface API + +Integration with Hugging Face's Inference API for generating text. + +For more information on Hugging Face's Inference API, visit [Hugging Face's Inference API documentation](https://huggingface.co/docs/api-inference/quicktour). + +## Installation + +```shell +pip install llama-index-llms-huggingface-api +``` + +## Usage + +```python +from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI + +llm = HuggingFaceInferenceAPI( + model_name="openai-community/gpt2", + temperature=0.7, + max_tokens=100, + token="", # Optional +) + +response = llm.complete("Hello, how are you?") +``` diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py new file mode 100644 index 0000000000000..7c76b9db76064 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py @@ -0,0 +1,5 @@ +from llama_index.llms.huggingface_api.base import ( + HuggingFaceInferenceAPI, +) + +__all__ = ["HuggingFaceInferenceAPI"] diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py new file mode 100644 index 0000000000000..e059fadb50414 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py @@ -0,0 +1,284 @@ +import logging +from typing import Any, Callable, Dict, Optional, Sequence, Union + +from huggingface_hub import AsyncInferenceClient, InferenceClient, model_info +from huggingface_hub.hf_api import ModelInfo +from huggingface_hub.inference._types import ConversationalOutput +from llama_index.core.base.llms.types import ( + ChatMessage, + ChatResponse, + ChatResponseAsyncGen, + ChatResponseGen, + CompletionResponse, + CompletionResponseAsyncGen, + CompletionResponseGen, + LLMMetadata, + MessageRole, +) +from llama_index.core.bridge.pydantic import Field, PrivateAttr +from llama_index.core.constants import ( + DEFAULT_CONTEXT_WINDOW, + DEFAULT_NUM_OUTPUTS, +) +from llama_index.core.llms.custom import CustomLLM + + +logger = logging.getLogger(__name__) + + +def chat_messages_to_conversational_kwargs( + messages: Sequence[ChatMessage], +) -> Dict[str, Any]: + """Convert ChatMessages to keyword arguments for Inference API conversational.""" + if len(messages) % 2 != 1: + raise NotImplementedError("Messages passed in must be of odd length.") + last_message = messages[-1] + kwargs: Dict[str, Any] = { + "text": last_message.content, + **last_message.additional_kwargs, + } + if len(messages) != 1: + kwargs["past_user_inputs"] = [] + kwargs["generated_responses"] = [] + for user_msg, assistant_msg in zip(messages[::2], messages[1::2]): + if ( + user_msg.role != MessageRole.USER + or assistant_msg.role != MessageRole.ASSISTANT + ): + raise NotImplementedError( + "Didn't handle when messages aren't ordered in alternating" + f" pairs of {(MessageRole.USER, MessageRole.ASSISTANT)}." + ) + kwargs["past_user_inputs"].append(user_msg.content) + kwargs["generated_responses"].append(assistant_msg.content) + return kwargs + + +class HuggingFaceInferenceAPI(CustomLLM): + """ + Wrapper on the Hugging Face's Inference API. + + Overview of the design: + - Synchronous uses InferenceClient, asynchronous uses AsyncInferenceClient + - chat uses the conversational task: https://huggingface.co/tasks/conversational + - complete uses the text generation task: https://huggingface.co/tasks/text-generation + + Note: some models that support the text generation task can leverage Hugging + Face's optimized deployment toolkit called text-generation-inference (TGI). + Use InferenceClient.get_model_status to check if TGI is being used. + + Relevant links: + - General Docs: https://huggingface.co/docs/api-inference/index + - API Docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client + - Source: https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub/inference + """ + + @classmethod + def class_name(cls) -> str: + return "HuggingFaceInferenceAPI" + + # Corresponds with huggingface_hub.InferenceClient + model_name: Optional[str] = Field( + default=None, + description=( + "The model to run inference with. Can be a model id hosted on the Hugging" + " Face Hub, e.g. bigcode/starcoder or a URL to a deployed Inference" + " Endpoint. Defaults to None, in which case a recommended model is" + " automatically selected for the task (see Field below)." + ), + ) + token: Union[str, bool, None] = Field( + default=None, + description=( + "Hugging Face token. Will default to the locally saved token. Pass " + "token=False if you don’t want to send your token to the server." + ), + ) + timeout: Optional[float] = Field( + default=None, + description=( + "The maximum number of seconds to wait for a response from the server." + " Loading a new model in Inference API can take up to several minutes." + " Defaults to None, meaning it will loop until the server is available." + ), + ) + headers: Dict[str, str] = Field( + default=None, + description=( + "Additional headers to send to the server. By default only the" + " authorization and user-agent headers are sent. Values in this dictionary" + " will override the default values." + ), + ) + cookies: Dict[str, str] = Field( + default=None, description="Additional cookies to send to the server." + ) + task: Optional[str] = Field( + default=None, + description=( + "Optional task to pick Hugging Face's recommended model, used when" + " model_name is left as default of None." + ), + ) + + _sync_client: "InferenceClient" = PrivateAttr() + _async_client: "AsyncInferenceClient" = PrivateAttr() + _get_model_info: "Callable[..., ModelInfo]" = PrivateAttr() + + context_window: int = Field( + default=DEFAULT_CONTEXT_WINDOW, + description=( + LLMMetadata.__fields__["context_window"].field_info.description + + " This may be looked up in a model's `config.json`." + ), + ) + num_output: int = Field( + default=DEFAULT_NUM_OUTPUTS, + description=LLMMetadata.__fields__["num_output"].field_info.description, + ) + is_chat_model: bool = Field( + default=False, + description=( + LLMMetadata.__fields__["is_chat_model"].field_info.description + + " Unless chat templating is intentionally applied, Hugging Face models" + " are not chat models." + ), + ) + is_function_calling_model: bool = Field( + default=False, + description=( + LLMMetadata.__fields__["is_function_calling_model"].field_info.description + + " As of 10/17/2023, Hugging Face doesn't support function calling" + " messages." + ), + ) + + def _get_inference_client_kwargs(self) -> Dict[str, Any]: + """Extract the Hugging Face InferenceClient construction parameters.""" + return { + "model": self.model_name, + "token": self.token, + "timeout": self.timeout, + "headers": self.headers, + "cookies": self.cookies, + } + + def __init__(self, **kwargs: Any) -> None: + """Initialize. + + Args: + kwargs: See the class-level Fields. + """ + if kwargs.get("model_name") is None: + task = kwargs.get("task", "") + # NOTE: task being None or empty string leads to ValueError, + # which ensures model is present + kwargs["model_name"] = InferenceClient.get_recommended_model(task=task) + logger.debug( + f"Using Hugging Face's recommended model {kwargs['model_name']}" + f" given task {task}." + ) + if kwargs.get("task") is None: + task = "conversational" + else: + task = kwargs["task"].lower() + + super().__init__(**kwargs) # Populate pydantic Fields + self._sync_client = InferenceClient(**self._get_inference_client_kwargs()) + self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs()) + self._get_model_info = model_info + + def validate_supported(self, task: str) -> None: + """ + Confirm the contained model_name is deployed on the Inference API service. + + Args: + task: Hugging Face task to check within. A list of all tasks can be + found here: https://huggingface.co/tasks + """ + all_models = self._sync_client.list_deployed_models(frameworks="all") + try: + if self.model_name not in all_models[task]: + raise ValueError( + "The Inference API service doesn't have the model" + f" {self.model_name!r} deployed." + ) + except KeyError as exc: + raise KeyError( + f"Input task {task!r} not in possible tasks {list(all_models.keys())}." + ) from exc + + def get_model_info(self, **kwargs: Any) -> "ModelInfo": + """Get metadata on the current model from Hugging Face.""" + return self._get_model_info(self.model_name, **kwargs) + + @property + def metadata(self) -> LLMMetadata: + return LLMMetadata( + context_window=self.context_window, + num_output=self.num_output, + is_chat_model=self.is_chat_model, + is_function_calling_model=self.is_function_calling_model, + model_name=self.model_name, + ) + + def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: + # default to conversational task as that was the previous functionality + if self.task == "conversational" or self.task is None: + output: "ConversationalOutput" = self._sync_client.conversational( + **{**chat_messages_to_conversational_kwargs(messages), **kwargs} + ) + return ChatResponse( + message=ChatMessage( + role=MessageRole.ASSISTANT, content=output["generated_text"] + ) + ) + else: + # try and use text generation + prompt = self.messages_to_prompt(messages) + completion = self.complete(prompt) + return ChatResponse( + message=ChatMessage(role=MessageRole.ASSISTANT, content=completion.text) + ) + + def complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + return CompletionResponse( + text=self._sync_client.text_generation( + prompt, **{**{"max_new_tokens": self.num_output}, **kwargs} + ) + ) + + def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseGen: + raise NotImplementedError + + def stream_complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponseGen: + raise NotImplementedError + + async def achat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError + + async def acomplete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + response = await self._async_client.text_generation( + prompt, **{**{"max_new_tokens": self.num_output}, **kwargs} + ) + return CompletionResponse(text=response) + + async def astream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseAsyncGen: + raise NotImplementedError + + async def astream_complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponseAsyncGen: + raise NotImplementedError diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml new file mode 100644 index 0000000000000..6e0a4b9d4c608 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml @@ -0,0 +1,63 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.llms.huggingface_api" + +[tool.llamahub.class_authors] +HuggingFaceInferenceAPI = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index llms huggingface api integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-llms-huggingface-api" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.41" +huggingface-hub = "^0.23.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py new file mode 100644 index 0000000000000..d5a331c86f9bf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py @@ -0,0 +1,115 @@ +from unittest.mock import MagicMock, patch + +import pytest +from llama_index.core.llms import ChatMessage, MessageRole +from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI + +STUB_MODEL_NAME = "placeholder_model" + + +@pytest.fixture(name="hf_inference_api") +def fixture_hf_inference_api() -> HuggingFaceInferenceAPI: + with patch.dict("sys.modules", huggingface_hub=MagicMock()): + return HuggingFaceInferenceAPI(model_name=STUB_MODEL_NAME) + + +class TestHuggingFaceInferenceAPI: + def test_class_name(self, hf_inference_api: HuggingFaceInferenceAPI) -> None: + assert HuggingFaceInferenceAPI.class_name() == HuggingFaceInferenceAPI.__name__ + assert hf_inference_api.class_name() == HuggingFaceInferenceAPI.__name__ + + def test_instantiation(self) -> None: + mock_hub = MagicMock() + with patch.dict("sys.modules", huggingface_hub=mock_hub): + llm = HuggingFaceInferenceAPI(model_name=STUB_MODEL_NAME) + + assert llm.model_name == STUB_MODEL_NAME + + # Check can be both a large language model and an embedding model + assert isinstance(llm, HuggingFaceInferenceAPI) + + # Confirm Clients are instantiated correctly + # mock_hub.InferenceClient.assert_called_once_with( + # model=STUB_MODEL_NAME, token=None, timeout=None, headers=None, cookies=None + # ) + # mock_hub.AsyncInferenceClient.assert_called_once_with( + # model=STUB_MODEL_NAME, token=None, timeout=None, headers=None, cookies=None + # ) + + def test_chat(self, hf_inference_api: HuggingFaceInferenceAPI) -> None: + messages = [ + ChatMessage(content="Which movie is the best?"), + ChatMessage(content="It's Die Hard for sure.", role=MessageRole.ASSISTANT), + ChatMessage(content="Can you explain why?"), + ] + generated_response = ( + " It's based on the book of the same name by James Fenimore Cooper." + ) + conversational_return = { + "generated_text": generated_response, + "conversation": { + "generated_responses": ["It's Die Hard for sure.", generated_response], + "past_user_inputs": [ + "Which movie is the best?", + "Can you explain why?", + ], + }, + } + + with patch.object( + hf_inference_api._sync_client, + "conversational", + return_value=conversational_return, + ) as mock_conversational: + response = hf_inference_api.chat(messages=messages) + + assert response.message.role == MessageRole.ASSISTANT + assert response.message.content == generated_response + mock_conversational.assert_called_once_with( + text="Can you explain why?", + past_user_inputs=["Which movie is the best?"], + generated_responses=["It's Die Hard for sure."], + ) + + def test_chat_text_generation( + self, hf_inference_api: HuggingFaceInferenceAPI + ) -> None: + mock_message_to_prompt = MagicMock( + return_value="System: You are an expert movie reviewer\nUser: Which movie is the best?\nAssistant:" + ) + hf_inference_api.task = "text-generation" + hf_inference_api.messages_to_prompt = mock_message_to_prompt + messages = [ + ChatMessage( + role=MessageRole.SYSTEM, content="You are an expert movie reviewer" + ), + ChatMessage(role=MessageRole.USER, content="Which movie is the best?"), + ] + conversational_return = "It's Die Hard for sure." + + with patch.object( + hf_inference_api._sync_client, + "text_generation", + return_value=conversational_return, + ) as mock_complete: + response = hf_inference_api.chat(messages=messages) + + hf_inference_api.messages_to_prompt.assert_called_once_with(messages) + assert response.message.role == MessageRole.ASSISTANT + assert response.message.content == conversational_return + mock_complete.assert_called_once_with( + "System: You are an expert movie reviewer\nUser: Which movie is the best?\nAssistant:", + max_new_tokens=256, + ) + + def test_complete(self, hf_inference_api: HuggingFaceInferenceAPI) -> None: + prompt = "My favorite color is " + generated_text = '"green" and I love to paint. I have been painting for 30 years and have been' + with patch.object( + hf_inference_api._sync_client, + "text_generation", + return_value=generated_text, + ) as mock_text_generation: + response = hf_inference_api.complete(prompt) + mock_text_generation.assert_called_once_with(prompt, max_new_tokens=256) + assert response.text == generated_text diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py new file mode 100644 index 0000000000000..14d71c5cc7461 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py @@ -0,0 +1,7 @@ +from llama_index.core.base.llms.base import BaseLLM +from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI + + +def test_embedding_class(): + names_of_base_classes = [b.__name__ for b in HuggingFaceInferenceAPI.__mro__] + assert BaseLLM.__name__ in names_of_base_classes diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py b/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py index 55be8fdc0dfe1..f206f5a260025 100644 --- a/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py +++ b/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py @@ -1,5 +1,6 @@ import logging from typing import Any, Callable, Dict, List, Optional, Sequence, Union +from deprecated import deprecated import torch from huggingface_hub import AsyncInferenceClient, InferenceClient, model_info @@ -455,6 +456,10 @@ def chat_messages_to_conversational_kwargs( return kwargs +@deprecated( + "Deprecated in favor of `HuggingFaceInferenceAPI` from `llama-index-llms-huggingface-api` which should be used instead.", + action="always", +) class HuggingFaceInferenceAPI(CustomLLM): """ Wrapper on the Hugging Face's Inference API. @@ -685,6 +690,10 @@ async def astream_complete( raise NotImplementedError +@deprecated( + "Deprecated in favor of `TextGenerationInference` from `llama-index-llms-text-generation-inference` which should be used instead.", + action="always", +) class TextGenerationInference(FunctionCallingLLM): model_name: Optional[str] = Field( default=None, diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md new file mode 100644 index 0000000000000..3e7238b7ebe98 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md @@ -0,0 +1,24 @@ +# LlamaIndex Llms Integration: Text Generation Inference + +Integration with [Text Generation Inference](https://huggingface.co/docs/text-generation-inference) from Hugging Face to generate text. + +## Installation + +```shell +pip install llama-index-llms-text-generation-inference +``` + +## Usage + +```python +from llama_index.llms.text_generation_inference import TextGenerationInference + +llm = TextGenerationInference( + model_name="openai-community/gpt2", + temperature=0.7, + max_tokens=100, + token="", # Optional +) + +response = llm.complete("Hello, how are you?") +``` diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py new file mode 100644 index 0000000000000..156730ba0dcb4 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py @@ -0,0 +1,5 @@ +from llama_index.llms.text_generation_inference.base import ( + TextGenerationInference, +) + +__all__ = ["TextGenerationInference"] diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py new file mode 100644 index 0000000000000..99d7dba8e0382 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py @@ -0,0 +1,445 @@ +import logging +from typing import Any, Callable, Dict, List, Optional, Sequence, Union + +from llama_index.core.base.llms.types import ( + ChatMessage, + ChatResponse, + ChatResponseAsyncGen, + ChatResponseGen, + CompletionResponse, + CompletionResponseAsyncGen, + CompletionResponseGen, + LLMMetadata, + MessageRole, +) +from llama_index.core.bridge.pydantic import Field, PrivateAttr +from llama_index.core.callbacks import CallbackManager +from llama_index.core.constants import ( + DEFAULT_TEMPERATURE, + DEFAULT_CONTEXT_WINDOW, + DEFAULT_NUM_OUTPUTS, +) +from llama_index.core.llms.callbacks import ( + llm_chat_callback, + llm_completion_callback, +) +from llama_index.core.llms.llm import ToolSelection +from llama_index.core.llms.function_calling import FunctionCallingLLM +from llama_index.core.base.llms.generic_utils import ( + chat_to_completion_decorator, + achat_to_completion_decorator, + stream_chat_to_completion_decorator, + astream_chat_to_completion_decorator, + get_from_param_or_env, +) +from llama_index.core.types import BaseOutputParser, PydanticProgramMode +from llama_index.core.chat_engine.types import AgentChatResponse +from llama_index.core.tools.types import BaseTool +from llama_index.llms.text_generation_inference.utils import ( + to_tgi_messages, + force_single_tool_call, + resolve_tgi_function_call, + get_max_input_length, + resolve_tool_choice, +) +from text_generation import ( + Client as TGIClient, + AsyncClient as TGIAsyncClient, +) + +logger = logging.getLogger(__name__) + + +class TextGenerationInference(FunctionCallingLLM): + model_name: Optional[str] = Field( + default=None, + description=("The name of the model served at the TGI endpoint"), + ) + temperature: float = Field( + default=DEFAULT_TEMPERATURE, + description=("The temperature to use for sampling."), + gte=0.0, + lte=1.0, + ) + max_tokens: int = Field( + default=DEFAULT_NUM_OUTPUTS, + description=("The maximum number of tokens to generate."), + gt=0, + ) + token: Union[str, bool, None] = Field( + default=None, + description=( + "Hugging Face token. Will default to the locally saved token. Pass " + "token=False if you don’t want to send your token to the server." + ), + ) + timeout: float = Field( + default=120, description=("The timeout to use in seconds."), gte=0 + ) + max_retries: int = Field( + default=5, description=("The maximum number of API retries."), gte=0 + ) + headers: Optional[Dict[str, str]] = Field( + default=None, + description=( + "Additional headers to send to the server. By default only the" + " authorization headers are sent. Values in this dictionary" + " will override the default values." + ), + ) + cookies: Optional[Dict[str, str]] = Field( + default=None, description=("Additional cookies to send to the server.") + ) + seed: Optional[str] = Field( + default=None, description=("The random seed to use for sampling.") + ) + additional_kwargs: Dict[str, Any] = Field( + default_factory=dict, description=("Additional kwargs for the TGI API.") + ) + + _sync_client: "TGIClient" = PrivateAttr() + _async_client: "TGIAsyncClient" = PrivateAttr() + + context_window: int = Field( + default=DEFAULT_CONTEXT_WINDOW, + description=("Maximum input length in tokens returned from TGI endpoint"), + ) + is_chat_model: bool = Field( + default=True, + description=( + LLMMetadata.__fields__["is_chat_model"].field_info.description + + " TGI makes use of chat templating," + " function call is available only for '/v1/chat/completions' route" + " of TGI endpoint" + ), + ) + is_function_calling_model: bool = Field( + default=False, + description=( + LLMMetadata.__fields__["is_function_calling_model"].field_info.description + + " 'text-generation-inference' supports function call" + " starting from v1.4.3" + ), + ) + + def __init__( + self, + model_url, + model_name: Optional[str] = None, + cookies: Optional[dict] = None, + temperature: float = DEFAULT_TEMPERATURE, + max_tokens: int = DEFAULT_NUM_OUTPUTS, + timeout: int = 120, + max_retries: int = 5, + seed: Optional[int] = None, + token: Optional[str] = None, + additional_kwargs: Optional[Dict[str, Any]] = None, + callback_manager: Optional[CallbackManager] = None, + system_prompt: Optional[str] = None, + messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None, + completion_to_prompt: Optional[Callable[[str], str]] = None, + pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, + output_parser: Optional[BaseOutputParser] = None, + ) -> None: + additional_kwargs = additional_kwargs or {} + callback_manager = callback_manager or CallbackManager([]) + + token = get_from_param_or_env("token", token, "HF_TOKEN", "") + + headers = {} + if token: + headers.update({"Authorization": f"Bearer {token}"}) + + self._sync_client = TGIClient( + base_url=model_url, + headers=headers, + cookies=cookies, + timeout=timeout, + ) + self._async_client = TGIAsyncClient( + base_url=model_url, + headers=headers, + cookies=cookies, + timeout=timeout, + ) + + try: + is_function_calling_model = resolve_tgi_function_call(model_url) + except Exception as e: + logger.warning(f"TGI client has no function call support: {e}") + is_function_calling_model = False + + context_window = get_max_input_length(model_url) or DEFAULT_CONTEXT_WINDOW + + super().__init__( + context_window=context_window, + temperature=temperature, + max_tokens=max_tokens, + additional_kwargs=additional_kwargs, + timeout=timeout, + max_retries=max_retries, + seed=seed, + model=model_name, + is_function_calling_model=is_function_calling_model, + callback_manager=callback_manager, + system_prompt=system_prompt, + messages_to_prompt=messages_to_prompt, + completion_to_prompt=completion_to_prompt, + pydantic_program_mode=pydantic_program_mode, + output_parser=output_parser, + ) + + @classmethod + def class_name(cls) -> str: + return "TextGenerationInference" + + @property + def metadata(self) -> LLMMetadata: + return LLMMetadata( + context_window=self.context_window, + num_output=self.max_tokens, + is_chat_model=True, + model_name=self.model_name, + random_seed=self.seed, + is_function_calling_model=self.is_function_calling_model, + ) + + @property + def _model_kwargs(self) -> Dict[str, Any]: + base_kwargs = { + "temperature": self.temperature, + "max_tokens": self.max_tokens, + "seed": self.seed, + } + return { + **base_kwargs, + **self.additional_kwargs, + } + + def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]: + return { + **self._model_kwargs, + **kwargs, + } + + @llm_chat_callback() + def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: + # convert to TGI Message + messages = to_tgi_messages(messages) + all_kwargs = self._get_all_kwargs(**kwargs) + response = self._sync_client.chat(messages=messages, **all_kwargs) + tool_calls = response.choices[0].message.tool_calls + + return ChatResponse( + message=ChatMessage( + role=MessageRole.ASSISTANT, + content=response.choices[0].message.content, + additional_kwargs=( + {"tool_calls": tool_calls} if tool_calls is not None else {} + ), + ), + raw=dict(response), + ) + + @llm_completion_callback() + def complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + complete_fn = chat_to_completion_decorator(self.chat) + return complete_fn(prompt, **kwargs) + + @llm_chat_callback() + def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseGen: + # convert to TGI Message + messages = to_tgi_messages(messages) + all_kwargs = self._get_all_kwargs(**kwargs) + response = self._sync_client.chat(messages=messages, stream=True, **all_kwargs) + + def generator() -> ChatResponseGen: + content = "" + role = MessageRole.ASSISTANT + for chunk in response: + content_delta = chunk.choices[0].delta.content + if content_delta is None: + continue + content += content_delta + yield ChatResponse( + message=ChatMessage(role=role, content=content), + delta=content_delta, + raw=chunk, + ) + + return generator() + + @llm_completion_callback() + def stream_complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponseGen: + stream_complete_fn = stream_chat_to_completion_decorator(self.stream_chat) + return stream_complete_fn(prompt, **kwargs) + + @llm_chat_callback() + async def achat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + # convert to TGI Message + messages = to_tgi_messages(messages) + all_kwargs = self._get_all_kwargs(**kwargs) + response = await self._async_client.chat(messages=messages, **all_kwargs) + tool_calls = response.choices[0].message.tool_calls + + return ChatResponse( + message=ChatMessage( + role=MessageRole.ASSISTANT, + content=response.choices[0].message.content, + additional_kwargs=( + {"tool_calls": tool_calls} if tool_calls is not None else {} + ), + ), + raw=dict(response), + ) + + @llm_completion_callback() + async def acomplete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + acomplete_fn = achat_to_completion_decorator(self.achat) + return await acomplete_fn(prompt, **kwargs) + + @llm_chat_callback() + async def astream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseAsyncGen: + # convert to TGI Message + messages = to_tgi_messages(messages) + all_kwargs = self._get_all_kwargs(**kwargs) + response = await self._async_client.chat( + messages=messages, stream=True, **all_kwargs + ) + + async def generator() -> ChatResponseAsyncGen: + content = "" + role = MessageRole.ASSISTANT + async for chunk in response: + content_delta = chunk.choices[0].delta.content + if content_delta is None: + continue + content += content_delta + yield ChatResponse( + message=ChatMessage(role=role, content=content), + delta=content_delta, + raw=chunk, + ) + + return generator() + + @llm_completion_callback() + async def astream_complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponseAsyncGen: + astream_complete_fn = astream_chat_to_completion_decorator(self.astream_chat) + return await astream_complete_fn(prompt, **kwargs) + + def chat_with_tools( + self, + tools: List["BaseTool"], + user_msg: Optional[Union[str, ChatMessage]] = None, + chat_history: Optional[List[ChatMessage]] = None, + verbose: bool = False, + allow_parallel_tool_calls: bool = False, + tool_choice: str = "auto", + **kwargs: Any, + ) -> ChatResponse: + """Predict and call the tool.""" + # use openai tool format + tool_specs = [ + tool.metadata.to_openai_tool(skip_length_check=True) for tool in tools + ] + + if isinstance(user_msg, str): + user_msg = ChatMessage(role=MessageRole.USER, content=user_msg) + + messages = chat_history or [] + if user_msg: + messages.append(user_msg) + + response = self.chat( + messages=messages, + tools=tool_specs, + tool_choice=resolve_tool_choice(tool_specs, tool_choice), + **kwargs, + ) + if not allow_parallel_tool_calls: + force_single_tool_call(response) + return response + + async def achat_with_tools( + self, + tools: List["BaseTool"], + user_msg: Optional[Union[str, ChatMessage]] = None, + chat_history: Optional[List[ChatMessage]] = None, + verbose: bool = False, + allow_parallel_tool_calls: bool = False, + tool_choice: str = "auto", + **kwargs: Any, + ) -> ChatResponse: + # use openai tool format + tool_specs = [ + tool.metadata.to_openai_tool(skip_length_check=True) for tool in tools + ] + + if isinstance(user_msg, str): + user_msg = ChatMessage(role=MessageRole.USER, content=user_msg) + + messages = chat_history or [] + if user_msg: + messages.append(user_msg) + + response = self.achat( + messages=messages, + tools=tool_specs, + tool_choice=resolve_tool_choice(tool_specs, tool_choice), + **kwargs, + ) + if not allow_parallel_tool_calls: + force_single_tool_call(response) + return response + + def get_tool_calls_from_response( + self, + response: "AgentChatResponse", + error_on_no_tool_call: bool = True, + ) -> List[ToolSelection]: + """Predict and call the tool.""" + tool_calls = response.message.additional_kwargs.get("tool_calls", []) + + if len(tool_calls) < 1: + if error_on_no_tool_call: + raise ValueError( + f"Expected at least one tool call, but got {len(tool_calls)} tool calls." + ) + else: + return [] + + tool_selections = [] + for tool_call in tool_calls: + # TODO Add typecheck with ToolCall from TGI once the client is updated + if tool_call and (tc_type := tool_call["type"]) != "function": + raise ValueError( + f"Invalid tool type: got {tc_type}, expect 'function'." + ) + argument_dict = tool_call["function"]["parameters"] + + tool_selections.append( + ToolSelection( + tool_id=tool_call["id"], + tool_name=tool_call["function"][ + "name" + ], # NOTE for now the tool_name is hardcoded 'tools' in TGI + tool_kwargs=argument_dict, + ) + ) + + return tool_selections diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py new file mode 100644 index 0000000000000..71843873d1e09 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py @@ -0,0 +1,66 @@ +import requests +from packaging import version +from typing import Sequence, Union, List, Optional +from llama_index.core.base.llms.types import ( + ChatMessage, + ChatResponse, +) +from text_generation.types import ( + Message, +) + + +def resolve_tgi_function_call(url: str) -> bool: + url = f"{url}/info" + model_info = dict(requests.get(url).json()) + tgi_version = model_info.get("version", None) + if version.parse(tgi_version) >= version.parse("2.0.1"): + return True + else: + raise ValueError( + "'text-generation-inference' version ", + f"incompatible with function call: {tgi_version}. ", + "Function call support was added in v2.0.1", + ) + + +def get_max_input_length(url: str) -> Union[int, None]: + url = f"{url}/info" + model_info = dict(requests.get(url).json()) + return model_info.get("max_input_length", None) + + +def to_tgi_messages(messages: Sequence[ChatMessage]) -> Sequence[Message]: + out_messages = [] + for m in messages: + tool_calls = m.additional_kwargs.get("tool_calls") + out_messages.append( + Message(role=m.role.value, content=m.content, tool_calls=tool_calls) + ) + + return out_messages + + +def force_single_tool_call(response: ChatResponse) -> None: + tool_calls = response.message.additional_kwargs.get("tool_calls", []) + if len(tool_calls) > 1: + response.message.additional_kwargs["tool_calls"] = [tool_calls[0]] + + +def resolve_tool_choice( + tools: Optional[List[dict]] = None, tool_choice: str = "none" +) -> Union[str, dict]: + """Resolve tool choice. + + Check if tool_name exists in tools. + Note that unlike in OpenAI specification, 'auto' will ALWAYS choose the tool for you. + Set to 'none' explicitly if do not wish to use tool. + """ + valid_tool_choices = ["none", "auto"] + [t["function"]["name"] for t in tools or []] + + if tool_choice not in valid_tool_choices: + raise ValueError( + f"{tool_choice} is not a valid tool_choice. Must be one of {valid_tool_choices}" + ) + + return tool_choice diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml new file mode 100644 index 0000000000000..d096a3f6982c2 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml @@ -0,0 +1,63 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.llms.text_generation_inference" + +[tool.llamahub.class_authors] +TextGenerationInference = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index llms huggingface text generation inference integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-llms-text-generation-inference" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.41" +text-generation = "^0.7.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py new file mode 100644 index 0000000000000..8cf27c42a97a2 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py @@ -0,0 +1,7 @@ +from llama_index.core.base.llms.base import BaseLLM +from llama_index.llms.text_generation_inference import TextGenerationInference + + +def test_embedding_class(): + names_of_base_classes = [b.__name__ for b in TextGenerationInference.__mro__] + assert BaseLLM.__name__ in names_of_base_classes diff --git a/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb b/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb index f16ac997251eb..27526e043a6d9 100644 --- a/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb +++ b/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb @@ -40,7 +40,7 @@ "%pip install llama-index-vector-stores-qdrant\n", "%pip install llama-index-readers-wikipedia\n", "%pip install llama-index-packs-llama-guard-moderator\n", - "%pip install llama-index-llms-huggingface" + "%pip install llama-index-llms-huggingface-api" ] }, { @@ -297,7 +297,7 @@ "simple_node_parser = SimpleNodeParser.from_defaults()\n", "\n", "# Step 3: Define ServiceContext with llm and embed_model\n", - "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n", + "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "import os\n", "\n", "os.environ[\"HUGGINGFACE_ACCESS_TOKEN\"] = \"hf_##################\"\n", diff --git a/llama-index-utils/llama-index-utils-huggingface/.gitignore b/llama-index-utils/llama-index-utils-huggingface/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-utils/llama-index-utils-huggingface/BUILD b/llama-index-utils/llama-index-utils-huggingface/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-utils/llama-index-utils-huggingface/Makefile b/llama-index-utils/llama-index-utils-huggingface/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-utils/llama-index-utils-huggingface/README.md b/llama-index-utils/llama-index-utils-huggingface/README.md new file mode 100644 index 0000000000000..5f1dbbb7fad90 --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/README.md @@ -0,0 +1 @@ +# LlamaIndex Utils: Huggingface diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py new file mode 100644 index 0000000000000..e9b04c0750c52 --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py @@ -0,0 +1,31 @@ +from llama_index.utils.huggingface.base import ( + DEFAULT_QUERY_BGE_INSTRUCTION_EN, + BGE_MODELS, + DEFAULT_EMBED_INSTRUCTION, + DEFAULT_HUGGINGFACE_EMBEDDING_MODEL, + DEFAULT_INSTRUCT_MODEL, + DEFAULT_QUERY_BGE_INSTRUCTION_ZH, + DEFAULT_QUERY_INSTRUCTION, + INSTRUCTOR_MODELS, + format_query, + format_text, + get_pooling_mode, + get_query_instruct_for_model_name, + get_text_instruct_for_model_name, +) + +__all__ = [ + "DEFAULT_QUERY_BGE_INSTRUCTION_EN", + "BGE_MODELS", + "DEFAULT_EMBED_INSTRUCTION", + "DEFAULT_HUGGINGFACE_EMBEDDING_MODEL", + "DEFAULT_INSTRUCT_MODEL", + "DEFAULT_QUERY_BGE_INSTRUCTION_ZH", + "DEFAULT_QUERY_INSTRUCTION", + "INSTRUCTOR_MODELS", + "format_query", + "format_text", + "get_pooling_mode", + "get_query_instruct_for_model_name", + "get_text_instruct_for_model_name", +] diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py new file mode 100644 index 0000000000000..009aaab7649ba --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py @@ -0,0 +1,99 @@ +from typing import Optional + +import requests + +DEFAULT_HUGGINGFACE_EMBEDDING_MODEL = "BAAI/bge-small-en" +DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-base" + +# Originally pulled from: +# https://github.com/langchain-ai/langchain/blob/v0.0.257/libs/langchain/langchain/embeddings/huggingface.py#L10 +DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: " +DEFAULT_QUERY_INSTRUCTION = ( + "Represent the question for retrieving supporting documents: " +) +DEFAULT_QUERY_BGE_INSTRUCTION_EN = ( + "Represent this question for searching relevant passages: " +) +DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:" + +BGE_MODELS = ( + "BAAI/bge-small-en", + "BAAI/bge-small-en-v1.5", + "BAAI/bge-base-en", + "BAAI/bge-base-en-v1.5", + "BAAI/bge-large-en", + "BAAI/bge-large-en-v1.5", + "BAAI/bge-small-zh", + "BAAI/bge-small-zh-v1.5", + "BAAI/bge-base-zh", + "BAAI/bge-base-zh-v1.5", + "BAAI/bge-large-zh", + "BAAI/bge-large-zh-v1.5", +) +INSTRUCTOR_MODELS = ( + "hku-nlp/instructor-base", + "hku-nlp/instructor-large", + "hku-nlp/instructor-xl", + "hkunlp/instructor-base", + "hkunlp/instructor-large", + "hkunlp/instructor-xl", +) + + +def get_query_instruct_for_model_name(model_name: Optional[str]) -> str: + """Get query text instruction for a given model name.""" + if model_name in INSTRUCTOR_MODELS: + return DEFAULT_QUERY_INSTRUCTION + if model_name in BGE_MODELS: + if "zh" in model_name: + return DEFAULT_QUERY_BGE_INSTRUCTION_ZH + return DEFAULT_QUERY_BGE_INSTRUCTION_EN + return "" + + +def format_query( + query: str, model_name: Optional[str], instruction: Optional[str] = None +) -> str: + if instruction is None: + instruction = get_query_instruct_for_model_name(model_name) + # NOTE: strip() enables backdoor for defeating instruction prepend by + # passing empty string + return f"{instruction} {query}".strip() + + +def get_text_instruct_for_model_name(model_name: Optional[str]) -> str: + """Get text instruction for a given model name.""" + return DEFAULT_EMBED_INSTRUCTION if model_name in INSTRUCTOR_MODELS else "" + + +def format_text( + text: str, model_name: Optional[str], instruction: Optional[str] = None +) -> str: + if instruction is None: + instruction = get_text_instruct_for_model_name(model_name) + # NOTE: strip() enables backdoor for defeating instruction prepend by + # passing empty string + return f"{instruction} {text}".strip() + + +def get_pooling_mode(model_name: Optional[str]) -> str: + pooling_config_url = ( + f"https://huggingface.co/{model_name}/raw/main/1_Pooling/config.json" + ) + + try: + response = requests.get(pooling_config_url) + config_data = response.json() + + cls_token = config_data.get("pooling_mode_cls_token", False) + mean_tokens = config_data.get("pooling_mode_mean_tokens", False) + + if mean_tokens: + return "mean" + elif cls_token: + return "cls" + except requests.exceptions.RequestException: + print( + "Warning: Pooling config file not found; pooling mode is defaulted to 'cls'." + ) + return "cls" diff --git a/llama-index-utils/llama-index-utils-huggingface/pyproject.toml b/llama-index-utils/llama-index-utils-huggingface/pyproject.toml new file mode 100644 index 0000000000000..199c5d7be01c6 --- /dev/null +++ b/llama-index-utils/llama-index-utils-huggingface/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.utils.huggingface" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index utils for huggingface integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-utils-huggingface" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.1" +sentence-transformers = "^2.6.1" + +[tool.poetry.dependencies.huggingface-hub] +extras = ["inference"] +version = ">=0.19.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/"