From 7c512fbdd227982a5c1aa8475e5ac59668058278 Mon Sep 17 00:00:00 2001
From: Fernando Silva <fernandonsilva16@gmail.com>
Date: Wed, 12 Jun 2024 18:17:18 -0300
Subject: [PATCH] Split HuggingFace embeddings in HuggingFace API and
 TextGenerationInference packages (#14013)

---
 .../api_reference/embeddings/huggingface.md   |   1 -
 .../embeddings/huggingface_api.md             |   4 +
 docs/docs/api_reference/llms/huggingface.md   |   1 -
 .../api_reference/llms/huggingface_api.md     |   4 +
 .../llms/text_generation_inference.md         |   4 +
 .../examples/cookbooks/llama3_cookbook.ipynb  |   5 +-
 .../cookbooks/prometheus2_cookbook.ipynb      |   4 +-
 .../examples/embeddings/jina_embeddings.ipynb |   3 +
 .../evaluation/prometheus_evaluation.ipynb    |   4 +-
 ...llm_judge_single_grading_correctness.ipynb |   4 +-
 .../pairwise/finetune_llm_judge.ipynb         |   4 +-
 docs/docs/examples/llm/huggingface.ipynb      |  21 +-
 .../examples/node_postprocessor/rankGPT.ipynb |   7 +-
 docs/mkdocs.yml                               |   6 +
 .../llama_index/core/embeddings/loading.py    |   2 +-
 .../core/ingestion/transformations.py         |   2 +-
 .../llama_index/core/llms/loading.py          |   2 +-
 .../.gitignore                                | 153 ++++++
 .../BUILD                                     |   3 +
 .../Makefile                                  |  17 +
 .../README.md                                 |  26 +
 .../embeddings/huggingface_api/BUILD          |   1 +
 .../embeddings/huggingface_api/__init__.py    |   7 +
 .../embeddings/huggingface_api/base.py        | 217 +++++++++
 .../embeddings/huggingface_api/pooling.py     |  74 +++
 .../pyproject.toml                            |  67 +++
 .../tests/BUILD                               |   1 +
 .../tests/__init__.py                         |   0
 .../tests/test_embeddings_huggingface.py      |  11 +
 .../tests/test_hf_inference.py                | 108 +++++
 .../embeddings/huggingface/base.py            |   5 +
 .../.gitignore                                | 153 ++++++
 .../llama-index-llms-huggingface-api/BUILD    |   3 +
 .../llama-index-llms-huggingface-api/Makefile |  17 +
 .../README.md                                 |  26 +
 .../llama_index/llms/huggingface_api/BUILD    |   1 +
 .../llms/huggingface_api/__init__.py          |   5 +
 .../llama_index/llms/huggingface_api/base.py  | 284 +++++++++++
 .../pyproject.toml                            |  63 +++
 .../tests/BUILD                               |   1 +
 .../tests/__init__.py                         |   0
 .../tests/test_huggingface_api.py             | 115 +++++
 .../tests/test_llms_huggingface_api.py        |   7 +
 .../llama_index/llms/huggingface/base.py      |   9 +
 .../.gitignore                                | 153 ++++++
 .../BUILD                                     |   3 +
 .../Makefile                                  |  17 +
 .../README.md                                 |  24 +
 .../llms/text_generation_inference/BUILD      |   1 +
 .../text_generation_inference/__init__.py     |   5 +
 .../llms/text_generation_inference/base.py    | 445 ++++++++++++++++++
 .../llms/text_generation_inference/utils.py   |  66 +++
 .../pyproject.toml                            |  63 +++
 .../tests/BUILD                               |   1 +
 .../tests/__init__.py                         |   0
 .../test_llms_text_generation_inference.py    |   7 +
 .../rag_moderator_llama_guard_pack.ipynb      |   4 +-
 .../llama-index-utils-huggingface/.gitignore  | 153 ++++++
 .../llama-index-utils-huggingface/BUILD       |   3 +
 .../llama-index-utils-huggingface/Makefile    |  17 +
 .../llama-index-utils-huggingface/README.md   |   1 +
 .../llama_index/utils/huggingface/BUILD       |   1 +
 .../llama_index/utils/huggingface/__init__.py |  31 ++
 .../llama_index/utils/huggingface/base.py     |  99 ++++
 .../pyproject.toml                            |  64 +++
 65 files changed, 2583 insertions(+), 27 deletions(-)
 create mode 100644 docs/docs/api_reference/embeddings/huggingface_api.md
 create mode 100644 docs/docs/api_reference/llms/huggingface_api.md
 create mode 100644 docs/docs/api_reference/llms/text_generation_inference.md
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py
 create mode 100644 llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/.gitignore
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/BUILD
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/Makefile
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/README.md
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py
 create mode 100644 llama-index-utils/llama-index-utils-huggingface/pyproject.toml

diff --git a/docs/docs/api_reference/embeddings/huggingface.md b/docs/docs/api_reference/embeddings/huggingface.md
index 597bd6798f3a5..0a8842e38b31f 100644
--- a/docs/docs/api_reference/embeddings/huggingface.md
+++ b/docs/docs/api_reference/embeddings/huggingface.md
@@ -2,4 +2,3 @@
     options:
       members:
         - HuggingFaceEmbedding
-        - HuggingFaceInferenceAPIEmbedding
diff --git a/docs/docs/api_reference/embeddings/huggingface_api.md b/docs/docs/api_reference/embeddings/huggingface_api.md
new file mode 100644
index 0000000000000..26888df65a1c3
--- /dev/null
+++ b/docs/docs/api_reference/embeddings/huggingface_api.md
@@ -0,0 +1,4 @@
+::: llama_index.embeddings.huggingface_api
+    options:
+      members:
+        - HuggingFaceInferenceAPIEmbedding
diff --git a/docs/docs/api_reference/llms/huggingface.md b/docs/docs/api_reference/llms/huggingface.md
index a03d1953b7c01..3edfc3aaf212e 100644
--- a/docs/docs/api_reference/llms/huggingface.md
+++ b/docs/docs/api_reference/llms/huggingface.md
@@ -1,5 +1,4 @@
 ::: llama_index.llms.huggingface
     options:
       members:
-        - HuggingFaceInferenceAPI
         - HuggingFaceLLM
diff --git a/docs/docs/api_reference/llms/huggingface_api.md b/docs/docs/api_reference/llms/huggingface_api.md
new file mode 100644
index 0000000000000..33ee697f599b2
--- /dev/null
+++ b/docs/docs/api_reference/llms/huggingface_api.md
@@ -0,0 +1,4 @@
+::: llama_index.llms.huggingface_api
+    options:
+      members:
+        - HuggingFaceInferenceAPI
diff --git a/docs/docs/api_reference/llms/text_generation_inference.md b/docs/docs/api_reference/llms/text_generation_inference.md
new file mode 100644
index 0000000000000..afeb9ae9d3543
--- /dev/null
+++ b/docs/docs/api_reference/llms/text_generation_inference.md
@@ -0,0 +1,4 @@
+::: llama_index.llms.text_generation_inference
+    options:
+      members:
+        - TextGenerationInference
diff --git a/docs/docs/examples/cookbooks/llama3_cookbook.ipynb b/docs/docs/examples/cookbooks/llama3_cookbook.ipynb
index d40468490be1b..610c56e064765 100644
--- a/docs/docs/examples/cookbooks/llama3_cookbook.ipynb
+++ b/docs/docs/examples/cookbooks/llama3_cookbook.ipynb
@@ -29,7 +29,8 @@
    "source": [
     "!pip install llama-index\n",
     "!pip install llama-index-llms-huggingface\n",
-    "!pip install llama-index-embeddings-huggingface"
+    "!pip install llama-index-embeddings-huggingface\n",
+    "!pip install llama-index-embeddings-huggingface-api"
    ]
   },
   {
@@ -166,7 +167,7 @@
    "source": [
     "## You can deploy the model on HF Inference Endpoint and use it\n",
     "\n",
-    "# from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "# llm = HuggingFaceInferenceAPI(\n",
     "#     model_name=\"<HF Inference Endpoint>\",\n",
diff --git a/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb b/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb
index fbd28ba0a4c5f..bc8f6aa3ced84 100644
--- a/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb
+++ b/docs/docs/examples/cookbooks/prometheus2_cookbook.ipynb
@@ -53,7 +53,7 @@
    "outputs": [],
    "source": [
     "!pip install llama-index\n",
-    "!pip install llama-index-llms-huggingface"
+    "!pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -145,7 +145,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "HF_TOKEN = \"YOUR HF TOKEN\"\n",
     "HF_ENDPOINT_URL = \"YOUR HF ENDPOINT URL\"\n",
diff --git a/docs/docs/examples/embeddings/jina_embeddings.ipynb b/docs/docs/examples/embeddings/jina_embeddings.ipynb
index f79d19ae81b9c..1ca790bfa2326 100644
--- a/docs/docs/examples/embeddings/jina_embeddings.ipynb
+++ b/docs/docs/examples/embeddings/jina_embeddings.ipynb
@@ -26,6 +26,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-embeddings-huggingface\n",
+    "%pip install llama-index-embeddings-huggingface-api\n",
     "%pip install llama-index-embeddings-openai"
    ]
   },
@@ -58,6 +59,8 @@
    "source": [
     "from llama_index.embeddings.huggingface import (\n",
     "    HuggingFaceEmbedding,\n",
+    ")\n",
+    "from llama_index.embeddings.huggingface_api import (\n",
     "    HuggingFaceInferenceAPIEmbedding,\n",
     ")\n",
     "from llama_index.embeddings.openai import OpenAIEmbedding\n",
diff --git a/docs/docs/examples/evaluation/prometheus_evaluation.ipynb b/docs/docs/examples/evaluation/prometheus_evaluation.ipynb
index f6fdd8dcfc0cd..bdf6ecccfc6d0 100644
--- a/docs/docs/examples/evaluation/prometheus_evaluation.ipynb
+++ b/docs/docs/examples/evaluation/prometheus_evaluation.ipynb
@@ -76,7 +76,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-llms-huggingface"
+    "%pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -146,7 +146,7 @@
     }
    ],
    "source": [
-    "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "HF_TOKEN = \"YOUR HF TOKEN\"\n",
     "HF_ENDPOINT_URL = (\n",
diff --git a/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb b/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
index dffed455848df..6d4a2e0920a84 100644
--- a/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
+++ b/docs/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
@@ -27,7 +27,7 @@
     "%pip install llama-index-finetuning\n",
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-finetuning-callbacks\n",
-    "%pip install llama-index-llms-huggingface"
+    "%pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -265,7 +265,7 @@
    ],
    "source": [
     "from llama_index.core.query_engine import RetrieverQueryEngine\n",
-    "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "llm = HuggingFaceInferenceAPI(\n",
     "    model_name=\"meta-llama/Llama-2-7b-chat-hf\",\n",
diff --git a/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb b/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
index 806440204a1ca..1cbbc9447af4d 100644
--- a/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
+++ b/docs/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
@@ -28,7 +28,7 @@
     "%pip install llama-index-finetuning\n",
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-finetuning-callbacks\n",
-    "%pip install llama-index-llms-huggingface"
+    "%pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -410,7 +410,7 @@
    "outputs": [],
    "source": [
     "from llama_index.core.query_engine import RetrieverQueryEngine\n",
-    "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "\n",
     "def create_query_engine(\n",
diff --git a/docs/docs/examples/llm/huggingface.ipynb b/docs/docs/examples/llm/huggingface.ipynb
index 8625117882528..34add77208449 100644
--- a/docs/docs/examples/llm/huggingface.ipynb
+++ b/docs/docs/examples/llm/huggingface.ipynb
@@ -49,7 +49,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-llms-huggingface"
+    "%pip install llama-index-llms-huggingface\n",
+    "%pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -99,10 +100,8 @@
     "import os\n",
     "from typing import List, Optional\n",
     "\n",
-    "from llama_index.llms.huggingface import (\n",
-    "    HuggingFaceInferenceAPI,\n",
-    "    HuggingFaceLLM,\n",
-    ")\n",
+    "from llama_index.llms.huggingface import HuggingFaceLLM\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "\n",
     "# SEE: https://huggingface.co/docs/hub/security-tokens\n",
     "# We just need a token with read permissions for this demo\n",
@@ -227,6 +226,16 @@
     "The new `TextGenerationInference` class allows to interface with endpoints running [`text-generation-inference`, TGI](https://huggingface.co/docs/text-generation-inference/index). In addition to blazingly fast inference, it supports `tool` usage starting from version `2.0.1`. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46c5c06d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install llama-index-llms-text-generation-inference"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "055ddcb1",
@@ -253,7 +262,7 @@
     "import os\n",
     "from typing import List, Optional\n",
     "\n",
-    "from llama_index.llms.huggingface import (\n",
+    "from llama_index.llms.text_generation_inference import (\n",
     "    TextGenerationInference,\n",
     ")\n",
     "\n",
diff --git a/docs/docs/examples/node_postprocessor/rankGPT.ipynb b/docs/docs/examples/node_postprocessor/rankGPT.ipynb
index 909b08c0d5ce1..aefc3c4ffc71d 100644
--- a/docs/docs/examples/node_postprocessor/rankGPT.ipynb
+++ b/docs/docs/examples/node_postprocessor/rankGPT.ipynb
@@ -32,6 +32,7 @@
    "source": [
     "%pip install llama-index-postprocessor-rankgpt-rerank\n",
     "%pip install llama-index-llms-huggingface\n",
+    "%pip install llama-index-llms-huggingface-api\n",
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-llms-ollama"
    ]
@@ -484,10 +485,8 @@
     "from llama_index.core import QueryBundle\n",
     "import pandas as pd\n",
     "from IPython.display import display, HTML\n",
-    "from llama_index.llms.huggingface import (\n",
-    "    HuggingFaceInferenceAPI,\n",
-    "    HuggingFaceLLM,\n",
-    ")\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface import HuggingFaceLLM\n",
     "\n",
     "from llama_index.postprocessor.rankgpt_rerank import RankGPTRerank\n",
     "\n",
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 7e80c50490938..eb624ecf60eef 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -754,6 +754,7 @@ nav:
           - ./api_reference/embeddings/google.md
           - ./api_reference/embeddings/gradient.md
           - ./api_reference/embeddings/huggingface.md
+          - ./api_reference/embeddings/huggingface_api.md
           - ./api_reference/embeddings/huggingface_itrex.md
           - ./api_reference/embeddings/huggingface_openvino.md
           - ./api_reference/embeddings/huggingface_optimum.md
@@ -839,6 +840,7 @@ nav:
           - ./api_reference/llms/gradient.md
           - ./api_reference/llms/groq.md
           - ./api_reference/llms/huggingface.md
+          - ./api_reference/llms/huggingface_api.md
           - ./api_reference/llms/index.md
           - ./api_reference/llms/ipex_llm.md
           - ./api_reference/llms/konko.md
@@ -877,6 +879,7 @@ nav:
           - ./api_reference/llms/rungpt.md
           - ./api_reference/llms/sagemaker_endpoint.md
           - ./api_reference/llms/solar.md
+          - ./api_reference/llms/text_generation_inference.md
           - ./api_reference/llms/together.md
           - ./api_reference/llms/unify.md
           - ./api_reference/llms/upstage.md
@@ -1964,6 +1967,9 @@ plugins:
             - ../llama-index-integrations/readers/llama-index-readers-azure-devops
             - ../llama-index-integrations/retrievers/llama-index-retrievers-duckdb-retriever
             - ../llama-index-packs/llama-index-packs-zenguard
+            - ../llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api
+            - ../llama-index-integrations/llms/llama-index-llms-text-generation-inference
+            - ../llama-index-integrations/llms/llama-index-llms-huggingface-api
   - redirects:
       redirect_maps:
         ./api/llama_index.vector_stores.MongoDBAtlasVectorSearch.html: api_reference/storage/vector_store/mongodb.md
diff --git a/llama-index-core/llama_index/core/embeddings/loading.py b/llama-index-core/llama_index/core/embeddings/loading.py
index fd84ee64ed7f1..ac1449b970c26 100644
--- a/llama-index-core/llama_index/core/embeddings/loading.py
+++ b/llama-index-core/llama_index/core/embeddings/loading.py
@@ -25,7 +25,7 @@
     pass
 
 try:
-    from llama_index.embeddings.huggingface import (
+    from llama_index.embeddings.huggingface_api import (
         HuggingFaceInferenceAPIEmbedding,
     )  # pants: no-infer-dep
 
diff --git a/llama-index-core/llama_index/core/ingestion/transformations.py b/llama-index-core/llama_index/core/ingestion/transformations.py
index 1cd488ebc135f..49e62df3715d1 100644
--- a/llama-index-core/llama_index/core/ingestion/transformations.py
+++ b/llama-index-core/llama_index/core/ingestion/transformations.py
@@ -285,7 +285,7 @@ def build_configured_transformation(
         pass
 
     try:
-        from llama_index.embeddings.huggingface import (
+        from llama_index.embeddings.huggingface_api import (
             HuggingFaceInferenceAPIEmbedding,
         )  # pants: no-infer-dep
 
diff --git a/llama-index-core/llama_index/core/llms/loading.py b/llama-index-core/llama_index/core/llms/loading.py
index 005a69c89a7f0..20ce3f0dcfea3 100644
--- a/llama-index-core/llama_index/core/llms/loading.py
+++ b/llama-index-core/llama_index/core/llms/loading.py
@@ -25,7 +25,7 @@
     pass
 
 try:
-    from llama_index.llms.huggingface import (
+    from llama_index.llms.huggingface_api import (
         HuggingFaceInferenceAPI,
     )  # pants: no-infer-dep
 
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore
new file mode 100644
index 0000000000000..990c18de22908
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD
new file mode 100644
index 0000000000000..0896ca890d8bf
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile
new file mode 100644
index 0000000000000..b9eab05aa3706
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md
new file mode 100644
index 0000000000000..4b3371966a15d
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/README.md
@@ -0,0 +1,26 @@
+# LlamaIndex Embeddings Integration: Huggingface API
+
+Integration with Hugging Face's Inference API for embeddings.
+
+For more information on Hugging Face's Inference API, visit [Hugging Face's Inference API documentation](https://huggingface.co/docs/api-inference/quicktour).
+
+## Installation
+
+```shell
+pip install llama-index-embeddings-huggingface-api
+```
+
+## Usage
+
+```python
+from llama_index.embeddings.huggingface_api import (
+    HuggingFaceInferenceAPIEmbedding,
+)
+
+my_embed = HuggingFaceInferenceAPIEmbedding(
+    model_name="BAAI/bge-small-en-v1.5",
+    token="<your-token>",  # Optional
+)
+
+embeddings = my_embed.get_text_embedding("Why sky is blue")
+```
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py
new file mode 100644
index 0000000000000..8359f3684f8fd
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/__init__.py
@@ -0,0 +1,7 @@
+from llama_index.embeddings.huggingface_api.base import (
+    HuggingFaceInferenceAPIEmbedding,
+)
+
+__all__ = [
+    "HuggingFaceInferenceAPIEmbedding",
+]
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py
new file mode 100644
index 0000000000000..012abd0a74f9f
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/base.py
@@ -0,0 +1,217 @@
+import asyncio
+import logging
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+from huggingface_hub import (
+    AsyncInferenceClient,
+    InferenceClient,
+    model_info,
+)
+from huggingface_hub.hf_api import ModelInfo
+from llama_index.core.base.embeddings.base import (
+    BaseEmbedding,
+    Embedding,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.embeddings.huggingface_api.pooling import Pooling
+from llama_index.utils.huggingface import (
+    format_query,
+    format_text,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class HuggingFaceInferenceAPIEmbedding(BaseEmbedding):  # type: ignore[misc]
+    """
+    Wrapper on the Hugging Face's Inference API for embeddings.
+
+    Overview of the design:
+    - Uses the feature extraction task: https://huggingface.co/tasks/feature-extraction
+    """
+
+    pooling: Optional[Pooling] = Field(
+        default=Pooling.CLS,
+        description="Pooling strategy. If None, the model's default pooling is used.",
+    )
+    query_instruction: Optional[str] = Field(
+        default=None, description="Instruction to prepend during query embedding."
+    )
+    text_instruction: Optional[str] = Field(
+        default=None, description="Instruction to prepend during text embedding."
+    )
+
+    # Corresponds with huggingface_hub.InferenceClient
+    model_name: Optional[str] = Field(
+        default=None,
+        description="Hugging Face model name. If None, the task will be used.",
+    )
+    token: Union[str, bool, None] = Field(
+        default=None,
+        description=(
+            "Hugging Face token. Will default to the locally saved token. Pass "
+            "token=False if you don’t want to send your token to the server."
+        ),
+    )
+    timeout: Optional[float] = Field(
+        default=None,
+        description=(
+            "The maximum number of seconds to wait for a response from the server."
+            " Loading a new model in Inference API can take up to several minutes."
+            " Defaults to None, meaning it will loop until the server is available."
+        ),
+    )
+    headers: Dict[str, str] = Field(
+        default=None,
+        description=(
+            "Additional headers to send to the server. By default only the"
+            " authorization and user-agent headers are sent. Values in this dictionary"
+            " will override the default values."
+        ),
+    )
+    cookies: Dict[str, str] = Field(
+        default=None, description="Additional cookies to send to the server."
+    )
+    task: Optional[str] = Field(
+        default=None,
+        description=(
+            "Optional task to pick Hugging Face's recommended model, used when"
+            " model_name is left as default of None."
+        ),
+    )
+    _sync_client: "InferenceClient" = PrivateAttr()
+    _async_client: "AsyncInferenceClient" = PrivateAttr()
+    _get_model_info: "Callable[..., ModelInfo]" = PrivateAttr()
+
+    def _get_inference_client_kwargs(self) -> Dict[str, Any]:
+        """Extract the Hugging Face InferenceClient construction parameters."""
+        return {
+            "model": self.model_name,
+            "token": self.token,
+            "timeout": self.timeout,
+            "headers": self.headers,
+            "cookies": self.cookies,
+        }
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize.
+
+        Args:
+            kwargs: See the class-level Fields.
+        """
+        if kwargs.get("model_name") is None:
+            task = kwargs.get("task", "")
+            # NOTE: task being None or empty string leads to ValueError,
+            # which ensures model is present
+            kwargs["model_name"] = InferenceClient.get_recommended_model(task=task)
+            logger.debug(
+                f"Using Hugging Face's recommended model {kwargs['model_name']}"
+                f" given task {task}."
+            )
+            print(kwargs["model_name"], flush=True)
+        super().__init__(**kwargs)  # Populate pydantic Fields
+        self._sync_client = InferenceClient(**self._get_inference_client_kwargs())
+        self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs())
+        self._get_model_info = model_info
+
+    def validate_supported(self, task: str) -> None:
+        """
+        Confirm the contained model_name is deployed on the Inference API service.
+
+        Args:
+            task: Hugging Face task to check within. A list of all tasks can be
+                found here: https://huggingface.co/tasks
+        """
+        all_models = self._sync_client.list_deployed_models(frameworks="all")
+        try:
+            if self.model_name not in all_models[task]:
+                raise ValueError(
+                    "The Inference API service doesn't have the model"
+                    f" {self.model_name!r} deployed."
+                )
+        except KeyError as exc:
+            raise KeyError(
+                f"Input task {task!r} not in possible tasks {list(all_models.keys())}."
+            ) from exc
+
+    def get_model_info(self, **kwargs: Any) -> "ModelInfo":
+        """Get metadata on the current model from Hugging Face."""
+        return self._get_model_info(self.model_name, **kwargs)
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "HuggingFaceInferenceAPIEmbedding"
+
+    async def _async_embed_single(self, text: str) -> Embedding:
+        embedding = await self._async_client.feature_extraction(text)
+        if len(embedding.shape) == 1:
+            return embedding.tolist()
+        embedding = embedding.squeeze(axis=0)
+        if len(embedding.shape) == 1:  # Some models pool internally
+            return embedding.tolist()
+        try:
+            return self.pooling(embedding).tolist()  # type: ignore[misc]
+        except TypeError as exc:
+            raise ValueError(
+                f"Pooling is required for {self.model_name} because it returned"
+                " a > 1-D value, please specify pooling as not None."
+            ) from exc
+
+    async def _async_embed_bulk(self, texts: Sequence[str]) -> List[Embedding]:
+        """
+        Embed a sequence of text, in parallel and asynchronously.
+
+        NOTE: this uses an externally created asyncio event loop.
+        """
+        tasks = [self._async_embed_single(text) for text in texts]
+        return await asyncio.gather(*tasks)
+
+    def _get_query_embedding(self, query: str) -> Embedding:
+        """
+        Embed the input query synchronously.
+
+        NOTE: a new asyncio event loop is created internally for this.
+        """
+        return asyncio.run(self._aget_query_embedding(query))
+
+    def _get_text_embedding(self, text: str) -> Embedding:
+        """
+        Embed the text query synchronously.
+
+        NOTE: a new asyncio event loop is created internally for this.
+        """
+        return asyncio.run(self._aget_text_embedding(text))
+
+    def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]:
+        """
+        Embed the input sequence of text synchronously and in parallel.
+
+        NOTE: a new asyncio event loop is created internally for this.
+        """
+        loop = asyncio.new_event_loop()
+        try:
+            tasks = [
+                loop.create_task(self._aget_text_embedding(text)) for text in texts
+            ]
+            loop.run_until_complete(asyncio.wait(tasks))
+        finally:
+            loop.close()
+        return [task.result() for task in tasks]
+
+    async def _aget_query_embedding(self, query: str) -> Embedding:
+        return await self._async_embed_single(
+            text=format_query(query, self.model_name, self.query_instruction)
+        )
+
+    async def _aget_text_embedding(self, text: str) -> Embedding:
+        return await self._async_embed_single(
+            text=format_text(text, self.model_name, self.text_instruction)
+        )
+
+    async def _aget_text_embeddings(self, texts: List[str]) -> List[Embedding]:
+        return await self._async_embed_bulk(
+            texts=[
+                format_text(text, self.model_name, self.text_instruction)
+                for text in texts
+            ]
+        )
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py
new file mode 100644
index 0000000000000..2a4035cd55656
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/llama_index/embeddings/huggingface_api/pooling.py
@@ -0,0 +1,74 @@
+from enum import Enum
+from typing import TYPE_CHECKING, Union, overload
+
+import numpy as np
+
+if TYPE_CHECKING:
+    import torch
+
+
+class Pooling(str, Enum):
+    """Enum of possible pooling choices with pooling behaviors."""
+
+    CLS = "cls"
+    MEAN = "mean"
+    LAST = "last"  # last token pooling
+
+    def __call__(self, array: np.ndarray) -> np.ndarray:
+        if self == self.CLS:
+            return self.cls_pooling(array)
+        elif self == self.LAST:
+            return self.last_pooling(array)
+        return self.mean_pooling(array)
+
+    @classmethod
+    @overload
+    def cls_pooling(cls, array: np.ndarray) -> np.ndarray:
+        ...
+
+    @classmethod
+    @overload
+    # TODO: Remove this `type: ignore` after the false positive problem
+    #  is addressed in mypy: https://github.com/python/mypy/issues/15683 .
+    def cls_pooling(cls, array: "torch.Tensor") -> "torch.Tensor":  # type: ignore
+        ...
+
+    @classmethod
+    def cls_pooling(
+        cls, array: "Union[np.ndarray, torch.Tensor]"
+    ) -> "Union[np.ndarray, torch.Tensor]":
+        if len(array.shape) == 3:
+            return array[:, 0]
+        if len(array.shape) == 2:
+            return array[0]
+        raise NotImplementedError(f"Unhandled shape {array.shape}.")
+
+    @classmethod
+    def mean_pooling(cls, array: np.ndarray) -> np.ndarray:
+        if len(array.shape) == 3:
+            return array.mean(axis=1)
+        if len(array.shape) == 2:
+            return array.mean(axis=0)
+        raise NotImplementedError(f"Unhandled shape {array.shape}.")
+
+    @classmethod
+    @overload
+    def last_pooling(cls, array: np.ndarray) -> np.ndarray:
+        ...
+
+    @classmethod
+    @overload
+    # TODO: Remove this `type: ignore` after the false positive problem
+    #  is addressed in mypy: https://github.com/python/mypy/issues/15683 .
+    def last_pooling(cls, array: "torch.Tensor") -> "torch.Tensor":  # type: ignore
+        ...
+
+    @classmethod
+    def last_pooling(
+        cls, array: "Union[np.ndarray, torch.Tensor]"
+    ) -> "Union[np.ndarray, torch.Tensor]":
+        if len(array.shape) == 3:
+            return array[:, -1]
+        if len(array.shape) == 2:
+            return array[-1]
+        raise NotImplementedError(f"Unhandled shape {array.shape}.")
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml
new file mode 100644
index 0000000000000..6100f0bc90cd1
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/pyproject.toml
@@ -0,0 +1,67 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.embeddings.huggingface_api"
+
+[tool.llamahub.class_authors]
+HuggingFaceInferenceAPIEmbedding = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index embeddings huggingface api integration"
+exclude = ["**/BUILD"]
+license = "MIT"
+name = "llama-index-embeddings-huggingface-api"
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.1"
+llama-index-utils-huggingface = "^0.1.0"
+
+[tool.poetry.dependencies.huggingface-hub]
+extras = ["inference"]
+version = ">=0.19.0"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD
new file mode 100644
index 0000000000000..dabf212d7e716
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/BUILD
@@ -0,0 +1 @@
+python_tests()
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py
new file mode 100644
index 0000000000000..bb33981498d98
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_embeddings_huggingface.py
@@ -0,0 +1,11 @@
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.embeddings.huggingface_api import (
+    HuggingFaceInferenceAPIEmbedding,
+)
+
+
+def test_huggingfaceapiembedding_class():
+    names_of_base_classes = [
+        b.__name__ for b in HuggingFaceInferenceAPIEmbedding.__mro__
+    ]
+    assert BaseEmbedding.__name__ in names_of_base_classes
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py
new file mode 100644
index 0000000000000..a78c2389bb4d4
--- /dev/null
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface-api/tests/test_hf_inference.py
@@ -0,0 +1,108 @@
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import numpy as np
+import pytest
+from llama_index.embeddings.huggingface_api.base import HuggingFaceInferenceAPIEmbedding
+from llama_index.embeddings.huggingface_api.pooling import Pooling
+
+STUB_MODEL_NAME = "placeholder_model"
+
+
+@pytest.fixture(name="hf_inference_api_embedding")
+def fixture_hf_inference_api_embedding() -> HuggingFaceInferenceAPIEmbedding:
+    with patch.dict("sys.modules", huggingface_hub=MagicMock()):
+        return HuggingFaceInferenceAPIEmbedding(model_name=STUB_MODEL_NAME)
+
+
+class TestHuggingFaceInferenceAPIEmbeddings:
+    def test_class_name(
+        self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding
+    ) -> None:
+        assert (
+            HuggingFaceInferenceAPIEmbedding.class_name()
+            == HuggingFaceInferenceAPIEmbedding.__name__
+        )
+        assert (
+            hf_inference_api_embedding.class_name()
+            == HuggingFaceInferenceAPIEmbedding.__name__
+        )
+
+    # def test_using_recommended_model(self) -> None:
+    #     mock_hub = MagicMock()
+    #     mock_hub.InferenceClient.get_recommended_model.return_value = (
+    #         "facebook/bart-base"
+    #     )
+    #     with patch.dict("sys.modules", huggingface_hub=mock_hub):
+    #         embedding = HuggingFaceInferenceAPIEmbedding(task="feature-extraction")
+    #     assert embedding.model_name == "facebook/bart-base"
+    #     # mock_hub.InferenceClient.get_recommended_model.assert_called_once_with(
+    #     #     task="feature-extraction"
+    #     # )
+
+    def test_embed_query(
+        self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding
+    ) -> None:
+        raw_single_embedding = np.random.default_rng().random(
+            (1, 3, 1024), dtype=np.float32
+        )
+
+        hf_inference_api_embedding.pooling = Pooling.CLS
+        with patch.object(
+            hf_inference_api_embedding._async_client,
+            "feature_extraction",
+            AsyncMock(return_value=raw_single_embedding),
+        ) as mock_feature_extraction:
+            embedding = hf_inference_api_embedding.get_query_embedding("test")
+        assert isinstance(embedding, list)
+        assert len(embedding) == 1024
+        assert isinstance(embedding[0], float)
+        assert np.all(
+            np.array(embedding, dtype=raw_single_embedding.dtype)
+            == raw_single_embedding[0, 0]
+        )
+        mock_feature_extraction.assert_awaited_once_with("test")
+
+        hf_inference_api_embedding.pooling = Pooling.MEAN
+        with patch.object(
+            hf_inference_api_embedding._async_client,
+            "feature_extraction",
+            AsyncMock(return_value=raw_single_embedding),
+        ) as mock_feature_extraction:
+            embedding = hf_inference_api_embedding.get_query_embedding("test")
+        assert isinstance(embedding, list)
+        assert len(embedding) == 1024
+        assert isinstance(embedding[0], float)
+        assert np.all(
+            np.array(embedding, dtype=raw_single_embedding.dtype)
+            == raw_single_embedding[0].mean(axis=0)
+        )
+        mock_feature_extraction.assert_awaited_once_with("test")
+
+    def test_embed_query_one_dimension(
+        self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding
+    ) -> None:
+        raw_single_embedding = np.random.default_rng().random(1024, dtype=np.float32)
+
+        with patch.object(
+            hf_inference_api_embedding._async_client,
+            "feature_extraction",
+            AsyncMock(return_value=raw_single_embedding),
+        ) as mock_feature_extraction:
+            embedding = hf_inference_api_embedding.get_query_embedding("test")
+        assert isinstance(embedding, list)
+        assert len(embedding) == 1024
+        assert isinstance(embedding[0], float)
+        assert np.all(
+            np.array(embedding, dtype=raw_single_embedding.dtype)
+            == raw_single_embedding
+        )
+        mock_feature_extraction.assert_awaited_once_with("test")
+
+    def test_serialization(
+        self, hf_inference_api_embedding: HuggingFaceInferenceAPIEmbedding
+    ) -> None:
+        serialized = hf_inference_api_embedding.to_dict()
+        # Check Hugging Face Inference API base class specifics
+        assert serialized["model_name"] == STUB_MODEL_NAME
+        # Check Hugging Face Inference API Embeddings derived class specifics
+        assert serialized["pooling"] == Pooling.CLS
diff --git a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py
index c400a2f694c55..1e520273bd42c 100644
--- a/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py
+++ b/llama-index-integrations/embeddings/llama-index-embeddings-huggingface/llama_index/embeddings/huggingface/base.py
@@ -2,6 +2,7 @@
 import logging
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
+from deprecated import deprecated
 from huggingface_hub import (
     AsyncInferenceClient,
     InferenceClient,
@@ -149,6 +150,10 @@ def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
         return self._embed(texts, prompt_name="text")
 
 
+@deprecated(
+    "Deprecated in favor of `HuggingFaceInferenceAPIEmbedding` from `llama-index-embeddings-huggingface-api` which should be used instead.",
+    action="always",
+)
 class HuggingFaceInferenceAPIEmbedding(BaseEmbedding):  # type: ignore[misc]
     """
     Wrapper on the Hugging Face's Inference API for embeddings.
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore b/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore
new file mode 100644
index 0000000000000..990c18de22908
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD
new file mode 100644
index 0000000000000..0896ca890d8bf
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile b/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile
new file mode 100644
index 0000000000000..b9eab05aa3706
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md b/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md
new file mode 100644
index 0000000000000..ee70d26be2ade
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/README.md
@@ -0,0 +1,26 @@
+# LlamaIndex Llms Integration: Huggingface API
+
+Integration with Hugging Face's Inference API for generating text.
+
+For more information on Hugging Face's Inference API, visit [Hugging Face's Inference API documentation](https://huggingface.co/docs/api-inference/quicktour).
+
+## Installation
+
+```shell
+pip install llama-index-llms-huggingface-api
+```
+
+## Usage
+
+```python
+from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
+
+llm = HuggingFaceInferenceAPI(
+    model_name="openai-community/gpt2",
+    temperature=0.7,
+    max_tokens=100,
+    token="<your-token>",  # Optional
+)
+
+response = llm.complete("Hello, how are you?")
+```
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py
new file mode 100644
index 0000000000000..7c76b9db76064
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/__init__.py
@@ -0,0 +1,5 @@
+from llama_index.llms.huggingface_api.base import (
+    HuggingFaceInferenceAPI,
+)
+
+__all__ = ["HuggingFaceInferenceAPI"]
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py
new file mode 100644
index 0000000000000..e059fadb50414
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py
@@ -0,0 +1,284 @@
+import logging
+from typing import Any, Callable, Dict, Optional, Sequence, Union
+
+from huggingface_hub import AsyncInferenceClient, InferenceClient, model_info
+from huggingface_hub.hf_api import ModelInfo
+from huggingface_hub.inference._types import ConversationalOutput
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.constants import (
+    DEFAULT_CONTEXT_WINDOW,
+    DEFAULT_NUM_OUTPUTS,
+)
+from llama_index.core.llms.custom import CustomLLM
+
+
+logger = logging.getLogger(__name__)
+
+
+def chat_messages_to_conversational_kwargs(
+    messages: Sequence[ChatMessage],
+) -> Dict[str, Any]:
+    """Convert ChatMessages to keyword arguments for Inference API conversational."""
+    if len(messages) % 2 != 1:
+        raise NotImplementedError("Messages passed in must be of odd length.")
+    last_message = messages[-1]
+    kwargs: Dict[str, Any] = {
+        "text": last_message.content,
+        **last_message.additional_kwargs,
+    }
+    if len(messages) != 1:
+        kwargs["past_user_inputs"] = []
+        kwargs["generated_responses"] = []
+        for user_msg, assistant_msg in zip(messages[::2], messages[1::2]):
+            if (
+                user_msg.role != MessageRole.USER
+                or assistant_msg.role != MessageRole.ASSISTANT
+            ):
+                raise NotImplementedError(
+                    "Didn't handle when messages aren't ordered in alternating"
+                    f" pairs of {(MessageRole.USER, MessageRole.ASSISTANT)}."
+                )
+            kwargs["past_user_inputs"].append(user_msg.content)
+            kwargs["generated_responses"].append(assistant_msg.content)
+    return kwargs
+
+
+class HuggingFaceInferenceAPI(CustomLLM):
+    """
+    Wrapper on the Hugging Face's Inference API.
+
+    Overview of the design:
+    - Synchronous uses InferenceClient, asynchronous uses AsyncInferenceClient
+    - chat uses the conversational task: https://huggingface.co/tasks/conversational
+    - complete uses the text generation task: https://huggingface.co/tasks/text-generation
+
+    Note: some models that support the text generation task can leverage Hugging
+    Face's optimized deployment toolkit called text-generation-inference (TGI).
+    Use InferenceClient.get_model_status to check if TGI is being used.
+
+    Relevant links:
+    - General Docs: https://huggingface.co/docs/api-inference/index
+    - API Docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client
+    - Source: https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub/inference
+    """
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "HuggingFaceInferenceAPI"
+
+    # Corresponds with huggingface_hub.InferenceClient
+    model_name: Optional[str] = Field(
+        default=None,
+        description=(
+            "The model to run inference with. Can be a model id hosted on the Hugging"
+            " Face Hub, e.g. bigcode/starcoder or a URL to a deployed Inference"
+            " Endpoint. Defaults to None, in which case a recommended model is"
+            " automatically selected for the task (see Field below)."
+        ),
+    )
+    token: Union[str, bool, None] = Field(
+        default=None,
+        description=(
+            "Hugging Face token. Will default to the locally saved token. Pass "
+            "token=False if you don’t want to send your token to the server."
+        ),
+    )
+    timeout: Optional[float] = Field(
+        default=None,
+        description=(
+            "The maximum number of seconds to wait for a response from the server."
+            " Loading a new model in Inference API can take up to several minutes."
+            " Defaults to None, meaning it will loop until the server is available."
+        ),
+    )
+    headers: Dict[str, str] = Field(
+        default=None,
+        description=(
+            "Additional headers to send to the server. By default only the"
+            " authorization and user-agent headers are sent. Values in this dictionary"
+            " will override the default values."
+        ),
+    )
+    cookies: Dict[str, str] = Field(
+        default=None, description="Additional cookies to send to the server."
+    )
+    task: Optional[str] = Field(
+        default=None,
+        description=(
+            "Optional task to pick Hugging Face's recommended model, used when"
+            " model_name is left as default of None."
+        ),
+    )
+
+    _sync_client: "InferenceClient" = PrivateAttr()
+    _async_client: "AsyncInferenceClient" = PrivateAttr()
+    _get_model_info: "Callable[..., ModelInfo]" = PrivateAttr()
+
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW,
+        description=(
+            LLMMetadata.__fields__["context_window"].field_info.description
+            + " This may be looked up in a model's `config.json`."
+        ),
+    )
+    num_output: int = Field(
+        default=DEFAULT_NUM_OUTPUTS,
+        description=LLMMetadata.__fields__["num_output"].field_info.description,
+    )
+    is_chat_model: bool = Field(
+        default=False,
+        description=(
+            LLMMetadata.__fields__["is_chat_model"].field_info.description
+            + " Unless chat templating is intentionally applied, Hugging Face models"
+            " are not chat models."
+        ),
+    )
+    is_function_calling_model: bool = Field(
+        default=False,
+        description=(
+            LLMMetadata.__fields__["is_function_calling_model"].field_info.description
+            + " As of 10/17/2023, Hugging Face doesn't support function calling"
+            " messages."
+        ),
+    )
+
+    def _get_inference_client_kwargs(self) -> Dict[str, Any]:
+        """Extract the Hugging Face InferenceClient construction parameters."""
+        return {
+            "model": self.model_name,
+            "token": self.token,
+            "timeout": self.timeout,
+            "headers": self.headers,
+            "cookies": self.cookies,
+        }
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize.
+
+        Args:
+            kwargs: See the class-level Fields.
+        """
+        if kwargs.get("model_name") is None:
+            task = kwargs.get("task", "")
+            # NOTE: task being None or empty string leads to ValueError,
+            # which ensures model is present
+            kwargs["model_name"] = InferenceClient.get_recommended_model(task=task)
+            logger.debug(
+                f"Using Hugging Face's recommended model {kwargs['model_name']}"
+                f" given task {task}."
+            )
+        if kwargs.get("task") is None:
+            task = "conversational"
+        else:
+            task = kwargs["task"].lower()
+
+        super().__init__(**kwargs)  # Populate pydantic Fields
+        self._sync_client = InferenceClient(**self._get_inference_client_kwargs())
+        self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs())
+        self._get_model_info = model_info
+
+    def validate_supported(self, task: str) -> None:
+        """
+        Confirm the contained model_name is deployed on the Inference API service.
+
+        Args:
+            task: Hugging Face task to check within. A list of all tasks can be
+                found here: https://huggingface.co/tasks
+        """
+        all_models = self._sync_client.list_deployed_models(frameworks="all")
+        try:
+            if self.model_name not in all_models[task]:
+                raise ValueError(
+                    "The Inference API service doesn't have the model"
+                    f" {self.model_name!r} deployed."
+                )
+        except KeyError as exc:
+            raise KeyError(
+                f"Input task {task!r} not in possible tasks {list(all_models.keys())}."
+            ) from exc
+
+    def get_model_info(self, **kwargs: Any) -> "ModelInfo":
+        """Get metadata on the current model from Hugging Face."""
+        return self._get_model_info(self.model_name, **kwargs)
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.num_output,
+            is_chat_model=self.is_chat_model,
+            is_function_calling_model=self.is_function_calling_model,
+            model_name=self.model_name,
+        )
+
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        # default to conversational task as that was the previous functionality
+        if self.task == "conversational" or self.task is None:
+            output: "ConversationalOutput" = self._sync_client.conversational(
+                **{**chat_messages_to_conversational_kwargs(messages), **kwargs}
+            )
+            return ChatResponse(
+                message=ChatMessage(
+                    role=MessageRole.ASSISTANT, content=output["generated_text"]
+                )
+            )
+        else:
+            # try and use text generation
+            prompt = self.messages_to_prompt(messages)
+            completion = self.complete(prompt)
+            return ChatResponse(
+                message=ChatMessage(role=MessageRole.ASSISTANT, content=completion.text)
+            )
+
+    def complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        return CompletionResponse(
+            text=self._sync_client.text_generation(
+                prompt, **{**{"max_new_tokens": self.num_output}, **kwargs}
+            )
+        )
+
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        raise NotImplementedError
+
+    def stream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        raise NotImplementedError
+
+    async def achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        raise NotImplementedError
+
+    async def acomplete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        response = await self._async_client.text_generation(
+            prompt, **{**{"max_new_tokens": self.num_output}, **kwargs}
+        )
+        return CompletionResponse(text=response)
+
+    async def astream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseAsyncGen:
+        raise NotImplementedError
+
+    async def astream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        raise NotImplementedError
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml
new file mode 100644
index 0000000000000..6e0a4b9d4c608
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.llms.huggingface_api"
+
+[tool.llamahub.class_authors]
+HuggingFaceInferenceAPI = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index llms huggingface api integration"
+exclude = ["**/BUILD"]
+license = "MIT"
+name = "llama-index-llms-huggingface-api"
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.41"
+huggingface-hub = "^0.23.0"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD
new file mode 100644
index 0000000000000..dabf212d7e716
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/BUILD
@@ -0,0 +1 @@
+python_tests()
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py
new file mode 100644
index 0000000000000..d5a331c86f9bf
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py
@@ -0,0 +1,115 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+from llama_index.core.llms import ChatMessage, MessageRole
+from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
+
+STUB_MODEL_NAME = "placeholder_model"
+
+
+@pytest.fixture(name="hf_inference_api")
+def fixture_hf_inference_api() -> HuggingFaceInferenceAPI:
+    with patch.dict("sys.modules", huggingface_hub=MagicMock()):
+        return HuggingFaceInferenceAPI(model_name=STUB_MODEL_NAME)
+
+
+class TestHuggingFaceInferenceAPI:
+    def test_class_name(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
+        assert HuggingFaceInferenceAPI.class_name() == HuggingFaceInferenceAPI.__name__
+        assert hf_inference_api.class_name() == HuggingFaceInferenceAPI.__name__
+
+    def test_instantiation(self) -> None:
+        mock_hub = MagicMock()
+        with patch.dict("sys.modules", huggingface_hub=mock_hub):
+            llm = HuggingFaceInferenceAPI(model_name=STUB_MODEL_NAME)
+
+        assert llm.model_name == STUB_MODEL_NAME
+
+        # Check can be both a large language model and an embedding model
+        assert isinstance(llm, HuggingFaceInferenceAPI)
+
+        # Confirm Clients are instantiated correctly
+        # mock_hub.InferenceClient.assert_called_once_with(
+        #     model=STUB_MODEL_NAME, token=None, timeout=None, headers=None, cookies=None
+        # )
+        # mock_hub.AsyncInferenceClient.assert_called_once_with(
+        #     model=STUB_MODEL_NAME, token=None, timeout=None, headers=None, cookies=None
+        # )
+
+    def test_chat(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
+        messages = [
+            ChatMessage(content="Which movie is the best?"),
+            ChatMessage(content="It's Die Hard for sure.", role=MessageRole.ASSISTANT),
+            ChatMessage(content="Can you explain why?"),
+        ]
+        generated_response = (
+            " It's based on the book of the same name by James Fenimore Cooper."
+        )
+        conversational_return = {
+            "generated_text": generated_response,
+            "conversation": {
+                "generated_responses": ["It's Die Hard for sure.", generated_response],
+                "past_user_inputs": [
+                    "Which movie is the best?",
+                    "Can you explain why?",
+                ],
+            },
+        }
+
+        with patch.object(
+            hf_inference_api._sync_client,
+            "conversational",
+            return_value=conversational_return,
+        ) as mock_conversational:
+            response = hf_inference_api.chat(messages=messages)
+
+        assert response.message.role == MessageRole.ASSISTANT
+        assert response.message.content == generated_response
+        mock_conversational.assert_called_once_with(
+            text="Can you explain why?",
+            past_user_inputs=["Which movie is the best?"],
+            generated_responses=["It's Die Hard for sure."],
+        )
+
+    def test_chat_text_generation(
+        self, hf_inference_api: HuggingFaceInferenceAPI
+    ) -> None:
+        mock_message_to_prompt = MagicMock(
+            return_value="System: You are an expert movie reviewer\nUser: Which movie is the best?\nAssistant:"
+        )
+        hf_inference_api.task = "text-generation"
+        hf_inference_api.messages_to_prompt = mock_message_to_prompt
+        messages = [
+            ChatMessage(
+                role=MessageRole.SYSTEM, content="You are an expert movie reviewer"
+            ),
+            ChatMessage(role=MessageRole.USER, content="Which movie is the best?"),
+        ]
+        conversational_return = "It's Die Hard for sure."
+
+        with patch.object(
+            hf_inference_api._sync_client,
+            "text_generation",
+            return_value=conversational_return,
+        ) as mock_complete:
+            response = hf_inference_api.chat(messages=messages)
+
+        hf_inference_api.messages_to_prompt.assert_called_once_with(messages)
+        assert response.message.role == MessageRole.ASSISTANT
+        assert response.message.content == conversational_return
+        mock_complete.assert_called_once_with(
+            "System: You are an expert movie reviewer\nUser: Which movie is the best?\nAssistant:",
+            max_new_tokens=256,
+        )
+
+    def test_complete(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
+        prompt = "My favorite color is "
+        generated_text = '"green" and I love to paint. I have been painting for 30 years and have been'
+        with patch.object(
+            hf_inference_api._sync_client,
+            "text_generation",
+            return_value=generated_text,
+        ) as mock_text_generation:
+            response = hf_inference_api.complete(prompt)
+        mock_text_generation.assert_called_once_with(prompt, max_new_tokens=256)
+        assert response.text == generated_text
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py
new file mode 100644
index 0000000000000..14d71c5cc7461
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_llms_huggingface_api.py
@@ -0,0 +1,7 @@
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
+
+
+def test_embedding_class():
+    names_of_base_classes = [b.__name__ for b in HuggingFaceInferenceAPI.__mro__]
+    assert BaseLLM.__name__ in names_of_base_classes
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py b/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
index 55be8fdc0dfe1..f206f5a260025 100644
--- a/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
@@ -1,5 +1,6 @@
 import logging
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+from deprecated import deprecated
 
 import torch
 from huggingface_hub import AsyncInferenceClient, InferenceClient, model_info
@@ -455,6 +456,10 @@ def chat_messages_to_conversational_kwargs(
     return kwargs
 
 
+@deprecated(
+    "Deprecated in favor of `HuggingFaceInferenceAPI` from `llama-index-llms-huggingface-api` which should be used instead.",
+    action="always",
+)
 class HuggingFaceInferenceAPI(CustomLLM):
     """
     Wrapper on the Hugging Face's Inference API.
@@ -685,6 +690,10 @@ async def astream_complete(
         raise NotImplementedError
 
 
+@deprecated(
+    "Deprecated in favor of `TextGenerationInference` from `llama-index-llms-text-generation-inference` which should be used instead.",
+    action="always",
+)
 class TextGenerationInference(FunctionCallingLLM):
     model_name: Optional[str] = Field(
         default=None,
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore
new file mode 100644
index 0000000000000..990c18de22908
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD
new file mode 100644
index 0000000000000..0896ca890d8bf
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile
new file mode 100644
index 0000000000000..b9eab05aa3706
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md
new file mode 100644
index 0000000000000..3e7238b7ebe98
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/README.md
@@ -0,0 +1,24 @@
+# LlamaIndex Llms Integration: Text Generation Inference
+
+Integration with [Text Generation Inference](https://huggingface.co/docs/text-generation-inference) from Hugging Face to generate text.
+
+## Installation
+
+```shell
+pip install llama-index-llms-text-generation-inference
+```
+
+## Usage
+
+```python
+from llama_index.llms.text_generation_inference import TextGenerationInference
+
+llm = TextGenerationInference(
+    model_name="openai-community/gpt2",
+    temperature=0.7,
+    max_tokens=100,
+    token="<your-token>",  # Optional
+)
+
+response = llm.complete("Hello, how are you?")
+```
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py
new file mode 100644
index 0000000000000..156730ba0dcb4
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/__init__.py
@@ -0,0 +1,5 @@
+from llama_index.llms.text_generation_inference.base import (
+    TextGenerationInference,
+)
+
+__all__ = ["TextGenerationInference"]
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py
new file mode 100644
index 0000000000000..99d7dba8e0382
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/base.py
@@ -0,0 +1,445 @@
+import logging
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.constants import (
+    DEFAULT_TEMPERATURE,
+    DEFAULT_CONTEXT_WINDOW,
+    DEFAULT_NUM_OUTPUTS,
+)
+from llama_index.core.llms.callbacks import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.core.llms.llm import ToolSelection
+from llama_index.core.llms.function_calling import FunctionCallingLLM
+from llama_index.core.base.llms.generic_utils import (
+    chat_to_completion_decorator,
+    achat_to_completion_decorator,
+    stream_chat_to_completion_decorator,
+    astream_chat_to_completion_decorator,
+    get_from_param_or_env,
+)
+from llama_index.core.types import BaseOutputParser, PydanticProgramMode
+from llama_index.core.chat_engine.types import AgentChatResponse
+from llama_index.core.tools.types import BaseTool
+from llama_index.llms.text_generation_inference.utils import (
+    to_tgi_messages,
+    force_single_tool_call,
+    resolve_tgi_function_call,
+    get_max_input_length,
+    resolve_tool_choice,
+)
+from text_generation import (
+    Client as TGIClient,
+    AsyncClient as TGIAsyncClient,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TextGenerationInference(FunctionCallingLLM):
+    model_name: Optional[str] = Field(
+        default=None,
+        description=("The name of the model served at the TGI endpoint"),
+    )
+    temperature: float = Field(
+        default=DEFAULT_TEMPERATURE,
+        description=("The temperature to use for sampling."),
+        gte=0.0,
+        lte=1.0,
+    )
+    max_tokens: int = Field(
+        default=DEFAULT_NUM_OUTPUTS,
+        description=("The maximum number of tokens to generate."),
+        gt=0,
+    )
+    token: Union[str, bool, None] = Field(
+        default=None,
+        description=(
+            "Hugging Face token. Will default to the locally saved token. Pass "
+            "token=False if you don’t want to send your token to the server."
+        ),
+    )
+    timeout: float = Field(
+        default=120, description=("The timeout to use in seconds."), gte=0
+    )
+    max_retries: int = Field(
+        default=5, description=("The maximum number of API retries."), gte=0
+    )
+    headers: Optional[Dict[str, str]] = Field(
+        default=None,
+        description=(
+            "Additional headers to send to the server. By default only the"
+            " authorization headers are sent. Values in this dictionary"
+            " will override the default values."
+        ),
+    )
+    cookies: Optional[Dict[str, str]] = Field(
+        default=None, description=("Additional cookies to send to the server.")
+    )
+    seed: Optional[str] = Field(
+        default=None, description=("The random seed to use for sampling.")
+    )
+    additional_kwargs: Dict[str, Any] = Field(
+        default_factory=dict, description=("Additional kwargs for the TGI API.")
+    )
+
+    _sync_client: "TGIClient" = PrivateAttr()
+    _async_client: "TGIAsyncClient" = PrivateAttr()
+
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW,
+        description=("Maximum input length in tokens returned from TGI endpoint"),
+    )
+    is_chat_model: bool = Field(
+        default=True,
+        description=(
+            LLMMetadata.__fields__["is_chat_model"].field_info.description
+            + " TGI makes use of chat templating,"
+            " function call is available only for '/v1/chat/completions' route"
+            " of TGI endpoint"
+        ),
+    )
+    is_function_calling_model: bool = Field(
+        default=False,
+        description=(
+            LLMMetadata.__fields__["is_function_calling_model"].field_info.description
+            + " 'text-generation-inference' supports function call"
+            " starting from v1.4.3"
+        ),
+    )
+
+    def __init__(
+        self,
+        model_url,
+        model_name: Optional[str] = None,
+        cookies: Optional[dict] = None,
+        temperature: float = DEFAULT_TEMPERATURE,
+        max_tokens: int = DEFAULT_NUM_OUTPUTS,
+        timeout: int = 120,
+        max_retries: int = 5,
+        seed: Optional[int] = None,
+        token: Optional[str] = None,
+        additional_kwargs: Optional[Dict[str, Any]] = None,
+        callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
+    ) -> None:
+        additional_kwargs = additional_kwargs or {}
+        callback_manager = callback_manager or CallbackManager([])
+
+        token = get_from_param_or_env("token", token, "HF_TOKEN", "")
+
+        headers = {}
+        if token:
+            headers.update({"Authorization": f"Bearer {token}"})
+
+        self._sync_client = TGIClient(
+            base_url=model_url,
+            headers=headers,
+            cookies=cookies,
+            timeout=timeout,
+        )
+        self._async_client = TGIAsyncClient(
+            base_url=model_url,
+            headers=headers,
+            cookies=cookies,
+            timeout=timeout,
+        )
+
+        try:
+            is_function_calling_model = resolve_tgi_function_call(model_url)
+        except Exception as e:
+            logger.warning(f"TGI client has no function call support: {e}")
+            is_function_calling_model = False
+
+        context_window = get_max_input_length(model_url) or DEFAULT_CONTEXT_WINDOW
+
+        super().__init__(
+            context_window=context_window,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            additional_kwargs=additional_kwargs,
+            timeout=timeout,
+            max_retries=max_retries,
+            seed=seed,
+            model=model_name,
+            is_function_calling_model=is_function_calling_model,
+            callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "TextGenerationInference"
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.max_tokens,
+            is_chat_model=True,
+            model_name=self.model_name,
+            random_seed=self.seed,
+            is_function_calling_model=self.is_function_calling_model,
+        )
+
+    @property
+    def _model_kwargs(self) -> Dict[str, Any]:
+        base_kwargs = {
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+            "seed": self.seed,
+        }
+        return {
+            **base_kwargs,
+            **self.additional_kwargs,
+        }
+
+    def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            **self._model_kwargs,
+            **kwargs,
+        }
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        # convert to TGI Message
+        messages = to_tgi_messages(messages)
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        response = self._sync_client.chat(messages=messages, **all_kwargs)
+        tool_calls = response.choices[0].message.tool_calls
+
+        return ChatResponse(
+            message=ChatMessage(
+                role=MessageRole.ASSISTANT,
+                content=response.choices[0].message.content,
+                additional_kwargs=(
+                    {"tool_calls": tool_calls} if tool_calls is not None else {}
+                ),
+            ),
+            raw=dict(response),
+        )
+
+    @llm_completion_callback()
+    def complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        complete_fn = chat_to_completion_decorator(self.chat)
+        return complete_fn(prompt, **kwargs)
+
+    @llm_chat_callback()
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        # convert to TGI Message
+        messages = to_tgi_messages(messages)
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        response = self._sync_client.chat(messages=messages, stream=True, **all_kwargs)
+
+        def generator() -> ChatResponseGen:
+            content = ""
+            role = MessageRole.ASSISTANT
+            for chunk in response:
+                content_delta = chunk.choices[0].delta.content
+                if content_delta is None:
+                    continue
+                content += content_delta
+                yield ChatResponse(
+                    message=ChatMessage(role=role, content=content),
+                    delta=content_delta,
+                    raw=chunk,
+                )
+
+        return generator()
+
+    @llm_completion_callback()
+    def stream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        stream_complete_fn = stream_chat_to_completion_decorator(self.stream_chat)
+        return stream_complete_fn(prompt, **kwargs)
+
+    @llm_chat_callback()
+    async def achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        # convert to TGI Message
+        messages = to_tgi_messages(messages)
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        response = await self._async_client.chat(messages=messages, **all_kwargs)
+        tool_calls = response.choices[0].message.tool_calls
+
+        return ChatResponse(
+            message=ChatMessage(
+                role=MessageRole.ASSISTANT,
+                content=response.choices[0].message.content,
+                additional_kwargs=(
+                    {"tool_calls": tool_calls} if tool_calls is not None else {}
+                ),
+            ),
+            raw=dict(response),
+        )
+
+    @llm_completion_callback()
+    async def acomplete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        acomplete_fn = achat_to_completion_decorator(self.achat)
+        return await acomplete_fn(prompt, **kwargs)
+
+    @llm_chat_callback()
+    async def astream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseAsyncGen:
+        # convert to TGI Message
+        messages = to_tgi_messages(messages)
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        response = await self._async_client.chat(
+            messages=messages, stream=True, **all_kwargs
+        )
+
+        async def generator() -> ChatResponseAsyncGen:
+            content = ""
+            role = MessageRole.ASSISTANT
+            async for chunk in response:
+                content_delta = chunk.choices[0].delta.content
+                if content_delta is None:
+                    continue
+                content += content_delta
+                yield ChatResponse(
+                    message=ChatMessage(role=role, content=content),
+                    delta=content_delta,
+                    raw=chunk,
+                )
+
+        return generator()
+
+    @llm_completion_callback()
+    async def astream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        astream_complete_fn = astream_chat_to_completion_decorator(self.astream_chat)
+        return await astream_complete_fn(prompt, **kwargs)
+
+    def chat_with_tools(
+        self,
+        tools: List["BaseTool"],
+        user_msg: Optional[Union[str, ChatMessage]] = None,
+        chat_history: Optional[List[ChatMessage]] = None,
+        verbose: bool = False,
+        allow_parallel_tool_calls: bool = False,
+        tool_choice: str = "auto",
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Predict and call the tool."""
+        # use openai tool format
+        tool_specs = [
+            tool.metadata.to_openai_tool(skip_length_check=True) for tool in tools
+        ]
+
+        if isinstance(user_msg, str):
+            user_msg = ChatMessage(role=MessageRole.USER, content=user_msg)
+
+        messages = chat_history or []
+        if user_msg:
+            messages.append(user_msg)
+
+        response = self.chat(
+            messages=messages,
+            tools=tool_specs,
+            tool_choice=resolve_tool_choice(tool_specs, tool_choice),
+            **kwargs,
+        )
+        if not allow_parallel_tool_calls:
+            force_single_tool_call(response)
+        return response
+
+    async def achat_with_tools(
+        self,
+        tools: List["BaseTool"],
+        user_msg: Optional[Union[str, ChatMessage]] = None,
+        chat_history: Optional[List[ChatMessage]] = None,
+        verbose: bool = False,
+        allow_parallel_tool_calls: bool = False,
+        tool_choice: str = "auto",
+        **kwargs: Any,
+    ) -> ChatResponse:
+        # use openai tool format
+        tool_specs = [
+            tool.metadata.to_openai_tool(skip_length_check=True) for tool in tools
+        ]
+
+        if isinstance(user_msg, str):
+            user_msg = ChatMessage(role=MessageRole.USER, content=user_msg)
+
+        messages = chat_history or []
+        if user_msg:
+            messages.append(user_msg)
+
+        response = self.achat(
+            messages=messages,
+            tools=tool_specs,
+            tool_choice=resolve_tool_choice(tool_specs, tool_choice),
+            **kwargs,
+        )
+        if not allow_parallel_tool_calls:
+            force_single_tool_call(response)
+        return response
+
+    def get_tool_calls_from_response(
+        self,
+        response: "AgentChatResponse",
+        error_on_no_tool_call: bool = True,
+    ) -> List[ToolSelection]:
+        """Predict and call the tool."""
+        tool_calls = response.message.additional_kwargs.get("tool_calls", [])
+
+        if len(tool_calls) < 1:
+            if error_on_no_tool_call:
+                raise ValueError(
+                    f"Expected at least one tool call, but got {len(tool_calls)} tool calls."
+                )
+            else:
+                return []
+
+        tool_selections = []
+        for tool_call in tool_calls:
+            # TODO Add typecheck with ToolCall from TGI once the client is updated
+            if tool_call and (tc_type := tool_call["type"]) != "function":
+                raise ValueError(
+                    f"Invalid tool type: got {tc_type}, expect 'function'."
+                )
+            argument_dict = tool_call["function"]["parameters"]
+
+            tool_selections.append(
+                ToolSelection(
+                    tool_id=tool_call["id"],
+                    tool_name=tool_call["function"][
+                        "name"
+                    ],  # NOTE for now the tool_name is hardcoded 'tools' in TGI
+                    tool_kwargs=argument_dict,
+                )
+            )
+
+        return tool_selections
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py
new file mode 100644
index 0000000000000..71843873d1e09
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/llama_index/llms/text_generation_inference/utils.py
@@ -0,0 +1,66 @@
+import requests
+from packaging import version
+from typing import Sequence, Union, List, Optional
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+)
+from text_generation.types import (
+    Message,
+)
+
+
+def resolve_tgi_function_call(url: str) -> bool:
+    url = f"{url}/info"
+    model_info = dict(requests.get(url).json())
+    tgi_version = model_info.get("version", None)
+    if version.parse(tgi_version) >= version.parse("2.0.1"):
+        return True
+    else:
+        raise ValueError(
+            "'text-generation-inference' version ",
+            f"incompatible with function call: {tgi_version}. ",
+            "Function call support was added in v2.0.1",
+        )
+
+
+def get_max_input_length(url: str) -> Union[int, None]:
+    url = f"{url}/info"
+    model_info = dict(requests.get(url).json())
+    return model_info.get("max_input_length", None)
+
+
+def to_tgi_messages(messages: Sequence[ChatMessage]) -> Sequence[Message]:
+    out_messages = []
+    for m in messages:
+        tool_calls = m.additional_kwargs.get("tool_calls")
+        out_messages.append(
+            Message(role=m.role.value, content=m.content, tool_calls=tool_calls)
+        )
+
+    return out_messages
+
+
+def force_single_tool_call(response: ChatResponse) -> None:
+    tool_calls = response.message.additional_kwargs.get("tool_calls", [])
+    if len(tool_calls) > 1:
+        response.message.additional_kwargs["tool_calls"] = [tool_calls[0]]
+
+
+def resolve_tool_choice(
+    tools: Optional[List[dict]] = None, tool_choice: str = "none"
+) -> Union[str, dict]:
+    """Resolve tool choice.
+
+    Check if tool_name exists in tools.
+    Note that unlike in OpenAI specification, 'auto' will ALWAYS choose the tool for you.
+    Set to 'none' explicitly if do not wish to use tool.
+    """
+    valid_tool_choices = ["none", "auto"] + [t["function"]["name"] for t in tools or []]
+
+    if tool_choice not in valid_tool_choices:
+        raise ValueError(
+            f"{tool_choice} is not a valid tool_choice. Must be one of {valid_tool_choices}"
+        )
+
+    return tool_choice
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml
new file mode 100644
index 0000000000000..d096a3f6982c2
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.llms.text_generation_inference"
+
+[tool.llamahub.class_authors]
+TextGenerationInference = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index llms huggingface text generation inference integration"
+exclude = ["**/BUILD"]
+license = "MIT"
+name = "llama-index-llms-text-generation-inference"
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.41"
+text-generation = "^0.7.0"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD
new file mode 100644
index 0000000000000..dabf212d7e716
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/BUILD
@@ -0,0 +1 @@
+python_tests()
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py
new file mode 100644
index 0000000000000..8cf27c42a97a2
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-text-generation-inference/tests/test_llms_text_generation_inference.py
@@ -0,0 +1,7 @@
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.llms.text_generation_inference import TextGenerationInference
+
+
+def test_embedding_class():
+    names_of_base_classes = [b.__name__ for b in TextGenerationInference.__mro__]
+    assert BaseLLM.__name__ in names_of_base_classes
diff --git a/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb b/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb
index f16ac997251eb..27526e043a6d9 100644
--- a/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb
+++ b/llama-index-packs/llama-index-packs-llama-guard-moderator/examples/rag_moderator_llama_guard_pack.ipynb
@@ -40,7 +40,7 @@
     "%pip install llama-index-vector-stores-qdrant\n",
     "%pip install llama-index-readers-wikipedia\n",
     "%pip install llama-index-packs-llama-guard-moderator\n",
-    "%pip install llama-index-llms-huggingface"
+    "%pip install llama-index-llms-huggingface-api"
    ]
   },
   {
@@ -297,7 +297,7 @@
     "simple_node_parser = SimpleNodeParser.from_defaults()\n",
     "\n",
     "# Step 3: Define ServiceContext with llm and embed_model\n",
-    "from llama_index.llms.huggingface import HuggingFaceInferenceAPI\n",
+    "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n",
     "import os\n",
     "\n",
     "os.environ[\"HUGGINGFACE_ACCESS_TOKEN\"] = \"hf_##################\"\n",
diff --git a/llama-index-utils/llama-index-utils-huggingface/.gitignore b/llama-index-utils/llama-index-utils-huggingface/.gitignore
new file mode 100644
index 0000000000000..990c18de22908
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-utils/llama-index-utils-huggingface/BUILD b/llama-index-utils/llama-index-utils-huggingface/BUILD
new file mode 100644
index 0000000000000..0896ca890d8bf
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-utils/llama-index-utils-huggingface/Makefile b/llama-index-utils/llama-index-utils-huggingface/Makefile
new file mode 100644
index 0000000000000..b9eab05aa3706
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-utils/llama-index-utils-huggingface/README.md b/llama-index-utils/llama-index-utils-huggingface/README.md
new file mode 100644
index 0000000000000..5f1dbbb7fad90
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/README.md
@@ -0,0 +1 @@
+# LlamaIndex Utils: Huggingface
diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py
new file mode 100644
index 0000000000000..e9b04c0750c52
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/__init__.py
@@ -0,0 +1,31 @@
+from llama_index.utils.huggingface.base import (
+    DEFAULT_QUERY_BGE_INSTRUCTION_EN,
+    BGE_MODELS,
+    DEFAULT_EMBED_INSTRUCTION,
+    DEFAULT_HUGGINGFACE_EMBEDDING_MODEL,
+    DEFAULT_INSTRUCT_MODEL,
+    DEFAULT_QUERY_BGE_INSTRUCTION_ZH,
+    DEFAULT_QUERY_INSTRUCTION,
+    INSTRUCTOR_MODELS,
+    format_query,
+    format_text,
+    get_pooling_mode,
+    get_query_instruct_for_model_name,
+    get_text_instruct_for_model_name,
+)
+
+__all__ = [
+    "DEFAULT_QUERY_BGE_INSTRUCTION_EN",
+    "BGE_MODELS",
+    "DEFAULT_EMBED_INSTRUCTION",
+    "DEFAULT_HUGGINGFACE_EMBEDDING_MODEL",
+    "DEFAULT_INSTRUCT_MODEL",
+    "DEFAULT_QUERY_BGE_INSTRUCTION_ZH",
+    "DEFAULT_QUERY_INSTRUCTION",
+    "INSTRUCTOR_MODELS",
+    "format_query",
+    "format_text",
+    "get_pooling_mode",
+    "get_query_instruct_for_model_name",
+    "get_text_instruct_for_model_name",
+]
diff --git a/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py
new file mode 100644
index 0000000000000..009aaab7649ba
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/llama_index/utils/huggingface/base.py
@@ -0,0 +1,99 @@
+from typing import Optional
+
+import requests
+
+DEFAULT_HUGGINGFACE_EMBEDDING_MODEL = "BAAI/bge-small-en"
+DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-base"
+
+# Originally pulled from:
+# https://github.com/langchain-ai/langchain/blob/v0.0.257/libs/langchain/langchain/embeddings/huggingface.py#L10
+DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: "
+DEFAULT_QUERY_INSTRUCTION = (
+    "Represent the question for retrieving supporting documents: "
+)
+DEFAULT_QUERY_BGE_INSTRUCTION_EN = (
+    "Represent this question for searching relevant passages: "
+)
+DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章："
+
+BGE_MODELS = (
+    "BAAI/bge-small-en",
+    "BAAI/bge-small-en-v1.5",
+    "BAAI/bge-base-en",
+    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-large-en",
+    "BAAI/bge-large-en-v1.5",
+    "BAAI/bge-small-zh",
+    "BAAI/bge-small-zh-v1.5",
+    "BAAI/bge-base-zh",
+    "BAAI/bge-base-zh-v1.5",
+    "BAAI/bge-large-zh",
+    "BAAI/bge-large-zh-v1.5",
+)
+INSTRUCTOR_MODELS = (
+    "hku-nlp/instructor-base",
+    "hku-nlp/instructor-large",
+    "hku-nlp/instructor-xl",
+    "hkunlp/instructor-base",
+    "hkunlp/instructor-large",
+    "hkunlp/instructor-xl",
+)
+
+
+def get_query_instruct_for_model_name(model_name: Optional[str]) -> str:
+    """Get query text instruction for a given model name."""
+    if model_name in INSTRUCTOR_MODELS:
+        return DEFAULT_QUERY_INSTRUCTION
+    if model_name in BGE_MODELS:
+        if "zh" in model_name:
+            return DEFAULT_QUERY_BGE_INSTRUCTION_ZH
+        return DEFAULT_QUERY_BGE_INSTRUCTION_EN
+    return ""
+
+
+def format_query(
+    query: str, model_name: Optional[str], instruction: Optional[str] = None
+) -> str:
+    if instruction is None:
+        instruction = get_query_instruct_for_model_name(model_name)
+    # NOTE: strip() enables backdoor for defeating instruction prepend by
+    # passing empty string
+    return f"{instruction} {query}".strip()
+
+
+def get_text_instruct_for_model_name(model_name: Optional[str]) -> str:
+    """Get text instruction for a given model name."""
+    return DEFAULT_EMBED_INSTRUCTION if model_name in INSTRUCTOR_MODELS else ""
+
+
+def format_text(
+    text: str, model_name: Optional[str], instruction: Optional[str] = None
+) -> str:
+    if instruction is None:
+        instruction = get_text_instruct_for_model_name(model_name)
+    # NOTE: strip() enables backdoor for defeating instruction prepend by
+    # passing empty string
+    return f"{instruction} {text}".strip()
+
+
+def get_pooling_mode(model_name: Optional[str]) -> str:
+    pooling_config_url = (
+        f"https://huggingface.co/{model_name}/raw/main/1_Pooling/config.json"
+    )
+
+    try:
+        response = requests.get(pooling_config_url)
+        config_data = response.json()
+
+        cls_token = config_data.get("pooling_mode_cls_token", False)
+        mean_tokens = config_data.get("pooling_mode_mean_tokens", False)
+
+        if mean_tokens:
+            return "mean"
+        elif cls_token:
+            return "cls"
+    except requests.exceptions.RequestException:
+        print(
+            "Warning: Pooling config file not found; pooling mode is defaulted to 'cls'."
+        )
+    return "cls"
diff --git a/llama-index-utils/llama-index-utils-huggingface/pyproject.toml b/llama-index-utils/llama-index-utils-huggingface/pyproject.toml
new file mode 100644
index 0000000000000..199c5d7be01c6
--- /dev/null
+++ b/llama-index-utils/llama-index-utils-huggingface/pyproject.toml
@@ -0,0 +1,64 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.utils.huggingface"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index utils for huggingface integration"
+exclude = ["**/BUILD"]
+license = "MIT"
+name = "llama-index-utils-huggingface"
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.1"
+sentence-transformers = "^2.6.1"
+
+[tool.poetry.dependencies.huggingface-hub]
+extras = ["inference"]
+version = ">=0.19.0"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"