From d563342f2c37b82352d32374770412e1a5dffcde Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 6 Mar 2024 11:39:07 -0500 Subject: [PATCH 1/4] update chatbot recipe --- chatbot-langchain/README.md | 18 -- chatbot-langchain/ai-studio.yaml | 25 --- chatbot-langchain/quadlet/README.md | 10 -- chatbot-langchain/quadlet/chatbot.image | 7 - .../quadlet/chatbot.kube.example | 16 -- chatbot-langchain/quadlet/chatbot.yaml | 45 ----- chatbot/README.md | 155 +----------------- chatbot/ai-studio.yaml | 34 ++-- chatbot/ai_applications/ask.py | 14 -- chatbot/ai_applications/builds/Containerfile | 9 - .../ai_applications/builds/requirements.txt | 2 - chatbot/ai_applications/chat_ui.py | 40 ----- .../builds/Containerfile | 0 .../builds/requirements.txt | 0 {chatbot-langchain => chatbot}/chatbot_ui.py | 0 chatbot/model_services/base/Containerfile | 8 - chatbot/model_services/cuda/Containerfile | 9 - chatbot/model_services/src/chat_service.py | 39 ----- chatbot/model_services/src/llamacpp_utils.py | 31 ---- chatbot/model_services/src/requirements.txt | 2 - chatbot/quadlet/README.md | 7 +- chatbot/quadlet/chatbot.image | 6 +- chatbot/quadlet/chatbot.yaml | 31 ++-- 23 files changed, 49 insertions(+), 459 deletions(-) delete mode 100644 chatbot-langchain/README.md delete mode 100644 chatbot-langchain/ai-studio.yaml delete mode 100644 chatbot-langchain/quadlet/README.md delete mode 100644 chatbot-langchain/quadlet/chatbot.image delete mode 100644 chatbot-langchain/quadlet/chatbot.kube.example delete mode 100644 chatbot-langchain/quadlet/chatbot.yaml delete mode 100644 chatbot/ai_applications/ask.py delete mode 100644 chatbot/ai_applications/builds/Containerfile delete mode 100644 chatbot/ai_applications/builds/requirements.txt delete mode 100644 chatbot/ai_applications/chat_ui.py rename {chatbot-langchain => chatbot}/builds/Containerfile (100%) rename {chatbot-langchain => chatbot}/builds/requirements.txt (100%) rename {chatbot-langchain => chatbot}/chatbot_ui.py (100%) delete mode 100644 chatbot/model_services/base/Containerfile delete mode 100644 chatbot/model_services/cuda/Containerfile delete mode 100644 chatbot/model_services/src/chat_service.py delete mode 100644 chatbot/model_services/src/llamacpp_utils.py delete mode 100644 chatbot/model_services/src/requirements.txt diff --git a/chatbot-langchain/README.md b/chatbot-langchain/README.md deleted file mode 100644 index 4a597ff3..00000000 --- a/chatbot-langchain/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Streamlit + Langchain ChatBot Demo - -### Build image -```bash -cd chatbot-langchain -podman build -t stchat . -f builds/Containerfile -``` -### Run image locally - -Make sure the playground model service is up and running before starting this container image. -To start the model service, refer to [the playground document](../playground/README.md) - - -```bash -podman run --rm -it -p 8501:8501 -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 stchat -``` - -Interact with the application from your local browser at `localhost:8501` diff --git a/chatbot-langchain/ai-studio.yaml b/chatbot-langchain/ai-studio.yaml deleted file mode 100644 index 4e0466b5..00000000 --- a/chatbot-langchain/ai-studio.yaml +++ /dev/null @@ -1,25 +0,0 @@ -version: v1.0 -application: - type: language - name: ChatBot_Streamlit - description: This is a Streamlit chat demo application. - containers: - - name: llamacpp-server - contextdir: ../playground - containerfile: Containerfile - model-service: true - backend: - - llama - arch: - - arm64 - - amd64 - ports: - - 8001 - - name: streamlit-chat-app - contextdir: . - containerfile: builds/Containerfile - arch: - - arm64 - - amd64 - ports: - - 8501 \ No newline at end of file diff --git a/chatbot-langchain/quadlet/README.md b/chatbot-langchain/quadlet/README.md deleted file mode 100644 index 3edb0990..00000000 --- a/chatbot-langchain/quadlet/README.md +++ /dev/null @@ -1,10 +0,0 @@ -### Run chatbot-langchain as a systemd service - -```bash -cp chatbot.yaml /etc/containers/systemd/chatbot.yaml -cp chatbot.kube.example /etc/containers/chatbot.kube -cp chatbot.image /etc/containers/chatbot.image -/usr/libexec/podman/quadlet --dryrun (optional) -systemctl daemon-reload -systemctl start chatbot -``` diff --git a/chatbot-langchain/quadlet/chatbot.image b/chatbot-langchain/quadlet/chatbot.image deleted file mode 100644 index 4ca5eaa3..00000000 --- a/chatbot-langchain/quadlet/chatbot.image +++ /dev/null @@ -1,7 +0,0 @@ -[Install] -WantedBy=chatbot.service - -[Image] -Image=quay.io/redhat-et/locallm-models:mistral-7b-instruct-v0.1.Q4_K_S.gguf -Image=quay.io/redhat-et/locallm-model-service:latest -Image=quay.io/redhat-et/locallm-chatbot:latest diff --git a/chatbot-langchain/quadlet/chatbot.kube.example b/chatbot-langchain/quadlet/chatbot.kube.example deleted file mode 100644 index 66919229..00000000 --- a/chatbot-langchain/quadlet/chatbot.kube.example +++ /dev/null @@ -1,16 +0,0 @@ -[Unit] -Description=Python script to run against downloaded LLM -Documentation=man:podman-generate-systemd(1) -Wants=network-online.target -After=network-online.target -RequiresMountsFor=%t/containers - -[Kube] -# Point to the yaml file in the same directory -Yaml=chatbot.yaml - -[Service] -Restart=always - -[Install] -WantedBy=default.target diff --git a/chatbot-langchain/quadlet/chatbot.yaml b/chatbot-langchain/quadlet/chatbot.yaml deleted file mode 100644 index 9ec6dbb8..00000000 --- a/chatbot-langchain/quadlet/chatbot.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - labels: - app: chatbot-langchain - name: chatbot-langchain -spec: - initContainers: - - name: model-file - image: quay.io/redhat-et/locallm-models:mistral-7b-instruct-v0.1.Q4_K_S.gguf - command: ['/usr/bin/install', "/model/mistral-7b-instruct-v0.1.Q4_K_S.gguf", "/shared/"] - volumeMounts: - - name: model-file - mountPath: /shared - containers: - - env: - - name: MODEL_SERVICE_ENDPOINT - value: http://10.88.0.1:8001/v1 - image: quay.io/redhat-et/locallm-chatbot:latest - name: chatbot-inference - ports: - - containerPort: 8501 - hostPort: 8501 - securityContext: - runAsNonRoot: true - - env: - - name: HOST - value: 0.0.0.0 - - name: PORT - value: 8001 - - name: MODEL_PATH - value: /model/mistral-7b-instruct-v0.1.Q4_K_S.gguf - image: quay.io/redhat-et/locallm-model-service:latest - name: chatbot-model-service - ports: - - containerPort: 8001 - hostPort: 8001 - securityContext: - runAsNonRoot: true - volumeMounts: - - name: model-file - mountPath: /model - volumes: - - name: model-file - emptyDir: {} diff --git a/chatbot/README.md b/chatbot/README.md index 2efffef5..4a597ff3 100644 --- a/chatbot/README.md +++ b/chatbot/README.md @@ -1,155 +1,18 @@ -# Chat Application - -This model service is intended be used as the basis for a chat application. It is capable of having arbitrarily long conversations -with users and retains a history of the conversation until it reaches the maximum context length of the model. -At that point, the service will remove the earliest portions of the conversation from its memory. - -To use this model service, please follow the steps below: - -* [Download Model](#download-models) -* [Build Image](#build-the-image) -* [Run Image](#run-the-image) -* [Interact with Service](#interact-with-the-app) -* [Deploy on Openshift](#deploy-on-openshift) - -## Build and Deploy Locally - -### Download model(s) - -The two models that we have tested and recommend for this example are Llama2 and Mistral. The locations of the GGUF variants -are listed below: - -* Llama2 - https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main -* Mistral - https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/tree/main - -_For a full list of supported model variants, please see the "Supported models" section of the -[llama.cpp repository](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description)._ - -This example assumes that the developer already has a copy of the model that they would like to use downloaded onto their host machine and located in the `/models` directory of this repo. - -This can be accomplished with: - -```bash -cd models -wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf -cd ../ -``` - -## Deploy from Local Container - -### Build the image - -Build the `model-service` image. +# Streamlit + Langchain ChatBot Demo +### Build image ```bash -cd chatbot/model_services -podman build -t chatbot:service -f base/Containerfile . +cd chatbot-langchain +podman build -t stchat . -f builds/Containerfile ``` +### Run image locally -After the image is created it should be run with the model mounted as volume, as shown below. -This prevents large model files from being loaded into the container image which can cause a significant slowdown -when transporting the images. If it is required that a model-service image contains the model, -the Containerfiles can be modified to copy the model into the image. - -With the model-service image, in addition to a volume mounted model file, an environment variable, $MODEL_PATH, -should be set at runtime. If not set, the default location where the service expects a model is at -`/locallm/models/llama-2-7b-chat.Q5_K_S.gguf` inside the running container. This file can be downloaded from the URL -`https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf`. +Make sure the playground model service is up and running before starting this container image. +To start the model service, refer to [the playground document](../playground/README.md) -### Run the image - -Once the model service image is built, it can be run with the following: -By assuming that we want to mount the model `llama-2-7b-chat.Q5_K_S.gguf` ```bash -export MODEL_FILE=llama-2-7b-chat.Q5_K_S.gguf -podman run --rm -d -it \ - -v /local/path/to/$MODEL_FILE:/locallm/models/$MODEL_FILE:Z \ - --env MODEL_PATH=/locallm/models/$MODEL_FILE \ - -p 7860:7860 \ - chatbot:service +podman run --rm -it -p 8501:8501 -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 stchat ``` -### Interact with the app - -Now the service can be interacted with by going to `0.0.0.0:7860` in your browser. - -![](/assets/app.png) - - -You can also use the example [chatbot/ai_applications/ask.py](ask.py) to interact with the model-service in a terminal. -If the `--prompt` argument is left blank, it will default to "Hello". - -```bash -cd chatbot/ai_applications - -python ask.py --prompt -``` - -Or, you can build the `ask.py` into a container image and run it alongside the model-service container, like so: - -```bash -cd chatbot/ai_applications -podman build -t chatbot -f builds/Containerfile . -podman run --rm -d -it -p 8080:8080 chatbot # then interact with the application at 0.0.0.0:8080 in your browser -``` - -## Deploy on Openshift - -Now that we've developed an application locally that leverages an LLM, we'll want to share it with a wider audience. -Let's get it off our machine and run it on OpenShift. - -### Rebuild for x86 - -If you are on a Mac, you'll need to rebuild the model-service image for the x86 architecture for most use case outside of Mac. -Since this is an AI workload, you may also want to take advantage of Nvidia GPU's available outside our local machine. -If so, build the model-service with a base image that contains CUDA and builds llama.cpp specifically for a CUDA environment. - -```bash -cd chatbot/model_services/cuda -podman build --platform linux/amd64 -t chatbot:service-cuda -f cuda/Containerfile . -``` - -The CUDA environment significantly increases the size of the container image. -If you are not utilizing a GPU to run this application, you can create an image -without the CUDA layers for an x86 architecture machine with the following: - -```bash -cd chatbot/model_services -podman build --platform linux/amd64 -t chatbot:service-amd64 -f base/Containerfile . -``` - -### Push to Quay - -Once you login to [quay.io](quay.io) you can push your own newly built version of this LLM application to your repository -for use by others. - -```bash -podman login quay.io -``` - -```bash -podman push localhost/chatbot:service-amd64 quay.io// -``` - -### Deploy - -Now that your model lives in a remote repository we can deploy it. -Go to your OpenShift developer dashboard and select "+Add" to use the Openshift UI to deploy the application. - -![](/assets/add_image.png) - -Select "Container images" - -![](/assets/container_images.png) - -Then fill out the form on the Deploy page with your [quay.io](quay.io) image name and make sure to set the "Target port" to 7860. - -![](/assets/deploy.png) - -Hit "Create" at the bottom and watch your application start. - -Once the pods are up and the application is working, navigate to the "Routes" section and click on the link created for you -to interact with your app. - -![](/assets/app.png) +Interact with the application from your local browser at `localhost:8501` diff --git a/chatbot/ai-studio.yaml b/chatbot/ai-studio.yaml index d3bb6cb7..4e0466b5 100644 --- a/chatbot/ai-studio.yaml +++ b/chatbot/ai-studio.yaml @@ -1,29 +1,25 @@ - version: v1.0 application: type: language - name: chatbot - description: This is a LLM chatbot application that can interact with a llamacpp model-service + name: ChatBot_Streamlit + description: This is a Streamlit chat demo application. containers: - - name: chatbot-inference-app - contextdir: ai_applications - containerfile: builds/Containerfile - - name: chatbot-model-service - contextdir: model_services - containerfile: base/Containerfile + - name: llamacpp-server + contextdir: ../playground + containerfile: Containerfile model-service: true backend: - llama arch: - arm64 - amd64 - - name: chatbot-model-servicecuda - contextdir: model_services - containerfile: cuda/Containerfile - model-service: true - backend: - - llama - gpu-env: - - cuda - arch: - - amd64 \ No newline at end of file + ports: + - 8001 + - name: streamlit-chat-app + contextdir: . + containerfile: builds/Containerfile + arch: + - arm64 + - amd64 + ports: + - 8501 \ No newline at end of file diff --git a/chatbot/ai_applications/ask.py b/chatbot/ai_applications/ask.py deleted file mode 100644 index 2e5edfbf..00000000 --- a/chatbot/ai_applications/ask.py +++ /dev/null @@ -1,14 +0,0 @@ -import argparse -from gradio_client import Client -import time - -parser = argparse.ArgumentParser() -parser.add_argument("-p", "--prompt", default="Hello") -parser.add_argument("-m", "--model_endpoint",default="http://0.0.0.0:7860/") -args = parser.parse_args() - -start = time.time() -client = Client(args.model_endpoint) -result = client.predict(args.prompt , api_name="/chat") -print(result) -print(time.time() - start) diff --git a/chatbot/ai_applications/builds/Containerfile b/chatbot/ai_applications/builds/Containerfile deleted file mode 100644 index b93e2b50..00000000 --- a/chatbot/ai_applications/builds/Containerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM registry.access.redhat.com/ubi9/python-39:latest -WORKDIR /chatbot -COPY builds/requirements.txt . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /chatbot/requirements.txt -ENV MODEL_ENDPOINT=http://10.88.0.1:7860 -EXPOSE 8080 -COPY chat_ui.py . -ENTRYPOINT [ "python", "chat_ui.py" ] diff --git a/chatbot/ai_applications/builds/requirements.txt b/chatbot/ai_applications/builds/requirements.txt deleted file mode 100644 index 4b684d73..00000000 --- a/chatbot/ai_applications/builds/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -gradio -gradio_client \ No newline at end of file diff --git a/chatbot/ai_applications/chat_ui.py b/chatbot/ai_applications/chat_ui.py deleted file mode 100644 index d4d6e02a..00000000 --- a/chatbot/ai_applications/chat_ui.py +++ /dev/null @@ -1,40 +0,0 @@ -import gradio as gr -from gradio_client import Client -import requests -import time -import os - - -class Chat: - def __init__(self, endpoint) -> None: - self.endpoint = endpoint - self.client = Client(self.endpoint) - - def ask(self, prompt, history): - - job = self.client.submit(prompt, api_name="/chat") - while not job.done(): - if len(job.outputs())>=1: - r = str(job.outputs()[-1]) - yield r - yield str(job.outputs()[-1]) - -def checking_model_service(model_service): - print("Waiting for Model Service Availability...") - ready = False - while not ready: - try: - request = requests.get(f'{model_service}') - if request.status_code == 200: - ready = True - except: - pass - time.sleep(1) - print("Model Service Available") - -if __name__ == "__main__": - model_endpoint = os.getenv('MODEL_ENDPOINT', "http://0.0.0.0:7860") - checking_model_service(model_endpoint) - chat = Chat(model_endpoint) - demo = gr.ChatInterface(chat.ask) - demo.launch(server_name="0.0.0.0", server_port=8080) \ No newline at end of file diff --git a/chatbot-langchain/builds/Containerfile b/chatbot/builds/Containerfile similarity index 100% rename from chatbot-langchain/builds/Containerfile rename to chatbot/builds/Containerfile diff --git a/chatbot-langchain/builds/requirements.txt b/chatbot/builds/requirements.txt similarity index 100% rename from chatbot-langchain/builds/requirements.txt rename to chatbot/builds/requirements.txt diff --git a/chatbot-langchain/chatbot_ui.py b/chatbot/chatbot_ui.py similarity index 100% rename from chatbot-langchain/chatbot_ui.py rename to chatbot/chatbot_ui.py diff --git a/chatbot/model_services/base/Containerfile b/chatbot/model_services/base/Containerfile deleted file mode 100644 index 191b4540..00000000 --- a/chatbot/model_services/base/Containerfile +++ /dev/null @@ -1,8 +0,0 @@ -# build from chatbot/model-services directory -FROM registry.access.redhat.com/ubi9/python-39:latest -WORKDIR /locallm -COPY src . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /locallm/requirements.txt -EXPOSE 7860 -ENTRYPOINT [ "python", "chat_service.py" ] diff --git a/chatbot/model_services/cuda/Containerfile b/chatbot/model_services/cuda/Containerfile deleted file mode 100644 index 17017817..00000000 --- a/chatbot/model_services/cuda/Containerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM quay.io/opendatahub/workbench-images:cuda-ubi9-python-3.9-20231206 -WORKDIR /locallm -COPY src . -RUN pip install --upgrade pip -ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on" -ENV FORCE_CMAKE=1 -RUN pip install --upgrade --force-reinstall --no-cache-dir -r /locallm/requirements.txt -EXPOSE 7860 -ENTRYPOINT [ "python", "chat_service.py" ] diff --git a/chatbot/model_services/src/chat_service.py b/chatbot/model_services/src/chat_service.py deleted file mode 100644 index 5dd0f377..00000000 --- a/chatbot/model_services/src/chat_service.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -sys.path.append("src") -import gradio as gr -from llama_cpp import Llama -from llamacpp_utils import clip_history -import os - - -llm = Llama(os.getenv('MODEL_PATH', - "models/llama-2-7b-chat.Q5_K_S.gguf"), - n_gpu_layers=-1, - n_ctx=2048, - max_tokens=512, - f16_kv = True, - stream=True) - -system_prompt = [ - {"role": "system", "content": """You are a helpful assistant that is comfortable speaking - with C level executives in a professional setting."""}, - ] - -def ask(prompt, history): - global system_prompt - global llm - system_prompt.append({"role":"user","content":prompt}) - system_prompt = clip_history(llm, prompt, system_prompt, 2048, 512) - chat_response = llm.create_chat_completion(system_prompt, stream=True) - reply = "" - for r in chat_response: - response = r["choices"][0]["delta"] - if "content" in response.keys(): - reply += response["content"] - yield reply - system_prompt.append({"role":"assistant","content":reply}) - -if __name__=="__main__": - - demo = gr.ChatInterface(ask) - demo.launch(server_name="0.0.0.0") \ No newline at end of file diff --git a/chatbot/model_services/src/llamacpp_utils.py b/chatbot/model_services/src/llamacpp_utils.py deleted file mode 100644 index 2026a9dd..00000000 --- a/chatbot/model_services/src/llamacpp_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -from llama_cpp import Llama - - -def tokenize(llama, prompt): - return llama.tokenize(bytes(prompt, "utf-8")) - -def count_tokens(llama,prompt): - return len(tokenize(llama,prompt)) + 5 - -def clip_history(llama, prompt, history, n_ctx, max_tokens): - prompt_len = count_tokens(llama, prompt) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = prompt_len + history_len - print(input_len) - while input_len >= n_ctx-max_tokens: - print("Clipping") - history.pop(1) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = history_len + prompt_len - print(input_len) - return history - -def chunk_tokens(llm, prompt, chunk_size): - tokens = tokenize(llm, prompt) - num_tokens = count_tokens(llm, prompt) - chunks = [] - for i in range((num_tokens//chunk_size)+1): - chunk = str(llm.detokenize(tokens[:chunk_size]),"utf-8") - chunks.append(chunk) - tokens = tokens[chunk_size:] - return chunks diff --git a/chatbot/model_services/src/requirements.txt b/chatbot/model_services/src/requirements.txt deleted file mode 100644 index 4c1269c4..00000000 --- a/chatbot/model_services/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -llama-cpp-python -gradio \ No newline at end of file diff --git a/chatbot/quadlet/README.md b/chatbot/quadlet/README.md index 97f7c000..3edb0990 100644 --- a/chatbot/quadlet/README.md +++ b/chatbot/quadlet/README.md @@ -1,9 +1,10 @@ -### Run chatbot as a systemd service +### Run chatbot-langchain as a systemd service ```bash -cp chatbot.yaml /etc/containers/systemd/. +cp chatbot.yaml /etc/containers/systemd/chatbot.yaml cp chatbot.kube.example /etc/containers/chatbot.kube +cp chatbot.image /etc/containers/chatbot.image /usr/libexec/podman/quadlet --dryrun (optional) systemctl daemon-reload systemctl start chatbot -``` +``` diff --git a/chatbot/quadlet/chatbot.image b/chatbot/quadlet/chatbot.image index feac1ce5..4ca5eaa3 100644 --- a/chatbot/quadlet/chatbot.image +++ b/chatbot/quadlet/chatbot.image @@ -2,6 +2,6 @@ WantedBy=chatbot.service [Image] -Image=quay.io/sallyom/models:llama2-7b-gguf -Image=quay.io/sallyom/chatbot:model-service -Image=quay.io/sallyom/chatbot:inference +Image=quay.io/redhat-et/locallm-models:mistral-7b-instruct-v0.1.Q4_K_S.gguf +Image=quay.io/redhat-et/locallm-model-service:latest +Image=quay.io/redhat-et/locallm-chatbot:latest diff --git a/chatbot/quadlet/chatbot.yaml b/chatbot/quadlet/chatbot.yaml index 8942481b..9ec6dbb8 100644 --- a/chatbot/quadlet/chatbot.yaml +++ b/chatbot/quadlet/chatbot.yaml @@ -2,34 +2,39 @@ apiVersion: v1 kind: Pod metadata: labels: - app: chatbot-inference-app - name: chatbot-inference-app + app: chatbot-langchain + name: chatbot-langchain spec: initContainers: - name: model-file - image: quay.io/sallyom/models:llama2-7b-gguf - command: ['/usr/bin/bash', '-c', "cp /model/llama-2-7b-chat.Q5_K_S.gguf /shared/ && chmod 777 /shared/llama-2-7b-chat.Q5_K_S.gguf"] + image: quay.io/redhat-et/locallm-models:mistral-7b-instruct-v0.1.Q4_K_S.gguf + command: ['/usr/bin/install', "/model/mistral-7b-instruct-v0.1.Q4_K_S.gguf", "/shared/"] volumeMounts: - name: model-file mountPath: /shared containers: - env: - - name: MODEL_ENDPOINT - value: http://localhost:7860 - image: quay.io/sallyom/chatbot:inference + - name: MODEL_SERVICE_ENDPOINT + value: http://10.88.0.1:8001/v1 + image: quay.io/redhat-et/locallm-chatbot:latest name: chatbot-inference ports: - - containerPort: 7860 - hostPort: 7860 - - containerPort: 8080 - hostPort: 8080 + - containerPort: 8501 + hostPort: 8501 securityContext: runAsNonRoot: true - env: + - name: HOST + value: 0.0.0.0 + - name: PORT + value: 8001 - name: MODEL_PATH - value: /model/llama-2-7b-chat.Q5_K_S.gguf - image: quay.io/sallyom/chatbot:model-service + value: /model/mistral-7b-instruct-v0.1.Q4_K_S.gguf + image: quay.io/redhat-et/locallm-model-service:latest name: chatbot-model-service + ports: + - containerPort: 8001 + hostPort: 8001 securityContext: runAsNonRoot: true volumeMounts: From e9cf22e58ab4e89ae27b52faa4ed8527ddc4de76 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 6 Mar 2024 11:52:10 -0500 Subject: [PATCH 2/4] update rag recipe --- rag-langchain/README.md | 34 ------ rag-langchain/ai-studio.yaml | 34 ------ rag/README.md | 108 ++++-------------- rag/ai-studio.yaml | 27 +++-- rag/ai_applications/base/Containerfile | 9 -- rag/ai_applications/base/requirements.txt | 1 - rag/ai_applications/rag_chat.py | 17 --- {rag-langchain => rag}/builds/Containerfile | 0 .../builds/chromadb/Containerfile | 0 .../builds/requirements.txt | 0 rag/model_services/base/Containerfile | 20 ---- rag/model_services/src/llamacpp_utils.py | 31 ----- rag/model_services/src/rag_service.py | 78 ------------- rag/model_services/src/requirements.txt | 5 - rag/populate_vectordb.py | 36 ++++++ {rag-langchain => rag}/rag_app.py | 0 16 files changed, 77 insertions(+), 323 deletions(-) delete mode 100644 rag-langchain/README.md delete mode 100644 rag-langchain/ai-studio.yaml delete mode 100644 rag/ai_applications/base/Containerfile delete mode 100644 rag/ai_applications/base/requirements.txt delete mode 100644 rag/ai_applications/rag_chat.py rename {rag-langchain => rag}/builds/Containerfile (100%) rename {rag-langchain => rag}/builds/chromadb/Containerfile (100%) rename {rag-langchain => rag}/builds/requirements.txt (100%) delete mode 100644 rag/model_services/base/Containerfile delete mode 100644 rag/model_services/src/llamacpp_utils.py delete mode 100644 rag/model_services/src/rag_service.py delete mode 100644 rag/model_services/src/requirements.txt create mode 100644 rag/populate_vectordb.py rename {rag-langchain => rag}/rag_app.py (100%) diff --git a/rag-langchain/README.md b/rag-langchain/README.md deleted file mode 100644 index ec3acecb..00000000 --- a/rag-langchain/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# RAG + Langchain - -This example will deploy a local RAG application using a chromadb server, a llama.cpp model server and a python app built with langchain. - -# - -### Deploy ChromaDB Vector Database -Use the existing ChromaDB image to deploy a vector store service. - -* `podman pull chromadb/chroma` -* `podman run --rm -it -p 8000:8000 chroma` - -### Deploy Model Service - -To start the model service, refer to [the playground model-service document](../playground/README.md) - -### Build and Deploy RAG app -Deploy a small application that can populate the data base from the vectorDB and generate a response with the LLM. - -We will want to have an embedding model that we can volume mount into our running application container. You can use the code snippet below to pull a copy of the `BAAI/bge-base-en-v1.5` embedding model. - - -```python -from huggingface_hub import snapshot_download -snapshot_download(repo_id="BAAI/bge-base-en-v1.5", - cache_dir="../models/", - local_files_only=False) -``` - -Follow the instructions below to build you container image and run it locally. - -* `podman build -t ragapp rag-langchain -f rag-langchain/builds/Containerfile` -* `podman run --rm -it -p 8501:8501 -v Local/path/to/locallm/models/:/rag/models:Z -v Local/path/to/locallm/data:/rag/data:Z -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 ragapp -- -H 10.88.0.1 ` - diff --git a/rag-langchain/ai-studio.yaml b/rag-langchain/ai-studio.yaml deleted file mode 100644 index 3c7ec7ac..00000000 --- a/rag-langchain/ai-studio.yaml +++ /dev/null @@ -1,34 +0,0 @@ -version: v1.0 -application: - type: language - name: rag-demo - description: This is a RAG demo application. - containers: - - name: llamacpp-server - contextdir: ../playground - containerfile: Containerfile - model-service: true - backend: - - llama - arch: - - arm64 - - amd64 - ports: - - 8001 - - name: chromadb-server - contextdir: builds/chromadb - containerfile: Containerfile - vectordb: true - arch: - - arm64 - - amd64 - ports: - - 8000 - - name: rag-inference-app - contextdir: . - containerfile: builds/Containerfile - arch: - - arm64 - - amd64 - ports: - - 8501 \ No newline at end of file diff --git a/rag/README.md b/rag/README.md index 5303b7ad..ec3acecb 100644 --- a/rag/README.md +++ b/rag/README.md @@ -1,100 +1,34 @@ -# Retrieval Augmented Generation (RAG) +# RAG + Langchain -This demo provides an example of using Retrieval Augmented Generation (RAG) to add additional context to an LLM chatbot. +This example will deploy a local RAG application using a chromadb server, a llama.cpp model server and a python app built with langchain. + +# -## Build and Deploy Locally +### Deploy ChromaDB Vector Database +Use the existing ChromaDB image to deploy a vector store service. -### Download LLM model(s) +* `podman pull chromadb/chroma` +* `podman run --rm -it -p 8000:8000 chroma` -The two models that we have tested and recommend for this example are Llama2 and Mistral. The locations of the GGUF variants -are listed below: +### Deploy Model Service -* Llama2 - https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main -* Mistral - https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/tree/main +To start the model service, refer to [the playground model-service document](../playground/README.md) -_For a full list of supported model variants, please see the "Supported models" section of the -[llama.cpp repository](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description)._ +### Build and Deploy RAG app +Deploy a small application that can populate the data base from the vectorDB and generate a response with the LLM. -This example assumes that the developer already has a copy of the model that they would like to use downloaded onto their host machine and located in the `/models` directory of this repo. +We will want to have an embedding model that we can volume mount into our running application container. You can use the code snippet below to pull a copy of the `BAAI/bge-base-en-v1.5` embedding model. -This can be accomplished with: -```bash -cd models -wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf -cd ../ +```python +from huggingface_hub import snapshot_download +snapshot_download(repo_id="BAAI/bge-base-en-v1.5", + cache_dir="../models/", + local_files_only=False) ``` -### Download the embedding model -To encode our additional data and populate our vector database, we need an embedding model (a second language model) for this workflow. Here we will use `BAAI/bge-large-en-v1.5` all the necessary model files can be found and downloaded from https://huggingface.co/BAAI/bge-large-en-v1.5. - - -Alternatively, you can run the below python code to download the model files directly into the `models/` directory. - -```python -from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings -SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5",cache_folder="models/") -``` - -### Prepare the RAG dataset - -Once you have the embedding model in place, you will want to create a vector database with your custom data that can be used to augment our chatbot. The python code below will create a persistent vector database on our local machine that we can query at runtime. The code below simply uses the `fake_meeting.txt` demo file already included in this repository. Feel free to replace this with your own data. - -```python -from langchain_community.document_loaders import TextLoader -from langchain.text_splitter import CharacterTextSplitter -from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings -from langchain.vectorstores import Chroma - -raw_documents = TextLoader("data/fake_meeting.txt").load() -text_splitter = CharacterTextSplitter(separator = ".", chunk_size=150, chunk_overlap=0) -docs = text_splitter.split_documents(raw_documents) -e = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5",cache_folder="models/") -db = Chroma.from_documents(docs,e,persist_directory="./data/chromaDB") -``` - -Great, we now have our LLM model, our text Embedding model, and our Vector Database loaded with custom data. We are ready to build our RAG container image! - -## Deploy from Local Container - -### Build the image - -Build the `model-service` image. - -```bash -cd rag/model_services -podman build -t rag:service -f base/Containerfile . -``` - -After the image is created it should be run with the models mounted as volumes, as shown below. -This prevents large model files from being loaded into the container image which can cause a significant slowdown -when transporting the images. If it is required that a model-service image contains the model, -the Containerfiles can be modified to copy the models into the image. - -With the model-service image, in addition to a volume mounted model file, an environment variable, $MODEL_PATH, -should be set at runtime. If not set, the default location where the service expects a model is at -`/locallm/models/llama-2-7b-chat.Q5_K_S.gguf` inside the running container. This file can be downloaded from the URL -`https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf`. - -### Run the image - -Once the model service image is built, it can be run with the following: -By assuming that we want to mount the models `llama-2-7b-chat.Q5_K_S.gguf` and `BAAI/bge-base-en-v1.5` as well as use the vector database we just created. - -```bash -export MODEL_FILE=llama-2-7b-chat.Q5_K_S.gguf -podman run --rm -d -it \ - -v /local/path/to/$MODEL_FILE:/locallm/models/$MODEL_FILE:Z \ - -v /local/path/to/locallm/data/chromaDB:/locallm/data/:Z \ - --env MODEL_PATH=/locallm/models/$MODEL_FILE \ - -p 7860:7860 \ - rag:service -``` - -### Interact with the app - -Now the service can be interacted with by going to `0.0.0.0:7860` in your browser. - -![](/assets/rag_ui.png) +Follow the instructions below to build you container image and run it locally. +* `podman build -t ragapp rag-langchain -f rag-langchain/builds/Containerfile` +* `podman run --rm -it -p 8501:8501 -v Local/path/to/locallm/models/:/rag/models:Z -v Local/path/to/locallm/data:/rag/data:Z -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 ragapp -- -H 10.88.0.1 ` diff --git a/rag/ai-studio.yaml b/rag/ai-studio.yaml index d3a41394..3c7ec7ac 100644 --- a/rag/ai-studio.yaml +++ b/rag/ai-studio.yaml @@ -4,18 +4,31 @@ application: name: rag-demo description: This is a RAG demo application. containers: - - name: rag-demo-service - contextdir: model_services - containerfile: base/Containerfile + - name: llamacpp-server + contextdir: ../playground + containerfile: Containerfile model-service: true backend: - llama arch: - arm64 - amd64 - - name: chatbot-inference-app - contextdir: ai_applications - containerfile: base/Containerfile + ports: + - 8001 + - name: chromadb-server + contextdir: builds/chromadb + containerfile: Containerfile + vectordb: true arch: - arm64 - - amd64 \ No newline at end of file + - amd64 + ports: + - 8000 + - name: rag-inference-app + contextdir: . + containerfile: builds/Containerfile + arch: + - arm64 + - amd64 + ports: + - 8501 \ No newline at end of file diff --git a/rag/ai_applications/base/Containerfile b/rag/ai_applications/base/Containerfile deleted file mode 100644 index 0fab597c..00000000 --- a/rag/ai_applications/base/Containerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM registry.access.redhat.com/ubi9/python-39:latest -WORKDIR /rag -COPY rag/ai_applications/base/requirements.txt . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /rag/requirements.txt -ENV MODEL_ENDPOINT=http://10.88.0.1:7860 -EXPOSE 8080 -COPY rag/ai_applications/rag_chat.py . -ENTRYPOINT [ "python", "rag_chat.py" ] \ No newline at end of file diff --git a/rag/ai_applications/base/requirements.txt b/rag/ai_applications/base/requirements.txt deleted file mode 100644 index dae4cee4..00000000 --- a/rag/ai_applications/base/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -gradio_client \ No newline at end of file diff --git a/rag/ai_applications/rag_chat.py b/rag/ai_applications/rag_chat.py deleted file mode 100644 index f83f97fc..00000000 --- a/rag/ai_applications/rag_chat.py +++ /dev/null @@ -1,17 +0,0 @@ - -import argparse -from gradio_client import Client -import time - -parser = argparse.ArgumentParser() -parser.add_argument("-p", "--prompt", default="What is ROSA?") -parser.add_argument("-m", "--model_endpoint",default="http://0.0.0.0:7860/") -parser.add_argument("-r", "--retrieve", default=True) -args = parser.parse_args() - -start = time.time() -client = Client(args.model_endpoint) -result = client.predict(args.prompt,args.retrieve, - api_name="/chat") -print(result) -print(time.time() - start) \ No newline at end of file diff --git a/rag-langchain/builds/Containerfile b/rag/builds/Containerfile similarity index 100% rename from rag-langchain/builds/Containerfile rename to rag/builds/Containerfile diff --git a/rag-langchain/builds/chromadb/Containerfile b/rag/builds/chromadb/Containerfile similarity index 100% rename from rag-langchain/builds/chromadb/Containerfile rename to rag/builds/chromadb/Containerfile diff --git a/rag-langchain/builds/requirements.txt b/rag/builds/requirements.txt similarity index 100% rename from rag-langchain/builds/requirements.txt rename to rag/builds/requirements.txt diff --git a/rag/model_services/base/Containerfile b/rag/model_services/base/Containerfile deleted file mode 100644 index 05b18385..00000000 --- a/rag/model_services/base/Containerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM registry.access.redhat.com/ubi9/python-39:1-158 -### Update sqlite for chroma -USER root -RUN dnf remove sqlite3 -y -RUN wget https://www.sqlite.org/2023/sqlite-autoconf-3410200.tar.gz -RUN tar -xvzf sqlite-autoconf-3410200.tar.gz -WORKDIR sqlite-autoconf-3410200 -RUN ./configure -RUN make -RUN make install -RUN mv /usr/local/bin/sqlite3 /usr/bin/sqlite3 -ENV LD_LIBRARY_PATH="/usr/local/lib" -### Permssions for chroma -WORKDIR /locallm -RUN mkdir /data -### -COPY src . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /locallm/requirements.txt -ENTRYPOINT [ "python", "rag_service.py" ] diff --git a/rag/model_services/src/llamacpp_utils.py b/rag/model_services/src/llamacpp_utils.py deleted file mode 100644 index 2026a9dd..00000000 --- a/rag/model_services/src/llamacpp_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -from llama_cpp import Llama - - -def tokenize(llama, prompt): - return llama.tokenize(bytes(prompt, "utf-8")) - -def count_tokens(llama,prompt): - return len(tokenize(llama,prompt)) + 5 - -def clip_history(llama, prompt, history, n_ctx, max_tokens): - prompt_len = count_tokens(llama, prompt) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = prompt_len + history_len - print(input_len) - while input_len >= n_ctx-max_tokens: - print("Clipping") - history.pop(1) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = history_len + prompt_len - print(input_len) - return history - -def chunk_tokens(llm, prompt, chunk_size): - tokens = tokenize(llm, prompt) - num_tokens = count_tokens(llm, prompt) - chunks = [] - for i in range((num_tokens//chunk_size)+1): - chunk = str(llm.detokenize(tokens[:chunk_size]),"utf-8") - chunks.append(chunk) - tokens = tokens[chunk_size:] - return chunks diff --git a/rag/model_services/src/rag_service.py b/rag/model_services/src/rag_service.py deleted file mode 100644 index 17e2097d..00000000 --- a/rag/model_services/src/rag_service.py +++ /dev/null @@ -1,78 +0,0 @@ -import gradio as gr -from llama_cpp import Llama -from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings -from langchain.vectorstores import Chroma -from llamacpp_utils import clip_history -import os - - -llm = Llama(os.getenv("MODEL_PATH", - "models/llama-2-7b-chat.Q5_K_S.gguf"), - n_gpu_layers=-1, - n_ctx=2048, - max_tokens=512, - f16_kv = True, - stream=True) - -system_prompt = [ - {"role": "system", "content": """You are a helpful assistant that is comfortable speaking - with C level executives in a professional setting."""}, - ] - -embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5", - cache_folder="models/") - - -def ask(prompt, history, rag): - global system_prompt - global llm - base_prompt = {"role":"user", "content":prompt} - user_prompt = None - if rag: - docs = retriever(prompt) - if docs: - print("Docs Found") - user_prompt = prompt + """\n Answer the query using a concise summary of the - following context: \n""" + docs - user_prompt = {"role":"user","content":user_prompt} - if not user_prompt: - user_prompt = base_prompt - ### start: shared with chat app asks ### - system_prompt.append(user_prompt) - system_prompt = clip_history(llm, prompt, system_prompt, 2048, 512) - chat_response = llm.create_chat_completion(system_prompt,stream=True) - reply = "" - for i in chat_response: - token = i["choices"][0]["delta"] - if "content" in token.keys(): - reply += token["content"] - yield reply - #### end: shared with chat app ask ### - if rag: - del system_prompt[-1] - system_prompt.append(base_prompt) - - -def retriever(prompt, top_k=2,threshold=0.75): - global embeddings - db = Chroma(persist_directory="data", - embedding_function=embeddings) - docs = db.similarity_search_with_score(prompt) - retrieved_list = [] - for doc in docs[:top_k]: - if doc[1] < threshold: - retrieved_list.append(doc[0].page_content) - if retrieved_list: - print("Retrieved documents to augment generated response") - return '\n'.join(retrieved_list) - else: - return None - - -if __name__ == "__main__": - - with gr.Blocks() as demo: - box = gr.Checkbox(label="RAG", info="Do you want to turn on RAG?") - chat_app = gr.ChatInterface(ask, additional_inputs=[box]) - - demo.launch(server_name="0.0.0.0") \ No newline at end of file diff --git a/rag/model_services/src/requirements.txt b/rag/model_services/src/requirements.txt deleted file mode 100644 index 10bec24d..00000000 --- a/rag/model_services/src/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -llama-cpp-python -gradio -langchain -chromadb -sentence-transformers \ No newline at end of file diff --git a/rag/populate_vectordb.py b/rag/populate_vectordb.py new file mode 100644 index 00000000..2bbb6efc --- /dev/null +++ b/rag/populate_vectordb.py @@ -0,0 +1,36 @@ +from langchain_community.document_loaders import TextLoader +from langchain.text_splitter import CharacterTextSplitter +import chromadb.utils.embedding_functions as embedding_functions +import chromadb +from chromadb.config import Settings +import uuid +import os +import argparse +import time + +parser = argparse.ArgumentParser() +parser.add_argument("-d", "--docs", default="data/fake_meeting.txt") +parser.add_argument("-c", "--chunk_size", default=150) +parser.add_argument("-e", "--embedding_model", default="BAAI/bge-base-en-v1.5") +parser.add_argument("-H", "--vdb_host", default="0.0.0.0") +parser.add_argument("-p", "--vdb_port", default="8000") +parser.add_argument("-n", "--name", default="test_collection") +args = parser.parse_args() + +raw_documents = TextLoader(args.docs).load() +text_splitter = CharacterTextSplitter(separator = ".", chunk_size=int(args.chunk_size), chunk_overlap=0) +docs = text_splitter.split_documents(raw_documents) +os.environ["TORCH_HOME"] = "./models/" + +embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=args.embedding_model) +client = chromadb.HttpClient(host=args.vdb_host, + port=args.vdb_port, + settings=Settings(allow_reset=True,)) +collection = client.get_or_create_collection(args.name, + embedding_function=embedding_func) +for doc in docs: + collection.add( + ids=[str(uuid.uuid1())], + metadatas=doc.metadata, + documents=doc.page_content + ) \ No newline at end of file diff --git a/rag-langchain/rag_app.py b/rag/rag_app.py similarity index 100% rename from rag-langchain/rag_app.py rename to rag/rag_app.py From e122c5757f925e66cecb17ee5bd60fb8c322f8e0 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 6 Mar 2024 11:57:01 -0500 Subject: [PATCH 3/4] update summarizer recipe --- summarizer-langchain/README.md | 16 ---- summarizer-langchain/ai-studio.yaml | 25 ----- summarizer/README.md | 91 ++----------------- summarizer/ai-studio.yaml | 36 +++----- summarizer/ai_applications/base/Containerfile | 8 -- .../ai_applications/base/requirements.txt | 2 - summarizer/ai_applications/summarize.py | 17 ---- summarizer/ai_applications/upload_file_ui.py | 25 ----- .../builds/Containerfile | 0 .../builds/requirements.txt | 0 summarizer/model_services/base/Containerfile | 7 -- summarizer/model_services/cuda/Containerfile | 9 -- .../model_services/src/llamacpp_utils.py | 31 ------- .../model_services/src/requirements.txt | 2 - .../model_services/src/summary_service.py | 52 ----------- .../quadlet/README.md | 0 .../quadlet/summarizer.image | 0 .../quadlet/summarizer.kube.example | 0 .../quadlet/summarizer.yaml | 0 .../summarizer.py | 0 20 files changed, 24 insertions(+), 297 deletions(-) delete mode 100644 summarizer-langchain/README.md delete mode 100644 summarizer-langchain/ai-studio.yaml delete mode 100644 summarizer/ai_applications/base/Containerfile delete mode 100644 summarizer/ai_applications/base/requirements.txt delete mode 100644 summarizer/ai_applications/summarize.py delete mode 100644 summarizer/ai_applications/upload_file_ui.py rename {summarizer-langchain => summarizer}/builds/Containerfile (100%) rename {summarizer-langchain => summarizer}/builds/requirements.txt (100%) delete mode 100644 summarizer/model_services/base/Containerfile delete mode 100644 summarizer/model_services/cuda/Containerfile delete mode 100644 summarizer/model_services/src/llamacpp_utils.py delete mode 100644 summarizer/model_services/src/requirements.txt delete mode 100644 summarizer/model_services/src/summary_service.py rename {summarizer-langchain => summarizer}/quadlet/README.md (100%) rename {summarizer-langchain => summarizer}/quadlet/summarizer.image (100%) rename {summarizer-langchain => summarizer}/quadlet/summarizer.kube.example (100%) rename {summarizer-langchain => summarizer}/quadlet/summarizer.yaml (100%) rename {summarizer-langchain => summarizer}/summarizer.py (100%) diff --git a/summarizer-langchain/README.md b/summarizer-langchain/README.md deleted file mode 100644 index aad4a0ee..00000000 --- a/summarizer-langchain/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Summarizer Application - -This example will deploy a local summarization application. - - -### Deploy Model Service - -To start the model service, refer to [the playground model-service document](../playground/README.md) - -### Build and Deploy Summarizer app - - -Follow the instructions below to build you container image and run it locally. - -* `podman build -t summarizer summarizer-langchain -f summarizer-langchain/builds/Containerfile` -* `podman run --rm -it -p 8501:8501 -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 summarizer` \ No newline at end of file diff --git a/summarizer-langchain/ai-studio.yaml b/summarizer-langchain/ai-studio.yaml deleted file mode 100644 index 3787b65f..00000000 --- a/summarizer-langchain/ai-studio.yaml +++ /dev/null @@ -1,25 +0,0 @@ -version: v1.0 -application: - type: language - name: Summarizer_App - description: This is a Streamlit demo application for summarizing text. - containers: - - name: llamacpp-server - contextdir: ../playground - containerfile: Containerfile - model-service: true - backend: - - llama - arch: - - arm64 - - amd64 - ports: - - 8001 - - name: streamlit-summary-app - contextdir: . - containerfile: builds/Containerfile - arch: - - arm64 - - amd64 - ports: - - 8501 \ No newline at end of file diff --git a/summarizer/README.md b/summarizer/README.md index b759aa01..aad4a0ee 100644 --- a/summarizer/README.md +++ b/summarizer/README.md @@ -1,89 +1,16 @@ -# Text Summarizer Application +# Summarizer Application -This model service is intended be be used for text summarization tasks. This service can ingest an arbitrarily long text input. If the input length is less than the models maximum context window it will summarize the input directly. If the input is longer than the maximum context window, the input will be divided into appropriately sized chunks. Each chunk will be summarized and a final "summary of summaries" will be the services final output. +This example will deploy a local summarization application. + -To use this model service, please follow the steps below: +### Deploy Model Service -* [Download Model](#download-models) -* [Build Image](#build-the-image) -* [Run Image](#run-the-image) -* [Interact with Service](#interact-with-the-app) -### Download model(s) +To start the model service, refer to [the playground model-service document](../playground/README.md) -This example assumes that the developer already has a copy of the model that they would like to use downloaded onto their host machine and located in the `/models` directory of this repo. +### Build and Deploy Summarizer app -The two models that we have tested and recommend for this example are Llama2 and Mistral. Please download any of the GGUF variants you'd like to use. -* Llama2 - https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main -* Mistral - https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/tree/main - -_For a full list of supported model variants, please see the "Supported models" section of the [llama.cpp repository](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description)._ - -```bash -cd models - -wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf -``` - -### Build the image - -To build the image we will use a `build.sh` script that will simply copy the desired model and shared code into the build directory temporarily. This prevents any large unused model files in the repo from being loaded into the podman environment during build which can cause a significant slowdown. - -```bash -cd summarizer/model_services/builds - -sh build.sh llama-2-7b-chat.Q5_K_S.gguf arm summarizer -``` -The user should provide the model name, the architecture and image name they want to use for the build. - -### Run the image -Once the model service image is built, it can be run with the following: - -```bash -podman run --rm -it -p 7860:7860 summarizer -``` -### Interact with the app - -#### Python Code -Now the service can be used with the python code below. - -```python -from gradio_client import Client -client = Client("http://0.0.0.0:7860") -result = client.predict(""" -It's Hackathon day. -All the developers are excited to work on interesting problems. -There are six teams total, but only one can take home the grand prize. -The first team to solve Artificial General Intelligence wins!""", -api_name="/chat") -print(result) -``` - -```bash - Sure, here is a summary of the input in bullet points: -• Hackathon day -• Developers excited to work on interesting problems -• Six teams participating -• Grand prize for the first team to solve Artificial General Intelligence -• Excitement and competition among the teams -``` - -#### Python Script -You can also use the `summarize.py` script under `/ai_applications` to run the summary application against a local file. If the `--file` argument is left blank, it will run against the demo file `data/fake_meeting.text` - -```bash -python summarizer/ai_applications/summarize --file -``` - -#### Web App -You can also use `upload_file_ui.py` under `/ai_applications` to deploy a small webapp that provides a simple file upload UI to get summaries of the uploaded files. - -```bash -python summarizer/ai_applications/upload_file_ui.py -``` - -You should now have an instance running at http://0.0.0.0:8080. - - -![](/assets/summary__upload_ui.png) +Follow the instructions below to build you container image and run it locally. +* `podman build -t summarizer summarizer-langchain -f summarizer-langchain/builds/Containerfile` +* `podman run --rm -it -p 8501:8501 -e MODEL_SERVICE_ENDPOINT=http://10.88.0.1:8001/v1 summarizer` \ No newline at end of file diff --git a/summarizer/ai-studio.yaml b/summarizer/ai-studio.yaml index 37e2fee2..3787b65f 100644 --- a/summarizer/ai-studio.yaml +++ b/summarizer/ai-studio.yaml @@ -1,31 +1,25 @@ version: v1.0 application: type: language - name: summarizer - description: This is a LLM summarizer application + name: Summarizer_App + description: This is a Streamlit demo application for summarizing text. containers: - - name: summarizer-model-service - contextdir: model_services - containerfile: base/Containerfile - model-service: true - backend: + - name: llamacpp-server + contextdir: ../playground + containerfile: Containerfile + model-service: true + backend: - llama arch: - arm64 - amd64 - - name: summarizer-model-service-cuda - contextdir: model_services - containerfile: cuda/Containerfile - model-service: true - backend: - - llama - gpu-env: - - cuda - arch: - - amd64 - - name: summarizer-example-app - contextdir: ai_applications - containerfile: base/Containerfile + ports: + - 8001 + - name: streamlit-summary-app + contextdir: . + containerfile: builds/Containerfile arch: + - arm64 - amd64 - - arm64 \ No newline at end of file + ports: + - 8501 \ No newline at end of file diff --git a/summarizer/ai_applications/base/Containerfile b/summarizer/ai_applications/base/Containerfile deleted file mode 100644 index c3737927..00000000 --- a/summarizer/ai_applications/base/Containerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM registry.access.redhat.com/ubi9/python-39:1-158 -WORKDIR /summarizer -COPY base/requirements.txt . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /summarizer/requirements.txt -COPY upload_file_ui.py . -EXPOSE 8080 -ENTRYPOINT [ "python", "upload_file_ui.py"] \ No newline at end of file diff --git a/summarizer/ai_applications/base/requirements.txt b/summarizer/ai_applications/base/requirements.txt deleted file mode 100644 index 4b684d73..00000000 --- a/summarizer/ai_applications/base/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -gradio -gradio_client \ No newline at end of file diff --git a/summarizer/ai_applications/summarize.py b/summarizer/ai_applications/summarize.py deleted file mode 100644 index 91fb394d..00000000 --- a/summarizer/ai_applications/summarize.py +++ /dev/null @@ -1,17 +0,0 @@ -import argparse -from gradio_client import Client -import time - -parser = argparse.ArgumentParser() -parser.add_argument("-f", "--file", default="data/fake_meeting.txt") -parser.add_argument("-m", "--model_endpoint",default="http://0.0.0.0:7860/") -args = parser.parse_args() - - -client = Client(args.model_endpoint) -with open(args.file) as f: - prompt = f.read() -start = time.time() -result = client.predict(prompt, api_name="/chat") -print(result) -print(time.time() - start) diff --git a/summarizer/ai_applications/upload_file_ui.py b/summarizer/ai_applications/upload_file_ui.py deleted file mode 100644 index 4b169826..00000000 --- a/summarizer/ai_applications/upload_file_ui.py +++ /dev/null @@ -1,25 +0,0 @@ -import gradio as gr -from gradio_client import Client -import os - -def get_summary(file_text): - global client - job = client.submit(file_text) - while not job.done(): - pass - return str(job.outputs()[-1]) - -def read_file(file): - with open(file) as f: - file_text = f.read() - response = get_summary(file_text) - return response - - -if __name__ == "__main__": - - model_endpoint = os.getenv('MODEL_ENDPOINT', "http://0.0.0.0:7860") - client = Client(model_endpoint) - demo = gr.Interface(fn=read_file, inputs="file", outputs="textbox", - allow_flagging="never") - demo.launch(server_name="0.0.0.0", server_port=8080) diff --git a/summarizer-langchain/builds/Containerfile b/summarizer/builds/Containerfile similarity index 100% rename from summarizer-langchain/builds/Containerfile rename to summarizer/builds/Containerfile diff --git a/summarizer-langchain/builds/requirements.txt b/summarizer/builds/requirements.txt similarity index 100% rename from summarizer-langchain/builds/requirements.txt rename to summarizer/builds/requirements.txt diff --git a/summarizer/model_services/base/Containerfile b/summarizer/model_services/base/Containerfile deleted file mode 100644 index ab1bf165..00000000 --- a/summarizer/model_services/base/Containerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM registry.access.redhat.com/ubi9/python-39:1-158 -WORKDIR /locallm -COPY src . -RUN pip install --upgrade pip -RUN pip install --no-cache-dir --upgrade -r /locallm/requirements.txt -EXPOSE 7860 -ENTRYPOINT [ "python", "summary_service.py" ] diff --git a/summarizer/model_services/cuda/Containerfile b/summarizer/model_services/cuda/Containerfile deleted file mode 100644 index baed20f6..00000000 --- a/summarizer/model_services/cuda/Containerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM quay.io/opendatahub/workbench-images:cuda-ubi9-python-3.9-20231206 -WORKDIR /locallm -COPY src . -RUN pip install --upgrade pip -ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on" -ENV FORCE_CMAKE=1 -RUN pip install --upgrade --force-reinstall --no-cache-dir -r /locallm/requirements.txt -EXPOSE 7860 -ENTRYPOINT [ "python", "summary_service.py" ] \ No newline at end of file diff --git a/summarizer/model_services/src/llamacpp_utils.py b/summarizer/model_services/src/llamacpp_utils.py deleted file mode 100644 index 2026a9dd..00000000 --- a/summarizer/model_services/src/llamacpp_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -from llama_cpp import Llama - - -def tokenize(llama, prompt): - return llama.tokenize(bytes(prompt, "utf-8")) - -def count_tokens(llama,prompt): - return len(tokenize(llama,prompt)) + 5 - -def clip_history(llama, prompt, history, n_ctx, max_tokens): - prompt_len = count_tokens(llama, prompt) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = prompt_len + history_len - print(input_len) - while input_len >= n_ctx-max_tokens: - print("Clipping") - history.pop(1) - history_len = sum([count_tokens(llama, x["content"]) for x in history]) - input_len = history_len + prompt_len - print(input_len) - return history - -def chunk_tokens(llm, prompt, chunk_size): - tokens = tokenize(llm, prompt) - num_tokens = count_tokens(llm, prompt) - chunks = [] - for i in range((num_tokens//chunk_size)+1): - chunk = str(llm.detokenize(tokens[:chunk_size]),"utf-8") - chunks.append(chunk) - tokens = tokens[chunk_size:] - return chunks diff --git a/summarizer/model_services/src/requirements.txt b/summarizer/model_services/src/requirements.txt deleted file mode 100644 index 4c1269c4..00000000 --- a/summarizer/model_services/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -llama-cpp-python -gradio \ No newline at end of file diff --git a/summarizer/model_services/src/summary_service.py b/summarizer/model_services/src/summary_service.py deleted file mode 100644 index fc0eba42..00000000 --- a/summarizer/model_services/src/summary_service.py +++ /dev/null @@ -1,52 +0,0 @@ -import sys -sys.path.append("src") -import gradio as gr -from llama_cpp import Llama -from llamacpp_utils import clip_history, chunk_tokens -import os - -llm = Llama(os.getenv('MODEL_PATH', - "models/llama-2-7b-chat.Q5_K_S.gguf"), - n_gpu_layers=0, - n_ctx=4096, - max_tokens=512, - n_batch=32, - f16_kv = True, - stream=False) - -system_prompt = [ - {"role": "system", "content": """You are a summarizing agent. - You only respond in bullet points. - Your only job is to summarize your inputs and provide the most concise possible output. - Do not add any information that does not come directly from the user prompt. - Limit your response to a maximum of 5 bullet points. - It's fine to have less than 5 bullet points"""}, - ] - -def summary(prompt, history): - global llm - global system_prompt - chunk_size = 4096 - prompt_chunks = chunk_tokens(llm,prompt,chunk_size-512) - partial_summaries = [] - print(f"processing {len(prompt_chunks)} chunks") - for i,chunk in enumerate(prompt_chunks): - print(f"{i+1}/{len(prompt_chunks)}") - prompt = {"role":"user", "content": chunk} - system_prompt.append(prompt) - chat_response = llm.create_chat_completion(system_prompt) - partial_summary = chat_response["choices"][0]["message"]["content"] - partial_summaries.append(partial_summary) - system_prompt = [system_prompt[0]] - if len(prompt_chunks) == 1: - return partial_summaries[0] - prompt = {"role":"user","content":" ".join(partial_summaries)} - system_prompt.append(prompt) - chat_response = llm.create_chat_completion(system_prompt) - return chat_response["choices"][0]["message"]["content"] - - -if __name__=="__main__": - - demo = gr.ChatInterface(summary) - demo.launch(server_name="0.0.0.0") diff --git a/summarizer-langchain/quadlet/README.md b/summarizer/quadlet/README.md similarity index 100% rename from summarizer-langchain/quadlet/README.md rename to summarizer/quadlet/README.md diff --git a/summarizer-langchain/quadlet/summarizer.image b/summarizer/quadlet/summarizer.image similarity index 100% rename from summarizer-langchain/quadlet/summarizer.image rename to summarizer/quadlet/summarizer.image diff --git a/summarizer-langchain/quadlet/summarizer.kube.example b/summarizer/quadlet/summarizer.kube.example similarity index 100% rename from summarizer-langchain/quadlet/summarizer.kube.example rename to summarizer/quadlet/summarizer.kube.example diff --git a/summarizer-langchain/quadlet/summarizer.yaml b/summarizer/quadlet/summarizer.yaml similarity index 100% rename from summarizer-langchain/quadlet/summarizer.yaml rename to summarizer/quadlet/summarizer.yaml diff --git a/summarizer-langchain/summarizer.py b/summarizer/summarizer.py similarity index 100% rename from summarizer-langchain/summarizer.py rename to summarizer/summarizer.py From c1f96e9f0693703389be849db129fba982289cd0 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Thu, 7 Mar 2024 15:40:14 -0500 Subject: [PATCH 4/4] revert recipies to original names --- {chatbot => chatbot-langchain}/README.md | 0 {chatbot => chatbot-langchain}/ai-studio.yaml | 0 {chatbot => chatbot-langchain}/builds/Containerfile | 0 {chatbot => chatbot-langchain}/builds/requirements.txt | 0 {chatbot => chatbot-langchain}/chatbot_ui.py | 0 {chatbot => chatbot-langchain}/quadlet/README.md | 0 {chatbot => chatbot-langchain}/quadlet/chatbot.image | 0 {chatbot => chatbot-langchain}/quadlet/chatbot.kube.example | 0 {chatbot => chatbot-langchain}/quadlet/chatbot.yaml | 0 {rag => rag-langchain}/README.md | 0 {rag => rag-langchain}/ai-studio.yaml | 0 {rag => rag-langchain}/builds/Containerfile | 0 {rag => rag-langchain}/builds/chromadb/Containerfile | 0 {rag => rag-langchain}/builds/requirements.txt | 0 {rag => rag-langchain}/populate_vectordb.py | 0 {rag => rag-langchain}/rag_app.py | 0 {summarizer => summarizer-langchain}/README.md | 0 {summarizer => summarizer-langchain}/ai-studio.yaml | 0 {summarizer => summarizer-langchain}/builds/Containerfile | 0 {summarizer => summarizer-langchain}/builds/requirements.txt | 0 {summarizer => summarizer-langchain}/quadlet/README.md | 0 {summarizer => summarizer-langchain}/quadlet/summarizer.image | 0 .../quadlet/summarizer.kube.example | 0 {summarizer => summarizer-langchain}/quadlet/summarizer.yaml | 0 {summarizer => summarizer-langchain}/summarizer.py | 0 25 files changed, 0 insertions(+), 0 deletions(-) rename {chatbot => chatbot-langchain}/README.md (100%) rename {chatbot => chatbot-langchain}/ai-studio.yaml (100%) rename {chatbot => chatbot-langchain}/builds/Containerfile (100%) rename {chatbot => chatbot-langchain}/builds/requirements.txt (100%) rename {chatbot => chatbot-langchain}/chatbot_ui.py (100%) rename {chatbot => chatbot-langchain}/quadlet/README.md (100%) rename {chatbot => chatbot-langchain}/quadlet/chatbot.image (100%) rename {chatbot => chatbot-langchain}/quadlet/chatbot.kube.example (100%) rename {chatbot => chatbot-langchain}/quadlet/chatbot.yaml (100%) rename {rag => rag-langchain}/README.md (100%) rename {rag => rag-langchain}/ai-studio.yaml (100%) rename {rag => rag-langchain}/builds/Containerfile (100%) rename {rag => rag-langchain}/builds/chromadb/Containerfile (100%) rename {rag => rag-langchain}/builds/requirements.txt (100%) rename {rag => rag-langchain}/populate_vectordb.py (100%) rename {rag => rag-langchain}/rag_app.py (100%) rename {summarizer => summarizer-langchain}/README.md (100%) rename {summarizer => summarizer-langchain}/ai-studio.yaml (100%) rename {summarizer => summarizer-langchain}/builds/Containerfile (100%) rename {summarizer => summarizer-langchain}/builds/requirements.txt (100%) rename {summarizer => summarizer-langchain}/quadlet/README.md (100%) rename {summarizer => summarizer-langchain}/quadlet/summarizer.image (100%) rename {summarizer => summarizer-langchain}/quadlet/summarizer.kube.example (100%) rename {summarizer => summarizer-langchain}/quadlet/summarizer.yaml (100%) rename {summarizer => summarizer-langchain}/summarizer.py (100%) diff --git a/chatbot/README.md b/chatbot-langchain/README.md similarity index 100% rename from chatbot/README.md rename to chatbot-langchain/README.md diff --git a/chatbot/ai-studio.yaml b/chatbot-langchain/ai-studio.yaml similarity index 100% rename from chatbot/ai-studio.yaml rename to chatbot-langchain/ai-studio.yaml diff --git a/chatbot/builds/Containerfile b/chatbot-langchain/builds/Containerfile similarity index 100% rename from chatbot/builds/Containerfile rename to chatbot-langchain/builds/Containerfile diff --git a/chatbot/builds/requirements.txt b/chatbot-langchain/builds/requirements.txt similarity index 100% rename from chatbot/builds/requirements.txt rename to chatbot-langchain/builds/requirements.txt diff --git a/chatbot/chatbot_ui.py b/chatbot-langchain/chatbot_ui.py similarity index 100% rename from chatbot/chatbot_ui.py rename to chatbot-langchain/chatbot_ui.py diff --git a/chatbot/quadlet/README.md b/chatbot-langchain/quadlet/README.md similarity index 100% rename from chatbot/quadlet/README.md rename to chatbot-langchain/quadlet/README.md diff --git a/chatbot/quadlet/chatbot.image b/chatbot-langchain/quadlet/chatbot.image similarity index 100% rename from chatbot/quadlet/chatbot.image rename to chatbot-langchain/quadlet/chatbot.image diff --git a/chatbot/quadlet/chatbot.kube.example b/chatbot-langchain/quadlet/chatbot.kube.example similarity index 100% rename from chatbot/quadlet/chatbot.kube.example rename to chatbot-langchain/quadlet/chatbot.kube.example diff --git a/chatbot/quadlet/chatbot.yaml b/chatbot-langchain/quadlet/chatbot.yaml similarity index 100% rename from chatbot/quadlet/chatbot.yaml rename to chatbot-langchain/quadlet/chatbot.yaml diff --git a/rag/README.md b/rag-langchain/README.md similarity index 100% rename from rag/README.md rename to rag-langchain/README.md diff --git a/rag/ai-studio.yaml b/rag-langchain/ai-studio.yaml similarity index 100% rename from rag/ai-studio.yaml rename to rag-langchain/ai-studio.yaml diff --git a/rag/builds/Containerfile b/rag-langchain/builds/Containerfile similarity index 100% rename from rag/builds/Containerfile rename to rag-langchain/builds/Containerfile diff --git a/rag/builds/chromadb/Containerfile b/rag-langchain/builds/chromadb/Containerfile similarity index 100% rename from rag/builds/chromadb/Containerfile rename to rag-langchain/builds/chromadb/Containerfile diff --git a/rag/builds/requirements.txt b/rag-langchain/builds/requirements.txt similarity index 100% rename from rag/builds/requirements.txt rename to rag-langchain/builds/requirements.txt diff --git a/rag/populate_vectordb.py b/rag-langchain/populate_vectordb.py similarity index 100% rename from rag/populate_vectordb.py rename to rag-langchain/populate_vectordb.py diff --git a/rag/rag_app.py b/rag-langchain/rag_app.py similarity index 100% rename from rag/rag_app.py rename to rag-langchain/rag_app.py diff --git a/summarizer/README.md b/summarizer-langchain/README.md similarity index 100% rename from summarizer/README.md rename to summarizer-langchain/README.md diff --git a/summarizer/ai-studio.yaml b/summarizer-langchain/ai-studio.yaml similarity index 100% rename from summarizer/ai-studio.yaml rename to summarizer-langchain/ai-studio.yaml diff --git a/summarizer/builds/Containerfile b/summarizer-langchain/builds/Containerfile similarity index 100% rename from summarizer/builds/Containerfile rename to summarizer-langchain/builds/Containerfile diff --git a/summarizer/builds/requirements.txt b/summarizer-langchain/builds/requirements.txt similarity index 100% rename from summarizer/builds/requirements.txt rename to summarizer-langchain/builds/requirements.txt diff --git a/summarizer/quadlet/README.md b/summarizer-langchain/quadlet/README.md similarity index 100% rename from summarizer/quadlet/README.md rename to summarizer-langchain/quadlet/README.md diff --git a/summarizer/quadlet/summarizer.image b/summarizer-langchain/quadlet/summarizer.image similarity index 100% rename from summarizer/quadlet/summarizer.image rename to summarizer-langchain/quadlet/summarizer.image diff --git a/summarizer/quadlet/summarizer.kube.example b/summarizer-langchain/quadlet/summarizer.kube.example similarity index 100% rename from summarizer/quadlet/summarizer.kube.example rename to summarizer-langchain/quadlet/summarizer.kube.example diff --git a/summarizer/quadlet/summarizer.yaml b/summarizer-langchain/quadlet/summarizer.yaml similarity index 100% rename from summarizer/quadlet/summarizer.yaml rename to summarizer-langchain/quadlet/summarizer.yaml diff --git a/summarizer/summarizer.py b/summarizer-langchain/summarizer.py similarity index 100% rename from summarizer/summarizer.py rename to summarizer-langchain/summarizer.py