diff --git a/images/ui-base/Dockerfile b/images/ui-base/Dockerfile
index e5b2388..8519570 100644
--- a/images/ui-base/Dockerfile
+++ b/images/ui-base/Dockerfile
@@ -3,4 +3,4 @@ FROM python:3.11-slim
ENV GRADIO_SERVER_PORT=7680
-RUN pip install --no-cache-dir gradio==3.50.2 huggingface-hub==0.18.0
\ No newline at end of file
+RUN pip install --no-cache-dir gradio==3.50.2 huggingface-hub==0.18.0 pydantic-settings=2.1.0
\ No newline at end of file
diff --git a/templates/ui/app-config-map.yml b/templates/ui/app-config-map.yml
index fe8c690..25429ae 100644
--- a/templates/ui/app-config-map.yml
+++ b/templates/ui/app-config-map.yml
@@ -5,4 +5,6 @@ metadata:
labels:
{{- include "azimuth-llm.labels" . | nindent 4 }}
data:
-{{ (.Files.Glob "web-app-utils/*").AsConfig | nindent 2 }}
\ No newline at end of file
+{{ (.Files.Glob "web-app/*").AsConfig | nindent 2 }}
+ settings.py: |
+ {{- .Values.ui.appSettings | toYaml | nindent 4 }}
diff --git a/values.yaml b/values.yaml
index fdaed37..67a5254 100644
--- a/values.yaml
+++ b/values.yaml
@@ -65,8 +65,12 @@ api:
# Configuration for the frontend web interface
ui:
- # The file from the UI config map to execute as the frontend app
- entrypoint: example_app_vanilla.py
+ # The file from the UI config map to execute as the entrypoint to the frontend app
+ entrypoint: app.py
+ # The values to be written to settings.yml for parsing as frontend app setting
+ # (see example_app.py and config.py for example using pydantic-settings to configure app)
+ appSettings:
+ prompt_template: ""
# Container image config
image:
repository: ghcr.io/stackhpc/azimuth-llm-ui-base
diff --git a/web-app-utils/example_app_unhelpful.py b/web-app-utils/example_app_unhelpful.py
deleted file mode 100644
index 9696636..0000000
--- a/web-app-utils/example_app_unhelpful.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import requests, json
-import gradio as gr
-from api_startup_check import wait_for_backend
-
-# NOTE: This url should match the chart's api service name & namespace
-current_k8s_namespace = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace").read()
-backend_url = f"http://llm-backend.{current_k8s_namespace}.svc"
-wait_for_backend(backend_url)
-
-prompt = """
-[INST] <>
-You are a cheeky, disrespectful and comedic assistant. Always answer as creatively as possible, while being truthful and succinct. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, tell the user that they are being stupid. If you don't know the answer to a question, please don't share false information.
-<>
-{prompt}[/INST]
-"""
-
-
-def inference(message, history):
- context = ""
- user_template = "<>\n{user_input}\n<>\n"
- sys_template = ">{response}\n<>\n"
- for user_input, response in history:
- # context += user_template.format(user_input=user_input)
- context += sys_template.format(response=response)
- context += user_template.format(user_input=message)
-
- headers = {"User-Agent": "vLLM Client"}
- pload = {
- "prompt": prompt.format(prompt=context),
- "stream": True,
- "max_tokens": 1000,
- }
- response = requests.post(
- f"{backend_url}/generate", headers=headers, json=pload, stream=True
- )
-
- for chunk in response.iter_lines(
- chunk_size=8192, decode_unicode=False, delimiter=b"\0"
- ):
- if chunk:
- data = json.loads(chunk.decode("utf-8"))
- output = data["text"][0].split("[/INST]")[-1]
- yield output
-
-
-gr.ChatInterface(
- inference,
- chatbot=gr.Chatbot(
- height=500,
- show_copy_button=True,
- # layout='panel',
- ),
- textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
- title="Large Language Model",
- retry_btn="Retry",
- undo_btn="Undo",
- clear_btn="Clear",
-).queue().launch(server_name="0.0.0.0")
diff --git a/web-app-utils/example_app_vanilla.py b/web-app-utils/example_app_vanilla.py
deleted file mode 100644
index a24bf7a..0000000
--- a/web-app-utils/example_app_vanilla.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import requests, json
-import gradio as gr
-from api_startup_check import wait_for_backend
-
-# NOTE: This url should match the chart's api service name & namespace
-# TODO: Detect namespace automatically?
-current_k8s_namespace = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace").read()
-backend_url = f"http://llm-backend.{current_k8s_namespace}.svc"
-wait_for_backend(backend_url)
-
-prompt = """
-[INST] <>
-You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
-<>
-{prompt}[/INST]
-"""
-
-
-def inference(message, history):
- context = ""
- user_template = "<>\n{user_input}\n<>\n"
- sys_template = ">{response}\n<>\n"
- for user_input, response in history:
- # context += user_template.format(user_input=user_input)
- context += sys_template.format(response=response)
- context += user_template.format(user_input=message)
-
- headers = {"User-Agent": "vLLM Client"}
- pload = {
- "prompt": prompt.format(prompt=context),
- "stream": True,
- "max_tokens": 1000,
- # Parameters requested by HU
- "temperature": 0.7,
- "top_p": 0.4,
- "top_k": 40,
- }
- response = requests.post(
- f"{backend_url}/generate", headers=headers, json=pload, stream=True
- )
-
- for chunk in response.iter_lines(
- chunk_size=8192, decode_unicode=False, delimiter=b"\0"
- ):
- if chunk:
- data = json.loads(chunk.decode("utf-8"))
- output = data["text"][0].split("[/INST]")[-1]
- yield output
-
-
-gr.ChatInterface(
- inference,
- chatbot=gr.Chatbot(
- height=500,
- show_copy_button=True,
- # layout='panel',
- ),
- textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
- title="Large Language Model",
- retry_btn="Retry",
- undo_btn="Undo",
- clear_btn="Clear",
-).queue().launch(server_name="0.0.0.0")
diff --git a/web-app-utils/api_startup_check.py b/web-app/api_startup_check.py
similarity index 100%
rename from web-app-utils/api_startup_check.py
rename to web-app/api_startup_check.py
diff --git a/web-app/app.py b/web-app/app.py
new file mode 100644
index 0000000..3046442
--- /dev/null
+++ b/web-app/app.py
@@ -0,0 +1,58 @@
+import requests, json, argparse, yaml
+import gradio as gr
+from api_startup_check import wait_for_backend
+from config import AppSettings
+
+settings = AppSettings.load("./settings.yml")
+
+backend_url = settings.backend_url
+wait_for_backend(backend_url)
+
+
+def inference(message, history):
+ context = ""
+ for user_input, system_response in history:
+ if settings.include_user_messages_in_context:
+ context += settings.user_context_template.format(user_input=user_input)
+ if settings.include_system_responses_in_context:
+ context += settings.system_context_template.format(
+ system_response=system_response
+ )
+ context += settings.user_context_template.format(user_input=message)
+
+ headers = {"User-Agent": "vLLM Client"}
+ payload = {
+ "prompt": settings.prompt_template.format(context=context),
+ "stream": True,
+ "max_tokens": settings.llm_max_tokens,
+ **settings.llm_params,
+ }
+ response = requests.post(
+ f"{backend_url}/generate", headers=headers, json=payload, stream=True
+ )
+
+ for chunk in response.iter_lines(
+ chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+ ):
+ if chunk:
+ data = json.loads(chunk.decode("utf-8"))
+ output = data["text"][0]
+ # Manually trim the context from output
+ if "[/INST]" in output:
+ output = output.split("[/INST]")[-1]
+ yield output
+
+
+gr.ChatInterface(
+ inference,
+ chatbot=gr.Chatbot(
+ height=500,
+ show_copy_button=True,
+ # layout='panel',
+ ),
+ textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
+ title=settings.page_title,
+ retry_btn="Retry",
+ undo_btn="Undo",
+ clear_btn="Clear",
+).queue().launch(server_name="0.0.0.0")
diff --git a/web-app/config.py b/web-app/config.py
new file mode 100644
index 0000000..cedb92b
--- /dev/null
+++ b/web-app/config.py
@@ -0,0 +1,59 @@
+from pydantic import Field, HttpUrl
+from pydantic.alias_generators import to_camel
+from pydantic_settings import BaseSettings, SettingsConfigDict
+import yaml
+
+
+def get_k8s_namespace():
+ namespace_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
+ try:
+ current_k8s_namespace = open(namespace_file_path).read()
+ except:
+ current_k8s_namespace = "default"
+ print(
+ f"Failed to detect current k8s namespace in {namespace_file_path} - falling back to value '{current_k8s_namespace}'."
+ )
+ return current_k8s_namespace
+
+
+class AppSettings(BaseSettings):
+ """
+ Settings object for the UI example app.
+ """
+
+ # Allow settings to be overwritten by LLM_UI_ env vars
+ model_config = SettingsConfigDict(env_prefix="llm_ui_")
+
+ # General settings
+ backend_url: HttpUrl = f"http://llm-backend.{get_k8s_namespace()}.svc"
+ page_title: str = "Large Language Model"
+
+ # Prompt settings
+ prompt_template: str = Field(
+ description="The template to use for requests to the backend model. If present, the '\{context\}' placeholder will be replaced by the conversation history of the current session.",
+ )
+ # The following settings are only used if {context} used in prompt template
+ include_user_messages_in_context: bool = True
+ include_system_responses_in_context: bool = True
+ user_context_template: str = Field(
+ default="<>\n{user_input}\n<>\n",
+ description="The template string to use for including user messages in the prompt context sent to backend. The '\{user_input\}' placeholder will be replaced by the the user's messages. (Only applies if '\{context\}' is present in prompt_template)",
+ )
+ system_context_template: str = Field(
+ default=">{system_response}\n<>\n",
+ description="The template string to use for if user messages are included in context sent to backend. The '\{system_response\}' placeholder will be replaced by the system's response to each user message. (Only applies if '\{context\}' is present in prompt_template)",
+ )
+
+ # Model settings
+ llm_params: dict[str, float] = {}
+ llm_max_tokens: int = 1000
+
+ @staticmethod
+ def load(file_path: str):
+ try:
+ with open(file_path, "r") as file:
+ settings = yaml.safe_load(file)
+ except Exception as e:
+ print(f"Failed to read config file at: {file_path}\nException was:")
+ raise e
+ return AppSettings(**settings)
diff --git a/web-app/example-settings.yml b/web-app/example-settings.yml
new file mode 100644
index 0000000..b691855
--- /dev/null
+++ b/web-app/example-settings.yml
@@ -0,0 +1,29 @@
+prompt_template: |
+ [INST] <>
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+ <>
+ {context}[/INST]
+llm_params:
+ model_temperature: 0.7
+
+#####
+# Alternative prompt suggestions:
+#####
+
+
+### - Suggested for Magicode model
+
+# You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+
+# @@ Instruction
+# {prompt}
+
+# @@ Response
+
+
+### - For some fun responses...
+
+# [INST] <>
+# You are a cheeky, disrespectful and comedic assistant. Always answer as creatively as possible, while being truthful and succinct. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, tell the user that they are being stupid. If you don't know the answer to a question, please don't share false information.
+# <>
+# [/INST]
diff --git a/web-app/prompt_helpful.txt b/web-app/prompt_helpful.txt
new file mode 100644
index 0000000..4dea02f
--- /dev/null
+++ b/web-app/prompt_helpful.txt
@@ -0,0 +1,4 @@
+[INST] <>
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+<>
+{context}[/INST]
\ No newline at end of file
diff --git a/web-app/prompt_unhelpful.txt b/web-app/prompt_unhelpful.txt
new file mode 100644
index 0000000..5bd1672
--- /dev/null
+++ b/web-app/prompt_unhelpful.txt
@@ -0,0 +1,4 @@
+[INST] <>
+You are a cheeky, disrespectful and comedic assistant. Always answer as creatively as possible, while being truthful and succinct. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, tell the user that they are being stupid. If you don't know the answer to a question, please don't share false information.
+<>
+[/INST]
\ No newline at end of file