fixing llamacpp pytyon docs and makefile

Signed-off-by: greg pereira <[email protected]>
containers · Apr 6, 2024 · 7346200 · 7346200
1 parent 9d63c26
commit 7346200
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,7 @@
 *_pycache_*
 port_check.lock
 *build
+models/*
+!models/convert_models/*
+!models/Containerfile
+!models/README.md
diff --git a/model_servers/llamacpp_python/Makefile b/model_servers/llamacpp_python/Makefile
@@ -1,21 +1,64 @@
+APP := llamacpp_python
+PORT := 8001
+
+IMAGE := quay.io/ai-lab/model_servers/$(APP):latest
+CUDA_IMAGE := quay.io/ai-lab/model_servers/$(APP)_cuda:latest
+VULKAN_IMAGE := quay.io/ai-lab/model_servers/$(APP)_vulkan:latest
+
+# ----- MODEL OPTIONS -----
+
+LLAMA_MODEL_NAME := llama-2-7b-chat.Q5_K_S.gguf
+LLAMA_MODEL_URL := https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf
+
+MISTRAL_MODEL_NAME := mistral-7b-instruct-v0.1.Q4_K_M.gguf
+MISTRAL_MODEL_URL := https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf 
+
+# --- END MODEL OPTIONS ---
+
+MODEL_PATH_FROM_ROOT := locallm/models/$(MISTRAL_MODEL_NAME) # CHOOSE MODEL HERE BY NAME
+RELATIVE_MODEL_PATH := ../../models
+
+BIND_MOUNT_OPTIONS :=  ro
+OS := $(shell uname -s)
+ifeq ($(OS),Linux)
+    BIND_MOUNT_OPTIONS := ro,Z
+endif
+
+.Phony: all
+all: build  download-mistral-7b-instruct-v0.1.Q4_K_M.gguf run
+
 .PHONY: build
 build:
-	podman build -t ghcr.io/ai-lab-recipes/model_servers .
+	podman build -t $(IMAGE) . -f base/Containerfile
+
+.PHONY: build-cuda
+build-cuda:
+	podman build -t $(CUDA_IMAGE) . -f cuda/Containerfile
+
+.PHONY: build-vulkan
+build-vulkan:
+	podman build -t $(VULKAN_IMAGE) . -f cuda/Containerfile
 
-llama-2-7b-chat.Q5_K_S.gguf:
-	curl -s -S -L -f https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_S.gguf -z $@ -o $@.tmp && mv -f $@.tmp $@ 2>/dev/null || rm -f $@.tmp $@
+.PHONY: download-model-llama
+download-model-llama:
+	cd ../../models && \
+	curl -s -S -L -f $(LLAMA_MODEL_URL) -z $@ -o $@.tmp && mv -f $@.tmp $@ 2>/dev/null || rm -f $@.tmp $@
 
-mistral-7b-instruct-v0.1.Q4_K_M.gguf:
-	curl -s -S -L -f https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf  -z $@ -o $@.tmp && mv -f $@.tmp $@ 2>/dev/null || rm -f $@.tmp $@
+.PHONY: download-model-mistral
+download-model-mistral:
+	cd ../../models && \
+	curl -s -S -L -f $(MISTRAL_MODEL_URL)  -z $@ -o $@.tmp && mv -f $@.tmp $@ 2>/dev/null || rm -f $@.tmp $@
 
 .PHONY: install
 install:
 	pip install -r tests/requirements.txt
 
 .PHONY: run
 run:
-	podman run -it -d -p 8001:8001 -v ./models:/locallm/models:ro,Z -e MODEL_PATH=models/mistral-7b-instruct-v0.1.Q4_K_M.gguf -e HOST=0.0.0.0 -e PORT=8001 --net=host ghcr.io/redhat-et/model_servers
+	cd ../.. && \
+	podman run -it -d -p $(PORT):$(PORT) -v ./models:$(MODEL_PATH_FROM_ROOT):$(BIND_MOUNT_OPTIONS) -e MODEL_PATH=$(MODEL_PATH_FROM_ROOT) -e HOST=0.0.0.0 -e PORT=$(PORT) --net=host $(IMAGE);
 
 .PHONY: test
 test:
-	pytest --log-cli-level NOTSET
+	$(MAKE) all
+	pytest --log-cli-level NOTSET
diff --git a/model_servers/llamacpp_python/README.md b/model_servers/llamacpp_python/README.md
@@ -1,16 +1,21 @@
 ### Build Model Service
 
+For the standard model service image:
 
 ```bash
-cd model_servers/llamacpp_python
-podman build -t playground -f base/Containerfile .
+make -f Makefile build
 ```
 
-or
+For the Cuda variant image:
 
 ```bash
-cd model_servers/llamacpp_python
-make -f base/Makefile build
+make -f Makefile build-cuda
+```
+
+For the Vulkan variant image:
+
+```bash
+make -f Makefile build-vulkan
 ```
 
 ### Download Model
@@ -22,39 +27,26 @@ At the time of this writing, 2 models are known to work with this service
 - **Mistral-7b**
     - Download URL: [https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf)
 
+It is suggested you place models in the [models](../../models/) directory. As for retrieving them, either use `wget` to download them with the download links above, or call the model names from the Makefile.
+
 ```bash
-cd ../models
-wget <Download URL>
-cd ../
+cd ../../models
+curl -sLO <Download URL> 
+cd model_servers/llamacpp_python
 ```
 
-or
+or:
 
 ```bash
-make -f Makefile models/mistral-7b-instruct-v0.1.Q4_K_M.gguf
+make -f Makefile download-model-mistral
+make -f Makefile download-model-llama
 ```
 
 ### Deploy Model Service
 
 #### Single Model Service:
 
-Deploy the LLM server and volume mount the model of choice using the `MODEL_PATH` environment variable.
-
-```bash
-podman run --rm -it -d \
-        -p 8001:8001 \
-        -v Local/path/to/locallm/models:/locallm/models:ro,Z \
-        -e MODEL_PATH=models/<model-filename> \
-        -e HOST=0.0.0.0 \
-        -e PORT=8001 \
-        playground`
-```
-
-or
-
-```bash
-make -f Makefile run
-```
+Deploy the LLM server and volume mount the model of choice using the `MODEL_PATH` environment variable. The model_server is most easily deploy from calling the make command: `make -f Makefile run`
 
 #### Multiple Model Service:
 
@@ -82,7 +74,7 @@ Here is an example `models_config.json` with two quantization variants of mistra
 }
 ```
 
-Now run the container with the specified config file. 
+Now run the container with the specified config file. Note: the following command runs with linux bind mount options, for Darwin remove the `,Z` from the volume directive.
 
 ```bash
 podman run --rm -it -d \
@@ -100,4 +92,4 @@ Running tests
 
 ```bash
 make -f Makefile test
-```
+```