Enable LLM inference with llama.cpp and llama-cpp-python (#33)

* Add models tab ui for model conversion * Add changes to models tab UI * Add changes to models tab UI * Feature/llamacpp (#22) * Initial download snapshot & covert to GGUF using LLama 🐑 * Chat 💬 completion with llama cpp * added llama.cpp requirement * model conversion * HF snapshot download fix * Implement CMake build support and enhance text generation using gguf model. * Implement dynamic model path and make quantized_model directory * Add py_cmd to configs using Makefile * Add py_cmd to configs with cmake, dynamic python command for conversion --------- Co-authored-by: parveen kumar <[email protected]> Co-authored-by: Subhanshu0027 <[email protected]> * Feat: download and convert, select model from models tab (#24) * Feat: download and convert, select model from models tab * Refactor: remove unused line * Remove Converted gguf Models & Enhance UI in Models Tab (#26) * Add feature to remove converted gguf models & UI changes in models tab * Add remove model functionality to core.py * Optimize code formatting * Update README.md with new features * Select model from chat tab (#25) * feat: Add support for selecting execution provider, CPU or GPU with CUDA (#29) * refactor: Remove all remnants of transformers inference logic and associated code, fix removeModel (#30) * docs: Add installation and development tips, update (#32) * fix: dropdown for initially empty saved_gguf_models_list (#34) * fix: Model list not updating after download snapshot, remove model (#35) * fix: Model list not updating after download snapshot * fix: remove model --------- Co-authored-by: Subhanshu0027 <[email protected]> Co-authored-by: Subhanshu0027 <[email protected]> Co-authored-by: Juggernaut <[email protected]>
Aesthisia · May 8, 2024 · 01417f3 · 01417f3
1 parent b1b7053
commit 01417f3
Show file tree

Hide file tree

Showing 11 changed files with 399 additions and 161 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,13 @@ models/.locks
 models/tmp*
 configs/config.ini
 
+src/original_model/*
+src/quantized_model/*
+
+src/llama_cpp
+
+#ignore build folder(for cmake)
+build
+
 #compiled files
 *.pyc
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Minimum required CMake version
+cmake_minimum_required(VERSION 3.15)
+
+# Project name
+project(llama_cpp)
+
+# Git repository location
+set(REPO_URL "https://github.com/ggerganov/llama.cpp")
+
+# Requirements file
+set(REQUIREMENTS_FILE "requirements.txt")
+
+# Llama directory
+set(LLAMA_DIR "${PROJECT_SOURCE_DIR}/src/llama_cpp")
+
+# Check for Python and Git using CMake's FIND_PACKAGE
+find_package(PythonLibs REQUIRED)
+find_package(Git REQUIRED)
+
+# Download and clone the llama.cpp repository
+execute_process(
+  COMMAND git clone ${REPO_URL} ${LLAMA_DIR}
+  RESULT_VARIABLE git_result
+)
+
+# Error handling for Git clone
+if(NOT ${git_result} EQUAL 0)
+  message(FATAL_ERROR "Failed to clone llama.cpp repository")
+endif()
+
+# Install Python requirements
+execute_process(
+  COMMAND pip install -r "${LLAMA_DIR}/${REQUIREMENTS_FILE}"
+)
+
+file(MAKE_DIRECTORY "${PROJECT_SOURCE_DIR}/src/quantized_model")
+
+find_program(PYTHON NAMES python python3 2>/dev/null)
+
+if(PYTHON)
+  file(APPEND "${PROJECT_SOURCE_DIR}/configs/config.ini" "py_cmd = ${PYTHON}")
+endif()
diff --git a/Makefile b/Makefile
@@ -0,0 +1,43 @@
+# Makefile to clone llama.cpp repository and install requirements
+
+# Variables
+REPO_URL := https://github.com/ggerganov/llama.cpp
+REQUIREMENTS_FILE := requirements.txt
+LLAMA_DIR := src/llama_cpp
+
+# Determine pip command
+PIP := $(shell command -v pip3 2>/dev/null || command -v pip)
+
+# Check if python and git are installed
+PYTHON := $(shell command -v python 2>/dev/null || command -v python3 2>/dev/null)
+GIT := $(shell command -v git)
+
+ifeq ($(PYTHON),)
+$(error Python is not installed. Please install Python before running this Makefile.)
+endif
+
+ifeq ($(GIT),)
+$(error Git is not installed. Please install Git before running this Makefile.)
+endif
+
+# Targets
+.PHONY: all clone install clean quantized_model_dir append_to_configs
+
+all: clone install quantized_model_dir append_to_configs
+
+clone:
+	mkdir -p $(LLAMA_DIR)
+	git clone $(REPO_URL) $(LLAMA_DIR)
+
+install:
+	cd $(LLAMA_DIR) && \
+		$(PIP) install -r $(REQUIREMENTS_FILE)
+
+quantized_model_dir:
+	mkdir -p src/quantized_model
+
+append_to_configs:
+	echo "py_cmd = $(PYTHON)" >> configs/config.ini
+
+clean:
+	rm -rf $(LLAMA_DIR)
diff --git a/README.md b/README.md
@@ -1,18 +1,19 @@
 ## LLMinator: Run & Test LLMs locally
-#### Gradio based tool with integrated chatbot to locally run & test LLMs directly from HuggingFace. 
 
-An easy-to-use tool made with Gradio, LangChain, and Torch.
-
-![image](https://github.com/Aesthisia/LLMinator/assets/91900622/54cc0b3f-c5a8-4470-bcc5-a22e5fd24707)
+#### Gradio based tool with integrated chatbot to locally run & test LLMs directly from HuggingFace.
 
+An easy-to-use tool made with Gradio, LangChain, and Torch.
 
+![LLMinator chat tab](https://github.com/Aesthisia/LLMinator/assets/89995648/0c7fd00f-610b-4ad1-8736-1f0cb7d212de)
+![LLMinator models tab](https://github.com/Aesthisia/LLMinator/assets/89995648/44c03281-fb76-40c6-b1d3-2e395562ae16)
 
 ### ⚡ Features
 
-- Context-aware Chatbot. 
-- Inbuilt code syntax highlighting. 
+- Context-aware Chatbot.
+- Inbuilt code syntax highlighting.
 - Load any LLM repo directly from HuggingFace.
-- Supports both CPU & Cuda modes. 
+- Supports both CPU & CUDA modes.
+- Enable LLM inference with [llama.cpp](https://github.com/ggerganov/llama.cpp) using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 
 ## 🚀 How to use
 
@@ -21,17 +22,69 @@ To use LLMinator, follow these simple steps:
 - Clone the LLMinator repository from GitHub.
 - Navigate to the directory containing the cloned repository.
 - Install the required dependencies by running `pip install -r requirements.txt`.
+- Build LLMinator with llama.cpp :
+
+  - Using `make`:
+
+    - On Linux or MacOS:
+
+      ```bash
+      make
+      ```
+
+    - On Windows:
+
+      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+      2. Extract `w64devkit` on your pc.
+      3. Run `w64devkit.exe`.
+      4. Use the `cd` command to reach the `LLMinator` folder.
+      5. From here you can run:
+         ```bash
+         make
+         ```
+
+  - Using `CMake`:
+    ```bash
+    mkdir build
+    cd build
+    cmake ..
+    ```
+
 - Run the LLMinator tool using the command `python webui.py`.
 - Access the web interface by opening the provided URL in your browser.
 - Start interacting with the chatbot and experimenting with LLMs!
 
-### Command line arguments 
+### Command line arguments
+
+| Argument Command | Default   | Description                                                                 |
+| ---------------- | --------- | --------------------------------------------------------------------------- |
+| --host           | 127.0.0.1 | Host or IP address on which the server will listen for incoming connections |
+| --port           | 7860      | Launch gradio with given server port                                        |
+| --share          | False     | This generates a public shareable link that you can send to anybody         |
+
+## Installation and Development Tips
+
+**Python Version:**
+
+- **Compatible Versions:** This project is compatible with Python versions 3.8+ to 3.11. Ensure you have one of these versions installed on your system. You can check your Python version by running `python --version` or `python3 --version` in your terminal.
+
+**Cmake and C Compiler:**
+
+- **Cmake Dependency:** If you plan to build the project using Cmake, make sure you have Cmake installed.
+- **C Compiler:** Additionally, you'll need a C compiler such as GCC. These are typically included with most Linux distributions. You can check this by running `gcc --version` in your terminal. Installation instructions for your specific operating system can be found online.
+
+**Visual Studio Code:**
+
+- **Visual Studio Installer:** If you're using Visual Studio Code for development, you'll need the C++ development workload installed. You can achieve this through the [Visual Studio Installer](https://visualstudio.microsoft.com/vs/features/cplusplus/)
+
+**GPU Acceleration (CUDA):**
+
+- **CUDA Installation:** To leverage GPU acceleration, you'll need CUDA installed on your system. Download instructions are available on the [NVIDIA website](https://developer.nvidia.com/cuda-toolkit).
+- **Torch Compatibility:** After installing CUDA, confirm CUDA availability with `torch.cuda.is_available()`. When using a GPU, ensure you follow the project's specific `llama-cpp-python` installation configuration for CUDA support.
+
+## Reporting Issues:
 
-| Argument Command | Default | Description |
-| ---------- | ---------- | ---------- |
-| --host | 127.0.0.1 | Host or IP address on which the server will listen for incoming connections |
-| --port | 7860 | Launch gradio with given server port |
-| --share | False | This generates a public shareable link that you can send to anybody |
+If you encounter any errors or issues, feel free to file a detailed report in the project's repository. We're always happy to help! When reporting an issue, please provide as much information as possible, including the error message, logs, the steps you took, and your system configuration. This makes it easier for us to diagnose and fix the problem quickly.
 
 ## 🤝 Contributions
 
@@ -42,4 +95,4 @@ We welcome contributions from the community to enhance LLMinator further. If you
 - Test your changes thoroughly.
 - Submit a pull request, providing a clear description of the changes you've made.
 
-Reach out to us: [email protected]
+Reach out to us: [email protected]
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/configs/config.ini b/configs/config.ini
@@ -1,4 +1,3 @@
 [Settings]
 execution_provider = 
 repo_id = 
-
diff --git a/core.py b/core.py
@@ -1,30 +1,36 @@
 import os, shutil
 from configparser import ConfigParser
+import gradio as gr
 
 default_repo_id = "stabilityai/stable-code-instruct-3b"
 config_path = "configs/config.ini"
-default_repo_id_parts = default_repo_id.split("/")
-default_model_folder = f"models--{'--'.join(default_repo_id_parts)}"
+cache_gguf_dir = os.path.join(os.getcwd(), "src/quantized_model")
+cache_original_dir = os.path.join(os.getcwd(), "src/original_model")
 
-def format_model_name(directory_name):
-    parts = directory_name.split("--")
-    return "/".join(parts[1:])
+def format_gguf_model_name(file_name):
+    parts = file_name.replace('.gguf', '').split("__")
+    return "/".join(parts)
 
-def list_download_models(cache_dir):
-    contents = os.listdir(cache_dir)
-    directories = [format_model_name(item) for item in contents if os.path.isdir(os.path.join(cache_dir, item)) and item.startswith("models")]
-    return directories
+def list_converted_gguf_models(cache_gguf_dir):
+    contents = os.listdir(cache_gguf_dir)
+    model_files = [format_gguf_model_name(item) for item in contents]
+    return model_files
 
-def remove_dir(path):
-    try:
-        for model in os.listdir(path):
-            if model != default_model_folder:
-                model_path = os.path.join(path, model)
-                if os.path.isdir(model_path):
-                    shutil.rmtree(model_path)
-        print("successfully removed cached models!")
-    except OSError as e:
-        print(f"Error: {e.strerror}")
+def removeModelFromCache(model_name):
+    config = ConfigParser()
+    config.read(config_path)
+    repo_id = config.get('Settings', 'repo_id')
+    if model_name == repo_id:
+        raise gr.Error("Can not delete default model")
+    else:
+        gguf_model_name = model_name.replace("/", "__") + ".gguf"
+        original_model_parts = model_name.split("/")
+        original_model_name = f"model--{'--'.join(original_model_parts)}"
+        try:
+            os.remove(os.path.join(cache_gguf_dir, gguf_model_name))
+            shutil.rmtree(os.path.join(cache_original_dir, original_model_name))
+        except FileNotFoundError:
+            raise gr.Error("Model not found in cache.")
 
 def read_config():
     config = ConfigParser()

diff --git a/models/models.txt b/models/models.txt
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
-gradio==4.24.0
+gradio==4.27.0
+huggingface_hub==0.21.1
 langchain==0.1.14
-torch==2.2.1
-transformers==4.39.1
+torch==2.1.2
+llama-cpp-python==0.1.9
diff --git a/src/quantize.py b/src/quantize.py
@@ -0,0 +1,41 @@
+import subprocess, os
+from huggingface_hub import snapshot_download
+from configparser import ConfigParser
+
+config_path = "./configs/config.ini"
+
+def get_py_cmd():
+    config = ConfigParser()
+    config.read(config_path)
+    py_cmd = config.get('Settings', 'py_cmd')
+    if "python3" in py_cmd:
+        return 'python3'
+    else:
+        return 'python'
+
+def quantize_model(repo_id):
+    original_models_path = "./src/original_model/"
+    quantized_path = "./src/quantized_model/"
+
+    repo_id_parts = repo_id.split("/")
+    model_folder = f"model--{'--'.join(repo_id_parts)}"
+    model_path = original_models_path + model_folder
+
+    outfile = quantized_path + repo_id.replace("/", "__") + ".gguf"
+
+    if os.path.isfile(outfile):
+        return outfile
+
+    snapshot_download(repo_id=repo_id, local_dir=model_path , local_dir_use_symlinks=True)
+
+    command = [
+        get_py_cmd(),
+        './src/llama_cpp/convert-hf-to-gguf.py',
+        model_path,
+        '--outtype', 'f16',
+        '--outfile', outfile
+    ]
+
+    subprocess.run(command, check=True)
+
+    return outfile