diff --git a/.github/ISSUE_TEMPLATE/QA_checklist.md b/.github/ISSUE_TEMPLATE/QA_checklist.md new file mode 100644 index 000000000..a0c68eb38 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/QA_checklist.md @@ -0,0 +1,173 @@ +--- +name: QA Checklist +about: QA Checklist +title: 'QA: [VERSION]' +labels: 'type: QA checklist' +assignees: '' +--- +**QA details:** + +Version: `v1.0.x-xxx` + +OS (select one) +- [ ] Windows 11 (online & offline) +- [ ] Ubuntu 24, 22 (online & offline) +- [ ] Mac Silicon OS 14/15 (online & offline) +- [ ] Mac Intel (online & offline) + +-------- + +# 1. Manual QA (CLI) +## Installation +- [ ] it should install with local installer (default; no internet required during installation, all dependencies bundled) +- [ ] it should install with network installer +- [ ] it should install 2 binaries (cortex and cortex-server) [mac: binaries in `/usr/local/bin`] +- [ ] it should install with correct folder permissions +- [ ] it should install with folders: /engines /logs (no /models folder until model pull) +- [ ] It should install with Docker image https://cortex.so/docs/installation/docker/ + +## Data/Folder structures +- [ ] cortex.so models are stored in `cortex.so/model_name/variants/`, with .gguf and model.yml file +- [ ] huggingface models are stored `huggingface.co/author/model_name` with .gguf and model.yml file +- [ ] downloaded models are saved in cortex.db with the right fields: `model`, `author_repo_id`, `branch_name`, `path_to_model_yaml` (view via SQL) + +## Cortex Update +- [ ] cortex -v should check output current version and check for updates +- [ ] cortex update replaces the app, installer, uninstaller and binary file (without installing cortex.llamacpp) +- [ ] `cortex update` should update from ~3-5 versions ago to latest (+3 to 5 bump) +- [ ] `cortex update` should update from the previous version to latest (+1 bump) +- [ ] `cortex update -v 1.x.x-xxx` should update from the previous version to specified version +- [ ] `cortex update` should update from previous stable version to latest +- [ ] it should gracefully update when server is actively running + +## Overall / App Shell +- [ ] cortex returns helpful text in a timely* way (< 5s) +- [ ] `cortex` or `cortex -h` displays help commands +- [ ] CLI commands should start the API server, if not running [except +- [ ] it should correctly log to cortex-cli.log and cortex.log +- [ ] There should be no stdout from inactive shell session + +## Engines +- [ ] llama.cpp should be installed by default +- [ ] it should run gguf models on llamacpp +- [ ] it should list engines +- [ ] it should get engines +- [ ] it should install engines (latest version if not specified) +- [ ] it should install engines (with specified variant and version) +- [ ] it should get default engine +- [ ] it should set default engine (with specified variant/version) +- [ ] it should load engine +- [ ] it should unload engine +- [ ] it should update engine (to latest version) +- [ ] it should update engine (to specified version) +- [ ] it should uninstall engines +- [ ] it should gracefully continue engine installation if interrupted halfway (partial download) +- [ ] it should gracefully handle when users try to CRUD incompatible engines (No variant found for xxx) +- [ ] it should run trtllm models on trt-llm [WIP, not tested] +- [ ] it shoud handle engine variants [WIP, not tested] +- [ ] it should update engines versions [WIP, not tested] + +## Server +- [ ] `cortex start` should start server and output localhost URL & port number +- [ ] users can access API Swagger documentation page at localhost URL & port number +- [ ] `cortex start` can be configured with parameters (port, [logLevel [WIP]](https://github.com/janhq/cortex.cpp/pull/1636)) https://cortex.so/docs/cli/start/ +- [ ] it should correctly log to cortex logs (logs/cortex.log, logs/cortex-cli.log) +- [ ] `cortex ps` should return server status and running models (or no model loaded) +- [ ] `cortex stop` should stop server + +## Model Pulling +- [ ] Pulling a model should pull .gguf and model.yml file +- [ ] Model download progress should appear as download bars for each file +- [ ] Model download progress should be accurate (%, total time, download size, speed) +### cortex.so +- [ ] it should pull by built in model_ID +- [ ] pull by model_ID should recommend default variant at the top (set in HF model.yml) +- [ ] it should pull by built-in model_id:variant +### huggingface.co +- [ ] it should pull by HF repo/model ID +- [ ] it should pull by full HF url (ending in .gguf) +### Interrupted Download +- [ ] it should allow user to interrupt / stop download +- [ ] pulling again after interruption should accurately calculates remainder of model file size neeed to be downloaded (`Found unfinished download! Additional XGB needs to be downloaded`) +- [ ] it should allow to continue downloading the remainder after interruption + +## Model Management +- [ ] it should list downloaded models +- [ ] it should get a local model +- [ ] it should update model parameters in model.yaml +- [ ] it should delete a model +- [ ] it should import models with model_id and model_path + +## Model Running +- [ ] `cortex run ` - if no local models detected, shows `pull` model menu +- [ ] `cortex run` - if local model detected, runs the local model +- [ ] `cortex run` - if multiple local models detected, shows list of local models (from multiple model sources eg cortexso, HF authors) for users to select (via regex search) +- [ ] `cortex run ` should return gracefully `Model not found!` +- [ ] run should autostart server +- [ ] `cortex run ` starts interactive chat (by default) +- [ ] `cortex run -d` runs in detached mode +- [ ] `cortex models start ` +- [ ] terminate StdIn or `exit()` should exit interactive chat + +## Hardware Detection / Acceleration [WIP, no need to QA] +- [ ] it should auto offload max ngl +- [ ] it should correctly detect available GPUs +- [ ] it should gracefully detect missing dependencies/drivers +CPU Extension (e.g. AVX-2, noAVX, AVX-512) +GPU Acceleration (e.g. CUDA11, CUDA12, Vulkan, sycl, etc) + +## Uninstallation / Reinstallation +- [ ] it should uninstall 2 binaries (cortex and cortex-server) +- [ ] it should uninstall with 2 options to delete or not delete data folder +- [ ] it should gracefully uninstall when server is still running +- [ ] uninstalling should not leave any dangling files +- [ ] uninstalling should not leave any dangling processes +- [ ] it should reinstall without having conflict issues with existing cortex data folders + +-- +# 2. API QA + +## Checklist for each endpoint +- [ ] Upon `cortex start`, API page is displayed at localhost:port endpoint +- [ ] Endpoints should support the parameters stated in API reference (towards OpenAI Compatibility) +- [ ] https://cortex.so/api-reference is updated + +## Endpoints +### Chat Completions +- [ ] POST `v1/chat/completions` +- [ ] Cortex supports Function Calling #295 + +### Engines +- [ ] List engines: GET `/v1/engines` +- [ ] Get engine: GET `/v1/engines/{name}` +- [ ] Install engine: POST `/v1/engines/install/{name}` +- [ ] Get default engine variant/version: GET `v1/engines/{name}/default` +- [ ] Set default engine variant/version: POST `v1/engines/{name}/default` +- [ ] Load engine: POST `v1/engines/{name}/load` +- [ ] Unload engine: DELETE `v1/engines/{name}/load` +- [ ] Update engine: POST `v1/engines/{name}/update` +- [ ] uninstall engine: DELETE `/v1/engines/install/{name}` + +### Pulling Models +- [ ] Pull model: POST `/v1/models/pull` starts download (websockets) +- [ ] Pull model: `websockets /events` emitted +- [ ] Stop model download: DELETE `/v1/models/pull` (websockets) +- [ ] Stop model download: `websockets /events` stopped +- [ ] Import model: POST `v1/models/import` + +### Running Models +- [ ] List models: GET `v1/models` +- [ ] Start model: POST `/v1/models/start` +- [ ] Stop model: POST `/v1/models/stop` +- [ ] Get model: GET `/v1/models/{id}` +- [ ] Delete model: DELETE `/v1/models/{id}` +- [ ] Update model: PATCH `/v1/models/{model}` updates model.yaml params + +## Server +- [ ] CORs [WIP] +- [ ] health: GET `/healthz` +- [ ] terminate server: DELETE `/processManager/destroy` +-------- +Test list for reference: +- #1357 e2e tests for APIs in CI +- #1147, #1225 for starting QA list \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 837bdb720..000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: Bug report -about: Create a report to help us improve -title: 'bug: [DESCRIPTION]' -labels: 'type: bug' -assignees: '' - ---- - -**Describe the bug** -A clear and concise description of what the bug is. - -**To Reproduce** -Steps to reproduce the behavior: -1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error - -**Expected behavior** -A clear and concise description of what you expected to happen. - -**Screenshots** -If applicable, add screenshots to help explain your problem. - -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - -**Additional context** -Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 000000000..6684aa985 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,58 @@ +name: "\U0001F41B Bug Report" +description: "If something isn't working as expected \U0001F914" +labels: [ "type: bug" ] +title: 'bug: [DESCRIPTION]' + +body: + - type: input + validations: + required: true + attributes: + label: "Cortex version" + description: "**Tip:** `cortex -v` outputs the version number" + + - type: textarea + validations: + required: true + attributes: + label: "Describe the issue and expected behaviour" + description: "A clear & concise description of the issue encountered" + + - type: textarea + attributes: + label: "Steps to Reproduce" + description: | + Please list out steps to reproduce the issue + placeholder: | + 1. Go to '...' + 2. Click on '...' + + - type: textarea + attributes: + label: "Screenshots / Logs" + description: | + Please include cortex-cli.log and cortex.log files in: ~/cortex/logs/ + + - type: checkboxes + attributes: + label: "What is your OS?" + options: + - label: Windows + - label: Mac Silicon + - label: Mac Intel + - label: Linux / Ubuntu + + - type: checkboxes + attributes: + label: "What engine are you running?" + options: + - label: cortex.llamacpp (default) + - label: cortex.tensorrt-llm (Nvidia GPUs) + - label: cortex.onnx (NPUs, DirectML) + + - type: input + attributes: + label: "Hardware Specs eg OS version, GPU" + description: + + \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..56e11b10a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,7 @@ +## To encourage contributors to use issue templates, we don't allow blank issues +blank_issues_enabled: true + +contact_links: + - name: "\1F4AC Cortex Discussions" + url: "https://github.com/orgs/janhq/discussions/categories/q-a" + about: "Get help, discuss features & roadmap, and share your projects" \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/discussion-thread.md b/.github/ISSUE_TEMPLATE/discussion-thread.md deleted file mode 100644 index 09e52ae68..000000000 --- a/.github/ISSUE_TEMPLATE/discussion-thread.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -name: Discussion thread -about: Start an open ended discussion -title: 'Discussion: [TOPIC HERE]' -labels: '' -assignees: '' - ---- - -**Motivation** - -**Discussion** - -**Resources** diff --git a/.github/ISSUE_TEMPLATE/epic-request.md b/.github/ISSUE_TEMPLATE/epic-request.md index bfad8e5f8..9056ec3dc 100644 --- a/.github/ISSUE_TEMPLATE/epic-request.md +++ b/.github/ISSUE_TEMPLATE/epic-request.md @@ -6,15 +6,56 @@ labels: 'type: epic' assignees: '' --- +## Goal -**Problem** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -**Success Criteria** -A clear and concise description of what you want to happen. +## Success Criteria -**Sub Issues** -- -**Additional context** -Add any other context or screenshots about the epic request here. +## Tasklist +- [ ] + +## API / CLI Documentation +### API +#### 1. Feature +``` +GET /v1/endpoint +``` + +Body: +```json +{ + "key": "value" +} +``` +**Response** +```json +200 +{ +} +Error +{ +} +``` + +### CLI +#### 1. Feature +``` +GET /v1/endpoint +``` +Response: +``` +``` +#### Help Command +``` +❯ cortex ... +Usage: +cortex [options] [subcommand] +Options: + -h,--help Print this help message and exit + ... ... + +Subcommands: + start Start a model by ID + ... ... +``` \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 26f586bd0..000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: 'feat: [DESCRIPTION]' -labels: 'type: feature request' -assignees: '' - ---- - -**Problem** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] - -**Success Criteria** -A clear and concise description of what you want to happen. - -**Additional context** -Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 000000000..1d267e500 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,19 @@ +name: "\U0001F680 Feature Request" +description: "Suggest an idea for this project \U0001F63B!" +title: 'idea: [DESCRIPTION]' +body: + - type: textarea + validations: + required: true + attributes: + label: "Problem Statement" + description: "Describe the problem you're facing" + placeholder: | + I'm always frustrated when ... + + - type: textarea + validations: + required: true + attributes: + label: "Feature Idea" + description: "Describe what you want instead. Examples are welcome!" \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/model_request.yml b/.github/ISSUE_TEMPLATE/model_request.yml new file mode 100644 index 000000000..c424de8fc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/model_request.yml @@ -0,0 +1,21 @@ +name: "\U0001F929 Model Request" +description: "Request a new model to be compiled" +title: 'feat: [DESCRIPTION]' +labels: 'type: model request' +body: + - type: markdown + attributes: + value: "**Tip:** Download any model with `cortex pull HUGGINGFACE_MODEL_ID`. Use this form for unsupported models only." + - type: textarea + validations: + required: true + attributes: + label: "Model Requests" + description: "If applicable, include the source URL, licenses, and any other relevant information" + - type: checkboxes + attributes: + label: "Which formats?" + options: + - label: GGUF (llama.cpp) + - label: TensorRT (TensorRT-LLM) + - label: ONNX (Onnx Runtime) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..320545b9a --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,14 @@ +## Describe Your Changes + +- + +## Fixes Issues + +- Closes # +- Closes # + +## Self Checklist + +- [ ] Added relevant comments, esp in complex areas +- [ ] Updated docs (for bug fixes / features) +- [ ] Created issues for follow-up changes or refactoring needed \ No newline at end of file diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh deleted file mode 100644 index 5b7b9771d..000000000 --- a/.github/scripts/e2e-test-llama-linux-and-mac.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/bash - -## Example run command -# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - -# Check for required arguments -if [[ $# -ne 3 ]]; then - echo "Usage: $0 " - exit 1 -fi - -rm /tmp/load-llm-model-res.log /tmp/completion-res.log /tmp/unload-model-res.log /tmp/load-embedding-model-res.log /tmp/embedding-res.log /tmp/nitro.log - -BINARY_PATH=$1 -DOWNLOAD_LLM_URL=$2 -DOWNLOAD_EMBEDDING_URL=$3 - -# Random port to ensure it's not used -min=10000 -max=11000 -range=$((max - min + 1)) -PORT=$((RANDOM % range + min)) - -# Start the binary file -"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log & - -# Get the process id of the binary file -pid=$! - -if ! ps -p $pid >/dev/null; then - echo "nitro failed to start. Logs:" - cat /tmp/nitro.log - exit 1 -fi - -# Wait for a few seconds to let the server start -sleep 5 - -# Check if /tmp/testllm exists, if not, download it -if [[ ! -f "/tmp/testllm" ]]; then - curl --connect-timeout 300 $DOWNLOAD_LLM_URL --output /tmp/testllm -fi - -# Check if /tmp/test-embedding exists, if not, download it -if [[ ! -f "/tmp/test-embedding" ]]; then - curl --connect-timeout 300 $DOWNLOAD_EMBEDDING_URL --output /tmp/test-embedding -fi - -# Run the curl commands -response1=$(curl --connect-timeout 60 -o /tmp/load-llm-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \ - --header 'Content-Type: application/json' \ - --data '{ - "llama_model_path": "/tmp/testllm", - "ctx_len": 50, - "ngl": 32, - "embedding": false -}') - -if ! ps -p $pid >/dev/null; then - echo "nitro failed to load model. Logs:" - cat /tmp/nitro.log - exit 1 -fi - -response2=$( - curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \ - --header 'Content-Type: application/json' \ - --header 'Accept: text/event-stream' \ - --header 'Access-Control-Allow-Origin: *' \ - --data '{ - "messages": [ - {"content": "Hello there", "role": "assistant"}, - {"content": "Write a long and sad story for me", "role": "user"} - ], - "stream": true, - "model": "gpt-3.5-turbo", - "max_tokens": 50, - "stop": ["hello"], - "frequency_penalty": 0, - "presence_penalty": 0, - "temperature": 0.1 - }' -) - -# unload model -response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log --request GET -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/unloadModel" \ - --header 'Content-Type: application/json' \ - --data '{ - "llama_model_path": "/tmp/testllm" -}') - -# load embedding model -response4=$(curl --connect-timeout 60 -o /tmp/load-embedding-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \ - --header 'Content-Type: application/json' \ - --data '{ - "llama_model_path": "/tmp/test-embedding", - "ctx_len": 50, - "ngl": 32, - "embedding": true, - "model_type": "embedding" -}') - -# request embedding -response5=$( - curl --connect-timeout 60 -o /tmp/embedding-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/embeddings" \ - --header 'Content-Type: application/json' \ - --header 'Accept: text/event-stream' \ - --header 'Access-Control-Allow-Origin: *' \ - --data '{ - "input": "Hello", - "model": "test-embedding", - "encoding_format": "float" - }' -) - -error_occurred=0 -if [[ "$response1" -ne 200 ]]; then - echo "The load llm model curl command failed with status code: $response1" - cat /tmp/load-llm-model-res.log - error_occurred=1 -fi - -if [[ "$response2" -ne 200 ]]; then - echo "The completion curl command failed with status code: $response2" - cat /tmp/completion-res.log - error_occurred=1 -fi - -if [[ "$response3" -ne 200 ]]; then - echo "The unload model curl command failed with status code: $response3" - cat /tmp/unload-model-res.log - error_occurred=1 -fi - -if [[ "$response4" -ne 200 ]]; then - echo "The load embedding model curl command failed with status code: $response4" - cat /tmp/load-embedding-model-res.log - error_occurred=1 -fi - -if [[ "$response5" -ne 200 ]]; then - echo "The embedding curl command failed with status code: $response5" - cat /tmp/embedding-res.log - error_occurred=1 -fi - -if [[ "$error_occurred" -eq 1 ]]; then - echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!" - echo "Nitro Error Logs:" - cat /tmp/nitro.log - kill $pid - exit 1 -fi - -echo "----------------------" -echo "Log load model:" -cat /tmp/load-llm-model-res.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/completion-res.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/unload-model-res.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/load-embedding-model-res.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/embedding-res.log - -echo "Nitro test run successfully!" - -# Kill the server process -kill $pid diff --git a/.github/scripts/e2e-test-llama-windows.bat b/.github/scripts/e2e-test-llama-windows.bat deleted file mode 100644 index cddca1e0b..000000000 --- a/.github/scripts/e2e-test-llama-windows.bat +++ /dev/null @@ -1,165 +0,0 @@ -@echo off - -set "TEMP=C:\Users\%UserName%\AppData\Local\Temp" -set "MODEL_LLM_PATH=%TEMP%\testllm" -set "MODEL_EMBEDDING_PATH=%TEMP%\test-embedding" - -rem Check for required arguments -if "%~3"=="" ( - echo Usage: %~0 ^ ^ ^ - exit /b 1 -) - -set "BINARY_PATH=%~1" -set "DOWNLOAD_LLM_URL=%~2" -set "DOWNLOAD_EMBEDDING_URL=%~3" - -for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi" - -echo BINARY_NAME=%BINARY_NAME% - -del %TEMP%\response1.log 2>nul -del %TEMP%\response2.log 2>nul -del %TEMP%\response3.log 2>nul -del %TEMP%\response4.log 2>nul -del %TEMP%\response5.log 2>nul -del %TEMP%\nitro.log 2>nul - -set /a min=9999 -set /a max=11000 -set /a range=max-min+1 -set /a PORT=%min% + %RANDOM% %% %range% - -rem Start the binary file -start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1 - -ping -n 6 127.0.0.1 %PORT% > nul - -rem Capture the PID of the started process with "nitro" in its name -for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do ( - set "pid=%%a" -) - -echo pid=%pid% - -if not defined pid ( - echo nitro failed to start. Logs: - type %TEMP%\nitro.log - exit /b 1 -) - -rem Wait for a few seconds to let the server start - -rem Check if %TEMP%\testmodel exists, if not, download it -if not exist "%MODEL_LLM_PATH%" ( - curl.exe --connect-timeout 300 %DOWNLOAD_LLM_URL% --output "%MODEL_LLM_PATH%" -) - -if not exist "%MODEL_EMBEDDING_PATH%" ( - curl.exe --connect-timeout 300 %DOWNLOAD_EMBEDDING_URL% --output "%MODEL_EMBEDDING_PATH%" -) - -rem Define JSON strings for curl data -call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%" -call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%" -set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}" -set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}" -set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}" -set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}" -set "curl_data5={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}" - -rem Print the values of curl_data for debugging -echo curl_data1=%curl_data1% -echo curl_data2=%curl_data2% -echo curl_data3=%curl_data3% -echo curl_data4=%curl_data4% -echo curl_data5=%curl_data5% - -rem Run the curl commands and capture the status code -curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1 - -curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/chat_completion" ^ ---header "Content-Type: application/json" ^ ---data "%curl_data2%" > %TEMP%\response2.log 2>&1 - -curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/unloadModel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1 - -curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1 - -curl.exe --connect-timeout 60 -o "%TEMP%\response5.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/embeddings" ^ ---header "Content-Type: application/json" ^ ---data "%curl_data5%" > %TEMP%\response5.log 2>&1 - -set "error_occurred=0" - -rem Read the status codes from the log files -for /f %%a in (%TEMP%\response1.log) do set "response1=%%a" -for /f %%a in (%TEMP%\response2.log) do set "response2=%%a" -for /f %%a in (%TEMP%\response3.log) do set "response3=%%a" -for /f %%a in (%TEMP%\response4.log) do set "response4=%%a" -for /f %%a in (%TEMP%\response5.log) do set "response5=%%a" - -if "%response1%" neq "200" ( - echo The first curl command failed with status code: %response1% - type %TEMP%\response1.log - set "error_occurred=1" -) - -if "%response2%" neq "200" ( - echo The second curl command failed with status code: %response2% - type %TEMP%\response2.log - set "error_occurred=1" -) - -if "%response3%" neq "200" ( - echo The third curl command failed with status code: %response3% - type %TEMP%\response3.log - set "error_occurred=1" -) - -if "%response4%" neq "200" ( - echo The fourth curl command failed with status code: %response4% - type %TEMP%\response4.log - set "error_occurred=1" -) - -if "%response5%" neq "200" ( - echo The fifth curl command failed with status code: %response5% - type %TEMP%\response5.log - set "error_occurred=1" -) - -if "%error_occurred%"=="1" ( - echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!! - echo Nitro Error Logs: - type %TEMP%\nitro.log - taskkill /f /pid %pid% - exit /b 1 -) - - -echo ---------------------- -echo Log load llm model: -type %TEMP%\response1.log - -echo ---------------------- -echo Log run test: -type %TEMP%\response2.log - -echo ---------------------- -echo Log unload model: -type %TEMP%\response3.log - -echo ---------------------- -echo Log load embedding model: -type %TEMP%\response3.log - -echo ---------------------- -echo Log run embedding test: -type %TEMP%\response5.log - -echo Nitro test run successfully! - -rem Kill the server process -@REM taskkill /f /pid %pid% -taskkill /f /im nitro.exe 2>nul || exit /B 0 \ No newline at end of file diff --git a/.github/scripts/e2e-test-whisper-linux-and-mac.sh b/.github/scripts/e2e-test-whisper-linux-and-mac.sh deleted file mode 100755 index 4c8a1e9eb..000000000 --- a/.github/scripts/e2e-test-whisper-linux-and-mac.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -## Example run command -# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - -# Check for required arguments -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " - exit 1 -fi - -rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log - -BINARY_PATH=$1 -DOWNLOAD_URL=$2 - -# Random port to ensure it's not used -min=10000 -max=11000 -range=$((max - min + 1)) -PORT=$((RANDOM % range + min)) - -# Start the binary file -"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log & - -# Get the process id of the binary file -pid=$! - -if ! ps -p $pid >/dev/null; then - echo "nitro failed to start. Logs:" - cat /tmp/nitro.log - exit 1 -fi - -# Wait for a few seconds to let the server start -sleep 5 - -# Check if /tmp/testwhisper exists, if not, download it -if [[ ! -f "/tmp/testwhisper" ]]; then - curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/testwhisper -fi - -# Run the curl commands -response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/load_model" \ - --header 'Content-Type: application/json' \ - --data '{ - "model_path": "/tmp/testwhisper", - "model_id": "whisper.cpp" -}') - -response2=$( - curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/transcriptions" \ - --header 'Access-Control-Allow-Origin: *' \ - --form 'file=@"../whisper.cpp/samples/jfk.wav"' \ - --form 'model_id="whisper.cpp"' \ - --form 'temperature="0.0"' \ - --form 'prompt="The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity. The president is trying to raly people to support the cause."' \ - -) - -error_occurred=0 -if [[ "$response1" -ne 200 ]]; then - echo "The first curl command failed with status code: $response1" - cat /tmp/response1.log - error_occurred=1 -fi - -if [[ "$response2" -ne 200 ]]; then - echo "The second curl command failed with status code: $response2" - cat /tmp/response2.log - error_occurred=1 -fi - -if [[ "$error_occurred" -eq 1 ]]; then - echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!" - echo "Nitro Error Logs:" - cat /tmp/nitro.log - kill $pid - exit 1 -fi - -echo "----------------------" -echo "Log load model:" -cat /tmp/response1.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/response2.log - -echo "Nitro test run successfully!" - -# Kill the server process -kill $pid diff --git a/.github/scripts/e2e-test-whisper-windows.bat b/.github/scripts/e2e-test-whisper-windows.bat deleted file mode 100644 index 6eb2037ea..000000000 --- a/.github/scripts/e2e-test-whisper-windows.bat +++ /dev/null @@ -1,102 +0,0 @@ -@echo off - -set "TEMP=C:\Users\%UserName%\AppData\Local\Temp" -set "MODEL_PATH=%TEMP%\testwhisper" - -rem Check for required arguments -if "%~2"=="" ( - echo Usage: %~0 ^ ^ - exit /b 1 -) - -set "BINARY_PATH=%~1" -set "DOWNLOAD_URL=%~2" - -for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi" - -echo BINARY_NAME=%BINARY_NAME% - -del %TEMP%\response1.log 2>nul -del %TEMP%\response2.log 2>nul -del %TEMP%\nitro.log 2>nul - -set /a min=9999 -set /a max=11000 -set /a range=max-min+1 -set /a PORT=%min% + %RANDOM% %% %range% - -rem Start the binary file -start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1 - -ping -n 6 127.0.0.1 %PORT% > nul - -rem Capture the PID of the started process with "nitro" in its name -for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do ( - set "pid=%%a" -) - -echo pid=%pid% - -if not defined pid ( - echo nitro failed to start. Logs: - type %TEMP%\nitro.log - exit /b 1 -) - -rem Wait for a few seconds to let the server start - -rem Check if %TEMP%\testwhisper exists, if not, download it -if not exist "%MODEL_PATH%" ( - curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%" -) - -rem Define JSON strings for curl data -call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%" -set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper\"}" - -rem Run the curl commands and capture the status code -curl.exe --connect-timeout 60 -o %TEMP%\response1.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1 - -curl --connect-timeout 60 -o %TEMP%\response2.log -s -w "%%{http_code}" --location "http://localhost:%PORT%/v1/audio/transcriptions" ^ ---form "file=@../..//whisper.cpp/samples/jfk.wav" ^ ---form "model_id=whisper" > %TEMP%\response2_code.log 2>&1 - -set "error_occurred=0" - -rem Read the status codes from the log files -for /f %%a in (%TEMP%\response1_code.log) do set "response1=%%a" -for /f %%a in (%TEMP%\response2_code.log) do set "response2=%%a" - -if "%response1%" neq "200" ( - echo The first curl command failed with status code: %response1% - type %TEMP%\response1.log - set "error_occurred=1" -) - -if "%response2%" neq "200" ( - echo The second curl command failed with status code: %response2% - type %TEMP%\response2.log - set "error_occurred=1" -) - -if "%error_occurred%"=="1" ( - echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!! - echo Nitro Error Logs: - type %TEMP%\nitro.log - taskkill /f /pid %pid% - exit /b 1 -) - - -echo ---------------------- -echo Log load model: -type %TEMP%\response1.log - -echo ---------------------- -echo "Log run test:" -type %TEMP%\response2.log - -echo Nitro test run successfully! - -rem Kill the server process -taskkill /f /im nitro.exe 2>nul || exit /B 0 diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml new file mode 100644 index 000000000..c5c09dcb5 --- /dev/null +++ b/.github/workflows/beta-build.yml @@ -0,0 +1,144 @@ +name: CI Cortex CPP Beta Build + +on: + push: + tags: ["v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+"] + +jobs: + # Job create Update app version based on latest release tag with build number and save to output + get-update-version: + uses: ./.github/workflows/template-get-update-version.yml + + get-cortex-llamacpp-latest-version: + uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml + + create-draft-release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + version: ${{ steps.get_version.outputs.version }} + permissions: + contents: write + steps: + - name: Extract tag name without v prefix + id: get_version + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}" + env: + GITHUB_REF: ${{ github.ref }} + - name: Create Draft Release + id: create_release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ github.ref_name }} + token: ${{ secrets.GITHUB_TOKEN }} + name: "${{ env.VERSION }}" + draft: true + prerelease: false + generate_release_notes: true + + build-macos: + uses: ./.github/workflows/template-build-macos.yml + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + secrets: inherit + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: beta + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-windows-x64: + uses: ./.github/workflows/template-build-windows-x64.yml + secrets: inherit + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: windows-cuda-11-7 + cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=C:/w/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + channel: beta + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-linux-x64: + uses: ./.github/workflows/template-build-linux-x64.yml + secrets: inherit + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-20-04 + cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: beta + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-docker-x64: + uses: ./.github/workflows/template-build-docker-x64.yml + secrets: inherit + needs: [get-update-version, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-latest + cmake-extra-flags: "-DCORTEX_VARIANT=prod" + tags: "menloltd/cortex:beta-${{ needs.get-update-version.outputs.new_version }}" + + update_release: + needs: [get-update-version, create-draft-release, build-macos, build-windows-x64, build-linux-x64] + permissions: + # write permission is required to create a github release + contents: write + # write permission is required for autolabeler + # otherwise, read permission is required at least + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: Getting the repo + uses: actions/checkout@v4 + - name: set release to prerelease + run: | + gh release edit v${{ needs.get-update-version.outputs.new_version }} --draft=false --prerelease + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + noti-discord: + needs: [get-update-version, create-draft-release, build-macos, build-windows-x64, build-linux-x64, update_release] + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Set version to environment variable + run: | + echo "VERSION=${{ needs.get-update-version.outputs.new_version }}" >> $GITHUB_ENV + echo "RUNNER_ID=$GITHUB_RUN_ID" >> $GITHUB_ENV + + - name: Notify Discord + uses: appleboy/discord-action@v1.0.0 + with: + webhook_id: ${{ secrets.WEBHOOK_ID_BETA }} + webhook_token: ${{ secrets.WEBHOOK_TOKEN_BETA }} + message: | + Cortex.cpp beta build artifact version ${{ env.VERSION }}: + - Windows: + - Network Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-windows-amd64-network-installer.exe + - Local Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-windows-amd64-local-installer.exe + - Binary: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-windows-amd64.tar.gz + - macOS Universal: + - Network Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-mac-universal-network-installer.pkg + - Local Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-mac-universal-local-installer.pkg + - Binary: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-mac-universal.tar.gz + - Linux Deb: + - Network Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64-network-installer.deb + - Local Installer: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64-local-installer.deb + - Binary: https://github.com/janhq/cortex.cpp/releases/download/v${{ env.VERSION }}/cortex-${{ env.VERSION }}-linux-amd64.tar.gz + - Docker: menloltd/cortex:beta-${{ env.VERSION }} + - Github Release: https://github.com/janhq/cortex.cpp/releases/tag/v${{ env.VERSION }} \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 90e509d55..000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,758 +0,0 @@ -name: CI - -on: - schedule: - - cron: "0 20 * * *" # At 8 PM UTC, which is 3 AM UTC+7 - push: - tags: ["v[0-9]+.[0-9]+.[0-9]+"] - paths: - [ - ".github/scripts/**", - ".github/workflows/build.yml", - "**/CMakeLists.txt", - "**/Makefile", - "**/*.h", - "**/*.hpp", - "**/*.c", - "**/*.cpp", - "**/*.cu", - "**/*.cc", - "**/*.cxx", - "llama.cpp", - "!docs/**", - "!.gitignore", - "!README.md", - ] - pull_request: - types: [opened, synchronize, reopened] - paths: - [ - ".github/scripts/**", - ".github/workflows/build.yml", - "**/CMakeLists.txt", - "**/Makefile", - "**/*.h", - "**/*.hpp", - "**/*.c", - "**/*.cpp", - "**/*.cu", - "**/*.cc", - "**/*.cxx", - "llama.cpp", - "!docs/**", - "!.gitignore", - "!README.md", - ] - workflow_dispatch: - -env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - WHISPER_MODEL_URL: https://delta.jan.ai/ggml-tiny-q5_1.bin - EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf - -jobs: - create-draft-release: - runs-on: ubuntu-latest - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - version: ${{ steps.get_version.outputs.version }} - permissions: - contents: write - steps: - - name: Extract tag name without v prefix - id: get_version - run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}" - env: - GITHUB_REF: ${{ github.ref }} - - name: Create Draft Release - id: create_release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ github.ref_name }} - release_name: "${{ env.VERSION }}" - draft: true - prerelease: false - - # Get the latest version of the release - set-nitro-version: - runs-on: ubuntu-latest - outputs: - version: ${{ steps.version_update.outputs.new_version }} - steps: - - name: Get latest release - id: version_update - run: | - ldd --version - if [[ ${{ github.event_name }} == push && ${{ github.ref }} == refs/tags/* ]]; then - echo "VERSION=${GITHUB_REF#refs/tags/}" - NEW_VERSION="${VERSION#v}" - echo "::set-output name=new_version::$NEW_VERSION" - else - # Function to get the latest release tag - get_latest_tag() { - local retries=0 - local max_retries=3 - local tag - while [ $retries -lt $max_retries ]; do - tag=$(curl -s https://api.github.com/repos/janhq/nitro/releases/latest | jq -r .tag_name) - if [ -n "$tag" ] && [ "$tag" != "null" ]; then - echo $tag - return - else - let retries++ - sleep 2 - fi - done - echo "Failed to fetch latest tag after $max_retries attempts." - exit 1 - } - # Get the latest release tag from GitHub API - LATEST_TAG=$(get_latest_tag) - - # Remove the 'v' and append the build number to the version - NEW_VERSION="${LATEST_TAG#v}-${GITHUB_RUN_NUMBER}" - echo "New version: $NEW_VERSION" - echo "::set-output name=new_version::$NEW_VERSION" - fi - echo "Version: $NEW_VERSION" - - ubuntu-amd64-build: - runs-on: ubuntu-18-04-cuda-11-7 - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - - strategy: - matrix: - include: - - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF" - - build: "amd64-avx" - defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - - build: "amd64-avx512" - defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - - build: "amd64-vulkan" - defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF" - # - build: "arm64" - # defines: "-A ARM64 -DLLAMA_NATIVE=OFF" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Prepare Vulkan SDK - if: ${{ matrix.build == 'amd64-vulkan' }} - uses: humbletim/setup-vulkan-sdk@v1.2.0 - with: - vulkan-query-version: 1.3.275.0 - vulkan-components: Vulkan-Headers, Vulkan-Loader - vulkan-use-cache: true - - - name: Build - id: make_build - run: | - ldd --version - ./install_deps.sh - mkdir build && cd build - cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - make -j $(nproc) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - tar -czvf nitro.tar.gz nitro - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-linux-${{ matrix.build }} - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz - asset_content_type: application/gzip - - ubuntu-amd64-cuda-build: - runs-on: ubuntu-18-04-cuda-${{ matrix.cuda }} - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - strategy: - matrix: - cuda: ["12-0", "11-7"] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Build - id: make_build - run: | - ./install_deps.sh - mkdir build && cd build - cmake -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - make -j $(nproc) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - tar -czvf nitro.tar.gz nitro - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-linux-amd64-cuda-${{ matrix.cuda }} - path: ./nitro - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz - asset_content_type: application/gzip - - macOS-silicon-build: - runs-on: mac-silicon - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install cmake sdl2 - - - name: Build - id: cmake_build - run: | - ./install_deps.sh - mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - CC=gcc-8 make -j $(sysctl -n hw.ncpu) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp llama.cpp/ggml-metal.metal nitro/ - cp build/nitro nitro/ - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-arm64 - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - run: | - # run e2e testing - cd nitro/ - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - run: | - # To test with CoreML - if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then - wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip - unzip ggml-tiny-encoder.mlmodelc.zip - rm ggml-tiny-encoder.mlmodelc.zip - rm -rf /tmp/testwhisper-encoder.mlmodelc - mv ggml-tiny-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc - fi - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - macOS-amd64-build: - runs-on: macos-latest - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install sdl2 - - - name: Build - id: cmake_build - run: | - ./install_deps.sh - mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. - CC=gcc-8 make -j $(sysctl -n hw.ncp) - ls -la - - - name: Package - shell: bash - run: | - mkdir -p nitro - cp build/nitro nitro/ - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-amd64 - path: ./nitro - - - name: Run e2e testing - LLama.CPP - shell: bash - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rm -rf uploads/ - - - name: Run e2e testing - Whisper.CPP - shell: bash - run: | - # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - universal-nitro-artifact-macos: - runs-on: macos-latest - needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - steps: - - name: download artifact amd64 - uses: actions/download-artifact@v2 - with: - name: nitro-mac-amd64 - path: ./nitro-mac-amd64 - - - name: download artifact arm64 - uses: actions/download-artifact@v2 - with: - name: nitro-mac-arm64 - path: ./nitro-mac-arm64 - - - name: bundle universal binary - run: | - mkdir -p nitro - ls ./nitro-mac-amd64 - lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro - cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal - tar -czvf nitro.tar.gz nitro - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - with: - name: nitro-mac-universal - path: ./nitro - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz - asset_content_type: application/gzip - - windows-amd64-build: - runs-on: windows-latest - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - - strategy: - matrix: - include: - - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-avx" - defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-avx512" - defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - - build: "amd64-vulkan" - defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - # - build: "arm64" - # defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON" - - permissions: - contents: write - - steps: - - name: Clone - - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1 - - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - - name: actions-setup-cmake - uses: jwlawson/actions-setup-cmake@v1.14.1 - - - name: Prepare Vulkan SDK - uses: humbletim/setup-vulkan-sdk@v1.2.0 - if: ${{ matrix.build == 'amd64-vulkan' }} - with: - vulkan-query-version: 1.3.275.0 - vulkan-components: Vulkan-Headers, Vulkan-Loader - vulkan-use-cache: true - - - name: Build - id: cmake_build - shell: cmd - run: | - cmake -S ./nitro_deps -B ./build_deps/nitro_deps - cmake --build ./build_deps/nitro_deps --config Release - mkdir -p build - cd build - cmake .. ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} - cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - - - name: Pack artifacts - id: pack_artifacts - shell: cmd - run: | - robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll - robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll - dotnet tool install --global AzureSignTool - azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" - 7z a -ttar temp.tar .\build\Release\* - 7z a -tgzip nitro.tar.gz temp.tar - - - name: Run e2e testing - Llama.cpp - shell: cmd - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - cd build\Release - ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} - rmdir /S /Q .\build\Release\uploads - - - name: Run e2e testing - Whisper.cpp - shell: cmd - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - cd build\Release - ..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }} - rmdir /S /Q .\build\Release\uploads - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-win-${{ matrix.build }} - path: ./build/Release - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz - asset_content_type: application/gzip - - windows-amd64-cuda-build: - runs-on: windows-cuda-${{ matrix.cuda }} - needs: [create-draft-release, set-nitro-version] - if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' - timeout-minutes: 40 - permissions: - contents: write - - strategy: - matrix: - cuda: ["12-0", "11-7"] - instructions: ["amd64-avx2", "amd64-avx", "amd64-avx512"] - - steps: - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1 - - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - - name: actions-setup-cmake - uses: jwlawson/actions-setup-cmake@v1.14.1 - - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - submodules: recursive - - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - uses: actions/setup-dotnet@v3 - with: - dotnet-version: "6.0.x" - - # Conditional instruction check and set environment variable - - name: Set INSTRUCTION Based on Instructions ${{ matrix.instructions }} - shell: cmd - run: | - IF "${{ matrix.instructions }}" == "amd64-avx2" ( - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx" ( - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx512" ( - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - ) - - - name: Build - id: cmake_build - shell: cmd - run: | - cmake -S ./nitro_deps -B ./build_deps/nitro_deps - cmake --build ./build_deps/nitro_deps --config Release - mkdir -p build - cd build - cmake .. %INSTRUCTION% -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} - cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - - - name: Pack artifacts - id: pack_artifacts - shell: cmd - run: | - set PATH=%PATH%;C:\Program Files\7-Zip\ - robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll - robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll - dotnet tool install --global AzureSignTool - %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" - 7z a -ttar temp.tar .\build\Release\* - 7z a -tgzip nitro.tar.gz temp.tar - - - name: Upload Artifact - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' - with: - name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }} - path: ./build/Release - - - uses: actions/upload-release-asset@v1.0.1 - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz - asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz - asset_content_type: application/gzip - - update_release_draft: - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - timeout-minutes: 40 - needs: - [ - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - permissions: - contents: write - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: release-drafter/release-drafter@v5 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - noti-discord-nightly: - timeout-minutes: 40 - if: github.event_name == 'schedule' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success' - needs: - [ - create-draft-release, - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: "0" - token: ${{ secrets.PAT_SERVICE_ACCOUNT }} - - name: Notify Discord - uses: Ilshidur/action-discord@master - with: - args: "Nightly build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}" - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} - - name: Update README.md with artifact URL - run: | - sed -i "s|||" README.md - git config --global user.email "service@jan.ai" - git config --global user.name "Service Account" - git add README.md - git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL" - git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main - env: - GITHUB_RUN_ID: ${{ github.run_id }} - - noti-discord-manual: - timeout-minutes: 40 - if: github.event_name == 'workflow_dispatch' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success' - needs: - [ - create-draft-release, - ubuntu-amd64-build, - ubuntu-amd64-cuda-build, - macOS-silicon-build, - macOS-amd64-build, - windows-amd64-build, - windows-amd64-cuda-build, - ] - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: "0" - token: ${{ secrets.PAT_SERVICE_ACCOUNT }} - - name: Notify Discord - uses: Ilshidur/action-discord@master - with: - args: "Manual build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}" - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} - # Update README.md with artifact URL if manual build from main branch - - name: Update README.md with artifact URL - if: github.ref == 'refs/heads/main' - run: | - sed -i "s|||" README.md - git config --global user.email "service@jan.ai" - git config --global user.name "Service Account" - git add README.md - git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL" - git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main - env: - GITHUB_RUN_ID: ${{ github.run_id }} diff --git a/.github/workflows/clean-cloudflare-page-preview-url-and-r2.yml b/.github/workflows/clean-cloudflare-page-preview-url-and-r2.yml new file mode 100644 index 000000000..b19860576 --- /dev/null +++ b/.github/workflows/clean-cloudflare-page-preview-url-and-r2.yml @@ -0,0 +1,52 @@ +name: "Clean old cloudflare pages preview urls and nightly build" +on: + schedule: + - cron: "0 0 * * *" # every day at 00:00 + workflow_dispatch: + +jobs: + clean-cloudflare-pages-preview-urls: + strategy: + matrix: + project: ["cortex-docs"] + runs-on: ubuntu-latest + steps: + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: install requests + run: | + python3 -m pip install requests pytz tqdm + - name: Python Inline script + uses: jannekem/run-python-script-action@v1 + with: + script: | + import requests + from datetime import datetime, UTC + from pytz import timezone + from tqdm import tqdm + + # Configuration + endpoint = "https://api.cloudflare.com/client/v4/accounts/${{ secrets.CLOUDFLARE_ACCOUNT_ID }}/pages/projects/${{ matrix.project }}/deployments" + expiration_days = 3 + headers = { + "Content-Type": "application/json;charset=UTF-8", + "Authorization": "Bearer ${{ secrets.CLOUDFLARE_API_TOKEN }}" + } + utc_tz = timezone('UTC') + + # Fetch the list of deployments + response = requests.get(endpoint, headers=headers) + deployments = response.json() + + for deployment in tqdm(deployments['result']): + # Calculate the age of the deployment + created_on = datetime.strptime(deployment['created_on'], "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=utc_tz) + if (datetime.now(UTC) - created_on).days > expiration_days: + # Delete the deployment + delete_response = requests.delete(f"{endpoint}/{deployment['id']}", headers=headers) + if delete_response.status_code == 200: + print(f"Deleted deployment: {deployment['id']}") + else: + print(f"Failed to delete deployment: {deployment['id']}") + diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml new file mode 100644 index 000000000..3c9eea724 --- /dev/null +++ b/.github/workflows/cortex-cpp-quality-gate.yml @@ -0,0 +1,227 @@ +name: CI Quality Gate Cortex CPP + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + paths: ["engine/**", ".github/workflows/cortex-cpp-quality-gate.yml"] + workflow_dispatch: + +env: + LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf + EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf + +jobs: + build-and-test: + runs-on: ${{ matrix.runs-on }} + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - os: "linux" + name: "amd64" + runs-on: "ubuntu-20-04-cuda-12-0" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "mac" + name: "amd64" + runs-on: "macos-selfhosted-12" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "mac" + name: "arm64" + runs-on: "macos-silicon" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DMAC_ARM64=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + - os: "windows" + name: "amd64" + runs-on: "windows-cuda-12-0" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=C:/w/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + build-deps-cmake-flags: "-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: use python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install tools on Linux + if: runner.os == 'Linux' + run: | + python3 -m pip install awscli + + - name: Install choco on Windows + if: runner.os == 'Windows' + run: | + choco install make pkgconfiglite ccache awscli 7zip ninja -y + + - name: Download ccache from s3 + if: runner.os == 'Windows' + continue-on-error: true + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-${{ matrix.os }}-${{ matrix.name }} ${{ matrix.ccache-dir }} --endpoint ${{ secrets.MINIO_ENDPOINT }} + aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-windows C:\Users\ContainerAdministrator\AppData\Local\vcpkg --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: Download vcpkg cache from s3 + if: runner.os == 'Linux' + continue-on-error: true + run: | + aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-linux /home/runner/.cache/vcpkg --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0 + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: Configure vcpkg + run: | + cd engine + make configure-vcpkg + + - name: Build + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ matrix.build-deps-cmake-flags }}" + + - name: Run setup config + run: | + cd engine + echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" > ~/.cortexrc + echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.cortexrc + ./build/cortex + cat ~/.cortexrc + + - name: Run unit tests + run: | + cd engine + make run-unit-tests + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run setup config + run: | + cd engine + echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" > ~/.cortexrc + echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.cortexrc + echo "apiServerPort: 3928" >> ~/.cortexrc + ./build/cortex + cat ~/.cortexrc + + - name: Run e2e tests + if: runner.os != 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex build/cortex-nightly + cp build/cortex build/cortex-beta + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/main.py + rm build/cortex-nightly + rm build/cortex-beta + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Run e2e tests + if: runner.os == 'Windows' && github.event.pull_request.draft == false + run: | + cd engine + cp build/cortex.exe build/cortex-nightly.exe + cp build/cortex.exe build/cortex-beta.exe + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + python e2e-test/main.py + rm build/cortex-nightly.exe + rm build/cortex-beta.exe + env: + GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }} + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="cortex" + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ matrix.os }}-${{ matrix.name }} + path: ./engine/cortex + + - name: Upload windows ccache to s3 + continue-on-error: true + if: always() && runner.os == 'Windows' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 sync ${{ matrix.ccache-dir }} s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-${{ matrix.os }}-${{ matrix.name }} --endpoint ${{ secrets.MINIO_ENDPOINT }} + aws s3 sync C:\Users\ContainerAdministrator\AppData\Local\vcpkg s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-windows --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: Upload linux vcpkg cache to s3 + continue-on-error: true + if: always() && runner.os == 'Linux' + run: | + aws s3 sync /home/runner/.cache/vcpkg s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-linux --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + build-docker-and-test: + runs-on: ubuntu-latest + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Run Docker + run: | + docker build -t menloltd/cortex:test -f docker/Dockerfile . + docker run -it -d -p 3928:39281 --name cortex menloltd/cortex:test + + - name: use python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Run e2e tests + run: | + cd engine + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + pytest e2e-test/test_api_docker.py + + - name: Run Docker + continue-on-error: true + if: always() + run: | + docker stop cortex + docker rm cortex diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 75d46cb03..5e7ffec5d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,38 +1,43 @@ -name: Nitro Docs +name: Cortex Docs on: push: branches: - - main + - dev paths: - 'docs/**' - '.github/workflows/docs.yml' pull_request: - branches: - - main paths: - 'docs/**' - '.github/workflows/docs.yml' # Review gh actions docs if you want to further define triggers, paths, etc # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on + schedule: + - cron: "0 22 * * 1,2,3,4,5,6" jobs: deploy: - name: Deploy to GitHub Pages + name: Deploy to Cloudflare Pages env: - CLOUDFLARE_ACCOUNT_ID: 9707100ef42a1a25bd70e3ee2137bd0e - CLOUDFLARE_PROJECT_NAME: nitro + CLOUDFLARE_PROJECT_NAME: cortex-docs runs-on: ubuntu-latest + permissions: + contents: write + deployments: write + pull-requests: write steps: - uses: actions/checkout@v3 - uses: actions/setup-node@v3 with: node-version: 18 - - name: Install jq + - name: Install jq uses: dcarbone/install-jq-action@v2.0.1 - name: Fill env vars + working-directory: docs + continue-on-error: true run: | env_example_file=".env.example" touch .env @@ -44,23 +49,27 @@ jobs: echo "$var_name=$var_value" >> .env fi done < "$env_example_file" - working-directory: docs env: - SECRETS: '${{ toJson(secrets) }}' + SECRETS: "${{ toJson(secrets) }}" - name: Install dependencies - run: yarn install working-directory: docs + run: yarn install - name: Build website - run: sed -i '/process.env.DEBUG = namespaces;/c\// process.env.DEBUG = namespaces;' ./node_modules/debug/src/node.js && yarn build working-directory: docs + run: export NODE_ENV=production && yarn build + + - name: Copy redirect file + working-directory: docs + continue-on-error: true + run: cp _redirects build/_redirects - name: Publish to Cloudflare Pages PR Preview and Staging - if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main') + if: github.event_name == 'pull_request' uses: cloudflare/pages-action@v1 with: apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} - accountId: ${{ env.CLOUDFLARE_ACCOUNT_ID }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} projectName: ${{ env.CLOUDFLARE_PROJECT_NAME }} directory: ./docs/build # Optional: Enable this if you want to have GitHub Deployments triggered @@ -68,28 +77,19 @@ jobs: id: deployCloudflarePages - uses: mshick/add-pr-comment@v2 - if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main' + if: github.event_name == 'pull_request' with: message: | - Preview URL: ${{ steps.deployCloudflarePages.outputs.url }} + Preview URL: ${{ steps.deployCloudflarePages.outputs.url }} - - name: Add Custome Domain file - if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository - run: echo "${{ vars.DOCUSAURUS_DOMAIN }}" > ./docs/build/CNAME - - # Popular action to deploy to GitHub Pages: - # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus - - name: Deploy to GitHub Pages - if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository - uses: peaceiris/actions-gh-pages@v3 + - name: Publish to Cloudflare Pages Production + if: (github.event_name == 'push' || github.event_name == 'schedule') && github.ref == 'refs/heads/dev' && github.event.pull_request.head.repo.full_name != github.repository + uses: cloudflare/pages-action@v1 with: - github_token: ${{ secrets.GITHUB_TOKEN }} - # Build output to publish to the `gh-pages` branch: - publish_dir: ./docs/build - # The following lines assign commit authorship to the official - # GH-Actions bot for deploys to `gh-pages` branch: - # https://github.com/actions/checkout/issues/13#issuecomment-724415212 - # The GH actions bot is used by default if you didn't specify the two fields. - # You can swap them out with your own user credentials. - user_name: github-actions[bot] - user_email: 41898282+github-actions[bot]@users.noreply.github.com \ No newline at end of file + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: ${{ env.CLOUDFLARE_PROJECT_NAME }} + directory: ./docs/build + branch: main + # Optional: Enable this if you want to have GitHub Deployments triggered + gitHubToken: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml new file mode 100644 index 000000000..9a31ef5ff --- /dev/null +++ b/.github/workflows/nightly-build.yml @@ -0,0 +1,142 @@ +name: CI Cortex CPP Nightly Build + +on: + schedule: + - cron: '0 20 * * *' # At 8 PM UTC everyday + workflow_dispatch: + inputs: + public_provider: + type: choice + description: 'Public Provider' + options: + - none + - aws-s3 + default: none + +jobs: + set-public-provider: + runs-on: ubuntu-latest + outputs: + public_provider: ${{ steps.set-public-provider.outputs.public_provider }} + ref: ${{ steps.set-public-provider.outputs.ref }} + steps: + - name: Set public provider + id: set-public-provider + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "::set-output name=public_provider::${{ github.event.inputs.public_provider }}" + echo "::set-output name=ref::${{ github.ref }}" + else + if [ "${{ github.event_name }}" == "schedule" ]; then + echo "::set-output name=public_provider::aws-s3" + echo "::set-output name=ref::refs/heads/dev" + elif [ "${{ github.event_name }}" == "push" ]; then + echo "::set-output name=public_provider::aws-s3" + echo "::set-output name=ref::${{ github.ref }}" + else + echo "::set-output name=public_provider::none" + echo "::set-output name=ref::${{ github.ref }}" + fi + fi + + # Job create Update app version based on latest release tag with build number and save to output + get-update-version: + uses: ./.github/workflows/template-get-update-version.yml + + get-cortex-llamacpp-latest-version: + uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml + + build-macos: + uses: ./.github/workflows/template-build-macos.yml + needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + secrets: inherit + with: + ref: ${{ needs.set-public-provider.outputs.ref }} + public_provider: ${{ needs.set-public-provider.outputs.public_provider }} + new_version: ${{ needs.get-update-version.outputs.new_version }} + cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: nightly + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-windows-x64: + uses: ./.github/workflows/template-build-windows-x64.yml + secrets: inherit + needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + with: + ref: ${{ needs.set-public-provider.outputs.ref }} + public_provider: ${{ needs.set-public-provider.outputs.public_provider }} + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: windows-cuda-11-7 + cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=C:/w/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + channel: nightly + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-linux-x64: + uses: ./.github/workflows/template-build-linux-x64.yml + secrets: inherit + needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + with: + ref: ${{ needs.set-public-provider.outputs.ref }} + public_provider: ${{ needs.set-public-provider.outputs.public_provider }} + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-20-04 + cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: nightly + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + update-latest-version: + runs-on: ubuntu-latest + if: needs.set-public-provider.outputs.public_provider == 'aws-s3' + needs: [get-update-version, set-public-provider, build-linux-x64, build-macos, build-windows-x64, get-cortex-llamacpp-latest-version] + steps: + - name: Update latest version + id: update-latest-version + run: | + echo "{\"tag_name\": \"v${{ needs.get-update-version.outputs.new_version }}\"}" > version.json + aws s3 cp version.json s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/version.json + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/mac-universal-cortex-nightly.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/mac-amd64/cortex-nightly.tar.gz + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/mac-universal-cortex-nightly.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/mac-arm64/cortex-nightly.tar.gz + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/mac-universal-cortex-nightly.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/mac-universal/cortex-nightly.tar.gz + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/linux-amd64-cortex-nightly.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/linux-amd64/cortex-nightly.tar.gz + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/windows-amd64-cortex-nightly.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/windows-amd64/cortex-nightly.tar.gz + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-mac-universal-network-installer.pkg s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/mac-universal/cortex-mac-universal-network-installer.pkg + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-linux-amd64-network-installer.deb s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/linux-amd64/cortex-linux-amd64-network-installer.deb + aws s3 cp s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-windows-amd64-network-installer.exe s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/latest/windows-amd64/cortex-windows-amd64-network-installer.exe + + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.DELTA_AWS_REGION }} + AWS_EC2_METADATA_DISABLED: "true" + + build-docker-x64: + if: needs.set-public-provider.outputs.public_provider == 'aws-s3' + uses: ./.github/workflows/template-build-docker-x64.yml + secrets: inherit + needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version] + with: + ref: ${{ needs.set-public-provider.outputs.ref }} + new_version: nightly-${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-latest + cmake-extra-flags: "-DCORTEX_VARIANT=prod" + tags: menloltd/cortex:nightly-${{ needs.get-update-version.outputs.new_version }} + + noti-discord-nightly-and-update-url-readme: + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version, build-docker-x64] + secrets: inherit + if: github.event_name == 'schedule' + uses: ./.github/workflows/template-noti-discord.yaml + with: + build_reason: Nightly + new_version: ${{ needs.get-update-version.outputs.new_version }} + + noti-discord-manual: + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, build-docker-x64] + secrets: inherit + if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'aws-s3' + uses: ./.github/workflows/template-noti-discord.yaml + with: + build_reason: Manual + new_version: ${{ needs.get-update-version.outputs.new_version }} \ No newline at end of file diff --git a/.github/workflows/stable-build.yml b/.github/workflows/stable-build.yml new file mode 100644 index 000000000..2b0523771 --- /dev/null +++ b/.github/workflows/stable-build.yml @@ -0,0 +1,92 @@ +name: CI Cortex CPP Stable Build + +on: + push: + tags: ["v[0-9]+.[0-9]+.[0-9]+"] + +jobs: + # Job create Update app version based on latest release tag with build number and save to output + get-update-version: + uses: ./.github/workflows/template-get-update-version.yml + + get-cortex-llamacpp-latest-version: + uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml + + create-draft-release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + version: ${{ steps.get_version.outputs.version }} + permissions: + contents: write + steps: + - name: Extract tag name without v prefix + id: get_version + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}" + env: + GITHUB_REF: ${{ github.ref }} + - name: Create Draft Release + id: create_release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ github.ref_name }} + token: ${{ secrets.GITHUB_TOKEN }} + name: "${{ env.VERSION }}" + draft: true + prerelease: false + generate_release_notes: true + + build-macos: + uses: ./.github/workflows/template-build-macos.yml + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + secrets: inherit + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: stable + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-windows-x64: + uses: ./.github/workflows/template-build-windows-x64.yml + secrets: inherit + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: windows-cuda-11-7 + cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=C:/w/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" + ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + channel: stable + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-linux-x64: + uses: ./.github/workflows/template-build-linux-x64.yml + secrets: inherit + needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + public_provider: github + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-20-04 + cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + channel: stable + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + + build-docker-x64: + uses: ./.github/workflows/template-build-docker-x64.yml + secrets: inherit + needs: [get-update-version, get-cortex-llamacpp-latest-version] + with: + ref: ${{ github.ref }} + new_version: ${{ needs.get-update-version.outputs.new_version }} + runs-on: ubuntu-latest + cmake-extra-flags: "-DCORTEX_VARIANT=prod" + tags: "menloltd/cortex:latest,menloltd/cortex:${{ needs.get-update-version.outputs.new_version }}" diff --git a/.github/workflows/template-build-docker-x64.yml b/.github/workflows/template-build-docker-x64.yml new file mode 100644 index 000000000..429397ed8 --- /dev/null +++ b/.github/workflows/template-build-docker-x64.yml @@ -0,0 +1,76 @@ +name: build-docker-x64 +on: + workflow_call: + inputs: + ref: + required: true + type: string + default: 'refs/heads/dev' + new_version: + required: true + type: string + default: '' + runs-on: + required: false + type: string + default: 'ubuntu-latest' + description: 'The runner to use for this job' + cmake-extra-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + tags: + required: false + type: string + default: 'menloltd/cortex:latest' + description: 'The tags to use for docker build and push' + +jobs: + build-docker-x64: + runs-on: ${{ inputs.runs-on }} + permissions: + contents: write + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + submodules: 'recursive' + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/Dockerfile + push: true + tags: ${{ inputs.tags }} + build-args: | + CORTEX_CPP_VERSION=${{ inputs.new_version }} + CMAKE_EXTRA_FLAGS=${{ inputs.cmake-extra-flags }} + + - name: Update Docker Hub overview + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + run: | + README_CONTENT=$(cat docker/README.md | tr -d '\r' | sed ':a;N;$!ba;s/\n/\\n/g') + JSON_PAYLOAD=$(printf '{"full_description": "%s"}' "$README_CONTENT") + + curl -X PATCH \ + -H "Content-Type: application/json" \ + -H "Authorization: JWT $(curl -s -H "Content-Type: application/json" -X POST -d '{"username": "'"$DOCKERHUB_USERNAME"'", "password": "'"$DOCKERHUB_TOKEN"'"}' https://hub.docker.com/v2/users/login/ | jq -r .token)" \ + -d "{\"full_description\": \"$JSON_PAYLOAD\"}" \ + https://hub.docker.com/v2/repositories/menloltd/cortex/ \ No newline at end of file diff --git a/.github/workflows/template-build-linux-x64.yml b/.github/workflows/template-build-linux-x64.yml new file mode 100644 index 000000000..d1ca73844 --- /dev/null +++ b/.github/workflows/template-build-linux-x64.yml @@ -0,0 +1,249 @@ +name: build-linux-x64 +on: + workflow_call: + inputs: + ref: + required: true + type: string + default: 'refs/heads/main' + public_provider: + required: true + type: string + default: none + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' + new_version: + required: true + type: string + default: '' + upload_url: + required: false + type: string + default: '' + runs-on: + required: false + type: string + default: 'ubuntu-20-04-cuda-12-0' + description: 'The runner to use for this job' + cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + build-deps-cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + ccache-dir: + required: false + type: string + default: '' + description: 'The ccache directory to use for this job' + channel: + required: true + type: string + default: 'nightly' + description: 'The channel to use for this job' + cortex-llamacpp-version: + required: true + type: string + default: '0.0.0' + description: 'The version of cortex-llamacpp to use for this job' + secrets: + DELTA_AWS_S3_BUCKET_NAME: + required: false + DELTA_AWS_ACCESS_KEY_ID: + required: false + DELTA_AWS_SECRET_ACCESS_KEY: + required: false + DELTA_AWS_REGION: + required: false + +jobs: + build-linux-x64: + runs-on: ${{ inputs.runs-on }} + permissions: + contents: write + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + submodules: 'recursive' + + - name: use python 3.9 + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Set output params for each channel + id : set-output-params + shell: bash + run: | + # Set output for stable channel + if [ "${{ inputs.channel }}" == "stable" ]; then + echo "::set-output name=package_name::cortexcpp" + echo "::set-output name=destination_binary_name::cortex" + echo "::set-output name=destination_binary_server_name::cortex-server" + echo "::set-output name=data_folder_name::cortexcpp" + echo "::set-output name=configuration_file_name::.cortexrc" + echo "::set-output name=uninstaller_file_name::cortex-uninstall.sh" + echo "::set-output name=iss_file_name::installer.iss" + fi + + # Set output for beta channel + if [ "${{ inputs.channel }}" == "beta" ]; then + echo "::set-output name=package_name::cortexcpp-beta" + echo "::set-output name=destination_binary_name::cortex-beta" + echo "::set-output name=destination_binary_server_name::cortex-server-beta" + echo "::set-output name=data_folder_name::cortexcpp-beta" + echo "::set-output name=configuration_file_name::.cortexrc-beta" + echo "::set-output name=uninstaller_file_name::cortex-beta-uninstall.sh" + echo "::set-output name=iss_file_name::installer-beta.iss" + fi + + # Set output for nightly channel + if [ "${{ inputs.channel }}" == "nightly" ]; then + echo "::set-output name=package_name::cortexcpp-nightly" + echo "::set-output name=destination_binary_name::cortex-nightly" + echo "::set-output name=destination_binary_server_name::cortex-server-nightly" + echo "::set-output name=data_folder_name::cortexcpp-nightly" + echo "::set-output name=configuration_file_name::.cortexrc-nightly" + echo "::set-output name=uninstaller_file_name::cortex-nightly-uninstall.sh" + echo "::set-output name=iss_file_name::installer-nightly.iss" + fi + + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Install dependencies linux + run: | + sudo apt update && sudo apt install gettext-base -y + python3 -m pip install awscli + + - name: Configure vcpkg + run: | + cd engine + make configure-vcpkg + + - name: Build + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ inputs.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ inputs.build-deps-cmake-flags }}" + + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + + - name: Build network Installers + shell: bash + run: | + cd engine + make build-installer PACKAGE_NAME="${{ steps.set-output-params.outputs.package_name }}" SOURCE_BINARY_PATH="../../cortex/${{ steps.set-output-params.outputs.destination_binary_name }}" SOURCE_BINARY_SERVER_PATH="../../cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }}" VERSION=${{ inputs.new_version }} DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" DATA_FOLDER_NAME="${{ steps.set-output-params.outputs.data_folder_name }}" CONFIGURATION_FILE_NAME="${{ steps.set-output-params.outputs.configuration_file_name }}" UNINSTALLER_FILE_NAME="${{ steps.set-output-params.outputs.uninstaller_file_name }}" + mv ${{ steps.set-output-params.outputs.package_name }}.deb ${{ steps.set-output-params.outputs.package_name }}-network.deb + + - name: Build local Installers + run: | + mkdir -p engine/templates/linux/dependencies + cd engine/templates/linux/dependencies + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-11-7.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-12-0.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-11-7.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-12-0.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-11-7.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-12-0.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-11-7.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-12-0.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-vulkan.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-linux-amd64.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-linux-amd64.tar.gz + cd .. + + # Remove network package + ls -al + rm -rf ${{ steps.set-output-params.outputs.package_name }} + rm ${{ steps.set-output-params.outputs.package_name }}.deb + chmod +x create_deb_local.sh + ./create_deb_local.sh ${{ steps.set-output-params.outputs.package_name }} ${{ inputs.new_version }} ../../cortex/${{ steps.set-output-params.outputs.destination_binary_name }} ../../cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }} ${{ steps.set-output-params.outputs.destination_binary_name }} ${{ steps.set-output-params.outputs.destination_binary_server_name }} ${{ steps.set-output-params.outputs.data_folder_name }} ${{ steps.set-output-params.outputs.configuration_file_name }}; + cp ${{ steps.set-output-params.outputs.package_name }}.deb ../../${{ steps.set-output-params.outputs.package_name }}-local.deb + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-linux-amd64 + path: ./engine/cortex + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-linux-amd64-network-installer + path: ./engine/${{ steps.set-output-params.outputs.package_name }}-network.deb + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-linux-amd64-local-installer + path: ./engine/${{ steps.set-output-params.outputs.package_name }}-local.deb + + - name: upload to aws s3 if public provider is aws + if: inputs.public_provider == 'aws-s3' + run: | + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/linux-amd64-cortex-nightly.tar.gz + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-network.deb s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-linux-amd64-network-installer.deb + + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/linux-amd64/cortex-nightly.tar.gz + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-network.deb s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/linux-amd64/cortex-${{ inputs.new_version }}-linux-amd64-network-installer.deb + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-local.deb s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/linux-amd64/cortex-${{ inputs.new_version }}-linux-amd64-local-installer.deb + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.DELTA_AWS_REGION }} + AWS_EC2_METADATA_DISABLED: "true" + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + uses: actions/upload-release-asset@v1.0.1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/cortex.tar.gz + asset_name: cortex-${{ inputs.new_version }}-linux-amd64.tar.gz + asset_content_type: application/zip + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/${{ steps.set-output-params.outputs.package_name }}-network.deb + asset_name: cortex-${{ inputs.new_version }}-linux-amd64-network-installer.deb + asset_content_type: application/octet-stream + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/${{ steps.set-output-params.outputs.package_name }}-local.deb + asset_name: cortex-${{ inputs.new_version }}-linux-amd64-local-installer.deb + asset_content_type: application/octet-stream \ No newline at end of file diff --git a/.github/workflows/template-build-macos.yml b/.github/workflows/template-build-macos.yml new file mode 100644 index 000000000..371468dfb --- /dev/null +++ b/.github/workflows/template-build-macos.yml @@ -0,0 +1,381 @@ +name: build-mac +on: + workflow_call: + inputs: + ref: + required: true + type: string + default: 'refs/heads/main' + public_provider: + required: true + type: string + default: none + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' + new_version: + required: true + type: string + default: '' + upload_url: + required: false + type: string + default: '' + cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + build-deps-cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + ccache-dir: + required: false + type: string + default: '' + description: 'The ccache directory to use for this job' + channel: + required: true + type: string + default: 'nightly' + description: 'The channel to use for this job' + cortex-llamacpp-version: + required: true + type: string + default: '0.0.0' + description: 'The version of cortex-llamacpp to use for this job' + secrets: + DELTA_AWS_S3_BUCKET_NAME: + required: false + DELTA_AWS_ACCESS_KEY_ID: + required: false + DELTA_AWS_SECRET_ACCESS_KEY: + required: false + DELTA_AWS_REGION: + required: false + NOTARIZE_P8_BASE64: + required: false + CODE_SIGN_P12_BASE64: + required: false + CODE_SIGN_P12_PASSWORD: + required: false + DEVELOPER_ID: + required: false + NOTARY_KEY_ID: + required: false + NOTARY_ISSUER: + required: false + APPLE_ID: + required: false + APPLE_APP_SPECIFIC_PASSWORD: + required: false + APPLE_TEAM_ID: + required: false + +jobs: + build-mac: + runs-on: ${{ matrix.runs-on }} + permissions: + contents: write + strategy: + fail-fast: false + matrix: + include: + - arch: 'arm64' + runs-on: 'macos-silicon' + extra-cmake-flags: "-DMAC_ARM64=ON" + + - arch: 'amd64' + runs-on: 'macos-selfhosted-12' + extra-cmake-flags: '' + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + submodules: 'recursive' + + - name: use python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Set output params for each channel + id : set-output-params + shell: bash + run: | + # Set output for stable channel + if [ "${{ inputs.channel }}" == "stable" ]; then + echo "::set-output name=package_name::cortexcpp" + echo "::set-output name=destination_binary_name::cortex" + echo "::set-output name=destination_binary_server_name::cortex-server" + echo "::set-output name=data_folder_name::cortexcpp" + echo "::set-output name=configuration_file_name::.cortexrc" + echo "::set-output name=uninstaller_file_name::cortex-uninstall.sh" + echo "::set-output name=iss_file_name::installer.iss" + fi + + # Set output for beta channel + if [ "${{ inputs.channel }}" == "beta" ]; then + echo "::set-output name=package_name::cortexcpp-beta" + echo "::set-output name=destination_binary_name::cortex-beta" + echo "::set-output name=destination_binary_server_name::cortex-server-beta" + echo "::set-output name=data_folder_name::cortexcpp-beta" + echo "::set-output name=configuration_file_name::.cortexrc-beta" + echo "::set-output name=uninstaller_file_name::cortex-beta-uninstall.sh" + echo "::set-output name=iss_file_name::installer-beta.iss" + fi + + # Set output for nightly channel + if [ "${{ inputs.channel }}" == "nightly" ]; then + echo "::set-output name=package_name::cortexcpp-nightly" + echo "::set-output name=destination_binary_name::cortex-nightly" + echo "::set-output name=destination_binary_server_name::cortex-server-nightly" + echo "::set-output name=data_folder_name::cortexcpp-nightly" + echo "::set-output name=configuration_file_name::.cortexrc-nightly" + echo "::set-output name=uninstaller_file_name::cortex-nightly-uninstall.sh" + echo "::set-output name=iss_file_name::installer-nightly.iss" + fi + + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Configure vcpkg + run: | + cd engine + make configure-vcpkg + + - name: Build + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ inputs.cmake-flags }} ${{ matrix.extra-cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ inputs.build-deps-cmake-flags }}" + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-${{ matrix.arch}} + path: ./engine/cortex + + build-universal: + runs-on: macos-latest + needs: build-mac + permissions: + contents: write + steps: + - name: Getting the repo + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref }} + submodules: 'recursive' + + - name: Set output params for each channel + id : set-output-params + shell: bash + run: | + # Set output for stable channel + if [ "${{ inputs.channel }}" == "stable" ]; then + echo "::set-output name=package_name::cortexcpp" + echo "::set-output name=destination_binary_name::cortex" + echo "::set-output name=destination_binary_server_name::cortex-server" + echo "::set-output name=data_folder_name::cortexcpp" + echo "::set-output name=configuration_file_name::.cortexrc" + echo "::set-output name=uninstaller_file_name::cortex-uninstall.sh" + echo "::set-output name=iss_file_name::installer.iss" + fi + + # Set output for beta channel + if [ "${{ inputs.channel }}" == "beta" ]; then + echo "::set-output name=package_name::cortexcpp-beta" + echo "::set-output name=destination_binary_name::cortex-beta" + echo "::set-output name=destination_binary_server_name::cortex-server-beta" + echo "::set-output name=data_folder_name::cortexcpp-beta" + echo "::set-output name=configuration_file_name::.cortexrc-beta" + echo "::set-output name=uninstaller_file_name::cortex-beta-uninstall.sh" + echo "::set-output name=iss_file_name::installer-beta.iss" + fi + + # Set output for nightly channel + if [ "${{ inputs.channel }}" == "nightly" ]; then + echo "::set-output name=package_name::cortexcpp-nightly" + echo "::set-output name=destination_binary_name::cortex-nightly" + echo "::set-output name=destination_binary_server_name::cortex-server-nightly" + echo "::set-output name=data_folder_name::cortexcpp-nightly" + echo "::set-output name=configuration_file_name::.cortexrc-nightly" + echo "::set-output name=uninstaller_file_name::cortex-nightly-uninstall.sh" + echo "::set-output name=iss_file_name::installer-nightly.iss" + fi + + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Get Cer for code signing + run: base64 -d <<< "$NOTARIZE_P8_BASE64" > /tmp/notary-key.p8 + shell: bash + env: + NOTARIZE_P8_BASE64: ${{ secrets.NOTARIZE_P8_BASE64 }} + + - uses: apple-actions/import-codesign-certs@v2 + continue-on-error: true + with: + p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }} + p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }} + + - name: download artifacts mac arm64 + uses: actions/download-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-arm64 + path: ./cortex-${{ inputs.new_version }}-mac-arm64 + + - name: download artifacts mac amd64 + uses: actions/download-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-amd64 + path: ./cortex-${{ inputs.new_version }}-mac-amd64 + + - name: create universal binary + run: | + mkdir -p engine/cortex + ls -al + find . | grep ${{ steps.set-output-params.outputs.destination_binary_name }} + find . | grep ${{ steps.set-output-params.outputs.destination_binary_server_name }} + lipo -create cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }} cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }} -output engine/cortex/${{ steps.set-output-params.outputs.destination_binary_name }} + lipo -create cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }} cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }} -output engine/cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }} + chmod +x engine/cortex/${{ steps.set-output-params.outputs.destination_binary_name }} + chmod +x engine/cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }} + + - name: Code Signing binaries + run: | + cd engine + make codesign-binary CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}" DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + + - name: Notary macOS Binary + run: | + curl -sSfL https://raw.githubusercontent.com/anchore/quill/main/install.sh | sh -s -- -b /usr/local/bin + cd engine/cortex + # Notarize the binary + quill notarize ./${{ steps.set-output-params.outputs.destination_binary_name }} + quill notarize ./${{ steps.set-output-params.outputs.destination_binary_server_name }} + env: + QUILL_NOTARY_KEY_ID: ${{ secrets.NOTARY_KEY_ID }} + QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }} + QUILL_NOTARY_KEY: "/tmp/notary-key.p8" + + - name: Build network Installers + shell: bash + run: | + cd engine + make build-installer PACKAGE_NAME="${{ steps.set-output-params.outputs.package_name }}" VERSION=${{ inputs.new_version }} SOURCE_BINARY_PATH="../../cortex/${{ steps.set-output-params.outputs.destination_binary_name }}" SOURCE_BINARY_SERVER_PATH="../../cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }}" DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" DATA_FOLDER_NAME="${{ steps.set-output-params.outputs.data_folder_name }}" CONFIGURATION_FILE_NAME="${{ steps.set-output-params.outputs.configuration_file_name }}" UNINSTALLER_FILE_NAME="${{ steps.set-output-params.outputs.uninstaller_file_name }}" + cat templates/macos/Scripts/postinstall + + - name: Codesign and notary for macos installer + run: | + cd engine + productsign --sign "Developer ID Installer: ${{ secrets.DEVELOPER_ID }}" ${{ steps.set-output-params.outputs.package_name }}.pkg ${{ steps.set-output-params.outputs.package_name }}$-signed.pkg + rm ${{ steps.set-output-params.outputs.package_name }}.pkg + mv ${{ steps.set-output-params.outputs.package_name }}$-signed.pkg Distribution.pkg + productbuild --synthesize --package Distribution.pkg Distribution.xml + sed -i '' 's/require-scripts="false"/require-scripts="true"/' Distribution.xml + cat Distribution.xml + productbuild --distribution Distribution.xml --sign "Developer ID Installer: ${{ secrets.DEVELOPER_ID }}" --package-path . ${{ steps.set-output-params.outputs.package_name }}-network.pkg + xcrun notarytool submit ${{ steps.set-output-params.outputs.package_name }}-network.pkg --apple-id ${{ secrets.APPLE_ID }} --password ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} --team-id ${{ secrets.APPLE_TEAM_ID }} --wait + + - name: Build local Installers + shell: bash + run: | + mkdir -p engine/templates/macos/Scripts/dependencies + cd engine/templates/macos/Scripts/dependencies + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-arm64.tar.gz + wget https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-amd64.tar.gz + + cd ../../ + chmod +x create_pkg_local.sh + ./create_pkg_local.sh ${{ steps.set-output-params.outputs.package_name }} ${{ inputs.new_version }} ../../cortex/${{ steps.set-output-params.outputs.destination_binary_name }} ../../cortex/${{ steps.set-output-params.outputs.destination_binary_server_name }} ${{ steps.set-output-params.outputs.destination_binary_name }} ${{ steps.set-output-params.outputs.destination_binary_server_name }} ${{ steps.set-output-params.outputs.data_folder_name }} ${{ steps.set-output-params.outputs.configuration_file_name }} ${{ steps.set-output-params.outputs.uninstaller_file_name }} + cp ${{ steps.set-output-params.outputs.package_name }}.pkg ../../ + + - name: Codesign and notary for macos installer + run: | + cd engine + productsign --sign "Developer ID Installer: ${{ secrets.DEVELOPER_ID }}" ${{ steps.set-output-params.outputs.package_name }}.pkg ${{ steps.set-output-params.outputs.package_name }}$-signed.pkg + rm ${{ steps.set-output-params.outputs.package_name }}.pkg + mv ${{ steps.set-output-params.outputs.package_name }}$-signed.pkg Distribution.pkg + productbuild --synthesize --package Distribution.pkg Distribution.xml + sed -i '' 's/require-scripts="false"/require-scripts="true"/' Distribution.xml + cat Distribution.xml + productbuild --distribution Distribution.xml --sign "Developer ID Installer: ${{ secrets.DEVELOPER_ID }}" --package-path . ${{ steps.set-output-params.outputs.package_name }}-local.pkg + xcrun notarytool submit ${{ steps.set-output-params.outputs.package_name }}-local.pkg --apple-id ${{ secrets.APPLE_ID }} --password ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} --team-id ${{ secrets.APPLE_TEAM_ID }} --wait + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-universal + path: ./engine/cortex + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-universal-network-installer + path: ./engine/${{ steps.set-output-params.outputs.package_name }}-network.pkg + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-universal-local-installer + path: ./engine/${{ steps.set-output-params.outputs.package_name }}-local.pkg + + - name: upload to aws s3 if public provider is aws + if: inputs.public_provider == 'aws-s3' + run: | + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/mac-universal-cortex-nightly.tar.gz + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-network.pkg s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-mac-universal-network-installer.pkg + + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/mac-universal/cortex-nightly.tar.gz + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-network.pkg s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/mac-universal/cortex-${{ inputs.new_version }}-mac-universal-network-installer.pkg + aws s3 cp ./engine/${{ steps.set-output-params.outputs.package_name }}-local.pkg s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/mac-universal/cortex-${{ inputs.new_version }}-mac-universal-local-installer.pkg + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.DELTA_AWS_REGION }} + AWS_EC2_METADATA_DISABLED: "true" + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/cortex.tar.gz + asset_name: cortex-${{ inputs.new_version }}-mac-universal.tar.gz + asset_content_type: application/zip + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/${{ steps.set-output-params.outputs.package_name }}-network.pkg + asset_name: cortex-${{ inputs.new_version }}-mac-universal-network-installer.pkg + asset_content_type: application/octet-stream + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/${{ steps.set-output-params.outputs.package_name }}-local.pkg + asset_name: cortex-${{ inputs.new_version }}-mac-universal-local-installer.pkg + asset_content_type: application/octet-stream \ No newline at end of file diff --git a/.github/workflows/template-build-windows-x64.yml b/.github/workflows/template-build-windows-x64.yml new file mode 100644 index 000000000..d1f6f1333 --- /dev/null +++ b/.github/workflows/template-build-windows-x64.yml @@ -0,0 +1,321 @@ +name: build-windows-x64 +on: + workflow_call: + inputs: + ref: + required: true + type: string + default: 'refs/heads/main' + public_provider: + required: true + type: string + default: none + description: 'none: build only, github: build and publish to github, aws s3: build and publish to aws s3' + new_version: + required: true + type: string + default: '' + upload_url: + required: false + type: string + default: '' + runs-on: + required: false + type: string + default: 'windows-cuda-12-0' + description: 'The runner to use for this job' + cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + build-deps-cmake-flags: + required: false + type: string + default: '' + description: 'The cmake flags to use for this job' + ccache-dir: + required: false + type: string + default: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' + description: 'The ccache directory to use for this job' + channel: + required: true + type: string + default: 'nightly' + description: 'The channel to use for this job' + cortex-llamacpp-version: + required: true + type: string + default: '0.0.0' + description: 'The version of cortex-llamacpp to use for this job' + secrets: + MINIO_BUCKET_NAME: + required: false + MINIO_ENDPOINT: + required: false + MINIO_ACCESS_KEY_ID: + required: false + MINIO_SECRET_ACCESS_KEY: + required: false + MINIO_REGION: + required: false + DEVELOPER_ID: + required: false + AZURE_KEY_VAULT_URI: + required: false + AZURE_CLIENT_ID: + required: false + AZURE_TENANT_ID: + required: false + AZURE_CLIENT_SECRET: + required: false + AZURE_CERT_NAME: + required: false + DELTA_AWS_S3_BUCKET_NAME: + required: false + DELTA_AWS_ACCESS_KEY_ID: + required: false + DELTA_AWS_SECRET_ACCESS_KEY: + required: false + DELTA_AWS_REGION: + required: false + +jobs: + build-windows-x64: + runs-on: ${{ inputs.runs-on }} + permissions: + contents: write + steps: + - name: Getting the repo + uses: actions/checkout@v3 + with: + ref: ${{ inputs.ref }} + submodules: 'recursive' + + - uses: actions/setup-dotnet@v3 + with: + dotnet-version: "8.0.x" + + - name: Set output params for each channel + id : set-output-params + shell: bash + run: | + # Set output for stable channel + if [ "${{ inputs.channel }}" == "stable" ]; then + echo "::set-output name=package_name::cortexcpp" + echo "::set-output name=destination_binary_name::cortex" + echo "::set-output name=destination_binary_server_name::cortex-server" + echo "::set-output name=data_folder_name::cortexcpp" + echo "::set-output name=configuration_file_name::.cortexrc" + echo "::set-output name=uninstaller_file_name::cortex-uninstall.sh" + echo "::set-output name=iss_file_name::installer.iss" + echo "::set-output name=local_iss_file_name::local-installer.iss" + fi + + # Set output for beta channel + if [ "${{ inputs.channel }}" == "beta" ]; then + echo "::set-output name=package_name::cortexcpp-beta" + echo "::set-output name=destination_binary_name::cortex-beta" + echo "::set-output name=destination_binary_server_name::cortex-server-beta" + echo "::set-output name=data_folder_name::cortexcpp-beta" + echo "::set-output name=configuration_file_name::.cortexrc-beta" + echo "::set-output name=uninstaller_file_name::cortex-beta-uninstall.sh" + echo "::set-output name=iss_file_name::installer-beta.iss" + echo "::set-output name=local_iss_file_name::local-installer-beta.iss" + fi + + # Set output for nightly channel + if [ "${{ inputs.channel }}" == "nightly" ]; then + echo "::set-output name=package_name::cortexcpp-nightly" + echo "::set-output name=destination_binary_name::cortex-nightly" + echo "::set-output name=destination_binary_server_name::cortex-server-nightly" + echo "::set-output name=data_folder_name::cortexcpp-nightly" + echo "::set-output name=configuration_file_name::.cortexrc-nightly" + echo "::set-output name=uninstaller_file_name::cortex-nightly-uninstall.sh" + echo "::set-output name=iss_file_name::installer-nightly.iss" + echo "::set-output name=local_iss_file_name::local-installer-nightly.iss" + fi + + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Install dependencies on Windows + run: | + choco install make pkgconfiglite ccache awscli 7zip ninja wget -y + dotnet tool install --global AzureSignTool + + - name: Download ccache from s3 + continue-on-error: true + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-windows-amd64 ${{ inputs.ccache-dir }} --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: Configure vcpkg + shell: cmd + run: | + cd engine + make configure-vcpkg + + - name: Build + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ inputs.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ inputs.build-deps-cmake-flags }}" + + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + + - name: Code Signing binaries + run: | + cd engine + make codesign-binary DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" AZURE_KEY_VAULT_URI="${{ secrets.AZURE_KEY_VAULT_URI }}" AZURE_CLIENT_ID="${{ secrets.AZURE_CLIENT_ID }}" AZURE_TENANT_ID="${{ secrets.AZURE_TENANT_ID }}" AZURE_CLIENT_SECRET="${{ secrets.AZURE_CLIENT_SECRET }}" AZURE_CERT_NAME="${{ secrets.AZURE_CERT_NAME }}" + + - name: Update version in installer.iss using sed + shell: bash + run: | + cd engine/templates/windows + sed -i "s/AppVersion=1.0/AppVersion=${{ inputs.new_version }}/g" ${{ steps.set-output-params.outputs.iss_file_name }} + sed -i "s/AppVersion=1.0/AppVersion=${{ inputs.new_version }}/g" ${{ steps.set-output-params.outputs.local_iss_file_name }} + cat ${{ steps.set-output-params.outputs.iss_file_name }} + cp ${{ steps.set-output-params.outputs.iss_file_name }} ../../../ + cp ${{ steps.set-output-params.outputs.local_iss_file_name }} ../../../ + ls ../../../ + + - name: Build network Installers + shell: bash + run: | + cd engine + make build-installer PACKAGE_NAME=${{ steps.set-output-params.outputs.package_name }} VERSION=${{ inputs.new_version }} DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + ls ../ + + - name: Build local Installers + shell: powershell + run: | + mkdir dependencies + cd dependencies + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-11-7.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-12-0.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-11-7.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-12-0.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-11-7.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-12-0.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-11-7.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-12-0.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-vulkan.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-windows-amd64.tar.gz + wget.exe https://github.com/janhq/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-windows-amd64.tar.gz + + - name: Enable long paths + run: | + reg add "HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem" /v LongPathsEnabled /t REG_DWORD /d 1 /f + + - name: Compile .ISS to .EXE network Installer + uses: nadeemjazmawe/inno-setup-action-cli@v6.0.5 + with: + filepath: ./${{ steps.set-output-params.outputs.iss_file_name }} + + - name: Codesign for windows network installer + shell: pwsh + run: | + ~\.dotnet\tools\azuresigntool.exe sign -kvu ${{ secrets.AZURE_KEY_VAULT_URI }} -kvi ${{ secrets.AZURE_CLIENT_ID }} -kvt ${{ secrets.AZURE_TENANT_ID }} -kvs ${{ secrets.AZURE_CLIENT_SECRET }} -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\setup.exe" + mv .\setup.exe .\network-setup.exe + + - name: Compile .ISS to .EXE local Installer + uses: nadeemjazmawe/inno-setup-action-cli@v6.0.5 + with: + filepath: ./${{ steps.set-output-params.outputs.local_iss_file_name }} + + - name: Codesign for windows network installer + shell: pwsh + run: | + ~\.dotnet\tools\azuresigntool.exe sign -kvu ${{ secrets.AZURE_KEY_VAULT_URI }} -kvi ${{ secrets.AZURE_CLIENT_ID }} -kvt ${{ secrets.AZURE_TENANT_ID }} -kvs ${{ secrets.AZURE_CLIENT_SECRET }} -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\setup.exe" + mv .\setup.exe .\local-setup.exe + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-windows-amd64 + path: ./engine/cortex + + - name: Upload Artifact network installer + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-windows-amd64-network-installer + path: ./network-setup.exe + + - name: Upload Artifact local installer + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-windows-amd64-local-installer + path: ./local-setup.exe + + - name: upload to aws s3 if public provider is aws + if: inputs.public_provider == 'aws-s3' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/windows-amd64-cortex-nightly.tar.gz + aws s3 cp ./network-setup.exe s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/temp-latest/cortex-windows-amd64-network-installer.exe + + aws s3 cp ./engine/cortex.tar.gz s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/windows-amd64/cortex-nightly.tar.gz + aws s3 cp ./network-setup.exe s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/windows-amd64/cortex-${{ inputs.new_version }}-windows-amd64-network-installer.exe + aws s3 cp ./local-setup.exe s3://${{ secrets.DELTA_AWS_S3_BUCKET_NAME }}/cortex/v${{ inputs.new_version }}/windows-amd64/cortex-${{ inputs.new_version }}-windows-amd64-local-installer.exe + env: + AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.DELTA_AWS_REGION }} + AWS_EC2_METADATA_DISABLED: "true" + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./engine/cortex.tar.gz + asset_name: cortex-${{ inputs.new_version }}-windows-amd64.tar.gz + asset_content_type: application/zip + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./network-setup.exe + asset_name: cortex-${{ inputs.new_version }}-windows-amd64-network-installer.exe + asset_content_type: application/octet-stream + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./local-setup.exe + asset_name: cortex-${{ inputs.new_version }}-windows-amd64-local-installer.exe + asset_content_type: application/octet-stream \ No newline at end of file diff --git a/.github/workflows/template-cortex-llamacpp-latest-version.yml b/.github/workflows/template-cortex-llamacpp-latest-version.yml new file mode 100644 index 000000000..5135c55ab --- /dev/null +++ b/.github/workflows/template-cortex-llamacpp-latest-version.yml @@ -0,0 +1,47 @@ +name: get-cortex-llamacpp-latest-version +on: + workflow_call: + outputs: + cortex_llamacpp_latest_version: + description: 'The latest version of cortex.llamacpp engines' + value: ${{ jobs.get-cortex-llamacpp-latest-version.outputs.new_version }} + +jobs: + get-cortex-llamacpp-latest-version: + runs-on: ubuntu-latest + outputs: + new_version: ${{ steps.version_update.outputs.new_version }} + steps: + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Update app version based on latest release tag with build number + id: version_update + run: | + # Function to get the latest release tag + get_latest_tag() { + local retries=0 + local max_retries=3 + local tag + while [ $retries -lt $max_retries ]; do + tag=$(curl -s https://api.github.com/repos/janhq/cortex.llamacpp/releases/latest | jq -r .tag_name) + if [ -n "$tag" ] && [ "$tag" != "null" ]; then + echo $tag + return + else + let retries++ + echo "Retrying... ($retries/$max_retries)" + sleep 2 + fi + done + echo "Failed to fetch latest tag after $max_retries attempts." + exit 1 + } + + # Get the latest release tag from GitHub API + LATEST_TAG=$(get_latest_tag) + + # Remove the 'v' and append the build number to the version + new_version="${LATEST_TAG#v}" + echo "New version: $new_version" + echo "::set-output name=new_version::$new_version" \ No newline at end of file diff --git a/.github/workflows/template-get-update-version.yml b/.github/workflows/template-get-update-version.yml new file mode 100644 index 000000000..7b715a6e0 --- /dev/null +++ b/.github/workflows/template-get-update-version.yml @@ -0,0 +1,59 @@ +name: get-update-version +on: + workflow_call: + outputs: + new_version: + description: 'The new version of the app' + value: ${{ jobs.get-update-version.outputs.new_version }} + +jobs: + get-update-version: + runs-on: ubuntu-latest + outputs: + new_version: ${{ steps.version_update.outputs.new_version }} + steps: + - name: Install jq + uses: dcarbone/install-jq-action@v2.0.1 + + - name: Get tag + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + id: tag + uses: dawidd6/action-get-tag@v1 + with: + strip_v: true + + - name: Update app version based on latest release tag with build number + id: version_update + run: | + # Function to get the latest release tag + get_latest_tag() { + local retries=0 + local max_retries=3 + local tag + while [ $retries -lt $max_retries ]; do + tag=$(curl -s https://api.github.com/repos/janhq/cortex.cpp/releases/latest | jq -r .tag_name) + if [ -n "$tag" ] && [ "$tag" != "null" ]; then + echo $tag + return + else + let retries++ + echo "Retrying... ($retries/$max_retries)" + sleep 2 + fi + done + echo "Failed to fetch latest tag after $max_retries attempts." + exit 1 + } + + if ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') }}; then + echo "Tag detected, set output follow tag" + echo "::set-output name=new_version::${{ steps.tag.outputs.tag }}" + else + # Get the latest release tag from GitHub API + LATEST_TAG=$(get_latest_tag) + + # Remove the 'v' and append the build number to the version + new_version="${LATEST_TAG#v}-${GITHUB_RUN_NUMBER}" + echo "New version: $new_version" + echo "::set-output name=new_version::$new_version" + fi \ No newline at end of file diff --git a/.github/workflows/template-noti-discord.yaml b/.github/workflows/template-noti-discord.yaml new file mode 100644 index 000000000..97a539e33 --- /dev/null +++ b/.github/workflows/template-noti-discord.yaml @@ -0,0 +1,46 @@ +name: noti-discord +on: + workflow_call: + inputs: + build_reason: + required: true + type: string + default: 'Nightly' + new_version: + required: true + type: string + default: '' + +jobs: + noti-discord: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Set version to environment variable + run: | + echo "VERSION=${{ inputs.new_version }}" >> $GITHUB_ENV + echo "RUNNER_ID=$GITHUB_RUN_ID" >> $GITHUB_ENV + echo "BUILD_REASON=${{ inputs.build_reason }}" >> $GITHUB_ENV + + - name: Notify Discord + uses: appleboy/discord-action@v1.0.0 + with: + webhook_id: ${{ secrets.WEBHOOK_ID }} + webhook_token: ${{ secrets.WEBHOOK_TOKEN }} + message: | + Cortex.cpp ${{ env.BUILD_REASON }} build artifact version ${{ env.VERSION }}: + - Windows: + - Network Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/windows-amd64/cortex-${{ env.VERSION }}-windows-amd64-network-installer.exe + - Local Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/windows-amd64/cortex-${{ env.VERSION }}-windows-amd64-local-installer.exe + - Binary: https://delta.jan.ai/cortex/v${{ env.VERSION }}/windows-amd64/cortex-nightly.tar.gz + - macOS Universal: + - Network Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/mac-universal/cortex-${{ env.VERSION }}-mac-universal-network-installer.pkg + - Local Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/mac-universal/cortex-${{ env.VERSION }}-mac-universal-local-installer.pkg + - Binary: https://delta.jan.ai/cortex/v${{ env.VERSION }}/mac-universal/cortex-nightly.tar.gz + - Linux Deb: + - Network Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/linux-amd64/cortex-${{ env.VERSION }}-linux-amd64-network-installer.deb + - Local Installer: https://delta.jan.ai/cortex/v${{ env.VERSION }}/linux-amd64/cortex-${{ env.VERSION }}-linux-amd64-local-installer.deb + - Binary: https://delta.jan.ai/cortex/v${{ env.VERSION }}/linux-amd64/cortex-nightly.tar.gz + - Docker: menloltd/cortex:nightly-${{ env.VERSION }} + - Github action run: https://github.com/janhq/cortex.cpp/actions/runs/${{ env.RUNNER_ID }} diff --git a/.github/workflows/test-cortexso-model-hub.yml b/.github/workflows/test-cortexso-model-hub.yml new file mode 100644 index 000000000..6e1539420 --- /dev/null +++ b/.github/workflows/test-cortexso-model-hub.yml @@ -0,0 +1,111 @@ +name: Test cortexso Model Hub + +on: + schedule: + - cron: "0 16 * * 5" # every Friday at 23:00 UTC+7 + workflow_dispatch: + +jobs: + build-and-test: + runs-on: ${{ matrix.runs-on }} + timeout-minutes: 1440 + strategy: + fail-fast: false + matrix: + include: + - os: "linux" + name: "amd64" + runs-on: "ubuntu-20-04-e2e-cortexcpp-model-hub" + cmake-flags: "-DCORTEX_CPP_VERSION=${{github.head_ref}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + build-deps-cmake-flags: "" + ccache-dir: "" + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: use python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install tools on Linux + run: | + sudo chown -R runner:runner /home/runner/cortexcpp + python3 -m pip install awscli + + - name: Download vcpkg cache from s3 + continue-on-error: true + run: | + aws s3 sync s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-linux /home/runner/.cache/vcpkg --endpoint ${{ secrets.MINIO_ENDPOINT }} --cli-read-timeout 0 + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: Configure vcpkg + run: | + cd engine + make configure-vcpkg + + - name: Build + run: | + cd engine + make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}" BUILD_DEPS_CMAKE_EXTRA_FLAGS="${{ matrix.build-deps-cmake-flags }}" + + - name: Run unit tests + run: | + cd engine + make run-unit-tests + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Run setup config for linux + shell: bash + run: | + cd engine + ./build/cortex --version + sed -i 's/huggingFaceToken: ""/huggingFaceToken: "${{ secrets.HUGGINGFACE_TOKEN_READ }}"/' ~/.cortexrc + + - name: Run e2e tests + run: | + cd engine + cp build/cortex build/cortex-nightly + cp build/cortex build/cortex-beta + python -m pip install --upgrade pip + python -m pip install -r e2e-test/requirements.txt + pytest e2e-test/test_api_cortexso_hub_llamacpp_engine.py + rm build/cortex-nightly + rm build/cortex-beta + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HF_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN_E2E }} + + - name: Pre-package + run: | + cd engine + make pre-package DESTINATION_BINARY_NAME="cortex" + + - name: Package + run: | + cd engine + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ matrix.os }}-${{ matrix.name }} + path: ./engine/cortex + + + - name: Upload linux vcpkg cache to s3 + continue-on-error: true + if: always() + run: | + aws s3 sync /home/runner/.cache/vcpkg s3://${{ secrets.MINIO_BUCKET_NAME }}/cortex-cpp-vcpkg-linux --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" diff --git a/.github/workflows/update-release-url.yml b/.github/workflows/update-release-url.yml deleted file mode 100644 index 710d23ba7..000000000 --- a/.github/workflows/update-release-url.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Update Download URLs - -on: - release: - types: - - published - - workflow_dispatch: - -jobs: - update-readme: - runs-on: ubuntu-latest - environment: production - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: "0" - token: ${{ secrets.PAT_SERVICE_ACCOUNT }} - ref: main - - - name: Get Latest Release - uses: pozetroninc/github-action-get-latest-release@v0.7.0 - id: get-latest-release - with: - repository: ${{ github.repository }} - - - name: Update Download URLs in README.md - run: | - echo "Latest Release: ${{ steps.get-latest-release.outputs.release }}" - tag=$(/bin/echo -n "${{ steps.get-latest-release.outputs.release }}") - echo "Tag: $tag" - # Remove the v prefix - release=${tag:1} - echo "Release: $release" - sed -i "s|||" README.md - sed -i "s|||" README.md - sed -i "s|||" README.md - sed -i "s|||" README.md - sed -i "s|||" README.md - sed -i "s|||" README.md - - - name: Commit and Push changes - if: github.event_name == 'release' - run: | - git config --global user.email "service@jan.ai" - git config --global user.name "Service Account" - git add README.md - git commit -m "Update README.md with Stable Download URLs" - git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main \ No newline at end of file diff --git a/.gitignore b/.gitignore index be1237faa..ad579aed8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,568 +1,24 @@ - -# Created by https://www.toptal.com/developers/gitignore/api/intellij+all,visualstudio,visualstudiocode,cmake,c,c++ -# Edit at https://www.toptal.com/developers/gitignore?templates=intellij+all,visualstudio,visualstudiocode,cmake,c,c++ - -### C ### -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -# *.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -### C++ ### -# Prerequisites - -# Compiled Object files -*.slo - -# Precompiled Headers - -# Linker files - -# Debugger Files - -# Compiled Dynamic libraries - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai - -# Executables - -### CMake ### -CMakeLists.txt.user -CMakeCache.txt -CMakeFiles -CMakeScripts -Testing -Makefile -!nitro-node/Makefile -cmake_install.cmake -install_manifest.txt -compile_commands.json -CTestTestfile.cmake -_deps -CMakeUserPresets.json - -### CMake Patch ### -# External projects -*-prefix/ - -### Intellij+all ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/artifacts -# .idea/compiler.xml -# .idea/jarRepositories.xml -# .idea/modules.xml -# .idea/*.iml -# .idea/modules -# *.iml -# *.ipr - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser - -### Intellij+all Patch ### -# Ignores the whole .idea folder and all .iml files -# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 - -.idea/ - -# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 - -*.iml -modules.xml -.idea/misc.xml -*.ipr - -# Sonarlint plugin -.idea/sonarlint - -### VisualStudioCode ### -.vscode/* -!.vscode/tasks.json -!.vscode/launch.json -*.code-workspace - -### VisualStudioCode Patch ### -# Ignore all local history of files -.history -.ionide - -### VisualStudio ### -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. -## -## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore - -# User-specific files -*.rsuser -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Mono auto generated files -mono_crash.* - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -[Ww][Ii][Nn]32/ -[Aa][Rr][Mm]/ -[Aa][Rr][Mm]64/ -bld/ -[Bb]in/ -[Oo]bj/ -[Ll]og/ -[Ll]ogs/ - -# Visual Studio 2015/2017 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# Visual Studio 2017 auto generated files -Generated\ Files/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUnit -*.VisualState.xml -TestResult.xml -nunit-*.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# Benchmark Results -BenchmarkDotNet.Artifacts/ - -# .NET Core -project.lock.json -project.fragment.lock.json -artifacts/ - -# ASP.NET Scaffolding -ScaffoldingReadMe.txt - -# StyleCop -StyleCopReport.xml - -# Files built by Visual Studio -*_i.c -*_p.c -*_h.h -*.meta -*.iobj -*.ipdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*_wpftmp.csproj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opendb -*.opensdf -*.sdf -*.cachefile -*.VC.db -*.VC.VC.opendb - -# Visual Studio profiler -*.psess -*.vsp -*.vspx -*.sap - -# Visual Studio Trace Files -*.e2e - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# AxoCover is a Code Coverage Tool -.axoCover/* -!.axoCover/settings.json - -# Coverlet is a free, cross platform Code Coverage Tool -coverage*[.json, .xml, .info] - -# Visual Studio code coverage results -*.coverage -*.coveragexml - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# Note: Comment the next line if you want to checkin your web deploy settings, -# but database connection strings (with potential passwords) will be unencrypted -*.pubxml -*.publishproj - -# Microsoft Azure Web App publish settings. Comment the next line if you want to -# checkin your Azure Web App publish settings, but sensitive information contained -# in these scripts will be unencrypted -PublishScripts/ - -# NuGet Packages -*.nupkg -# NuGet Symbol Packages -*.snupkg -# The packages folder can be ignored because of Package Restore -**/[Pp]ackages/* -# except build/, which is used as an MSBuild target. -!**/[Pp]ackages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/[Pp]ackages/repositories.config -# NuGet v3's project.json files produces more ignorable files -*.nuget.props -*.nuget.targets - -# Microsoft Azure Build Output -csx/ -*.build.csdef - -# Microsoft Azure Emulator -ecf/ -rcf/ - -# Windows Store app package directories and files -AppPackages/ -BundleArtifacts/ -Package.StoreAssociation.xml -_pkginfo.txt -*.appx -*.appxbundle -*.appxupload - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!?*.[Cc]ache/ - -# Others -ClientBin/ -~$* -*~ -*.dbmdl -*.dbproj.schemaview -*.jfm -*.pfx -*.publishsettings -orleans.codegen.cs - -# Including strong name files can present a security risk -# (https://github.com/github/gitignore/pull/2483#issue-259490424) -#*.snk - -# Since there are multiple workflows, uncomment next line to ignore bower_components -# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) -#bower_components/ - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm -ServiceFabricBackup/ -*.rptproj.bak - -# SQL Server files -*.mdf -*.ldf -*.ndf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings -*.rptproj.rsuser -*- [Bb]ackup.rdl -*- [Bb]ackup ([0-9]).rdl -*- [Bb]ackup ([0-9][0-9]).rdl - -# Microsoft Fakes -FakesAssemblies/ - -# GhostDoc plugin setting file -*.GhostDoc.xml - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat -node_modules/ - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) -*.vbw - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - -# Paket dependency manager -.paket/paket.exe -paket-files/ - -# FAKE - F# Make -.fake/ - -# CodeRush personal settings -.cr/personal - -# Python Tools for Visual Studio (PTVS) -__pycache__/ -*.pyc - -# Cake - Uncomment if you are using it -# tools/** -# !tools/packages.config - -# Tabs Studio -*.tss - -# Telerik's JustMock configuration file -*.jmconfig - -# BizTalk build output -*.btp.cs -*.btm.cs -*.odx.cs -*.xsd.cs - -# OpenCover UI analysis results -OpenCover/ - -# Azure Stream Analytics local run output -ASALocalRun/ - -# MSBuild Binary and Structured Log -*.binlog - -# NVidia Nsight GPU debugger configuration file -*.nvuser - -# MFractors (Xamarin productivity tool) working folder -.mfractor/ - -# Local History for Visual Studio -.localhistory/ - -# BeatPulse healthcheck temp database -healthchecksdb - -# Backup folder for Package Reference Convert tool in Visual Studio 2017 -MigrationBackup/ - -# Ionide (cross platform F# VS Code tools) working folder -.ionide/ - -# Fody - auto-generated XML schema -FodyWeavers.xsd - -### VisualStudio Patch ### -# Additional files built by Visual Studio -*.tlog - -# End of https://www.toptal.com/developers/gitignore/api/intellij+all,visualstudio,visualstudiocode,cmake,c,c++ -build -build_deps +# platform .DS_Store - -uploads/** \ No newline at end of file +platform/cortex.db +dist +*.lock +node_modules +.turbo +package-lock.json + +# CI - Test - Coverage +cortex.log +api.log +prism.log +api.json +openai-python/* +build +platform/cortex.exe +platform/package-lock.json +.vscode +platform/command +platform/src/infrastructure/commanders/test/test_data +**/vcpkg_installed +engine/test.db +!docs/yarn.lock \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index e2f71d456..da05bcdd8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "llama.cpp"] - path = llama.cpp - url = https://github.com/ggerganov/llama.cpp - branch = master -[submodule "whisper.cpp"] - path = whisper.cpp - url = https://github.com/ggerganov/whisper.cpp.git +[submodule "engine/vcpkg"] + path = engine/vcpkg + url = https://github.com/microsoft/vcpkg.git diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index eba4fee0c..000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,106 +0,0 @@ -cmake_minimum_required(VERSION 3.5) -project(nitro C CXX) - -include(CheckIncludeFileCXX) - -check_include_file_cxx(any HAS_ANY) -check_include_file_cxx(string_view HAS_STRING_VIEW) -check_include_file_cxx(coroutine HAS_COROUTINE) -if(HAS_ANY - AND HAS_STRING_VIEW - AND HAS_COROUTINE) - set(CMAKE_CXX_STANDARD 20) -elseif(HAS_ANY AND HAS_STRING_VIEW) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 14) -endif() - -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) -set(OPENSSL_USE_STATIC_LIBS TRUE) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install) -# This is the critical line for installing another package - -if(LLAMA_CUDA) - cmake_minimum_required(VERSION 3.17) - - find_package(CUDAToolkit) - if(CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") - add_compile_definitions(GGML_USE_CUDA) - endif() -endif() - -if(DEBUG) - message(STATUS "NITRO DEBUG IS ON") - add_compile_definitions(ALLOW_ALL_CORS) -endif() - -if(NOT DEFINED NITRO_VERSION) - set(NITRO_VERSION "default_version") -endif() - -if(APPLE) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$") - # MacOS silicon - set(LLAMA_METAL_EMBED_LIBRARY ON) - set(WHISPER_COREML 1) - else() - # MacOS amd64 - set(LLAMA_METAL OFF) - endif() -endif() - -add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}") - -add_subdirectory(llama.cpp/examples/llava) -add_subdirectory(llama.cpp) -add_subdirectory(whisper.cpp) -add_subdirectory(test) - -add_executable(${PROJECT_NAME} main.cc) - -# ############################################################################## -# If you include the drogon source code locally in your project, use this method -# to add drogon add_subdirectory(nitro_deps) -# target_link_libraries(${PROJECT_NAME} PRIVATE nitro_deps) -# -# and comment out the following lines -find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama whisper llava - ${CMAKE_THREAD_LIBS_INIT}) - -# ############################################################################## - -if(CMAKE_CXX_STANDARD LESS 17) - # With C++14, use boost to support any and std::string_view - message(STATUS "use c++14") - find_package(Boost 1.61.0 REQUIRED) - target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS}) -elseif(CMAKE_CXX_STANDARD LESS 20) - message(STATUS "use c++17") -else() - message(STATUS "use c++20") -endif() - -aux_source_directory(controllers CTL_SRC) -aux_source_directory(common COMMON_SRC) -aux_source_directory(context CONTEXT_SRC) -aux_source_directory(models MODEL_SRC) -# aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins -# PLUGIN_SRC) - -# drogon_create_views(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/views -# ${CMAKE_CURRENT_BINARY_DIR}) use the following line to create views with -# namespaces. drogon_create_views(${PROJECT_NAME} -# ${CMAKE_CURRENT_SOURCE_DIR}/views ${CMAKE_CURRENT_BINARY_DIR} TRUE) - -target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -# ${CMAKE_CURRENT_SOURCE_DIR}/models) -target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) -# ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) -# ############################################################################## -# uncomment the following line for dynamically loading views set_property(TARGET -# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..607c3db7f --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,42 @@ +### Repo Structure +``` +# Entity Definitions +domain/ # This is the core directory where the domains are defined. + abstracts/ # Abstract base classes for common attributes and methods. + models/ # Domain interface definitions, e.g. model, assistant. + repositories/ # Extensions abstract and interface + +# Business Rules +usecases/ # Application logic + assistants/ # CRUD logic (invokes dtos, entities). + chat/ # Logic for chat functionalities. + models/ # Logic for model operations. + +# Adapters & Implementations +infrastructure/ # Implementations for Cortex interactions + commanders/ # CLI handlers + models/ + questions/ # CLI installation UX + shortcuts/ # CLI chained syntax + types/ + usecases/ # Invokes UseCases + + controllers/ # Nest controllers and HTTP routes + assistants/ # Invokes UseCases + chat/ # Invokes UseCases + models/ # Invokes UseCases + + database/ # Database providers (mysql, sqlite) + +# Framework specific object definitions + dtos/ # DTO definitions (data transfer & validation) + entities/ # TypeORM entity definitions (db schema) + +# Providers + providers/cortex # Cortex [server] provider (a core extension) + repositories/extensions # Extension provider (core & external extensions) + +extensions/ # External extensions +command.module.ts # CLI Commands List +main.ts # Entrypoint +``` diff --git a/LICENSE b/LICENSE index 5469257f1..b64fc2445 100644 --- a/LICENSE +++ b/LICENSE @@ -1,660 +1,201 @@ -# GNU AFFERO GENERAL PUBLIC LICENSE - -Version 3, 19 November 2007 - -Copyright (C) 2007 Free Software Foundation, Inc. - - -Everyone is permitted to copy and distribute verbatim copies of this -license document, but changing it is not allowed. - -## Preamble - -The GNU Affero General Public License is a free, copyleft license for -software and other kinds of works, specifically designed to ensure -cooperation with the community in the case of network server software. - -The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -our General Public Licenses are intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains -free software for all its users. - -When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - -Developers that use our General Public Licenses protect your rights -with two steps: (1) assert copyright on the software, and (2) offer -you this License which gives you legal permission to copy, distribute -and/or modify the software. - -A secondary benefit of defending all users' freedom is that -improvements made in alternate versions of the program, if they -receive widespread use, become available for other developers to -incorporate. Many developers of free software are heartened and -encouraged by the resulting cooperation. However, in the case of -software used on network servers, this result may fail to come about. -The GNU General Public License permits making a modified version and -letting the public access it on a server without ever releasing its -source code to the public. - -The GNU Affero General Public License is designed specifically to -ensure that, in such cases, the modified source code becomes available -to the community. It requires the operator of a network server to -provide the source code of the modified version running there to the -users of that server. Therefore, public use of a modified version, on -a publicly accessible server, gives the public access to the source -code of the modified version. - -An older license, called the Affero General Public License and -published by Affero, was designed to accomplish similar goals. This is -a different license, not a version of the Affero GPL, but Affero has -released a new version of the Affero GPL which permits relicensing -under this license. - -The precise terms and conditions for copying, distribution and -modification follow. - -## TERMS AND CONDITIONS - -### 0. Definitions. - -"This License" refers to version 3 of the GNU Affero General Public -License. - -"Copyright" also means copyright-like laws that apply to other kinds -of works, such as semiconductor masks. - -"The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - -To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of -an exact copy. The resulting work is called a "modified version" of -the earlier work or a work "based on" the earlier work. - -A "covered work" means either the unmodified Program or a work based -on the Program. - -To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - -To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user -through a computer network, with no transfer of a copy, is not -conveying. - -An interactive user interface displays "Appropriate Legal Notices" to -the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - -### 1. Source Code. - -The "source code" for a work means the preferred form of the work for -making modifications to it. "Object code" means any non-source form of -a work. - -A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - -The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - -The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - -The Corresponding Source need not include anything that users can -regenerate automatically from other parts of the Corresponding Source. - -The Corresponding Source for a work in source code form is that same -work. - -### 2. Basic Permissions. - -All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - -You may make, run and propagate covered works that you do not convey, -without conditions so long as your license otherwise remains in force. -You may convey covered works to others for the sole purpose of having -them make modifications exclusively for you, or provide you with -facilities for running those works, provided that you comply with the -terms of this License in conveying all material for which you do not -control copyright. Those thus making or running the covered works for -you must do so exclusively on your behalf, under your direction and -control, on terms that prohibit them from making any copies of your -copyrighted material outside their relationship with you. - -Conveying under any other circumstances is permitted solely under the -conditions stated below. Sublicensing is not allowed; section 10 makes -it unnecessary. - -### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - -No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - -When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such -circumvention is effected by exercising rights under this License with -respect to the covered work, and you disclaim any intention to limit -operation or modification of the work as a means of enforcing, against -the work's users, your or third parties' legal rights to forbid -circumvention of technological measures. - -### 4. Conveying Verbatim Copies. - -You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - -You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - -### 5. Conveying Modified Source Versions. - -You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these -conditions: - -- a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. -- b) The work must carry prominent notices stating that it is - released under this License and any conditions added under - section 7. This requirement modifies the requirement in section 4 - to "keep intact all notices". -- c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. -- d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - -A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - -### 6. Conveying Non-Source Forms. - -You may convey a covered work in object code form under the terms of -sections 4 and 5, provided that you also convey the machine-readable -Corresponding Source under the terms of this License, in one of these -ways: - -- a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. -- b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the Corresponding - Source from a network server at no charge. -- c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. -- d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. -- e) Convey the object code using peer-to-peer transmission, - provided you inform other peers where the object code and - Corresponding Source of the work are being offered to the general - public at no charge under subsection 6d. - -A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - -A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, -family, or household purposes, or (2) anything designed or sold for -incorporation into a dwelling. In determining whether a product is a -consumer product, doubtful cases shall be resolved in favor of -coverage. For a particular product received by a particular user, -"normally used" refers to a typical or common use of that class of -product, regardless of the status of the particular user or of the way -in which the particular user actually uses, or expects or is expected -to use, the product. A product is a consumer product regardless of -whether the product has substantial commercial, industrial or -non-consumer uses, unless such uses represent the only significant -mode of use of the product. - -"Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to -install and execute modified versions of a covered work in that User -Product from a modified version of its Corresponding Source. The -information must suffice to ensure that the continued functioning of -the modified object code is in no case prevented or interfered with -solely because modification has been made. - -If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - -The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or -updates for a work that has been modified or installed by the -recipient, or for the User Product in which it has been modified or -installed. Access to a network may be denied when the modification -itself materially and adversely affects the operation of the network -or violates the rules and protocols for communication across the -network. - -Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - -### 7. Additional Terms. - -"Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - -When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - -Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders -of that material) supplement the terms of this License with terms: - -- a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or -- b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or -- c) Prohibiting misrepresentation of the origin of that material, - or requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or -- d) Limiting the use for publicity purposes of names of licensors - or authors of the material; or -- e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or -- f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions - of it) with contractual assumptions of liability to the recipient, - for any liability that these contractual assumptions directly - impose on those licensors and authors. - -All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - -If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - -Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; the -above requirements apply either way. - -### 8. Termination. - -You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - -However, if you cease all violation of this License, then your license -from a particular copyright holder is reinstated (a) provisionally, -unless and until the copyright holder explicitly and finally -terminates your license, and (b) permanently, if the copyright holder -fails to notify you of the violation by some reasonable means prior to -60 days after the cessation. - -Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - -Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - -### 9. Acceptance Not Required for Having Copies. - -You are not required to accept this License in order to receive or run -a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - -### 10. Automatic Licensing of Downstream Recipients. - -Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - -An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - -You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - -### 11. Patents. - -A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - -A contributor's "essential patent claims" are all patent claims owned -or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - -Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - -In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - -If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - -If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - -A patent license is "discriminatory" if it does not include within the -scope of its coverage, prohibits the exercise of, or is conditioned on -the non-exercise of one or more of the rights that are specifically -granted under this License. You may not convey a covered work if you -are a party to an arrangement with a third party that is in the -business of distributing software, under which you make payment to the -third party based on the extent of your activity of conveying the -work, and under which the third party grants, to any of the parties -who would receive the covered work from you, a discriminatory patent -license (a) in connection with copies of the covered work conveyed by -you (or copies made from those copies), or (b) primarily for and in -connection with specific products or compilations that contain the -covered work, unless you entered into that arrangement, or that patent -license was granted, prior to 28 March 2007. - -Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - -### 12. No Surrender of Others' Freedom. - -If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under -this License and any other pertinent obligations, then as a -consequence you may not convey it at all. For example, if you agree to -terms that obligate you to collect a royalty for further conveying -from those to whom you convey the Program, the only way you could -satisfy both those terms and this License would be to refrain entirely -from conveying the Program. - -### 13. Remote Network Interaction; Use with the GNU General Public License. - -Notwithstanding any other provision of this License, if you modify the -Program, your modified version must prominently offer all users -interacting with it remotely through a computer network (if your -version supports such interaction) an opportunity to receive the -Corresponding Source of your version by providing access to the -Corresponding Source from a network server at no charge, through some -standard or customary means of facilitating copying of software. This -Corresponding Source shall include the Corresponding Source for any -work covered by version 3 of the GNU General Public License that is -incorporated pursuant to the following paragraph. - -Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the work with which it is combined will remain governed by version -3 of the GNU General Public License. - -### 14. Revised Versions of this License. - -The Free Software Foundation may publish revised and/or new versions -of the GNU Affero General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies that a certain numbered version of the GNU Affero General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU Affero General Public License, you may choose any version ever -published by the Free Software Foundation. - -If the Program specifies that a proxy can decide which future versions -of the GNU Affero General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - -Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - -### 15. Disclaimer of Warranty. - -THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT -WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND -PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE -DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR -CORRECTION. - -### 16. Limitation of Liability. - -IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR -CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES -ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT -NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR -LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM -TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER -PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -### 17. Interpretation of Sections 15 and 16. - -If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - -END OF TERMS AND CONDITIONS - -## How to Apply These Terms to Your New Programs - -If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these -terms. - -To do so, attach the following notices to the program. It is safest to -attach them to the start of each source file to most effectively state -the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as - published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper -mail. - -If your software can interact with users remotely through a computer -network, you should also make sure that it provides a way for users to -get its source. For example, if your program is a web application, its -interface could display a "Source" link that leads users to an archive -of the code. There are many ways you could offer source, and different -solutions will be better for different programs; see section 13 for -the specific requirements. - -You should also get your employer (if you work as a programmer) or -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. For more information on this, and how to apply and follow -the GNU AGPL, see . \ No newline at end of file + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 Homebrew Computer Company + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index f10301363..2c944815a 100644 --- a/README.md +++ b/README.md @@ -1,217 +1,399 @@ -# Nitro - Embeddable AI +# Cortex.cpp +

- nitrologo +Cortex cpp's Readme Banner +

- Documentation - API Reference - - Changelog - Bug reports - Discord + + GitHub commit activity + Github Last Commit + Github Contributors + GitHub closed issues + Discord

-> ⚠️ **Nitro is currently in Development**: Expect breaking changes and bugs! +

+ Documentation - API Reference + - Changelog - Bug reports - Discord +

+ +> **Cortex.cpp is currently in active development.** + +## Overview + +Cortex is a Local AI API Platform that is used to run and customize LLMs. + +Key Features: +- Pull from Huggingface, or Cortex Built-in Models +- Models stored in universal file formats (vs blobs) +- Swappable Engines (default: [`llamacpp`](https://github.com/janhq/cortex.llamacpp), future: [`ONNXRuntime`](https://github.com/janhq/cortex.onnx), [`TensorRT-LLM`](https://github.com/janhq/cortex.tensorrt-llm)) +- Cortex can be deployed as a standalone API server, or integrated into apps like [Jan.ai](https://jan.ai/) + +Coming soon; now available on [cortex-nightly](#beta--nightly-versions): +- Engines Management (install specific llama-cpp version and variants) +- Nvidia Hardware detection & activation (current: Nvidia, future: AMD, Intel, Qualcomm) +- Cortex's roadmap is to implement the full OpenAI API including Tools, Runs, Multi-modal and Realtime APIs. + +## Local Installation + +Cortex has an Local Installer that packages all required dependencies, so that no internet connection is required during the installation process. + +Cortex also has a [Network Installer](#network-installer) which downloads the necessary dependencies from the internet during the installation. + +

+ + Windows: + cortex.exe +

-## Features -- Fast Inference: Built on top of the cutting-edge inference library llama.cpp, modified to be production ready. -- Lightweight: Only 3MB, ideal for resource-sensitive environments. -- Easily Embeddable: Simple integration into existing applications, offering flexibility. -- Quick Setup: Approximately 10-second initialization for swift deployment. -- Enhanced Web Framework: Incorporates drogon cpp to boost web service efficiency. +

+ + MacOS (Silicon/Intel): + cortex.pkg +

-## About Nitro +

+ + Linux: + cortex.deb (Coming soon: Linux installation script) +

-Nitro is a high-efficiency C++ inference engine for edge computing, powering [Jan](https://jan.ai/). It is lightweight and embeddable, ideal for product integration. +- For Linux: Download the installer and run the following command in terminal: -The binary of nitro after zipped is only ~3mb in size with none to minimal dependencies (if you use a GPU need CUDA for example) make it desirable for any edge/server deployment 👍. +```bash + sudo apt install ./cortex-local-installer.deb +``` + +- The binary will be installed in the `/usr/bin/` directory. + +## Usage -> Read more about Nitro at https://nitro.jan.ai/ +### CLI -### Repo Structure +After installation, you can run Cortex.cpp from the command line by typing `cortex --help`. ``` -. -├── controllers -├── docs -├── llama.cpp -> Upstream llama C++ -├── nitro_deps -> Dependencies of the Nitro project as a sub-project -└── utils +# Run a Model +cortex pull llama3.2 +cortex pull bartowski/Meta-Llama-3.1-8B-Instruct-GGUF +cortex run llama3.2 + +# Resource Management +cortex ps (view active models & RAM/VRAM used) +cortex models stop llama3.2 + +# Available on cortex-nightly: +cortex engines install llama-cpp -m (lists versions and variants) +cortex hardware list (hardware detection) +cortex hardware activate + +cortex stop ``` -## Quickstart +Refer to our [Quickstart](https://cortex.so/docs/quickstart/) and +[CLI documentation](https://cortex.so/docs/cli) for more details. -**Step 1: Install Nitro** +### API: +Cortex.cpp includes a REST API accessible at `localhost:39281`. -- For Linux and MacOS +Refer to our [API documentation](https://cortex.so/api-reference) for more details. - ```bash - curl -sfL https://raw.githubusercontent.com/janhq/nitro/main/install.sh | sudo /bin/bash - - ``` +## Models -- For Windows +Cortex.cpp allows users to pull models from multiple Model Hubs, offering flexibility and extensive model access: +- [Hugging Face](https://huggingface.co): GGUF models eg `author/Model-GGUF` +- Cortex Built-in Models - ```bash - powershell -Command "& { Invoke-WebRequest -Uri 'https://raw.githubusercontent.com/janhq/nitro/main/install.bat' -OutFile 'install.bat'; .\install.bat; Remove-Item -Path 'install.bat' }" - ``` +Once downloaded, the model `.gguf` and `model.yml` files are stored in `~\cortexcpp\models`. -**Step 2: Downloading a Model** +> **Note**: +> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 14B models, and 32 GB to run the 32B models. -```bash -mkdir model && cd model -wget -O llama-2-7b-model.gguf https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf?download=true -``` - -**Step 3: Run Nitro server** - -```bash title="Run Nitro server" -nitro -``` - -**Step 4: Load model** - -```bash title="Load model" -curl http://localhost:3928/inferences/llamacpp/loadmodel \ - -H 'Content-Type: application/json' \ - -d '{ - "llama_model_path": "/model/llama-2-7b-model.gguf", - "ctx_len": 512, - "ngl": 100, - }' -``` - -**Step 5: Making an Inference** - -```bash title="Nitro Inference" -curl http://localhost:3928/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "messages": [ - { - "role": "user", - "content": "Who won the world series in 2020?" - }, - ] - }' -``` - -Table of parameters - -| Parameter | Type | Description | -|------------------|---------|--------------------------------------------------------------| -| `llama_model_path` | String | The file path to the LLaMA model. | -| `ngl` | Integer | The number of GPU layers to use. | -| `ctx_len` | Integer | The context length for the model operations. | -| `embedding` | Boolean | Whether to use embedding in the model. | -| `n_parallel` | Integer | The number of parallel operations. | -| `cont_batching` | Boolean | Whether to use continuous batching. | -| `user_prompt` | String | The prompt to use for the user. | -| `ai_prompt` | String | The prompt to use for the AI assistant. | -| `system_prompt` | String | The prompt to use for system rules. | -| `pre_prompt` | String | The prompt to use for internal configuration. | -| `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) | -| `n_batch` | Integer | The batch size for prompt eval step | -| `caching_enabled` | Boolean | To enable prompt caching or not | -| `clean_cache_threshold` | Integer | Number of chats that will trigger clean cache action| -|`grp_attn_n`|Integer|Group attention factor in self-extend| -|`grp_attn_w`|Integer|Group attention width in self-extend| -|`mlock`|Boolean|Prevent system swapping of the model to disk in macOS| -|`grammar_file`| String |You can constrain the sampling using GBNF grammars by providing path to a grammar file| -|`model_type` | String | Model type we want to use: llm or embedding, default value is llm| - -***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal -```zsh -./nitro 1 127.0.0.1 5000 ([thread_num] [host] [port] [uploads_folder_path]) -``` -- thread_num : the number of thread that nitro webserver needs to have -- host : host value normally 127.0.0.1 or 0.0.0.0 -- port : the port that nitro got deployed onto -- uploads_folder_path: custom path for file uploads in Drogon. - -Nitro server is compatible with the OpenAI format, so you can expect the same output as the OpenAI ChatGPT API. - -## Compile from source -To compile nitro please visit [Compile from source](docs/docs/new/build-source.md) - -## Download +### Cortex Built-in Models & Quantizations + +| Model /Engine | llama.cpp | Command | +| -------------- | --------------------- | ----------------------------- | +| phi-3.5 | ✅ | cortex run phi3.5 | +| llama3.2 | ✅ | cortex run llama3.2 | +| llama3.1 | ✅ | cortex run llama3.1 | +| codestral | ✅ | cortex run codestral | +| gemma2 | ✅ | cortex run gemma2 | +| mistral | ✅ | cortex run mistral | +| ministral | ✅ | cortex run ministral | +| qwen2 | ✅ | cortex run qwen2.5 | +| openhermes-2.5 | ✅ | cortex run openhermes-2.5 | +| tinyllama | ✅ | cortex run tinyllama | + +View all [Cortex Built-in Models](https://cortex.so/models). + +Cortex supports multiple quantizations for each model. +``` +❯ cortex-nightly pull llama3.2 +Downloaded models: + llama3.2:3b-gguf-q2-k + +Available to download: + 1. llama3.2:3b-gguf-q3-kl + 2. llama3.2:3b-gguf-q3-km + 3. llama3.2:3b-gguf-q3-ks + 4. llama3.2:3b-gguf-q4-km (default) + 5. llama3.2:3b-gguf-q4-ks + 6. llama3.2:3b-gguf-q5-km + 7. llama3.2:3b-gguf-q5-ks + 8. llama3.2:3b-gguf-q6-k + 9. llama3.2:3b-gguf-q8-0 + +Select a model (1-9): +``` + +## Advanced Installation + +### Beta & Nightly Versions (Local Installer) + +Cortex releases Beta and Nightly versions for advanced users to try new features (we appreciate your feedback!) +- Beta (early preview): CLI command: `cortex-beta` +- Nightly (released every night): CLI Command: `cortex-nightly` + - Nightly automatically pulls the latest changes from upstream [llama.cpp](https://github.com/ggerganov/llama.cpp/) repo, creates a PR and runs tests. + - If all test pass, the PR is automatically merged into our repo, with the latest llama.cpp version. - + + + + + + + + + + + + + + + + + + +
VersionWindowsMacOSLinux
Beta (Preview) + + + cortex.exe + + + + + cortex.pkg + + + + + cortex.deb + +
Nightly (Experimental) + + + cortex.exe + + + + + cortex.pkg + + + + + cortex.deb + +
+ +### Network Installer + +Cortex.cpp is available with a Network Installer, which is a smaller installer but requires internet connection during installation to download the necessary dependencies. + + + - - - + + + - + + + + - - - + + + +
Version TypeWindowsMacOSLinuxWindowsMacOSLinux
Stable (Recommended) - - - CPU + + + cortex.exe - - - CUDA + + + cortex.pkg - - - Intel + + + cortex.deb
Beta (Preview) - - - M1/M2 + + + cortex.exe - - - CPU + + + cortex.pkg - - - CUDA + + + cortex.deb
Experimental (Nighlty Build) - - GitHub action artifactory +
Nightly (Experimental) + + + cortex.exe + + + + + cortex.pkg + + + + + cortex.deb
-Download the latest version of Nitro at https://nitro.jan.ai/ or visit the **[GitHub Releases](https://github.com/janhq/nitro/releases)** to download any previous release. +### Build from Source + +#### Windows + +1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). +2. Navigate to the `engine` folder. +3. Configure the vpkg: + +```bash +cd vcpkg +./bootstrap-vcpkg.bat +vcpkg install +``` -## Nightly Build +4. Build the Cortex.cpp inside the `engine/build` folder: -Nightly build is a process where the software is built automatically every night. This helps in detecting and fixing bugs early in the development cycle. The process for this project is defined in [`.github/workflows/build.yml`](.github/workflows/build.yml) +```bash +mkdir build +cd build +cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake -DVCPKG_TARGET_TRIPLET=x64-windows-static +cmake --build . --config Release +``` -You can join our Discord server [here](https://discord.gg/FTk2MvZwJH) and go to channel [github-nitro](https://discordapp.com/channels/1107178041848909847/1151022176019939328) to monitor the build process. +5. Verify that Cortex.cpp is installed correctly by getting help information. -The nightly build is triggered at 2:00 AM UTC every day. +```sh +cortex -h +``` -The nightly build can be downloaded from the url notified in the Discord channel. Please access the url from the browser and download the build artifacts from there. +#### MacOS -## Manual Build +1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). +2. Navigate to the `engine` folder. +3. Configure the vpkg: -Manual build is a process where the software is built manually by the developers. This is usually done when a new feature is implemented or a bug is fixed. The process for this project is defined in [`.github/workflows/build.yml`](.github/workflows/build.yml) +```bash +cd vcpkg +./bootstrap-vcpkg.sh +vcpkg install +``` -It is similar to the nightly build process, except that it is triggered manually by the developers. +4. Build the Cortex.cpp inside the `engine/build` folder: -### Contact +```bash +mkdir build +cd build +cmake .. -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake +make -j4 +``` -- For support, please file a GitHub ticket. -- For questions, join our Discord [here](https://discord.gg/FTk2MvZwJH). -- For long-form inquiries, please email hello@jan.ai. +5. Verify that Cortex.cpp is installed correctly by getting help information. + +```sh +cortex -h +``` + +#### Linux -## Star History +1. Clone the Cortex.cpp repository [here](https://github.com/janhq/cortex.cpp). +2. Navigate to the `engine` folder. +3. Configure the vpkg: -[![Star History Chart](https://api.star-history.com/svg?repos=janhq/nitro&type=Date)](https://star-history.com/#janhq/nitro&Date) +```bash +cd vcpkg +./bootstrap-vcpkg.sh +vcpkg install +``` + +4. Build the Cortex.cpp inside the `engine/build` folder: + +```bash +mkdir build +cd build +cmake .. -DCMAKE_TOOLCHAIN_FILE=path_to_vcpkg_folder_in_cortex_repo/vcpkg/scripts/buildsystems/vcpkg.cmake +make -j4 +``` + +5. Verify that Cortex.cpp is installed correctly by getting help information. + +```sh +cortex -h +``` + +## Uninstallation + +### Windows + +1. Open the Windows Control Panel. +2. Navigate to `Add or Remove Programs`. +3. Search for `cortexcpp` and double click to uninstall. (for beta and nightly builds, search for `cortexcpp-beta` and `cortexcpp-nightly` respectively) + +### MacOs + +Run the uninstaller script: + +```bash +sudo sh cortex-uninstall.sh +``` + +For MacOS, there is a uninstaller script comes with the binary and added to the `/usr/local/bin/` directory. The script is named `cortex-uninstall.sh` for stable builds, `cortex-beta-uninstall.sh` for beta builds and `cortex-nightly-uninstall.sh` for nightly builds. + +### Linux + +```bash +sudo apt remove cortexcpp +``` + +## Contact Support + +- For support, please file a [GitHub ticket](https://github.com/janhq/cortex.cpp/issues/new/choose). +- For questions, join our Discord [here](https://discord.gg/FTk2MvZwJH). +- For long-form inquiries, please email [hello@jan.ai](mailto:hello@jan.ai). diff --git a/assets/Nitro README banner.png b/assets/Nitro README banner.png deleted file mode 100644 index 5582c7688..000000000 Binary files a/assets/Nitro README banner.png and /dev/null differ diff --git a/assets/cortex-banner.png b/assets/cortex-banner.png new file mode 100644 index 000000000..95a7262a9 Binary files /dev/null and b/assets/cortex-banner.png differ diff --git a/assets/placeholder b/assets/placeholder deleted file mode 100644 index 8b1378917..000000000 --- a/assets/placeholder +++ /dev/null @@ -1 +0,0 @@ - diff --git a/common/base.cc b/common/base.cc deleted file mode 100644 index e69de29bb..000000000 diff --git a/context/llama_server_context.h b/context/llama_server_context.h deleted file mode 100644 index 21792f11b..000000000 --- a/context/llama_server_context.h +++ /dev/null @@ -1,2260 +0,0 @@ -#include -#include -#include -#include - -// External -#include "clip.h" -#include "common.h" -#include "llama.h" -#include "llava.h" -#include "stb_image.h" -#include "utils/json.hpp" - -#if defined(_WIN32) -#define NOMINMAX -#endif - -using json = nlohmann::json; - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" - -struct server_params { - std::string hostname = "127.0.0.1"; - std::string api_key; - std::string public_path = "examples/server/public"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; -}; - -static bool server_verbose = false; - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do { \ - if (server_verbose) { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR_LLAMA(MSG, ...) \ - server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING_LLAMA(MSG, ...) \ - server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO_LLAMA(MSG, ...) \ - server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -// -// base64 utils (TODO: move to common in the future) -// - -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); -} - -static std::vector base64_decode(const std::string& encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && - is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; - in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); - } - i = 0; - } - } - - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; - } - - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (j = 0; (j < i - 1); j++) { - ret.push_back(char_array_3[j]); - } - } - - return ret; -} - -// -// parallel -// - -enum task_type { COMPLETION_TASK, CANCEL_TASK }; - -struct task_server { - int id; - int target_id; - task_type type; - json data; - bool infill_mode = false; - bool embedding_mode = false; - int multitask_id = -1; -}; - -struct task_result { - int id; - int multitask_id = -1; - bool stop; - bool error; - json result_json; -}; - -struct task_multi { - int id; - std::set subtasks_remaining{}; - std::vector results{}; -}; - -// TODO: can become bool if we can't find use of more states -enum slot_state { - IDLE, - PROCESSING, -}; - -enum slot_command { - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = - false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image { - int32_t id; - - bool request_encode_image = false; - float* image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8* img_data; - - std::string prefix_prompt; // before of this image -}; - -// completion token output with probabilities -struct completion_token_output { - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; - llama_token tok; - std::string text_to_send; -}; - -static size_t common_part(const std::vector& a, - const std::vector& b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - return i; -} - -enum stop_type { - STOP_FULL, - STOP_PARTIAL, -}; - -enum class ModelType { LLM = 0, EMBEDDING }; - -static bool ends_with(const std::string& str, const std::string& suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static size_t find_partial_stop_string(const std::string& stop, - const std::string& text) { - if (!text.empty() && !stop.empty()) { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { - if (stop[char_index] == text_last_char) { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} - -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; -} - -static void server_log(const char* level, const char* function, int line, - const char* message, - const nlohmann::ordered_json& extra) { - nlohmann::ordered_json log{ - {"timestamp", time(nullptr)}, {"level", level}, - {"function", function}, {"line", line}, - {"message", message}, - }; - - if (!extra.empty()) { - log.merge_patch(extra); - } - - const std::string str = - log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); -} - -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context* ctx, - const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} - -// convert a vector of completion_token_output to json -static json probs_vector_to_json( - const llama_context* ctx, - const std::vector& probs) { - json out = json::array(); - for (const auto& prob : probs) { - json probs_for_token = json::array(); - for (const auto& p : prob.probs) { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json{ - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; -} - -template -static T json_value(const json& body, const std::string& key, - const T& default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - -struct llama_client_slot { - int id; - int task_id = -1; - - struct slot_params params; - - slot_state state = IDLE; - slot_command command = NONE; - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; - - json prompt; - std::string generated_text; - llama_token sampled; - std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - std::string oaicompat_model; - - std::string stopping_word; - - // sampling - struct llama_sampling_params sparams; - llama_sampling_context* ctx_sampling = nullptr; - - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - // multitasks - int multitask_id = -1; - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - - generated_token_probs.clear(); - - for (slot_image& img : images) { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); - } - img.prefix_prompt = ""; - } - - images.clear(); - } - - bool has_budget(gpt_params& global_params) { - n_remaining = -1; - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; - } - return n_remaining > 0 || n_remaining == -1; // no budget || limitless - } - - bool available() const { return state == IDLE && command == NONE; } - - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } - - void add_token_string(const completion_token_output& token) { - if (command == RELEASE) { - return; - } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - - void release() { - if (state == IDLE || state == PROCESSING) { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; - } - } - - json get_formated_timings() { - return json{ - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", - t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", - 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - void print_timings() const { - LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing - << "ms / " << num_prompt_tokens_processed << " tokens (" - << t_prompt_processing / num_prompt_tokens_processed - << " ms per " - "token, " - << 1e3 / t_prompt_processing * num_prompt_tokens_processed - << " tokens per second)"; - LOG_DEBUG << __func__ << ": eval time = " << t_token_generation - << " ms / " << n_decoded << " runs (" - << t_token_generation / n_decoded - << " ms per " - "token, " - << 1e3 / t_token_generation * n_decoded - << " tokens per second)\n"; - LOG_DEBUG << __func__ << ": total time = " - << t_prompt_processing + t_token_generation << " ms"; - } -}; - -struct llama_server_context { - llama_model* model = nullptr; - llama_context* ctx = nullptr; - - clip_ctx* clp_ctx = nullptr; - - gpt_params params; - - llama_batch batch; - - bool multimodal = false; - bool clean_kv_cache = true; - bool all_slots_are_idle = false; - bool add_bos_token = true; - - int32_t id_gen; - int32_t n_ctx; // total context for all clients / slots - - // Internal - std::atomic model_loaded_external = false; - - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - - std::string name_user; // this should be the antiprompt - std::string name_assistant; - - // slots / clients - std::vector slots; - - std::vector queue_tasks; - std::vector queue_results; - std::vector queue_multitasks; - std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks - std::condition_variable condition_tasks; - std::mutex mutex_results; - std::condition_variable condition_results; - ModelType model_type = ModelType::LLM; - - ~llama_server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; - } - } - - bool load_model(const gpt_params& params_) { - params = params_; - if (!params.mmproj.empty()) { - multimodal = true; - LOG_DEBUG << "Multi Modal Mode Enabled"; - clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); - if (clp_ctx == nullptr) { - LOG_ERROR_LLAMA("unable to load clip model", - {{"model", params.mmproj}}); - return false; - } - - if (params.n_ctx < - 2048) { // request larger context for the image embedding - params.n_ctx = 2048; - } - } - - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr) { - LOG_ERROR_LLAMA("llama.cpp unable to load model", - {{"model", params.model}}); - return false; - } - - if (multimodal) { - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_n_embd(model); - if (n_embd_clip != n_embd_llm) { - LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector (" - << n_embd_clip - << ") is not " - "equal to that of LLaMA (" - << n_embd_llm - << "). Make sure that you use the " - "correct mmproj file."; - llama_free(ctx); - llama_free_model(model); - return false; - } - } - - if (ctx == nullptr) { - LOG_ERROR_LLAMA("Unable to get llama.cpp context", {}); - return false; - } - n_ctx = llama_n_ctx(ctx); - - add_bos_token = llama_should_add_bos_token(model); - - return true; - } - - void initialize() { - id_gen = 0; - - // create slots - all_slots_are_idle = true; - - const int32_t n_ctx_slot = n_ctx / params.n_parallel; - - LOG_DEBUG << "Available slots: "; - for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; - - slot.id = i; - slot.n_ctx = n_ctx_slot; - slot.reset(); - - LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; - slots.push_back(slot); - } - - try { - batch = llama_batch_init(n_ctx, 0, params.n_parallel); - } catch (const std::exception& e) { - LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata", - {{"exception", e.what()}, - {"n_tokens_alloc", n_ctx}, - {"embd", 0}, - {"n_seq_max", params.n_parallel}}); - } - - // empty system prompt - system_prompt = ""; - system_tokens.clear(); - } - - std::vector tokenize(const json& json_prompt, - bool add_bos) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see - // https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other - // chat templates - const bool TMP_FORCE_SPECIAL = true; - - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto& p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - std::vector p; - if (first) { - p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - first = false; - } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); - } - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - } - - return prompt_tokens; - } - - llama_client_slot* get_slot(int id) { - int64_t t_last = ggml_time_us(); - llama_client_slot* last_used = nullptr; - - for (llama_client_slot& slot : slots) { - if (slot.id == id && slot.available()) { - return &slot; - } - - if (slot.available() && slot.t_last_used < t_last) { - last_used = &slot; - t_last = slot.t_last_used; - } - } - - return last_used; - } - - bool launch_slot_with_data(llama_client_slot*& slot, json data) { - slot_params default_params; - llama_sampling_params default_sparams; - - if (data.count("__oaicompat") != 0) { - slot->oaicompat = true; - slot->oaicompat_model = - json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - } else { - slot->oaicompat = false; - slot->oaicompat_model = ""; - } - - slot->params.stream = json_value(data, "stream", false); - slot->params.cache_prompt = json_value(data, "cache_prompt", false); - slot->params.n_predict = - json_value(data, "n_predict", default_params.n_predict); - slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = - json_value(data, "typical_p", default_sparams.typical_p); - slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot->sparams.penalty_last_n = - json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot->sparams.penalty_repeat = - json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot->sparams.penalty_freq = - json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot->sparams.penalty_present = - json_value(data, "presence_penalty", default_sparams.penalty_present); - slot->sparams.mirostat = - json_value(data, "mirostat", default_sparams.mirostat); - slot->sparams.mirostat_tau = - json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot->sparams.mirostat_eta = - json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = - json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); - slot->sparams.grammar = - json_value(data, "grammar", default_sparams.grammar); - slot->sparams.n_probs = - json_value(data, "n_probs", default_sparams.n_probs); - - // infill - if (data.count("input_prefix") != 0) { - slot->params.input_prefix = data["input_prefix"]; - } else { - slot->params.input_prefix = ""; - } - - if (data.count("input_suffix") != 0) { - slot->params.input_suffix = data["input_suffix"]; - } else { - slot->params.input_suffix = ""; - } - - if (data.count("prompt") != 0) { - slot->prompt = data["prompt"]; - } else { - slot->prompt = ""; - } - - slot->sparams.penalty_prompt_tokens.clear(); - slot->sparams.use_penalty_prompt_tokens = false; - const auto& penalty_prompt = data.find("penalty_prompt"); - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - auto penalty_tokens = - llama_tokenize(model, penalty_prompt_string, false); - slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); - if (slot->params.n_predict > 0) { - slot->sparams.penalty_prompt_tokens.reserve( - slot->sparams.penalty_prompt_tokens.size() + - slot->params.n_predict); - } - slot->sparams.use_penalty_prompt_tokens = true; - } else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot->sparams.penalty_prompt_tokens.reserve( - n_tokens + std::max(0, slot->params.n_predict)); - const int n_vocab = llama_n_vocab(model); - for (const auto& penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot->sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot->sparams.use_penalty_prompt_tokens = true; - } - } - - slot->sparams.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - - const auto& logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto& el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number()) { - slot->sparams.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - slot->sparams.logit_bias[tok] = -INFINITY; - } - } - } - } - } - - slot->params.antiprompt.clear(); - - const auto& stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto& word : *stop) { - if (!word.empty()) { - slot->params.antiprompt.push_back(word); - } - } - } - - if (multimodal) { - const auto& images_data = data.find("image_data"); - if (images_data != data.end() && images_data->is_array()) { - for (const auto& img : *images_data) { - const std::vector image_buffer = - base64_decode(img["data"].get()); - - slot_image img_sl; - img_sl.id = - img.count("id") != 0 ? img["id"].get() : slot->images.size(); - img_sl.img_data = clip_image_u8_init(); - if (!clip_image_load_from_bytes( - image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_DEBUG << "slot " << slot->id - << " - failed to load image [id: " << img_sl.id << "]"; - return false; - } - LOG_DEBUG << "slot " << slot->id << " - loaded image"; - img_sl.request_encode_image = true; - slot->images.push_back(img_sl); - } - // process prompt - // example: system prompt [img-102] user [img-103] describe [img-134] -> - // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, - // {id: 134, prefix: ' describe '}]} - if (slot->images.size() > 0 && !slot->prompt.is_array()) { - std::string prompt = slot->prompt.get(); - size_t pos = 0, begin_prefix = 0; - std::string pattern = "[img-"; - while ((pos = prompt.find(pattern, pos)) != std::string::npos) { - size_t end_prefix = pos; - pos += pattern.length(); - size_t end_pos = prompt.find("]", pos); - if (end_pos != std::string::npos) { - std::string image_id = prompt.substr(pos, end_pos - pos); - try { - int img_id = std::stoi(image_id); - bool found = false; - for (slot_image& img : slot->images) { - if (img.id == img_id) { - found = true; - img.prefix_prompt = - prompt.substr(begin_prefix, end_prefix - begin_prefix); - begin_prefix = end_pos + 1; - break; - } - } - if (!found) { - LOG_DEBUG << "ERROR: Image with id: " << img_id - << ", not found.\n"; - slot->images.clear(); - return false; - } - } catch (const std::invalid_argument& e) { - LOG_DEBUG << "Invalid image number id in prompt"; - slot->images.clear(); - return false; - } - } - } - slot->prompt = ""; - slot->params.input_suffix = prompt.substr(begin_prefix); - slot->params.cache_prompt = - false; // multimodal doesn't support cache prompt - } - } - } - - if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); - } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); - slot->command = LOAD_PROMPT; - - all_slots_are_idle = false; - - LOG_DEBUG << "slot " << slot->id - << " is processing [task id: " << slot->task_id << "]"; - - return true; - } - - void kv_cache_clear() { - // clear the entire KV cache - llama_kv_cache_clear(ctx); - } - - void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - - llama_batch_clear(batch); - - kv_cache_clear(); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, {0}, false); - } - - if (llama_decode(ctx, batch) != 0) { - LOG_WARN << __func__ << ": llama_decode() failed"; - return; - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); - } - - LOG_DEBUG << "system prompt updated"; - system_need_update = false; - } - - void notify_system_prompt_changed() { - // release all slots - for (llama_client_slot& slot : slots) { - slot.release(); - } - - system_need_update = true; - } - - void process_system_prompt_data(const json& sys_props) { - system_prompt = sys_props.value("prompt", ""); - name_user = sys_props.value("anti_prompt", ""); - name_assistant = sys_props.value("assistant_name", ""); - - if (slots.size() > 0) { - notify_system_prompt_changed(); - } - } - - static size_t find_stopping_strings(const std::string& text, - const size_t last_token_size, - const stop_type type, - llama_client_slot& slot) { - size_t stop_pos = std::string::npos; - - for (const std::string& word : slot.params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; - slot.has_next_token = false; - } - stop_pos = pos; - } - } - - return stop_pos; - } - - bool process_token(completion_token_output& result, llama_client_slot& slot) { - // remember which tokens were sampled - used for repetition penalties during - // sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); - slot.sampled = result.tok; - - // search stop word and delete it - slot.generated_text += token_str; - slot.has_next_token = true; - - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && - result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from - // scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - - // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } - - if (!incomplete) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = - find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) { - is_stop_full = true; - slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), - STOP_PARTIAL, slot); - } - - // check if there is any token to predict - if (stop_pos == std::string::npos || - (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { - // no send the stop word in the response - result.text_to_send = - slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); - // add the token to slot queue and cache - } - slot.add_token_string(result); - if (slot.params.stream) { - send_partial_response(slot, result); - } - } - - if (incomplete) { - slot.has_next_token = true; - } - - // check the limits - if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) { - slot.stopped_limit = true; - slot.has_next_token = false; - } - - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { - slot.stopped_eos = true; - slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); - } - - LOG_VERBOSE( - "next token", - { - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); - - return slot.has_next_token; // continue - } - bool process_images(llama_client_slot& slot) const { - for (slot_image& img : slot.images) { - if (!img.request_encode_image) { - continue; - } - - if (!llava_image_embed_make_with_clip_img( - clp_ctx, params.n_threads, img.img_data, &img.image_embedding, - &img.image_tokens)) { - LOG_DEBUG << "Error processing the given image"; - return false; - } - - img.request_encode_image = false; - } - - return slot.images.size() > 0; - } - void send_error(task_server& task, std::string error) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = task.id; - res.multitask_id = task.multitask_id; - res.stop = false; - res.error = true; - res.result_json = {{"content", error}}; - queue_results.push_back(res); - condition_results.notify_all(); - } - - void add_multi_task(int id, std::vector& sub_ids) { - std::lock_guard lock(mutex_tasks); - task_multi multi; - multi.id = id; - std::copy(sub_ids.begin(), sub_ids.end(), - std::inserter(multi.subtasks_remaining, - multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - condition_tasks.notify_one(); - } - - void update_multi_task(int multitask_id, int subtask_id, - task_result& result) { - std::lock_guard lock(mutex_tasks); - for (auto& multitask : queue_multitasks) { - if (multitask.id == multitask_id) { - multitask.subtasks_remaining.erase(subtask_id); - multitask.results.push_back(result); - condition_tasks.notify_one(); - } - } - } - - json get_model_props() { return get_formated_generation(slots[0]); } - - json get_formated_generation(llama_client_slot& slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && - std::isinf(eos_bias->second); - return json{ - {"n_ctx", slot.n_ctx}, - {"model", params.model_alias}, - {"seed", slot.params.seed}, - {"temperature", slot.sparams.temp}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, - {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"grammar", slot.sparams.grammar}, - }; - } - - void send_partial_response(llama_client_slot& slot, - completion_token_output tkn) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = false; - - res.result_json = json{{"content", tkn.text_to_send}, - {"stop", false}, - {"slot_id", slot.id}, - {"multimodal", multimodal}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs_output = {}; - const std::vector to_send_toks = - llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, - slot.generated_token_probs.size()); - size_t probs_stop_pos = - std::min(slot.sent_token_probs_index + to_send_toks.size(), - slot.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } - slot.sent_token_probs_index = probs_stop_pos; - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs_output); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_final_response(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - res.result_json = - json{{"content", !slot.params.stream ? slot.generated_text : ""}, - {"slot_id", slot.id}, - {"stop", true}, - {"model", params.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs = {}; - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = - llama_tokenize(ctx, slot.stopping_word, false); - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); - } else { - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.begin() + slot.sent_token_probs_index); - } - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - // parent multitask, if any, needs to be updated - if (slot.multitask_id != -1) { - update_multi_task(slot.multitask_id, slot.task_id, res); - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_embedding(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - const int n_embd = llama_n_embd(model); - - std::vector embd_res(n_embd, 0.0f); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - LOG_ERROR << "failed to get embeddings" - << " token " << batch.token[i] << ", seq_id " - << batch.seq_id[i][0]; - - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - - continue; - } - - llama_embd_normalize(embd, embd_res.data(), n_embd); - } - res.result_json = json{ - {"embedding", embd_res}, - }; - - queue_results.push_back(res); - condition_results.notify_all(); - } - - int request_completion(json data, bool infill, bool embedding, - int multitask_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.target_id = 0; - task.data = std::move(data); - task.infill_mode = infill; - task.embedding_mode = embedding; - task.type = COMPLETION_TASK; - task.multitask_id = multitask_id; - - // when a completion task's prompt array is not a singleton, we split it - // into multiple requests - if (task.data.at("prompt").size() > 1) { - lock.unlock(); // entering new func scope - return split_multiprompt_task(task); - } - - // otherwise, it's a single-prompt task, we actually queue it - queue_tasks.push_back(task); - condition_tasks.notify_one(); - return task.id; - } - - task_result next_result(int task_id) { - while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&] { return !queue_results.empty(); }); - - for (int i = 0; i < (int)queue_results.size(); i++) { - // for now, tasks that have associated parent multitasks just get erased - // once multitask picks up the result - if (queue_results[i].multitask_id == task_id) { - update_multi_task(task_id, queue_results[i].id, queue_results[i]); - queue_results.erase(queue_results.begin() + i); - continue; - } - - if (queue_results[i].id == task_id) { - if (queue_results[i].multitask_id != -1) { - LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}}); - } - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // never reached - // return task_result{-1, false, false, {}}; - } - - // for multiple images processing - bool ingest_images(llama_client_slot& slot, int n_batch) { - int image_idx = 0; - - while (image_idx < (int)slot.images.size()) { - slot_image& img = slot.images[image_idx]; - - // process prefix prompt - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = - std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - if (llama_decode(ctx, batch_view)) { - LOG_DEBUG << __func__ << " : failed to eval\n"; - return false; - } - } - - // process image with llm - for (int i = 0; i < img.image_tokens; i += n_batch) { - int n_eval = img.image_tokens - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - - const int n_embd = llama_n_embd(model); - llama_batch batch_img = { - n_eval, nullptr, (img.image_embedding + i * n_embd), - nullptr, nullptr, nullptr, - nullptr, slot.n_past, 1, - 0, - }; - if (llama_decode(ctx, batch_img)) { - LOG_DEBUG << __func__ << " : failed to eval image"; - return false; - } - slot.n_past += n_eval; - } - image_idx++; - - llama_batch_clear(batch); - - // append prefix of next image - const auto json_prompt = - (image_idx >= (int)slot.images.size()) - ? slot.params.input_suffix - : // no more images, then process suffix prompt - (json)(slot.images[image_idx].prefix_prompt); - - std::vector append_tokens = - tokenize(json_prompt, false); // has next image - for (int i = 0; i < (int)append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); - slot.n_past += 1; - } - } - - return true; - } - - void request_cancel(int task_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.type = CANCEL_TASK; - task.target_id = task_id; - queue_tasks.push_back(task); - condition_tasks.notify_one(); - } - - int split_multiprompt_task(task_server& multiprompt_task) { - int prompt_count = multiprompt_task.data.at("prompt").size(); - assert(prompt_count > 1); - - int multitask_id = id_gen++; - std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) { - json subtask_data = multiprompt_task.data; - subtask_data["prompt"] = subtask_data["prompt"][i]; - - // subtasks inherit everything else (infill mode, embedding mode, etc.) - subtask_ids[i] = - request_completion(subtask_data, multiprompt_task.infill_mode, - multiprompt_task.embedding_mode, multitask_id); - } - - // queue up the multitask so we can track its subtask progression - add_multi_task(multitask_id, subtask_ids); - return multitask_id; - } - - void process_tasks() { - std::unique_lock lock(mutex_tasks); - while (!queue_tasks.empty()) { - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - switch (task.type) { - case COMPLETION_TASK: { - llama_client_slot* slot = - get_slot(json_value(task.data, "slot_id", -1)); - if (slot == nullptr) { - LOG_DEBUG << "slot unavailable"; - // send error result - send_error(task, "slot unavailable"); - return; - } - - if (task.data.contains("system_prompt")) { - process_system_prompt_data(task.data["system_prompt"]); - } - - slot->reset(); - - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; - slot->multitask_id = task.multitask_id; - - if (!launch_slot_with_data(slot, task.data)) { - // send error result - send_error(task, "internal_error"); - break; - } - } break; - case CANCEL_TASK: { // release slot linked with the task id - for (auto& slot : slots) { - if (slot.task_id == task.target_id) { - slot.release(); - break; - } - } - } break; - } - } - - // remove finished multitasks from the queue of multitasks, and add the - // corresponding result to the result queue - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) { - if (queue_iterator->subtasks_remaining.empty()) { - // all subtasks done == multitask is done - task_result aggregate_result; - aggregate_result.id = queue_iterator->id; - aggregate_result.stop = true; - aggregate_result.error = false; - - // collect json results into one json result - std::vector result_jsons; - for (auto& subres : queue_iterator->results) { - result_jsons.push_back(subres.result_json); - aggregate_result.error = aggregate_result.error && subres.error; - } - aggregate_result.result_json = json{"results", result_jsons}; - - std::lock_guard lock(mutex_results); - queue_results.push_back(aggregate_result); - condition_results.notify_all(); - - queue_iterator = queue_multitasks.erase(queue_iterator); - } else { - ++queue_iterator; - } - } - } - - bool update_slots() { - // attend tasks - process_tasks(); - - // update the system prompt wait until all slots are idle state - if (system_need_update && all_slots_are_idle) { - LOG_DEBUG << "updating system prompt"; - update_system_prompt(); - } - - llama_batch_clear(batch); - - if (all_slots_are_idle) { - if (system_prompt.empty() && clean_kv_cache) { - LOG_DEBUG - << "all slots are idle and system prompt is empty, clear the KV " - "cache"; - kv_cache_clear(); - } - // std::this_thread::sleep_for(std::chrono::milliseconds(5)); - // TODO: Need to implement queueing using CV for better performance - std::unique_lock lock(mutex_tasks); - condition_tasks.wait(lock, [&] { - return (!queue_tasks.empty() && model_loaded_external) || - (queue_tasks.empty() && !model_loaded_external); - }); - } - - for (llama_client_slot& slot : slots) { - if (slot.is_processing() && - slot.cache_tokens.size() >= (size_t)slot.n_ctx) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; - - LOG_DEBUG << "slot " << slot.id - << " context shift - n_keep = " << slot.params.n_keep - << ", n_left = " << n_left << ", n_discard: " << n_discard; - llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, - slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; - i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - }); - } - } - - // decode any currently ongoing sequences - for (auto& slot : slots) { - // release the slot - if (slot.command == RELEASE) { - slot.state = IDLE; - slot.command = NONE; - slot.t_last_used = ggml_time_us(); - - LOG_DEBUG << "slot " << slot.id << " released (" - << (int)slot.cache_tokens.size() << " tokens in cache)"; - - continue; - } - - if (slot.state == IDLE) { - continue; - } - - slot.i_batch = batch.n_tokens; - - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, - {slot.id}, true); - - slot.n_decoded += 1; - slot.n_past += 1; - } - - // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; - - // assign workload to the slots - if (params.cont_batching || batch.n_tokens == 0) { - for (auto& slot : slots) { - const bool has_prompt = slot.prompt.is_array() || - (slot.prompt.is_string() && - !slot.prompt.get().empty()) || - !slot.images.empty(); - - // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - - // need process the prompt - if (slot.state == IDLE && slot.command == LOAD_PROMPT) { - slot.state = PROCESSING; - slot.command = NONE; - std::vector prompt_tokens; - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_genereration = 0; - - if (slot.infill) { - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && - params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = - 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && - suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), - llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); - prompt_tokens = prefix_tokens; - } else { - prompt_tokens = tokenize( - slot.prompt, - system_prompt.empty() && - add_bos_token); // add BOS if there isn't system prompt - } - - slot.num_prompt_tokens = prompt_tokens.size(); - - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = - (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / - n_block_size; - - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + - erased_blocks * n_block_size, - prompt_tokens.end()); - - LOG_VERBOSE( - "input truncated", - { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - slot.truncated = true; - prompt_tokens = new_tokens; - - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); - } - - if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); - - slot.n_past = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; - } else { - // push the prompt into the sampling context (do not apply grammar) - for (auto& token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); - } - - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - slot.num_prompt_tokens_processed = - slot.num_prompt_tokens - slot.n_past; - - LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past - << " tokens | to process: " - << slot.num_prompt_tokens_processed << " tokens"; - } - - LOG_DEBUG << "slot " << slot.id << " : kv cache rm - [" - << (int)system_tokens.size() + slot.n_past << ", end)"; - - llama_kv_cache_seq_rm(ctx, slot.id, - system_tokens.size() + slot.n_past, -1); - - slot.cache_tokens = prompt_tokens; - - if (slot.n_past == slot.num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - LOG_DEBUG << "slot " << slot.id - << " : we have to evaluate at least 1 token to " - "generate logits"; - slot.n_past--; - } - - LOG_VERBOSE( - "prompt ingested", - { - {"n_past", slot.n_past}, - {"cached", - tokens_to_str(ctx, slot.cache_tokens.cbegin(), - slot.cache_tokens.cbegin() + slot.n_past)}, - {"to_eval", - tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, - slot.cache_tokens.cend())}, - }); - - const bool has_images = process_images(slot); - - // process the prefix of first image - std::vector prefix_tokens = - has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) - : prompt_tokens; - for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { - llama_batch_add(batch, prefix_tokens[slot.n_past], - system_tokens.size() + slot.n_past, {slot.id}, - false); - } - - if (has_images && !ingest_images(slot, n_batch)) { - LOG_DEBUG << "failed processing images"; - return false; - } - - // extract the logits only for the last token - if (batch.n_tokens > 0) { - batch.logits[batch.n_tokens - 1] = true; - } - - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - } - } - } - - if (batch.n_tokens == 0) { - all_slots_are_idle = true; - return true; - } - - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it - // via the context size - LOG_DEBUG << __func__ - << " : failed to decode the batch, n_batch = " << n_batch - << ", ret = " << ret; - return false; - } - - LOG_DEBUG - << __func__ - << " : failed to find free space in the KV cache, retrying with " - "smaller n_batch = " - << n_batch / 2; - - // retry with half the batch size to try to find a free slot in the KV - // cache - n_batch /= 2; - i -= n_batch; - continue; - } - - for (auto& slot : slots) { - if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { - continue; - } - - // prompt evaluated for embedding - if (slot.embedding) { - send_embedding(slot); - slot.release(); - slot.i_batch = -1; - return true; - } - - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, - NULL, slot.i_batch - i); - - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); - - if (slot.n_decoded == 1) { - slot.t_start_genereration = ggml_time_us(); - slot.t_prompt_processing = - (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; - } - - llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), - slot.ctx_sampling->cur.size(), false}; - result.tok = id; - - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); - } - - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - } - - slot.i_batch = -1; - } - } - return true; - } -}; - -static void server_print_usage(const char* argv0, const gpt_params& params, - const server_params& sparams) { - printf("usage: %s [options]\n", argv0); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", - server_verbose ? "enabled" : "disabled"); - printf( - " -t N, --threads N number of threads to use during " - "computation (default: %d)\n", - params.n_threads); - printf( - " -tb N, --threads-batch N number of threads to use during batch " - "and prompt processing (default: same as --threads)\n"); - printf( - " -c N, --ctx-size N size of the prompt context (default: " - "%d)\n", - params.n_ctx); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf( - " RoPE frequency scaling method, defaults " - "to linear unless specified by the model\n"); - printf( - " --rope-freq-base N RoPE base frequency (default: loaded " - "from model)\n"); - printf( - " --rope-freq-scale N RoPE frequency scaling factor, expands " - "context by a factor of 1/N\n"); - printf( - " --yarn-ext-factor N YaRN: extrapolation mix factor (default: " - "1.0, 0.0 = full interpolation)\n"); - printf( - " --yarn-attn-factor N YaRN: scale sqrt(t) or attention " - "magnitude (default: 1.0)\n"); - printf( - " --yarn-beta-slow N YaRN: high correction dim or alpha " - "(default: %.1f)\n", - params.yarn_beta_slow); - printf( - " --yarn-beta-fast N YaRN: low correction dim or beta " - "(default: %.1f)\n", - params.yarn_beta_fast); - printf( - " -b N, --batch-size N batch size for prompt processing " - "(default: %d)\n", - params.n_batch); - printf( - " --memory-f32 use f32 instead of f16 for memory " - "key+value (default: disabled)\n"); - printf( - " not recommended: doubles context memory " - "required and no measurable increase in quality\n"); - if (llama_supports_mlock()) { - printf( - " --mlock force system to keep model in RAM " - "rather than swapping or compressing\n"); - } - if (llama_supports_mmap()) { - printf( - " --no-mmap do not memory-map model (slower load " - "but may reduce pageouts if not using mlock)\n"); - } - printf( - " --numa attempt optimizations that help on some " - "NUMA systems\n"); - if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf( - " how to split the model across multiple " - "GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf( - " - layer (default): split layers and " - "KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf( - " fraction of the model to offload to " - "each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf( - " -mg i, --main-gpu i the GPU to use for the model (with " - "split-mode = none),\n"); - printf( - " or for intermediate results and KV " - "(with split-mode = row)\n"); - } - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", - params.model.c_str()); - printf(" -a ALIAS, --alias ALIAS\n"); - printf( - " set an alias for the model, will be " - "added as `model` field in completion response\n"); - printf( - " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf( - " --lora-base FNAME optional model to use as a base for the " - "layers modified by the LoRA adapter\n"); - printf( - " --host ip address to listen (default (default: " - "%s)\n", - sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", - sparams.port); - printf( - " --path PUBLIC_PATH path from which to serve static files " - "(default %s)\n", - sparams.public_path.c_str()); - printf( - " --api-key API_KEY optional api key to enhance server " - "security. If set, requests must include this key for access.\n"); - printf( - " --api-key-file FNAME path to file containing api keys " - "delimited by new lines. If set, requests must include one of the " - "keys for access.\n"); - printf( - " -to N, --timeout N server read/write timeout in seconds " - "(default: %d)\n", - sparams.read_timeout); - printf( - " --embedding enable embedding vector output (default: " - "%s)\n", - params.embedding ? "enabled" : "disabled"); - printf( - " -np N, --parallel N number of slots for process requests " - "(default: %d)\n", - params.n_parallel); - printf( - " -cb, --cont-batching enable continuous batching (a.k.a " - "dynamic batching) (default: disabled)\n"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf( - " set a file to load a system prompt " - "(initial " - "prompt of all slots), this is useful for chat applications.\n"); - printf( - " --mmproj MMPROJ_FILE path to a multimodal projector file for " - "LLaVA.\n"); - printf(" --log-disable disables logging to a file.\n"); - printf("\n"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf( - " advanced option to override model " - "metadata by key. may be specified multiple times.\n"); - printf( - " types: int, float, bool. example: " - "--override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf( - " -gan N, --grp-attn-n N set the group attention factor to extend " - "context size through self-extend(default: 1=disabled), used together " - "with group attention width `--grp-attn-w`"); - printf( - " -gaw N, --grp-attn-w N set the group attention width to extend " - "context size through self-extend(default: 512), used together with " - "group attention factor `--grp-attn-n`"); - printf("\n"); -} -static std::string random_string() { - static const std::string str( - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); -} -static json format_final_response_oaicompat(const json& request, - const task_result& response, - bool streaming = false) { - json result = response.result_json; - - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = - json{{"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", - json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, - {"id", gen_chatcmplid()}}; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = - json_value(result, "completion_probabilities", json::array()); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate -// two responses -static std::vector format_partial_response_oaicompat( - const task_result& response) { - json result = response.result_json; - - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({response.result_json}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = - json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"content", content}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json{{"choices", choices}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({ret}); -} - -static json format_partial_response( - llama_server_context& llama, llama_client_slot* slot, - const std::string& content, - const std::vector& probs) { - json res = json{{"content", content}, - {"stop", false}, - {"slot_id", slot->id}, - {"multimodal", llama.multimodal}}; - - if (slot->sparams.n_probs > 0) { - res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); - } - - return res; -} - -static json format_tokenizer_response(const std::vector& tokens) { - return json{{"tokens", tokens}}; -} - -static json format_detokenized_response(std::string content) { - return json{{"content", content}}; -} - -struct token_translator { - llama_context* ctx; - std::string operator()(llama_token tok) const { - return llama_token_to_piece(ctx, tok); - } - std::string operator()(const completion_token_output& cto) const { - return (*this)(cto.tok); - } -}; - -static void append_to_generated_text_from_generated_token_probs( - llama_server_context& llama, llama_client_slot* slot) { - auto& gtps = slot->generated_token_probs; - auto translator = token_translator{llama.ctx}; - auto add_strlen = [=](size_t sum, const completion_token_output& cto) { - return sum + translator(cto).size(); - }; - const size_t len = - std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); - if (slot->generated_text.capacity() < slot->generated_text.size() + len) { - slot->generated_text.reserve(slot->generated_text.size() + len); - } - for (const completion_token_output& cto : gtps) { - slot->generated_text += translator(cto); - } -} diff --git a/context/whisper_server_context.cc b/context/whisper_server_context.cc deleted file mode 100644 index a4ccbe710..000000000 --- a/context/whisper_server_context.cc +++ /dev/null @@ -1,796 +0,0 @@ -#include "whisper_server_context.h" -#include "utils/dr_wav.h" -#include -#include -#include -#include "utils/json.hpp" - -using json = nlohmann::json; - -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo) { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname == "-") { - { - uint8_t buf[1024]; - while (true) { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == - false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return false; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, - wav_data.size()); - } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); - return false; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, - fname.c_str()); - return false; - } - - if (stereo && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", - __func__, fname.c_str()); - return false; - } - - if (wav.sampleRate != COMMON_SAMPLE_RATE) { - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, - fname.c_str(), COMMON_SAMPLE_RATE / 1000); - return false; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, - fname.c_str()); - return false; - } - - const uint64_t n = - wav_data.empty() - ? wav.totalPCMFrameCount - : wav_data.size() / (wav.channels * wav.bitsPerSample / 8); - - std::vector pcm16; - pcm16.resize(n * wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i]) / 32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; - } - } - - if (stereo) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f; - pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f; - } - } - - return true; -} - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s) { - std::stringstream result; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - result << speaker << text << "\n"; - } - return result.str(); -} - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only) { - std::string speaker = ""; - const int64_t n_samples = pcmf32s[0].size(); - - const int64_t is0 = timestamp_to_sample(t0, n_samples); - const int64_t is1 = timestamp_to_sample(t1, n_samples); - - double energy0 = 0.0f; - double energy1 = 0.0f; - - for (int64_t j = is0; j < is1; j++) { - energy0 += fabs(pcmf32s[0][j]); - energy1 += fabs(pcmf32s[1][j]); - } - - if (energy0 > 1.1 * energy1) { - speaker = "0"; - } else if (energy1 > 1.1 * energy0) { - speaker = "1"; - } else { - speaker = "?"; - } - - // printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = - // %s\n", is0, is1, energy0, energy1, speaker.c_str()); - - if (!id_only) { - speaker.insert(0, "(speaker "); - speaker.append(")"); - } - - return speaker; -} - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma) { - int64_t msec = t * 10; - int64_t hr = msec / (1000 * 60 * 60); - msec = msec - hr * (1000 * 60 * 60); - int64_t min = msec / (1000 * 60); - msec = msec - min * (1000 * 60); - int64_t sec = msec / 1000; - msec = msec - sec * 1000; - - char buf[32]; - snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int)hr, (int)min, - (int)sec, comma ? "," : ".", (int)msec); - - return std::string(buf); -} - -int timestamp_to_sample(int64_t t, int n_samples) { - return (std::max)(0, (std::min)((int)n_samples - 1, - (int)((t * WHISPER_SAMPLE_RATE) / 100))); -} - -bool is_file_exist(const char* fileName) { - std::ifstream infile(fileName); - return infile.good(); -} - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params) { - fprintf(stderr, "\n"); - fprintf(stderr, "usage: %s [options] \n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, - " -h, --help [default] show this help " - "message and exit\n"); - fprintf(stderr, - " -t N, --threads N [%-7d] number of threads to use " - "during computation\n", - params.n_threads); - fprintf(stderr, - " -p N, --processors N [%-7d] number of processors to use " - "during computation\n", - params.n_processors); - fprintf( - stderr, - " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", - params.offset_t_ms); - fprintf(stderr, - " -on N, --offset-n N [%-7d] segment index offset\n", - params.offset_n); - fprintf(stderr, - " -d N, --duration N [%-7d] duration of audio to " - "process in milliseconds\n", - params.duration_ms); - fprintf(stderr, - " -mc N, --max-context N [%-7d] maximum number of text " - "context tokens to store\n", - params.max_context); - fprintf(stderr, - " -ml N, --max-len N [%-7d] maximum segment length in " - "characters\n", - params.max_len); - fprintf(stderr, - " -sow, --split-on-word [%-7s] split on word rather than " - "on token\n", - params.split_on_word ? "true" : "false"); - fprintf(stderr, - " -bo N, --best-of N [%-7d] number of best candidates " - "to keep\n", - params.best_of); - fprintf(stderr, - " -bs N, --beam-size N [%-7d] beam size for beam search\n", - params.beam_size); - fprintf(stderr, - " -wt N, --word-thold N [%-7.2f] word timestamp " - "probability threshold\n", - params.word_thold); - fprintf(stderr, - " -et N, --entropy-thold N [%-7.2f] entropy threshold for " - "decoder fail\n", - params.entropy_thold); - fprintf(stderr, - " -lpt N, --logprob-thold N [%-7.2f] log probability threshold " - "for decoder fail\n", - params.logprob_thold); - // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by - // x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, - " -debug, --debug-mode [%-7s] enable debug mode (eg. dump " - "log_mel)\n", - params.debug_mode ? "true" : "false"); - fprintf(stderr, - " -tr, --translate [%-7s] translate from source " - "language to english\n", - params.translate ? "true" : "false"); - fprintf(stderr, - " -di, --diarize [%-7s] stereo audio diarization\n", - params.diarize ? "true" : "false"); - fprintf(stderr, - " -tdrz, --tinydiarize [%-7s] enable tinydiarize " - "(requires a tdrz model)\n", - params.tinydiarize ? "true" : "false"); - fprintf(stderr, - " -nf, --no-fallback [%-7s] do not use temperature " - "fallback while decoding\n", - params.no_fallback ? "true" : "false"); - fprintf(stderr, - " -ps, --print-special [%-7s] print special tokens\n", - params.print_special ? "true" : "false"); - fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", - params.print_colors ? "true" : "false"); - fprintf(stderr, - " -pr, --print-realtime [%-7s] print output in realtime\n", - params.print_realtime ? "true" : "false"); - fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", - params.print_progress ? "true" : "false"); - fprintf(stderr, - " -nt, --no-timestamps [%-7s] do not print timestamps\n", - params.no_timestamps ? "true" : "false"); - fprintf(stderr, - " -l LANG, --language LANG [%-7s] spoken language ('auto' for " - "auto-detect)\n", - params.language.c_str()); - fprintf(stderr, - " -dl, --detect-language [%-7s] exit after automatically " - "detecting language\n", - params.detect_language ? "true" : "false"); - fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", - params.prompt.c_str()); - fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", - params.model.c_str()); - fprintf(stderr, - " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used " - "for encode inference\n", - params.openvino_encode_device.c_str()); - fprintf(stderr, - " --convert, [%-7s] Convert audio to WAV, " - "requires ffmpeg on the server", - params.ffmpeg_converter ? "true" : "false"); - fprintf(stderr, "\n"); -} - -bool whisper_params_parse(int argc, char** argv, whisper_params& params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-h" || arg == "--help") { - whisper_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--processors") { - params.n_processors = std::stoi(argv[++i]); - } else if (arg == "-ot" || arg == "--offset-t") { - params.offset_t_ms = std::stoi(argv[++i]); - } else if (arg == "-on" || arg == "--offset-n") { - params.offset_n = std::stoi(argv[++i]); - } else if (arg == "-d" || arg == "--duration") { - params.duration_ms = std::stoi(argv[++i]); - } else if (arg == "-mc" || arg == "--max-context") { - params.max_context = std::stoi(argv[++i]); - } else if (arg == "-ml" || arg == "--max-len") { - params.max_len = std::stoi(argv[++i]); - } else if (arg == "-bo" || arg == "--best-of") { - params.best_of = std::stoi(argv[++i]); - } else if (arg == "-bs" || arg == "--beam-size") { - params.beam_size = std::stoi(argv[++i]); - } else if (arg == "-wt" || arg == "--word-thold") { - params.word_thold = std::stof(argv[++i]); - } else if (arg == "-et" || arg == "--entropy-thold") { - params.entropy_thold = std::stof(argv[++i]); - } else if (arg == "-lpt" || arg == "--logprob-thold") { - params.logprob_thold = std::stof(argv[++i]); - } - // else if (arg == "-su" || arg == "--speed-up") { params.speed_up - // = true; } - else if (arg == "-debug" || arg == "--debug-mode") { - params.debug_mode = true; - } else if (arg == "-tr" || arg == "--translate") { - params.translate = true; - } else if (arg == "-di" || arg == "--diarize") { - params.diarize = true; - } else if (arg == "-tdrz" || arg == "--tinydiarize") { - params.tinydiarize = true; - } else if (arg == "-sow" || arg == "--split-on-word") { - params.split_on_word = true; - } else if (arg == "-nf" || arg == "--no-fallback") { - params.no_fallback = true; - } else if (arg == "-fp" || arg == "--font-path") { - params.font_path = argv[++i]; - } else if (arg == "-ps" || arg == "--print-special") { - params.print_special = true; - } else if (arg == "-pc" || arg == "--print-colors") { - params.print_colors = true; - } else if (arg == "-pr" || arg == "--print-realtime") { - params.print_realtime = true; - } else if (arg == "-pp" || arg == "--print-progress") { - params.print_progress = true; - } else if (arg == "-nt" || arg == "--no-timestamps") { - params.no_timestamps = true; - } else if (arg == "-l" || arg == "--language") { - params.language = argv[++i]; - } else if (arg == "-dl" || arg == "--detect-language") { - params.detect_language = true; - } else if (arg == "--prompt") { - params.prompt = argv[++i]; - } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; - } else if (arg == "-oved" || arg == "--ov-e-device") { - params.openvino_encode_device = argv[++i]; - } else if (arg == "-ng" || arg == "--no-gpu") { - params.use_gpu = false; - } else if (arg == "--convert") { - params.ffmpeg_converter = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - whisper_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -void check_ffmpeg_availibility() { - int result = system("ffmpeg -version"); - - if (result == 0) { - std::cout << "ffmpeg is available." << std::endl; - } else { - // ffmpeg is not available - std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "; - std::cout << "and that its executable is included in your system's PATH. "; - exit(0); - } -} - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp) { - std::ostringstream cmd_stream; - std::string converted_filename_temp = temp_filename + "_temp.wav"; - cmd_stream << "ffmpeg -i \"" << temp_filename - << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" - << converted_filename_temp << "\" 2>&1"; - std::string cmd = cmd_stream.str(); - - int status = std::system(cmd.c_str()); - if (status != 0) { - error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; - return false; - } - - // Remove the original file - if (remove(temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to remove the original file.\"}"; - return false; - } - - // Rename the temporary file to match the original filename - if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; - return false; - } - return true; -} - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data) { - int progress_step = - ((whisper_print_user_data*)user_data)->params->progress_step; - int* progress_prev = &(((whisper_print_user_data*)user_data)->progress_prev); - if (progress >= *progress_prev + progress_step) { - *progress_prev += progress_step; - fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress); - } -} - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data) { - const auto& params = *((whisper_print_user_data*)user_data)->params; - const auto& pcmf32s = *((whisper_print_user_data*)user_data)->pcmf32s; - - const int n_segments = whisper_full_n_segments(ctx); - - std::string speaker = ""; - - int64_t t0 = 0; - int64_t t1 = 0; - - // print the last n_new segments - const int s0 = n_segments - n_new; - - if (s0 == 0) { - printf("\n"); - } - - for (int i = s0; i < n_segments; i++) { - if (!params.no_timestamps || params.diarize) { - t0 = whisper_full_get_segment_t0(ctx, i); - t1 = whisper_full_get_segment_t1(ctx, i); - } - - if (!params.no_timestamps) { - printf("[%s --> %s] ", to_timestamp(t0).c_str(), - to_timestamp(t1).c_str()); - } - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - if (params.print_colors) { - for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) { - if (params.print_special == false) { - const whisper_token id = whisper_full_get_token_id(ctx, i, j); - if (id >= whisper_token_eot(ctx)) { - continue; - } - } - - const char* text = whisper_full_get_token_text(ctx, i, j); - const float p = whisper_full_get_token_p(ctx, i, j); - - const int col = (std::max)( - 0, (std::min)((int)k_colors.size() - 1, - (int)((std::pow)(p, 3) * float(k_colors.size())))); - - printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, - "\033[0m"); - } - } else { - const char* text = whisper_full_get_segment_text(ctx, i); - - printf("%s%s", speaker.c_str(), text); - } - - if (params.tinydiarize) { - if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - printf("%s", params.tdrz_speaker_turn.c_str()); - } - } - - // with timestamps or speakers: each segment on new line - if (!params.no_timestamps || params.diarize) { - printf("\n"); - } - fflush(stdout); - } -} - -whisper_server_context::~whisper_server_context() { - if (ctx) { - whisper_print_timings(ctx); - whisper_free(ctx); - ctx = nullptr; - } -} - -bool whisper_server_context::load_model(std::string& model_path) { - whisper_mutex.lock(); - - // clean up - whisper_free(ctx); - - // whisper init - ctx = whisper_init_from_file_with_params(model_path.c_str(), cparams); - - // TODO perhaps load prior model here instead of exit - if (ctx == nullptr) { - whisper_mutex.unlock(); - return false; - } - - // initialize openvino encoder. this has no effect on whisper.cpp builds that - // don't have OpenVINO configured - whisper_ctx_init_openvino_encoder( - ctx, nullptr, params.openvino_encode_device.c_str(), nullptr); - - // check if the model is in the file system - whisper_mutex.unlock(); - return true; -} - -std::string whisper_server_context::inference( - std::string& input_file_path, std::string language, std::string prompt, - std::string response_format, float temperature, bool translate) { - // acquire whisper model mutex lock - whisper_mutex.lock(); - - // audio arrays - std::vector pcmf32; // mono-channel F32 PCM - std::vector> pcmf32s; // stereo-channel F32 PCM - - // if file is not wav, convert to wav - if (params.ffmpeg_converter) { - std::string error_resp = "Failed to execute ffmpeg command converting " + - input_file_path + " to wav"; - const bool is_converted = convert_to_wav(input_file_path, error_resp); - if (!is_converted) { - whisper_mutex.unlock(); - LOG_ERROR << error_resp; - throw std::runtime_error(error_resp); - } - } - - // read wav content into pcmf32 - if (!read_wav(input_file_path, pcmf32, pcmf32s, params.diarize)) { - std::string error_resp = "Failed to read WAV file " + input_file_path; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - - printf("Successfully loaded %s\n", input_file_path.c_str()); - - params.translate = translate; - params.language = language; - params.response_format = response_format; - if (!whisper_is_multilingual(ctx)) { - if (params.language != "en" || params.translate) { - params.language = "en"; - params.translate = false; - LOG_WARN - << "Model " << model_id - << " is not multilingual, ignoring language and translation options"; - } - } - if (params.detect_language) { - params.language = "auto"; - } - - // print some processing info - std::string processing_info = - "Model " + model_id + " processing " + input_file_path + " (" + - std::to_string(pcmf32.size()) + " samples, " + - std::to_string(float(pcmf32.size()) / WHISPER_SAMPLE_RATE) + " sec), " + - std::to_string(params.n_threads) + " threads, " + - std::to_string(params.n_processors) + - " processors, lang = " + params.language + - ", task = " + (params.translate ? "translate" : "transcribe") + ", " + - (params.tinydiarize ? "tdrz = 1, " : "") + - (params.no_timestamps ? "timestamps = 0" : "timestamps = 1"); - LOG_INFO << processing_info; - - // run the inference - { - std::string msg = "Running whisper.cpp inference of model " + model_id + - " on " + input_file_path; - LOG_INFO << msg; - whisper_full_params wparams = - whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - - wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH - : WHISPER_SAMPLING_GREEDY; - - wparams.print_realtime = false; - wparams.print_progress = params.print_progress; - wparams.print_timestamps = !params.no_timestamps; - wparams.print_special = params.print_special; - wparams.translate = params.translate; - wparams.language = params.language.c_str(); - wparams.detect_language = params.detect_language; - wparams.n_threads = params.n_threads; - wparams.n_max_text_ctx = - params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; - wparams.offset_ms = params.offset_t_ms; - wparams.duration_ms = params.duration_ms; - - wparams.thold_pt = params.word_thold; - wparams.max_len = params.max_len == 0 ? 60 : params.max_len; - wparams.split_on_word = params.split_on_word; - - wparams.speed_up = params.speed_up; - wparams.debug_mode = params.debug_mode; - - wparams.tdrz_enable = params.tinydiarize; // [TDRZ] - - wparams.initial_prompt = prompt.c_str(); - - wparams.greedy.best_of = params.best_of; - wparams.beam_search.beam_size = params.beam_size; - - wparams.temperature = temperature; - wparams.temperature_inc = params.temperature_inc; - wparams.entropy_thold = params.entropy_thold; - wparams.logprob_thold = params.logprob_thold; - - wparams.no_timestamps = params.no_timestamps; - - whisper_print_user_data user_data = {¶ms, &pcmf32s, 0}; - - // this callback is called on each new segment - if (params.print_realtime) { - wparams.new_segment_callback = whisper_print_segment_callback; - wparams.new_segment_callback_user_data = &user_data; - } - - if (wparams.print_progress) { - wparams.progress_callback = whisper_print_progress_callback; - wparams.progress_callback_user_data = &user_data; - } - - // examples for abort mechanism - // in examples below, we do not abort the processing, but we could if the - // flag is set to true - - // the callback is called before every encoder run - if it returns false, - // the processing is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.encoder_begin_callback = [](struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - void* user_data) { - bool is_aborted = *(bool*)user_data; - return !is_aborted; - }; - wparams.encoder_begin_callback_user_data = &is_aborted; - } - - // the callback is called before every computation - if it returns true, the - // computation is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.abort_callback = [](void* user_data) { - bool is_aborted = *(bool*)user_data; - return is_aborted; - }; - wparams.abort_callback_user_data = &is_aborted; - } - - if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), - params.n_processors) != 0) { - std::string error_resp = "Failed to process audio"; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - } - - // return results to user - std::string result; - if (params.response_format == text_format) { - result = output_str(ctx, params, pcmf32s); - } else if (params.response_format == srt_format) { - std::stringstream ss; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - ss << i + 1 + params.offset_n << "\n"; - ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vtt_format) { - std::stringstream ss; - - ss << "WEBVTT\n\n"; - - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true); - speaker.insert(0, ""); - } - - ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vjson_format) { - /* try to match openai/whisper's Python format */ - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - json segment = json{ - {"id", i}, - {"text", whisper_full_get_segment_text(ctx, i)}, - }; - - if (!params.no_timestamps) { - segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01; - segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01; - } - - const int n_tokens = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < n_tokens; ++j) { - whisper_token_data token = whisper_full_get_token_data(ctx, i, j); - if (token.id >= whisper_token_eot(ctx)) { - continue; - } - - segment["tokens"].push_back(token.id); - json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}}; - if (!params.no_timestamps) { - word["start"] = token.t0 * 0.01; - word["end"] = token.t1 * 0.01; - } - word["probability"] = token.p; - segment["words"].push_back(word); - } - jres["segments"].push_back(segment); - } - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } else { - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } - - // reset params to thier defaults - params = default_params; - - // return whisper model mutex lock - whisper_mutex.unlock(); - LOG_INFO << "Successfully processed " << input_file_path << ": " << result; - - return result; -} diff --git a/context/whisper_server_context.h b/context/whisper_server_context.h deleted file mode 100644 index da29e4d9f..000000000 --- a/context/whisper_server_context.h +++ /dev/null @@ -1,165 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#include "whisper.h" - -// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9] -// Lowest is red, middle is yellow, highest is green. -const std::vector k_colors = { - "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", - "\033[38;5;220m", "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", - "\033[38;5;118m", "\033[38;5;82m", -}; - -// output formats -const std::string json_format = "json"; -const std::string text_format = "text"; -const std::string srt_format = "srt"; -const std::string vjson_format = "verbose_json"; -const std::string vtt_format = "vtt"; - -#define COMMON_SAMPLE_RATE 16000 - -struct whisper_params { - int32_t n_threads = - (std::min)(4, (int32_t)std::thread::hardware_concurrency()); - int32_t n_processors = 1; - int32_t offset_t_ms = 0; - int32_t offset_n = 0; - int32_t duration_ms = 0; - int32_t progress_step = 5; - int32_t max_context = -1; - int32_t max_len = 0; - int32_t best_of = 2; - int32_t beam_size = -1; - - float word_thold = 0.01f; - float entropy_thold = 2.40f; - float logprob_thold = -1.00f; - float temperature = 0.00f; - float temperature_inc = 0.20f; - - bool speed_up = false; - bool debug_mode = false; - bool translate = false; - bool detect_language = false; - bool diarize = false; - bool tinydiarize = false; - bool split_on_word = false; - bool no_fallback = false; - bool print_special = false; - bool print_colors = false; - bool print_realtime = false; - bool print_progress = false; - bool no_timestamps = false; - bool use_gpu = true; - bool ffmpeg_converter = false; - - std::string language = "en"; - std::string prompt = ""; - std::string font_path = - "/System/Library/Fonts/Supplemental/Courier New Bold.ttf"; - std::string model = "models/ggml-base.en.bin"; - - std::string response_format = json_format; - - // [TDRZ] speaker turn string - std::string tdrz_speaker_turn = - " [SPEAKER_TURN]"; // TODO: set from command line - - std::string openvino_encode_device = "CPU"; -}; - -// Read WAV audio file and store the PCM data into pcmf32 -// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE -// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain -// 2 channel PCM -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo); - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s); - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only = false); - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma = false); - -int timestamp_to_sample(int64_t t, int n_samples); - -bool is_file_exist(const char* fileName); - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params); - -bool whisper_params_parse(int argc, char** argv, whisper_params& params); - -void check_ffmpeg_availibility(); - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp); - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data); - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data); - -struct whisper_print_user_data { - const whisper_params* params; - - const std::vector>* pcmf32s; - int progress_prev; -}; - -struct whisper_server_context { - whisper_params params; - whisper_params default_params; - std::mutex whisper_mutex; - std::string model_id; - - struct whisper_context_params cparams; - struct whisper_context* ctx = nullptr; - - whisper_server_context() = default; // add this line - - // Constructor - whisper_server_context(const std::string& model_id) { - this->model_id = model_id; - this->cparams = whisper_context_params(); - this->ctx = nullptr; - // store default params so we can reset after each inference request - this->default_params = whisper_params(); - this->params = whisper_params(); - } - - // Move constructor - whisper_server_context(whisper_server_context&& other) noexcept - : params(std::move(other.params)), - default_params(std::move(other.default_params)), - whisper_mutex() // std::mutex is not movable, so we initialize a new one - , - model_id(std::move(other.model_id)), - cparams(std::move(other.cparams)), - ctx(std::exchange( - other.ctx, - nullptr)) // ctx is a raw pointer, so we use std::exchange - {} - - bool load_model(std::string& model_path); - - std::string inference(std::string& input_file_path, std::string languague, - std::string prompt, std::string response_format, - float temperature, bool translate); - - ~whisper_server_context(); -}; \ No newline at end of file diff --git a/controllers/audio.cc b/controllers/audio.cc deleted file mode 100644 index 91fd76d5b..000000000 --- a/controllers/audio.cc +++ /dev/null @@ -1,300 +0,0 @@ -#include "audio.h" - -#include "utils/nitro_utils.h" -#include "whisper.h" - -using namespace v1; - -audio::audio() { - whisper_print_system_info(); -}; - -audio::~audio() {} - -std::optional audio::ParseModelId( - const std::shared_ptr& jsonBody, - const std::function& callback) { - if (!jsonBody->isMember("model_id")) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return std::nullopt; // Signal that an error occurred - } - - return (*jsonBody)["model_id"].asString(); -} - -void audio::LoadModel(const HttpRequestPtr& req, - std::function&& callback) { - const auto jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // Check if model is already loaded - if (whispers.find(model_id) != whispers.end()) { - std::string error_msg = "Model " + model_id + " already loaded"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k409Conflict); - callback(resp); - return; - } - - // Model not loaded, load it - // Parse model path from request - std::string model_path = (*jsonBody)["model_path"].asString(); - if (!is_file_exist(model_path.c_str())) { - std::string error_msg = "Model " + model_path + " not found"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - whisper_server_context whisper = whisper_server_context(model_id); - bool model_loaded = whisper.load_model(model_path); - // If model failed to load, return a 500 error - if (!model_loaded) { - whisper.~whisper_server_context(); - std::string error_msg = "Failed to load model " + model_path; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - - // Warm up the model - // Parse warm up audio path from request - if (jsonBody->isMember("warm_up_audio_path")) { - std::string warm_up_msg = "Warming up model " + model_id; - LOG_INFO << warm_up_msg; - std::string warm_up_audio_path = - (*jsonBody)["warm_up_audio_path"].asString(); - // Return 400 error if warm up audio path is not found - if (!is_file_exist(warm_up_audio_path.c_str())) { - std::string error_msg = - "Warm up audio " + warm_up_audio_path + - " not found, please provide a valid path or don't specify it at all"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } else { - LOG_INFO << "Warming up model " << model_id << " with audio " - << warm_up_audio_path << " ..."; - std::string warm_up_result = whisper.inference(warm_up_audio_path, "en", - "", text_format, 0, false); - LOG_INFO << "Warm up model " << model_id << " completed"; - } - } else { - LOG_INFO << "No warm up audio provided, skipping warm up"; - } - - // Model loaded successfully, add it to the map of loaded models - // and return a 200 response - whispers.emplace(model_id, std::move(whisper)); - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " loaded successfully"; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::UnloadModel( - const HttpRequestPtr& req, - std::function&& callback) { - const auto& jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // If model is not loaded, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Model loaded, unload it - whispers[model_id].~whisper_server_context(); - whispers.erase(model_id); - - // Return a 200 response - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " unloaded successfully"; - LOG_INFO << success_msg; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::ListModels(const HttpRequestPtr& req, - std::function&& callback) { - // Return a list of all loaded models - Json::Value jsonResp; - Json::Value models; - for (auto const& model : whispers) { - models.append(model.first); - } - jsonResp["models"] = models; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::TranscriptionImpl( - const HttpRequestPtr& req, - std::function&& callback, bool translate) { - MultiPartParser partParser; - Json::Value jsonResp; - if (partParser.parse(req) != 0 || partParser.getFiles().size() != 1) { - auto resp = HttpResponse::newHttpResponse(); - resp->setBody("Must have exactly one file"); - resp->setStatusCode(k403Forbidden); - callback(resp); - return; - } - auto& file = partParser.getFiles()[0]; - const auto& formFields = partParser.getParameters(); - - // Check if model_id are present in the request. If not, return a 400 error - if (formFields.find("model_id") == formFields.end()) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } - - std::string model_id = formFields.at("model_id"); - - // Parse all other optional parameters from the request - std::string language = formFields.find("language") != formFields.end() - ? formFields.at("language") - : "en"; - std::string prompt = formFields.find("prompt") != formFields.end() - ? formFields.at("prompt") - : ""; - std::string response_format = - formFields.find("response_format") != formFields.end() - ? formFields.at("response_format") - : json_format; - float temperature = formFields.find("temperature") != formFields.end() - ? std::stof(formFields.at("temperature")) - : 0; - - // Check if model is loaded. If not, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Save input file to temp location - std::string temp_dir = - std::filesystem::temp_directory_path().string() + "/" + - std::to_string(std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count()); - // Create the directory - std::filesystem::create_directory(temp_dir); - // Save the file to the directory, with its original name - std::string temp_file_path = temp_dir + "/" + file.getFileName(); - file.saveAs(temp_file_path); - - // Run inference - std::string result; - try { - result = - whispers[model_id].inference(temp_file_path, language, prompt, - response_format, temperature, translate); - } catch (const std::exception& e) { - std::remove(temp_file_path.c_str()); - Json::Value jsonResp; - jsonResp["message"] = e.what(); - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - // TODO: Need to remove the entire temp directory, not just the file - std::remove(temp_file_path.c_str()); - - auto resp = nitro_utils::nitroHttpResponse(); - resp->setBody(result); - resp->setStatusCode(k200OK); - // Set content type based on response format - if (response_format == json_format || response_format == vjson_format) { - resp->addHeader("Content-Type", "application/json"); - } else if (response_format == text_format) { - resp->addHeader("Content-Type", "text/html"); - } else if (response_format == srt_format) { - resp->addHeader("Content-Type", "application/x-subrip"); - } else if (response_format == vtt_format) { - resp->addHeader("Content-Type", "text/vtt"); - } - callback(resp); - return; -} - -void audio::ModelStatus( - const HttpRequestPtr& req, - std::function&& callback) { - auto resp = nitro_utils::nitroHttpResponse(); - resp->setStatusCode(k200OK); - resp->setContentTypeCode(drogon::CT_APPLICATION_JSON); - resp->setBody("Unimplemented"); - callback(resp); -} - -void audio::CreateTranscription( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), false); -} - -void audio::CreateTranslation( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), true); -} \ No newline at end of file diff --git a/controllers/audio.h b/controllers/audio.h deleted file mode 100644 index 19b1efb6d..000000000 --- a/controllers/audio.h +++ /dev/null @@ -1,74 +0,0 @@ -#pragma once - -#include -#include -#include -#include "common/base.h" - -#define DR_WAV_IMPLEMENTATION -#include "utils/dr_wav.h" - -#include "utils/json.hpp" - -// Whisper Context -#include "context/whisper_server_context.h" - -using json = nlohmann::ordered_json; - -using namespace drogon; - -namespace v1 { - -class audio : public drogon::HttpController