From e80bb44e2b3cc7b80a7d000ff97b422914171779 Mon Sep 17 00:00:00 2001
From: Hien To <tominhhien97@gmail.com>
Date: Wed, 8 May 2024 14:39:29 +0700
Subject: [PATCH] Refactor CICD

---
 .github/workflows/build.yml        | 770 +++++------------------------
 .github/workflows/docs.yml         |  95 ----
 .github/workflows/quality-gate.yml | 163 ++++++
 cortex-cpp/.gitignore              |   1 -
 cortex-cpp/Makefile                |  67 +++
 5 files changed, 356 insertions(+), 740 deletions(-)
 delete mode 100644 .github/workflows/docs.yml
 create mode 100644 .github/workflows/quality-gate.yml
 create mode 100644 cortex-cpp/Makefile

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dc0719f95..aad8abbfb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,52 +1,15 @@
 name: CI
 
 on:
-  schedule:
-    - cron: "0 20 * * *" # At 8 PM UTC, which is 3 AM UTC+7
   push:
     tags: ["v[0-9]+.[0-9]+.[0-9]+"]
     paths:
       [
-        ".github/scripts/**",
-        ".github/workflows/build.yml",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-        "**/*.cc",
-        "**/*.cxx",
-        "llama.cpp",
-        "!docs/**",
-        "!.gitignore",
-        "!README.md",
-      ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      [
-        ".github/scripts/**",
-        ".github/workflows/build.yml",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-        "**/*.cc",
-        "**/*.cxx",
-        "llama.cpp",
-        "!docs/**",
-        "!.gitignore",
-        "!README.md",
+        "cortex-cpp/**",
       ]
   workflow_dispatch:
 
 env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
   EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
 
@@ -76,253 +39,121 @@ jobs:
           draft: true
           prerelease: false
 
-  # Get the latest version of the release
-  set-nitro-version:
-    runs-on: ubuntu-latest
-    outputs:
-      version: ${{ steps.version_update.outputs.new_version }}
-    steps:
-      - name: Get latest release
-        id: version_update
-        run: |
-          ldd --version
-          if [[ ${{ github.event_name }} == push && ${{ github.ref }} == refs/tags/* ]]; then
-            echo "VERSION=${GITHUB_REF#refs/tags/}"
-            NEW_VERSION="${VERSION#v}"
-            echo "::set-output name=new_version::$NEW_VERSION"
-          else
-            # Function to get the latest release tag
-            get_latest_tag() {
-              local retries=0
-              local max_retries=3
-              local tag
-              while [ $retries -lt $max_retries ]; do
-                tag=$(curl -s https://api.github.com/repos/janhq/cortex/releases/latest | jq -r .tag_name)
-                if [ -n "$tag" ] && [ "$tag" != "null" ]; then
-                  echo $tag
-                  return
-                else
-                  let retries++
-                  sleep 2
-                fi
-              done
-              echo "Failed to fetch latest tag after $max_retries attempts."
-              exit 1
-            }
-            # Get the latest release tag from GitHub API
-            LATEST_TAG=$(get_latest_tag)
-            
-            # Remove the 'v' and append the build number to the version
-            NEW_VERSION="${LATEST_TAG#v}-${GITHUB_RUN_NUMBER}"
-            echo "New version: $NEW_VERSION"
-            echo "::set-output name=new_version::$NEW_VERSION"
-          fi
-          echo "Version: $NEW_VERSION"
-
-  ubuntu-amd64-build:
-    runs-on: ubuntu-18-04-cuda-11-7
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-
-    strategy:
-      matrix:
-        include:
-          - build: "amd64-avx2"
-            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-          - build: "amd64-avx"
-            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-          - build: "amd64-avx512"
-            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-          - build: "amd64-vulkan"
-            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
-          # - build: "arm64"
-          #   defines: "-A ARM64 -DLLAMA_NATIVE=OFF"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Prepare Vulkan SDK
-        if: ${{ matrix.build == 'amd64-vulkan' }}
-        uses: humbletim/setup-vulkan-sdk@v1.2.0
-        with:
-          vulkan-query-version: 1.3.275.0
-          vulkan-components: Vulkan-Headers, Vulkan-Loader
-          vulkan-use-cache: true
-
-      - name: Build
-        id: make_build
-        run: |
-          ldd --version
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
-          make -j $(nproc)
-          ls -la
-
-      - name: Package
-        shell: bash
-        run: |
-          mkdir -p cortex-cpp/nitro
-          mkdir -p cortex-cpp/nitro/engines/cortex.llamacpp
-          cp cortex-cpp/build/nitro cortex-cpp/nitro/
-          cp cortex-cpp/build/engines/cortex.llamacpp/libengine.so cortex-cpp/nitro/engines/cortex.llamacpp/
-          tar -czvf cortex-cpp/nitro.tar.gz cortex-cpp/nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-linux-${{ matrix.build }}
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }}
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz
-          asset_content_type: application/gzip
-
-  ubuntu-amd64-cuda-build:
-    runs-on: ubuntu-18-04-cuda-${{ matrix.cuda }}
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    needs: [create-draft-release]
     timeout-minutes: 40
-    permissions:
-      contents: write
     strategy:
+      fail-fast: false
       matrix:
         include:
-          - cuda: "12-0"
-            flags: "-DCUDA_12_0=ON"
-          - cuda: "11-7"
-            flags: "-DCUDA_11_7=ON"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Build
-        id: make_build
-        run: |
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
-          make -j $(nproc)
-          ls -la
+          - os: "linux"
+            name: "amd64-avx2"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: true
+
+          - os: "linux"
+            name: "amd64-avx"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-avx512"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-vulkan"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-11-7"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-12-0"
+            runs-on: "ubuntu-18-04-cuda-12-0"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "mac"
+            name: "amd64"
+            runs-on: "macos-13"
+            cmake-flags: ""
+            run-e2e: true
+
+          - os: "mac"
+            name: "arm64"
+            runs-on: "mac-silicon"
+            cmake-flags: "-DMAC_ARM64=ON"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx2"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-vulkan"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+          - os: "windows"
+            name: "amd64-avx512-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
 
-      - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/
-          tar -czvf nitro.tar.gz nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-linux-amd64-cuda-${{ matrix.cuda }}
-          path: ./cortex-cpp/nitro
-      
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        if: ${{ matrix.cuda != '12-0'}}
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz
-          asset_content_type: application/gzip
-
-  macOS-silicon-build:
-    runs-on: mac-silicon
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
-          CC=gcc-8 make -j $(sysctl -n hw.ncpu)
-          ls -la
-
-      - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-arm64
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro/
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-  macOS-amd64-build:
-    runs-on: macos-13
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
     steps:
       - name: Clone
         id: checkout
@@ -330,284 +161,32 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
         run: |
-          brew update
-          brew install sdl2
+          choco install make -y
 
       - name: Build
-        id: cmake_build
         run: |
           cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
-          CC=gcc-8 make -j $(sysctl -n hw.ncp)
-          ls -la
+          make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
 
       - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-amd64
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        run: |
-          # run e2e testing
-          cd cortex-cpp
-          cd nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-  universal-nitro-artifact-macos:
-    runs-on: macos-latest
-    needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' 
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    steps:
-      - name: download artifact amd64
-        uses: actions/download-artifact@v2
-        with:
-          name: nitro-mac-amd64
-          path: ./cortex-cpp/nitro-mac-amd64
-
-      - name: download artifact arm64
-        uses: actions/download-artifact@v2
-        with:
-          name: nitro-mac-arm64
-          path: ./cortex-cpp/nitro-mac-arm64
-      
-      - name: bundle universal binary
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp/
-          ls ./nitro-mac-amd64
-          lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro
-          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib
-          tar -czvf nitro.tar.gz nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-universal
-          path: ./cortex-cpp/nitro
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz
-          asset_content_type: application/gzip
-
-  windows-amd64-build:
-    runs-on: windows-latest
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-
-    strategy:
-      matrix:
-        include:
-          - build: "amd64-avx2"
-            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-avx"
-            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-avx512"
-            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-vulkan"
-            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          # - build: "arm64"
-          #   defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-
-    permissions:
-      contents: write
-
-    steps:
-      - name: Clone
-
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Setup VSWhere.exe
-        uses: warrenbuckley/Setup-VSWhere@v1
-        with:
-          version: latest
-          silent: true
-        env:
-          ACTIONS_ALLOW_UNSECURE_COMMANDS: true
-
-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
-
-      - name: actions-setup-cmake
-        uses: jwlawson/actions-setup-cmake@v1.14.1
-
-      - name: Prepare Vulkan SDK
-        uses: humbletim/setup-vulkan-sdk@v1.2.0
-        if: ${{ matrix.build == 'amd64-vulkan' }}
-        with:
-          vulkan-query-version: 1.3.275.0
-          vulkan-components: Vulkan-Headers, Vulkan-Loader
-          vulkan-use-cache: true
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          cd cortex-cpp
-          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          cmake --build ./build_deps/nitro_deps --config Release
-          mkdir -p build
-          cd build
-          cmake .. ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
-          cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        shell: cmd
-        run: |
-          cd cortex-cpp
-          mkdir .\build\Release\engines\cortex.llamacpp\
-          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
-          robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
-          robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          dotnet tool install --global AzureSignTool
-          azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
-          7z a -ttar temp.tar .\build\Release\*
-          7z a -tgzip nitro.tar.gz temp.tar
-
-      - name: Run e2e testing - Llama.cpp
-        shell: cmd
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
-        run: |
-          cd cortex-cpp
-          cd build\Release
-          ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rmdir /S /Q .\build\Release\uploads
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-win-${{ matrix.build }}
-          path: ./cortex-cpp/build/Release
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz
-          asset_content_type: application/gzip
-
-  windows-amd64-cuda-build:
-    runs-on: windows-cuda-${{ matrix.cuda }}
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-
-    strategy:
-      matrix:
-        include:
-          - cuda: "12-0"
-            instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "12-0"
-            instructions: "amd64-avx"
-            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "12-0"
-            instructions: "amd64-avx512"
-            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx"
-            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx512"
-            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - uses: actions/setup-dotnet@v3
-        with:
-          dotnet-version: "6.0.x"
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
         run: |
-          cd cortex-cpp
-          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          cmake --build ./build_deps/nitro_deps --config Release
-          mkdir -p build
-          cd build
-          cmake .. ${{ matrix.inst-flags }} ${{ matrix.cmake-flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
-          cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
+         cd cortex-cpp
+          make package
 
-      - name: Pack artifacts
-        id: pack_artifacts
-        shell: cmd
+      - name: Run e2e testing
+        if: ${{ matrix.run-e2e }}
         run: |
-          set PATH=%PATH%;C:\Program Files\7-Zip\
-          cd cortex-cpp
-          mkdir .\build\Release\engines\cortex.llamacpp\
-          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
-          robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
-          robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          dotnet tool install --global AzureSignTool
-          %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
-          7z a -ttar temp.tar .\build\Release\*
-          7z a -tgzip nitro.tar.gz temp.tar
+         cd cortex-cpp
+          make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
-          name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}
-          path: ./cortex-cpp/build/Release
+          name: cortex-llamacpp-${{ matrix.os }}-${{ matrix.name }}
+          path: ./cortex-cpp/cortex.llamacpp
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -615,103 +194,6 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz
-          asset_content_type: application/gzip
-
-  update_release_draft:
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-    timeout-minutes: 40
-    needs:
-      [
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    permissions:
-      contents: write
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: release-drafter/release-drafter@v5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  noti-discord-nightly:
-    timeout-minutes: 40
-    if: github.event_name == 'schedule' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success'
-    needs:
-      [
-        create-draft-release,
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: "0"
-          token: ${{ secrets.PAT_SERVICE_ACCOUNT }}
-      - name: Notify Discord
-        uses: Ilshidur/action-discord@master
-        with:
-          args: "Nightly build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}"
-        env:
-          DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
-      - name: Update README.md with artifact URL
-        run: |
-          sed -i "s|<a href='https://github.com/janhq/nitro/actions/runs/.*'>|<a href='https://github.com/janhq/nitro/actions/runs/${GITHUB_RUN_ID}'>|" README.md
-          git config --global user.email "service@jan.ai"
-          git config --global user.name "Service Account"
-          git add README.md
-          git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL"
-          git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main
-        env:
-          GITHUB_RUN_ID: ${{ github.run_id }}
-
-  noti-discord-manual:
-    timeout-minutes: 40
-    if: github.event_name == 'workflow_dispatch' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success'
-    needs:
-      [
-        create-draft-release,
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: "0"
-          token: ${{ secrets.PAT_SERVICE_ACCOUNT }}
-      - name: Notify Discord
-        uses: Ilshidur/action-discord@master
-        with:
-          args: "Manual build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}"
-        env:
-          DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
-      # Update README.md with artifact URL if manual build from main branch
-      - name: Update README.md with artifact URL
-        if: github.ref == 'refs/heads/main'
-        run: |
-          sed -i "s|<a href='https://github.com/janhq/nitro/actions/runs/.*'>|<a href='https://github.com/janhq/nitro/actions/runs/${GITHUB_RUN_ID}'>|" README.md
-          git config --global user.email "service@jan.ai"
-          git config --global user.name "Service Account"
-          git add README.md
-          git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL"
-          git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main
-        env:
-          GITHUB_RUN_ID: ${{ github.run_id }}
+          asset_path: ./cortex-cpp/cortex.tar.gz
+          asset_name: cortex-llamacpp-engine-${{ needs.create-draft-release.outputs.version }}-${{ matrix.os }}-${{ matrix.name }}.tar.gz
+          asset_content_type: application/gzip
\ No newline at end of file
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
deleted file mode 100644
index 75d46cb03..000000000
--- a/.github/workflows/docs.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: Nitro Docs
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - '.github/workflows/docs.yml'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - '.github/workflows/docs.yml'
-    # Review gh actions docs if you want to further define triggers, paths, etc
-    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on
-
-jobs:
-  deploy:
-    name: Deploy to GitHub Pages
-    env:
-      CLOUDFLARE_ACCOUNT_ID: 9707100ef42a1a25bd70e3ee2137bd0e
-      CLOUDFLARE_PROJECT_NAME: nitro
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Install jq      
-        uses: dcarbone/install-jq-action@v2.0.1
-
-      - name: Fill env vars
-        run: |
-          env_example_file=".env.example"
-          touch .env
-          while IFS= read -r line || [[ -n "$line" ]]; do
-            if [[ "$line" == *"="* ]]; then
-              var_name=$(echo $line | cut -d '=' -f 1)
-              echo $var_name
-              var_value="$(jq -r --arg key "$var_name" '.[$key]' <<< "$SECRETS")"
-              echo "$var_name=$var_value" >> .env
-            fi
-          done < "$env_example_file"
-        working-directory: docs
-        env:
-          SECRETS: '${{ toJson(secrets) }}'
-
-      - name: Install dependencies
-        run: yarn install
-        working-directory: docs
-      - name: Build website
-        run: sed -i '/process.env.DEBUG = namespaces;/c\// process.env.DEBUG = namespaces;' ./node_modules/debug/src/node.js && yarn build
-        working-directory: docs
-
-      - name: Publish to Cloudflare Pages PR Preview and Staging
-        if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main')
-        uses: cloudflare/pages-action@v1
-        with:
-          apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
-          accountId: ${{ env.CLOUDFLARE_ACCOUNT_ID }}
-          projectName: ${{ env.CLOUDFLARE_PROJECT_NAME }}
-          directory: ./docs/build
-          # Optional: Enable this if you want to have GitHub Deployments triggered
-          gitHubToken: ${{ secrets.GITHUB_TOKEN }}
-        id: deployCloudflarePages
-
-      - uses: mshick/add-pr-comment@v2
-        if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main'
-        with:
-          message: |
-              Preview URL: ${{ steps.deployCloudflarePages.outputs.url }}
-
-      - name: Add Custome Domain file
-        if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository
-        run: echo "${{ vars.DOCUSAURUS_DOMAIN }}" > ./docs/build/CNAME
-
-      # Popular action to deploy to GitHub Pages:
-      # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus
-      - name: Deploy to GitHub Pages
-        if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository
-        uses: peaceiris/actions-gh-pages@v3
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          # Build output to publish to the `gh-pages` branch:
-          publish_dir: ./docs/build
-          # The following lines assign commit authorship to the official
-          # GH-Actions bot for deploys to `gh-pages` branch:
-          # https://github.com/actions/checkout/issues/13#issuecomment-724415212
-          # The GH actions bot is used by default if you didn't specify the two fields.
-          # You can swap them out with your own user credentials.
-          user_name: github-actions[bot]
-          user_email: 41898282+github-actions[bot]@users.noreply.github.com
\ No newline at end of file
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
new file mode 100644
index 000000000..82a98f3e0
--- /dev/null
+++ b/.github/workflows/quality-gate.yml
@@ -0,0 +1,163 @@
+name: CI Quality Gate
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      [
+        "cortex-cpp/**",
+      ]
+  workflow_dispatch:
+
+env:
+  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
+  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
+
+jobs:
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    timeout-minutes: 40
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "linux"
+            name: "amd64-avx2"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: true
+
+          - os: "linux"
+            name: "amd64-avx"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-avx512"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-vulkan"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-11-7"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-12-0"
+            runs-on: "ubuntu-18-04-cuda-12-0"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "mac"
+            name: "amd64"
+            runs-on: "macos-13"
+            cmake-flags: ""
+            run-e2e: true
+
+          - os: "mac"
+            name: "arm64"
+            runs-on: "mac-silicon"
+            cmake-flags: "-DMAC_ARM64=ON"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx2"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-vulkan"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+          - os: "windows"
+            name: "amd64-avx512-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install make -y
+
+      - name: Build
+        run: |
+          cd cortex-cpp
+          make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+
+      - name: Package
+        run: |
+         cd cortex-cpp
+          make package
+
+      - name: Run e2e testing
+        if: ${{ matrix.run-e2e }}
+        run: |
+         cd cortex-cpp
+          make run-e2e-test LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: cortex-llamacpp-engine-${{ matrix.os }}-${{ matrix.name }}
+          path: ./cortex-cpp/cortex.llamacpp
\ No newline at end of file
diff --git a/cortex-cpp/.gitignore b/cortex-cpp/.gitignore
index be1237faa..69c167305 100644
--- a/cortex-cpp/.gitignore
+++ b/cortex-cpp/.gitignore
@@ -85,7 +85,6 @@ CMakeCache.txt
 CMakeFiles
 CMakeScripts
 Testing
-Makefile
 !nitro-node/Makefile
 cmake_install.cmake
 install_manifest.txt
diff --git a/cortex-cpp/Makefile b/cortex-cpp/Makefile
new file mode 100644
index 000000000..998011128
--- /dev/null
+++ b/cortex-cpp/Makefile
@@ -0,0 +1,67 @@
+# Makefile for Cortex llamacpp engine - Build, Lint, Test, and Clean
+.PHONY: all build package run-e2e-test
+
+
+CMAKE_EXTRA_FLAGS ?= ""
+RUN_TESTS ?= false
+LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
+EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf"
+
+# Default target, does nothing
+all:
+	@echo "Specify a target to run"
+
+# Build the Cortex engine
+build:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cmake -S ./nitro_deps -B ./build_deps/nitro_deps;"
+	@powershell -Command "cmake --build ./build_deps/nitro_deps --config Release -j4;"
+	@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release -j4;"
+else ifeq ($(shell uname -s),Linux)
+	@./install_deps.sh;
+	@mkdir -p build && cd build; \
+	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	make -j4;
+else
+	@./install_deps.sh;
+	@mkdir -p build && cd build; \
+	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	make -j4;
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "mkdir -p cortex\engines\cortex.llamacpp\; cp build\engines\cortex.llamacpp\engine.dll cortex\engines\cortex.llamacpp\;"
+	@powershell -Command "cp build\Release\nitro.exe .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\msvcp140.dll .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\vcruntime140_1.dll .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\vcruntime140.dll .\cortex\;"
+	@powershell -Command "7z a -ttar temp.tar cortex\\*; 7z a -tgzip cortex.tar.gz temp.tar;"
+else ifeq ($(shell uname -s),Linux)
+	@mkdir -p cortex/engines/cortex.llamacpp; \
+	cp build/engines/cortex.llamacpp/libengine.so cortex/engines/cortex.llamacpp/; \
+	cp build/nitro cortex/; \
+	tar -czvf cortex.tar.gz cortex;
+else
+	@mkdir -p cortex/engines/cortex.llamacpp; \
+	cp build/engines/cortex.llamacpp/libengine.dylib cortex/engines/cortex.llamacpp/; \
+	cp build/nitro cortex/; \
+	tar -czvf cortex.llamacpp.tar.gz cortex;
+endif
+
+run-e2e-test:
+ifeq ($(RUN_TESTS),false)
+	@echo "Skipping tests"
+	@exit 0
+endif
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd cortex; ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+else ifeq ($(shell uname -s),Linux)
+	@cd cortex; \
+	chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \
+	rm -rf uploads/;
+else
+	@cd cortex; \
+	chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \
+	rm -rf uploads/;
+endif
\ No newline at end of file