diff --git a/.github/workflows/build.yml b/.github/workflows/cortex-cpp-build.yml
similarity index 81%
rename from .github/workflows/build.yml
rename to .github/workflows/cortex-cpp-build.yml
index 716dfd679..69915a034 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/cortex-cpp-build.yml
@@ -1,4 +1,4 @@
-name: CI
+name: CI Cortex CPP
 
 on:
   push:
@@ -25,7 +25,8 @@ jobs:
     steps:
       - name: Extract tag name without v prefix
         id: get_version
-        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}"
+        run: |
+          echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}"
         env:
           GITHUB_REF: ${{ github.ref }}
       - name: Create Draft Release
@@ -91,7 +92,7 @@ jobs:
 
           - os: "mac"
             name: "arm64"
-            runs-on: "mac-silicon"
+            runs-on: "macos-latest"
             cmake-flags: "-DMAC_ARM64=ON"
             run-e2e: true
 
@@ -161,16 +162,54 @@ jobs:
         with:
           submodules: recursive
 
+      - uses: actions/setup-dotnet@v3
+        if: runner.os == 'Windows'
+        with:
+          dotnet-version: "8.0.x"
+
       - name: Install choco on Windows
         if: runner.os == 'Windows'
         run: |
           choco install make -y
 
+      - name: Get Cer for code signing
+        if: runner.os == 'macOS'
+        run: base64 -d <<< "$CODE_SIGN_P12_BASE64" > /tmp/codesign.p12
+        shell: bash
+        env:
+          CODE_SIGN_P12_BASE64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+  
+      - uses: apple-actions/import-codesign-certs@v2
+        if: runner.os == 'macOS'
+        with:
+          p12-file-base64: ${{ secrets.CODE_SIGN_P12_BASE64 }}
+          p12-password: ${{ secrets.CODE_SIGN_P12_PASSWORD }}
+
       - name: Build
         run: |
           cd cortex-cpp
           make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
 
+      - name: Pre-package
+        run: |
+          cd cortex-cpp
+          make pre-package
+
+      - name: Code Signing macOS
+        if: runner.os == 'macOS'
+        run: |
+          cd cortex-cpp
+          make codesign CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}"
+
+      - name: Code Signing Windows
+        if: runner.os == 'Windows'
+        shell: cmd
+        run: |
+          cd cortex-cpp
+          set PATH=%PATH%;%USERPROFILE%\.dotnet\tools
+          make codesign CODE_SIGN=true AZURE_KEY_VAULT_URI="${{ secrets.AZURE_KEY_VAULT_URI }}" AZURE_CLIENT_ID="${{ secrets.AZURE_CLIENT_ID }}" AZURE_TENANT_ID="${{ secrets.AZURE_TENANT_ID }}" AZURE_CLIENT_SECRET="${{ secrets.AZURE_CLIENT_SECRET }}" AZURE_CERT_NAME="${{ secrets.AZURE_CERT_NAME }}"
+  
+
       - name: Package
         run: |
           cd cortex-cpp
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml
similarity index 97%
rename from .github/workflows/quality-gate.yml
rename to .github/workflows/cortex-cpp-quality-gate.yml
index 909ab7e77..33c8a4533 100644
--- a/.github/workflows/quality-gate.yml
+++ b/.github/workflows/cortex-cpp-quality-gate.yml
@@ -1,4 +1,4 @@
-name: CI Quality Gate
+name: CI Quality Gate Cortex CPP
 
 on:
   pull_request:
@@ -145,6 +145,11 @@ jobs:
           cd cortex-cpp
           make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
 
+      - name: Pre-package
+        run: |
+          cd cortex-cpp
+          make pre-package
+ 
       - name: Package
         run: |
           cd cortex-cpp
diff --git a/.github/workflows/cortex-js.yml b/.github/workflows/cortex-js.yml
new file mode 100644
index 000000000..31175b1be
--- /dev/null
+++ b/.github/workflows/cortex-js.yml
@@ -0,0 +1,44 @@
+name: Publish cortex js Package to npmjs
+on:
+  push:
+    tags: ["v[0-9]+.[0-9]+.[0-9]+-cortex-js"]
+    paths:
+      [
+        "cortex-js/**",
+      ]
+jobs:
+  build-and-publish-plugins:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Install jq
+        uses: dcarbone/install-jq-action@v2.0.1
+
+      - name: "Update version by tag"
+        run: |
+          cd cortex-js
+          # Remove the v prefix
+          tag_version=${GITHUB_REF#refs/tags/v}
+          # Remove the -cortex-js suffix
+          new_version=${tag_version%-cortex-js}
+
+          # Replace the old version with the new version in package.json
+          jq --arg version "$new_version" '.version = $version' ./package.json > /tmp/package.json && mv /tmp/package.json ./package.json
+
+          # Print the new version
+          echo "Updated package.json version to: $new_version"
+
+      # Setup .npmrc file to publish to npm
+      - uses: actions/setup-node@v3
+        with:
+          node-version: "20.x"
+          registry-url: "https://registry.npmjs.org"
+      - run: yarn install && yarn build
+        working-directory: ./cortex-js
+      - run: npm publish --access public
+        env:
+          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
+        working-directory: ./cortex-js
diff --git a/cortex-cpp/Makefile b/cortex-cpp/Makefile
index 9f4c98d1b..98486f023 100644
--- a/cortex-cpp/Makefile
+++ b/cortex-cpp/Makefile
@@ -6,6 +6,13 @@ CMAKE_EXTRA_FLAGS ?= ""
 RUN_TESTS ?= false
 LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
 EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf"
+CODE_SIGN ?= false
+AZURE_KEY_VAULT_URI ?= xxxx
+AZURE_CLIENT_ID ?= xxxx
+AZURE_TENANT_ID ?= xxxx
+AZURE_CLIENT_SECRET ?= xxxx
+AZURE_CERT_NAME ?= xxxx
+DEVELOPER_ID ?= xxxx
 
 # Default target, does nothing
 all:
@@ -29,24 +36,47 @@ else
 	make -j4;
 endif
 
-package:
+pre-package:
 ifeq ($(OS),Windows_NT)
-	@powershell -Command "mkdir -p cortex-cpp\engines\cortex.llamacpp\; cp build\engines\cortex.llamacpp\engine.dll cortex-cpp\engines\cortex.llamacpp\;"
-	@powershell -Command "cp build\Release\cortex-cpp.exe .\cortex-cpp\;"
-	@powershell -Command "cp build-deps\_install\bin\zlib.dll .\cortex-cpp\;"
-	@powershell -Command "cp ..\.github\patches\windows\msvcp140.dll .\cortex-cpp\;"
-	@powershell -Command "cp ..\.github\patches\windows\vcruntime140_1.dll .\cortex-cpp\;"
-	@powershell -Command "cp ..\.github\patches\windows\vcruntime140.dll .\cortex-cpp\;"
-	@powershell -Command "7z a -ttar temp.tar cortex-cpp\\*; 7z a -tgzip cortex-cpp.tar.gz temp.tar;"
+	@powershell -Command "mkdir -p cortex-cpp\engines\cortex.llamacpp\; cp -r build\engines\cortex.llamacpp\engine.dll cortex-cpp\engines\cortex.llamacpp\;"
+	@powershell -Command "cp -r build\Release\cortex-cpp.exe .\cortex-cpp\;"
+	@powershell -Command "cp -r build-deps\_install\bin\zlib.dll .\cortex-cpp\;"
+	@powershell -Command "cp -r ..\.github\patches\windows\msvcp140.dll .\cortex-cpp\;"
+	@powershell -Command "cp -r ..\.github\patches\windows\vcruntime140_1.dll .\cortex-cpp\;"
+	@powershell -Command "cp -r ..\.github\patches\windows\vcruntime140.dll .\cortex-cpp\;"
 else ifeq ($(shell uname -s),Linux)
 	@mkdir -p cortex-cpp/engines/cortex.llamacpp; \
 	cp build/engines/cortex.llamacpp/libengine.so cortex-cpp/engines/cortex.llamacpp/; \
-	cp build/cortex-cpp cortex-cpp/; \
-	tar -czvf cortex-cpp.tar.gz cortex-cpp;
+	cp build/cortex-cpp cortex-cpp/;
 else
 	@mkdir -p cortex-cpp/engines/cortex.llamacpp; \
 	cp build/engines/cortex.llamacpp/libengine.dylib cortex-cpp/engines/cortex.llamacpp/; \
-	cp build/cortex-cpp cortex-cpp/; \
+	cp build/cortex-cpp cortex-cpp/;
+endif
+
+codesign:
+ifeq ($(CODE_SIGN),false)
+	@echo "Skipping Code Sign"
+	@exit 0
+endif
+
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "dotnet tool install --global AzureSignTool;"
+	@powershell -Command 'azuresigntool.exe sign -kvu "$(AZURE_KEY_VAULT_URI)" -kvi "$(AZURE_CLIENT_ID)" -kvt "$(AZURE_TENANT_ID)" -kvs "$(AZURE_CLIENT_SECRET)" -kvc "$(AZURE_CERT_NAME)" -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\cortex-cpp\cortex-cpp.exe";'
+	@powershell -Command 'azuresigntool.exe sign -kvu "$(AZURE_KEY_VAULT_URI)" -kvi "$(AZURE_CLIENT_ID)" -kvt "$(AZURE_TENANT_ID)" -kvs "$(AZURE_CLIENT_SECRET)" -kvc "$(AZURE_CERT_NAME)" -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\cortex-cpp\engines\cortex.llamacpp\engine.dll";'
+else ifeq ($(shell uname -s),Linux)
+	@echo "Skipping Code Sign for linux"
+	@exit 0
+else
+	find "cortex-cpp" -type f -exec codesign --force -s "$(DEVELOPER_ID)" --options=runtime {} \;
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "7z a -ttar temp.tar cortex-cpp\*; 7z a -tgzip cortex-cpp.tar.gz temp.tar;"
+else ifeq ($(shell uname -s),Linux)
+	tar -czvf cortex-cpp.tar.gz cortex-cpp;
+else
 	tar -czvf cortex-cpp.tar.gz cortex-cpp;
 endif
 
@@ -65,4 +95,13 @@ else
 	@cd cortex-cpp; \
 	chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./cortex-cpp $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \
 	rm -rf uploads/;
+endif
+
+clean:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "rm -rf build; rm -rf build-deps; rm -rf cortex-cpp; rm -rf cortex-cpp.tar.gz;"
+else ifeq ($(shell uname -s),Linux)
+	@rm -rf build; rm -rf build-deps; rm -rf cortex-cpp; rm -rf cortex-cpp.tar.gz;
+else
+	@rm -rf build; rm -rf build-deps; rm -rf cortex-cpp; rm -rf cortex-cpp.tar.gz;
 endif
\ No newline at end of file
diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
index c273d7e38..635fb9945 100644
--- a/cortex-cpp/engines/cortex.llamacpp/engine.cmake
+++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
@@ -1,6 +1,7 @@
 # cortex.llamacpp release version
-set(VERSION 0.1.2)
+set(VERSION 0.1.4)
 set(ENGINE_VERSION v${VERSION})
+add_compile_definitions(CORTEX_LLAMACPP_VERSION="${VERSION}")
 
 # MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION})
 
diff --git a/cortex-cpp/main.cc b/cortex-cpp/main.cc
index 53c65cd37..12cabeb0c 100644
--- a/cortex-cpp/main.cc
+++ b/cortex-cpp/main.cc
@@ -51,6 +51,10 @@ int main(int argc, char* argv[]) {
 #else
   LOG_INFO << "cortex-cpp version: undefined";
 #endif
+#ifdef CORTEX_LLAMACPP_VERSION
+  LOG_INFO << "cortex.llamacpp version: " << CORTEX_LLAMACPP_VERSION;
+#endif
+
   LOG_INFO << "Server started, listening at: " << host << ":" << port;
   LOG_INFO << "Please load your model";
   drogon::app().addListener(host, port);
diff --git a/cortex-js/.env.development b/cortex-js/.env.development
new file mode 100644
index 000000000..e69de29bb
diff --git a/cortex-js/.env.example b/cortex-js/.env.example
index 51944b2db..d0666607c 100644
--- a/cortex-js/.env.example
+++ b/cortex-js/.env.example
@@ -1,3 +1,2 @@
 EXTENSIONS_PATH=<EXTENSIONS_PATH>
 CORTEX_MODELS_DIR=<CORTEX_MODELS_DIR>
-CORTEX_BINARY_PATH=<CORTEX_BINARY_PATH>
\ No newline at end of file
diff --git a/cortex-js/constant.ts b/cortex-js/constant.ts
index 8f1ba5008..b9a983657 100644
--- a/cortex-js/constant.ts
+++ b/cortex-js/constant.ts
@@ -4,3 +4,6 @@ export const databaseFile = `${databaseName}.db`;
 
 export const defaultCortexJsHost = 'localhost';
 export const defaultCortexJsPort = 7331;
+
+export const defaultCortexCppHost = '127.0.0.1';
+export const defaultCortexCppPort = 3928;
diff --git a/cortex-js/package.json b/cortex-js/package.json
index f5519d13a..a66b5b6da 100644
--- a/cortex-js/package.json
+++ b/cortex-js/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "cortex-js",
+  "name": "@janhq/cortex",
   "version": "0.0.1",
   "description": "",
   "author": "",
@@ -25,6 +25,7 @@
     "typeorm": "typeorm-ts-node-esm"
   },
   "dependencies": {
+    "@huggingface/gguf": "^0.1.5",
     "@nestjs/axios": "^3.0.2",
     "@nestjs/common": "^10.0.0",
     "@nestjs/config": "^3.2.2",
@@ -33,10 +34,12 @@
     "@nestjs/mapped-types": "*",
     "@nestjs/platform-express": "^10.0.0",
     "@nestjs/swagger": "^7.3.1",
+    "@terascope/fetch-github-release": "^0.8.8",
     "axios": "^1.6.8",
     "class-transformer": "^0.5.1",
     "class-validator": "^0.14.1",
     "cli-progress": "^3.12.0",
+    "decompress": "^4.2.1",
     "nest-commander": "^3.13.0",
     "readline": "^1.3.0",
     "reflect-metadata": "^0.2.0",
@@ -52,6 +55,7 @@
     "@nestjs/testing": "^10.0.0",
     "@nestjs/typeorm": "^10.0.2",
     "@types/cli-progress": "^3.11.5",
+    "@types/decompress": "^4.2.7",
     "@types/express": "^4.17.17",
     "@types/jest": "^29.5.2",
     "@types/node": "^20.12.9",
diff --git a/cortex-js/src/app.module.ts b/cortex-js/src/app.module.ts
index 748ca9bb9..3120e90c1 100644
--- a/cortex-js/src/app.module.ts
+++ b/cortex-js/src/app.module.ts
@@ -6,20 +6,19 @@ import { DevtoolsModule } from '@nestjs/devtools-integration';
 import { DatabaseModule } from './infrastructure/database/database.module';
 import { ChatModule } from './usecases/chat/chat.module';
 import { AssistantsModule } from './usecases/assistants/assistants.module';
-import { InferenceSettingsModule } from './usecases/inference-settings/inference-settings.module';
 import { ExtensionModule } from './infrastructure/repositories/extensions/extension.module';
 import { CortexModule } from './usecases/cortex/cortex.module';
 import { ConfigModule } from '@nestjs/config';
+import { env } from 'node:process';
 
 @Module({
   imports: [
     DevtoolsModule.register({
-      http: process.env.NODE_ENV !== 'production',
+      http: env.NODE_ENV !== 'production',
     }),
     ConfigModule.forRoot({
       isGlobal: true,
-      envFilePath:
-        process.env.NODE_ENV === 'production' ? '.env' : '.env.development',
+      envFilePath: env.NODE_ENV !== 'production' ? '.env.development' : '.env',
     }),
     DatabaseModule,
     MessagesModule,
@@ -27,7 +26,6 @@ import { ConfigModule } from '@nestjs/config';
     ModelsModule,
     ChatModule,
     AssistantsModule,
-    InferenceSettingsModule,
     CortexModule,
     ExtensionModule,
   ],
diff --git a/cortex-js/src/command.module.ts b/cortex-js/src/command.module.ts
index fe23ca16c..d15758746 100644
--- a/cortex-js/src/command.module.ts
+++ b/cortex-js/src/command.module.ts
@@ -1,14 +1,26 @@
 import { Module } from '@nestjs/common';
-import { BasicCommand } from './infrastructure/commanders/basic-command.commander';
 import { ModelsModule } from './usecases/models/models.module';
 import { DatabaseModule } from './infrastructure/database/database.module';
 import { ConfigModule } from '@nestjs/config';
 import { CortexModule } from './usecases/cortex/cortex.module';
 import { ServeCommand } from './infrastructure/commanders/serve.command';
-import { PullCommand } from './infrastructure/commanders/pull.command';
-import { InferenceCommand } from './infrastructure/commanders/inference.command';
 import { ModelsCommand } from './infrastructure/commanders/models.command';
-import { StartCommand } from './infrastructure/commanders/start.command';
+import { ExtensionModule } from './infrastructure/repositories/extensions/extension.module';
+import { ChatModule } from './usecases/chat/chat.module';
+import { InitCommand } from './infrastructure/commanders/init.command';
+import { HttpModule } from '@nestjs/axios';
+import { InitRunModeQuestions } from './infrastructure/commanders/questions/init.questions';
+import { ModelListCommand } from './infrastructure/commanders/models/model-list.command';
+import { ModelPullCommand } from './infrastructure/commanders/models/model-pull.command';
+import { CortexCommand } from './infrastructure/commanders/cortex-command.commander';
+import { ChatCommand } from './infrastructure/commanders/chat.command';
+import { ModelStartCommand } from './infrastructure/commanders/models/model-start.command';
+import { ModelStopCommand } from './infrastructure/commanders/models/model-stop.command';
+import { ModelGetCommand } from './infrastructure/commanders/models/model-get.command';
+import { ModelRemoveCommand } from './infrastructure/commanders/models/model-remove.command';
+import { RunCommand } from './infrastructure/commanders/shortcuts/run.command';
+import { InitCudaQuestions } from './infrastructure/commanders/questions/cuda.questions';
+import { CliUsecasesModule } from './infrastructure/commanders/usecases/cli.usecases.module';
 
 @Module({
   imports: [
@@ -20,14 +32,32 @@ import { StartCommand } from './infrastructure/commanders/start.command';
     DatabaseModule,
     ModelsModule,
     CortexModule,
+    ChatModule,
+    ExtensionModule,
+    HttpModule,
+    CliUsecasesModule,
   ],
   providers: [
-    BasicCommand,
+    CortexCommand,
     ModelsCommand,
-    PullCommand,
     ServeCommand,
-    InferenceCommand,
-    StartCommand,
+    ChatCommand,
+    InitCommand,
+
+    // Questions
+    InitRunModeQuestions,
+    InitCudaQuestions,
+
+    // Model commands
+    ModelStartCommand,
+    ModelStopCommand,
+    ModelListCommand,
+    ModelGetCommand,
+    ModelRemoveCommand,
+    ModelPullCommand,
+
+    // Shortcuts
+    RunCommand,
   ],
 })
 export class CommandModule {}
diff --git a/cortex-js/src/domain/abstracts/engine.abstract.ts b/cortex-js/src/domain/abstracts/engine.abstract.ts
index 564faa2a1..f21f6664b 100644
--- a/cortex-js/src/domain/abstracts/engine.abstract.ts
+++ b/cortex-js/src/domain/abstracts/engine.abstract.ts
@@ -1,8 +1,16 @@
+/* eslint-disable no-unused-vars, @typescript-eslint/no-unused-vars */
+import { Model, ModelSettingParams } from '../models/model.interface';
 import { Extension } from './extension.abstract';
 
 export abstract class EngineExtension extends Extension {
   abstract provider: string;
-  abstract inference(completion: any, req: any, res: any): void;
-  abstract loadModel(loadModel: any): Promise<void>;
-  abstract unloadModel(modelId: string): Promise<void>;
+
+  abstract inference(completion: any, req: any, stream: any, res?: any): void;
+
+  async loadModel(
+    model: Model,
+    settingParams?: ModelSettingParams,
+  ): Promise<void> {}
+
+  async unloadModel(modelId: string): Promise<void> {}
 }
diff --git a/cortex-js/src/domain/abstracts/oai.abstract.ts b/cortex-js/src/domain/abstracts/oai.abstract.ts
index 96748449d..2923c4277 100644
--- a/cortex-js/src/domain/abstracts/oai.abstract.ts
+++ b/cortex-js/src/domain/abstracts/oai.abstract.ts
@@ -1,6 +1,12 @@
-/* eslint-disable @typescript-eslint/no-unused-vars */
 import { HttpService } from '@nestjs/axios';
 import { EngineExtension } from './engine.abstract';
+import { stdout } from 'process';
+
+export type ChatStreamEvent = {
+  type: 'data' | 'error' | 'end';
+  data?: any;
+  error?: any;
+};
 
 export abstract class OAIEngineExtension extends EngineExtension {
   abstract apiUrl: string;
@@ -9,44 +15,120 @@ export abstract class OAIEngineExtension extends EngineExtension {
     super();
   }
 
-  async inference(
+  inference(
     createChatDto: any,
     headers: Record<string, string>,
-    res: any,
+    writableStream: WritableStream<ChatStreamEvent>,
+    res?: any,
   ) {
     if (createChatDto.stream === true) {
-      const response = await this.httpService
-        .post(this.apiUrl, createChatDto, {
-          headers: {
-            'Content-Type': headers['content-type'] ?? 'application/json',
-            Authorization: headers['authorization'],
-          },
-          responseType: 'stream',
-        })
-        .toPromise();
-
-      res.writeHead(200, {
-        'Content-Type': 'text/event-stream',
-        'Cache-Control': 'no-cache',
-        Connection: 'keep-alive',
-        'Access-Control-Allow-Origin': '*',
-      });
+      if (res) {
+        res.writeHead(200, {
+          'Content-Type': 'text/event-stream',
+          'Cache-Control': 'no-cache',
+          Connection: 'keep-alive',
+          'Access-Control-Allow-Origin': '*',
+        });
+        this.httpService
+          .post(this.apiUrl, createChatDto, {
+            headers: {
+              'Content-Type': headers['content-type'] ?? 'application/json',
+              Authorization: headers['authorization'],
+            },
+            responseType: 'stream',
+          })
+          .toPromise()
+          .then((response) => {
+            response?.data.pipe(res);
+          });
+      } else {
+        const decoder = new TextDecoder('utf-8');
+        const defaultWriter = writableStream.getWriter();
+        defaultWriter.ready.then(() => {
+          this.httpService
+            .post(this.apiUrl, createChatDto, {
+              headers: {
+                'Content-Type': headers['content-type'] ?? 'application/json',
+                Authorization: headers['authorization'],
+              },
+              responseType: 'stream',
+            })
+            .subscribe({
+              next: (response) => {
+                response.data.on('data', (chunk: any) => {
+                  let content = '';
+                  const text = decoder.decode(chunk);
+                  const lines = text.trim().split('\n');
+                  let cachedLines = '';
+                  for (const line of lines) {
+                    try {
+                      const toParse = cachedLines + line;
+                      if (!line.includes('data: [DONE]')) {
+                        const data = JSON.parse(toParse.replace('data: ', ''));
+                        content += data.choices[0]?.delta?.content ?? '';
+
+                        if (content.startsWith('assistant: ')) {
+                          content = content.replace('assistant: ', '');
+                        }
+
+                        if (content !== '') {
+                          defaultWriter.write({
+                            type: 'data',
+                            data: content,
+                          });
+                        }
+                      }
+                    } catch {
+                      cachedLines = line;
+                    }
+                  }
+                });
 
-      response?.data.pipe(res);
+                response.data.on('error', (error: any) => {
+                  defaultWriter.write({
+                    type: 'error',
+                    error,
+                  });
+                });
+
+                response.data.on('end', () => {
+                  // stdout.write('Stream end');
+                  defaultWriter.write({
+                    type: 'end',
+                  });
+                });
+              },
+
+              error: (error) => {
+                stdout.write('Stream error: ' + error);
+              },
+            });
+        });
+      }
     } else {
-      const response = await this.httpService
-        .post(this.apiUrl, createChatDto, {
-          headers: {
-            'Content-Type': headers['content-type'] ?? 'application/json',
-            Authorization: headers['authorization'],
-          },
-        })
-        .toPromise();
-
-      res.json(response?.data);
+      const defaultWriter = writableStream.getWriter();
+      defaultWriter.ready.then(() => {
+        this.httpService
+          .post(this.apiUrl, createChatDto, {
+            headers: {
+              'Content-Type': headers['content-type'] ?? 'application/json',
+              Authorization: headers['authorization'],
+            },
+          })
+          .toPromise()
+          .then((response) => {
+            defaultWriter.write({
+              type: 'data',
+              data: response?.data,
+            });
+          })
+          .catch((error: any) => {
+            defaultWriter.write({
+              type: 'error',
+              error,
+            });
+          });
+      });
     }
   }
-
-  async loadModel(_loadModel: any): Promise<void> {}
-  async unloadModel(_modelId: string): Promise<void> {}
 }
diff --git a/cortex-js/src/domain/models/huggingface.interface.ts b/cortex-js/src/domain/models/huggingface.interface.ts
new file mode 100644
index 000000000..cfb1dc5a3
--- /dev/null
+++ b/cortex-js/src/domain/models/huggingface.interface.ts
@@ -0,0 +1,65 @@
+export interface HuggingFaceRepoData {
+  id: string;
+  modelId: string;
+  modelUrl?: string;
+  author: string;
+  sha: string;
+  downloads: number;
+  lastModified: string;
+  private: boolean;
+  disabled: boolean;
+  gated: boolean;
+  pipeline_tag: 'text-generation';
+  tags: Array<'transformers' | 'pytorch' | 'safetensors' | string>;
+  cardData: Record<CardDataKeys | string, unknown>;
+  siblings: {
+    rfilename: string;
+    downloadUrl?: string;
+    fileSize?: number;
+    quantization?: Quantization;
+  }[];
+  createdAt: string;
+}
+
+const CardDataKeys = [
+  'base_model',
+  'datasets',
+  'inference',
+  'language',
+  'library_name',
+  'license',
+  'model_creator',
+  'model_name',
+  'model_type',
+  'pipeline_tag',
+  'prompt_template',
+  'quantized_by',
+  'tags',
+] as const;
+export type CardDataKeysTuple = typeof CardDataKeys;
+export type CardDataKeys = CardDataKeysTuple[number];
+
+export const AllQuantizations = [
+  'Q3_K_S',
+  'Q3_K_M',
+  'Q3_K_L',
+  'Q4_K_S',
+  'Q4_K_M',
+  'Q5_K_S',
+  'Q5_K_M',
+  'Q4_0',
+  'Q4_1',
+  'Q5_0',
+  'Q5_1',
+  'IQ2_XXS',
+  'IQ2_XS',
+  'Q2_K',
+  'Q2_K_S',
+  'Q6_K',
+  'Q8_0',
+  'F16',
+  'F32',
+  'COPY',
+];
+export type QuantizationsTuple = typeof AllQuantizations;
+export type Quantization = QuantizationsTuple[number];
diff --git a/cortex-js/src/infrastructure/commanders/basic-command.commander.ts b/cortex-js/src/infrastructure/commanders/basic-command.commander.ts
deleted file mode 100644
index ea0c71237..000000000
--- a/cortex-js/src/infrastructure/commanders/basic-command.commander.ts
+++ /dev/null
@@ -1,52 +0,0 @@
-import { RootCommand, CommandRunner, Option } from 'nest-commander';
-import { PullCommand } from './pull.command';
-import { ServeCommand } from './serve.command';
-import { InferenceCommand } from './inference.command';
-import { ModelsCommand } from './models.command';
-import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
-import { defaultCortexJsHost, defaultCortexJsPort } from 'constant';
-
-@RootCommand({
-  subCommands: [ModelsCommand, PullCommand, ServeCommand, InferenceCommand],
-})
-export class BasicCommand extends CommandRunner {
-  constructor(private readonly cortexUsecases: CortexUsecases) {
-    super();
-  }
-
-  async run(input: string[], options?: any): Promise<void> {
-    const command = input[0];
-
-    switch (command) {
-      case 'start':
-        const host = options?.host || defaultCortexJsHost;
-        const port = options?.port || defaultCortexJsPort;
-        return this.cortexUsecases
-          .startCortex(host, port)
-          .then((e) => console.log(e));
-      case 'stop':
-        return this.cortexUsecases
-          .stopCortex(defaultCortexJsHost, defaultCortexJsPort)
-          .then((e) => console.log(e));
-      default:
-        console.error(`Command ${command} is not supported`);
-        return;
-    }
-  }
-
-  @Option({
-    flags: '--host <host>',
-    description: 'Host to serve the application',
-  })
-  parseHost(value: string) {
-    return value;
-  }
-
-  @Option({
-    flags: '--port <port>',
-    description: 'Port to serve the application',
-  })
-  parsePort(value: string) {
-    return parseInt(value, 10);
-  }
-}
diff --git a/cortex-js/src/infrastructure/commanders/chat.command.ts b/cortex-js/src/infrastructure/commanders/chat.command.ts
new file mode 100644
index 000000000..8efacb093
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/chat.command.ts
@@ -0,0 +1,41 @@
+import { ChatUsecases } from '@/usecases/chat/chat.usecases';
+import { CommandRunner, SubCommand, Option } from 'nest-commander';
+import { ChatCliUsecases } from './usecases/chat.cli.usecases';
+import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
+import { exit } from 'node:process';
+
+type ChatOptions = {
+  model?: string;
+};
+
+@SubCommand({ name: 'chat', description: 'Start a chat with a model' })
+export class ChatCommand extends CommandRunner {
+  constructor(
+    private readonly chatUsecases: ChatUsecases,
+    private readonly cortexUsecases: CortexUsecases,
+  ) {
+    super();
+  }
+
+  async run(_input: string[], option: ChatOptions): Promise<void> {
+    const modelId = option.model;
+    if (!modelId) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    const chatCliUsecases = new ChatCliUsecases(
+      this.chatUsecases,
+      this.cortexUsecases,
+    );
+    return chatCliUsecases.chat(modelId);
+  }
+
+  @Option({
+    flags: '--model <model_id>',
+    description: 'Model Id to start chat with',
+  })
+  parseModelId(value: string) {
+    return value;
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/cortex-command.commander.ts b/cortex-js/src/infrastructure/commanders/cortex-command.commander.ts
new file mode 100644
index 000000000..05aa30271
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/cortex-command.commander.ts
@@ -0,0 +1,20 @@
+import { RootCommand, CommandRunner } from 'nest-commander';
+import { ServeCommand } from './serve.command';
+import { ChatCommand } from './chat.command';
+import { ModelsCommand } from './models.command';
+import { InitCommand } from './init.command';
+import { RunCommand } from './shortcuts/run.command';
+
+@RootCommand({
+  subCommands: [
+    ModelsCommand,
+    ServeCommand,
+    ChatCommand,
+    InitCommand,
+    RunCommand,
+  ],
+  description: 'Cortex CLI',
+})
+export class CortexCommand extends CommandRunner {
+  async run(): Promise<void> {}
+}
diff --git a/cortex-js/src/infrastructure/commanders/inference.command.ts b/cortex-js/src/infrastructure/commanders/inference.command.ts
deleted file mode 100644
index b5eba3988..000000000
--- a/cortex-js/src/infrastructure/commanders/inference.command.ts
+++ /dev/null
@@ -1,25 +0,0 @@
-import { CommandRunner, SubCommand } from 'nest-commander';
-
-@SubCommand({ name: 'chat' })
-export class InferenceCommand extends CommandRunner {
-  constructor() {
-    super();
-  }
-
-  async run(_input: string[]): Promise<void> {
-    const lineByLine = require('readline');
-    const lbl = lineByLine.createInterface({
-      input: process.stdin,
-      output: process.stdout,
-    });
-    lbl.on('line', (userInput: string) => {
-      if (userInput.trim() === 'exit()') {
-        lbl.close();
-        return;
-      }
-
-      console.log('Result:', userInput);
-      console.log('Enter another equation or type "exit()" to quit.');
-    });
-  }
-}
diff --git a/cortex-js/src/infrastructure/commanders/init.command.ts b/cortex-js/src/infrastructure/commanders/init.command.ts
new file mode 100644
index 000000000..adf8eba4b
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/init.command.ts
@@ -0,0 +1,37 @@
+import { CommandRunner, InquirerService, SubCommand } from 'nest-commander';
+import { InitCliUsecases } from './usecases/init.cli.usecases';
+import { InitOptions } from './types/init-options.interface';
+
+@SubCommand({
+  name: 'init',
+  aliases: ['setup'],
+  description: "Init settings and download cortex's dependencies",
+})
+export class InitCommand extends CommandRunner {
+  constructor(
+    private readonly inquirerService: InquirerService,
+    private readonly initUsecases: InitCliUsecases,
+  ) {
+    super();
+  }
+
+  async run(input: string[], options?: InitOptions): Promise<void> {
+    options = await this.inquirerService.ask(
+      'init-run-mode-questions',
+      options,
+    );
+
+    if (options.runMode === 'GPU' && !(await this.initUsecases.cudaVersion())) {
+      options = await this.inquirerService.ask('init-cuda-questions', options);
+    }
+
+    const version = input[0] ?? 'latest';
+
+    const engineFileName = this.initUsecases.parseEngineFileName(options);
+    await this.initUsecases.installEngine(engineFileName, version);
+
+    if (options.installCuda === 'Yes') {
+      await this.initUsecases.installCudaToolkitDependency(options);
+    }
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models.command.ts b/cortex-js/src/infrastructure/commanders/models.command.ts
index e20d27970..631c55774 100644
--- a/cortex-js/src/infrastructure/commanders/models.command.ts
+++ b/cortex-js/src/infrastructure/commanders/models.command.ts
@@ -1,52 +1,23 @@
-import { ModelsUsecases } from '@/usecases/models/models.usecases';
 import { CommandRunner, SubCommand } from 'nest-commander';
-import { PullCommand } from './pull.command';
-import { StartCommand } from './start.command';
+import { ModelStartCommand } from './models/model-start.command';
+import { ModelGetCommand } from './models/model-get.command';
+import { ModelListCommand } from './models/model-list.command';
+import { ModelStopCommand } from './models/model-stop.command';
+import { ModelPullCommand } from './models/model-pull.command';
+import { ModelRemoveCommand } from './models/model-remove.command';
 
-@SubCommand({ name: 'models', subCommands: [PullCommand, StartCommand] })
+@SubCommand({
+  name: 'models',
+  subCommands: [
+    ModelPullCommand,
+    ModelStartCommand,
+    ModelStopCommand,
+    ModelListCommand,
+    ModelGetCommand,
+    ModelRemoveCommand,
+  ],
+  description: 'Subcommands for managing models',
+})
 export class ModelsCommand extends CommandRunner {
-  constructor(private readonly modelsUsecases: ModelsUsecases) {
-    super();
-  }
-
-  async run(input: string[]): Promise<void> {
-    const command = input[0];
-    const modelId = input[1];
-
-    if (command !== 'list') {
-      if (!modelId) {
-        console.log('Model ID is required');
-        return;
-      }
-    }
-
-    switch (command) {
-      case 'list':
-        this.modelsUsecases.findAll().then(console.log);
-        return;
-      case 'get':
-        this.modelsUsecases.findOne(modelId).then(console.log);
-        return;
-      case 'remove':
-        this.modelsUsecases.remove(modelId).then(console.log);
-        return;
-
-      case 'stop':
-        return this.modelsUsecases
-          .stopModel(modelId)
-          .then(console.log)
-          .catch(console.error);
-
-      case 'stats':
-      case 'fetch':
-      case 'build': {
-        console.log('Command is not supported yet');
-        return;
-      }
-
-      default:
-        console.error(`Command ${command} is not supported`);
-        return;
-    }
-  }
+  async run(): Promise<void> {}
 }
diff --git a/cortex-js/src/infrastructure/commanders/models/model-get.command.ts b/cortex-js/src/infrastructure/commanders/models/model-get.command.ts
new file mode 100644
index 000000000..15136adc6
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-get.command.ts
@@ -0,0 +1,20 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+import { exit } from 'node:process';
+
+@SubCommand({ name: 'get', description: 'Get a model by ID.' })
+export class ModelGetCommand extends CommandRunner {
+  constructor(private readonly modelsCliUsecases: ModelsCliUsecases) {
+    super();
+  }
+
+  async run(input: string[]): Promise<void> {
+    if (input.length === 0) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    const models = await this.modelsCliUsecases.getModel(input[0]);
+    console.log(models);
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models/model-list.command.ts b/cortex-js/src/infrastructure/commanders/models/model-list.command.ts
new file mode 100644
index 000000000..6e491fc8d
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-list.command.ts
@@ -0,0 +1,14 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+
+@SubCommand({ name: 'list', description: 'List all models locally.' })
+export class ModelListCommand extends CommandRunner {
+  constructor(private readonly modelsCliUsecases: ModelsCliUsecases) {
+    super();
+  }
+
+  async run(): Promise<void> {
+    const models = await this.modelsCliUsecases.listAllModels();
+    console.log(models);
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts b/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts
new file mode 100644
index 000000000..c1a1af7ac
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-pull.command.ts
@@ -0,0 +1,25 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { exit } from 'node:process';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+
+@SubCommand({
+  name: 'pull',
+  aliases: ['download'],
+  description: 'Download a model. Working with HuggingFace model id.',
+})
+export class ModelPullCommand extends CommandRunner {
+  constructor(private readonly modelsCliUsecases: ModelsCliUsecases) {
+    super();
+  }
+
+  async run(input: string[]) {
+    if (input.length < 1) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    await this.modelsCliUsecases.pullModel(input[0]);
+    console.log('\nDownload complete!');
+    exit(0);
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models/model-remove.command.ts b/cortex-js/src/infrastructure/commanders/models/model-remove.command.ts
new file mode 100644
index 000000000..531f0f893
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-remove.command.ts
@@ -0,0 +1,20 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+import { exit } from 'node:process';
+
+@SubCommand({ name: 'remove', description: 'Remove a model by ID locally.' })
+export class ModelRemoveCommand extends CommandRunner {
+  constructor(private readonly modelsCliUsecases: ModelsCliUsecases) {
+    super();
+  }
+
+  async run(input: string[]): Promise<void> {
+    if (input.length === 0) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    const result = await this.modelsCliUsecases.removeModel(input[0]);
+    console.log(result);
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models/model-start.command.ts b/cortex-js/src/infrastructure/commanders/models/model-start.command.ts
new file mode 100644
index 000000000..b3108ff3e
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-start.command.ts
@@ -0,0 +1,24 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { exit } from 'node:process';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
+
+@SubCommand({ name: 'start', description: 'Start a model by ID.' })
+export class ModelStartCommand extends CommandRunner {
+  constructor(
+    private readonly cortexUsecases: CortexUsecases,
+    private readonly modelsCliUsecases: ModelsCliUsecases,
+  ) {
+    super();
+  }
+
+  async run(input: string[]): Promise<void> {
+    if (input.length === 0) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    await this.cortexUsecases.startCortex();
+    await this.modelsCliUsecases.startModel(input[0]);
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/models/model-stop.command.ts b/cortex-js/src/infrastructure/commanders/models/model-stop.command.ts
new file mode 100644
index 000000000..b9a4b112b
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/models/model-stop.command.ts
@@ -0,0 +1,24 @@
+import { CommandRunner, SubCommand } from 'nest-commander';
+import { exit } from 'node:process';
+import { ModelsCliUsecases } from '../usecases/models.cli.usecases';
+import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
+
+@SubCommand({ name: 'stop', description: 'Stop a model by ID.' })
+export class ModelStopCommand extends CommandRunner {
+  constructor(
+    private readonly cortexUsecases: CortexUsecases,
+    private readonly modelsCliUsecases: ModelsCliUsecases,
+  ) {
+    super();
+  }
+
+  async run(input: string[]): Promise<void> {
+    if (input.length === 0) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    await this.modelsCliUsecases.stopModel(input[0]);
+    await this.cortexUsecases.stopCortex();
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/pull.command.ts b/cortex-js/src/infrastructure/commanders/pull.command.ts
deleted file mode 100644
index 825b859b2..000000000
--- a/cortex-js/src/infrastructure/commanders/pull.command.ts
+++ /dev/null
@@ -1,143 +0,0 @@
-import { ModelsUsecases } from '@/usecases/models/models.usecases';
-import { CommandRunner, SubCommand } from 'nest-commander';
-import { CreateModelDto } from '../dtos/models/create-model.dto';
-import { ModelFormat } from '@/domain/models/model.interface';
-import { Presets, SingleBar } from 'cli-progress';
-
-const AllQuantizations = [
-  'Q3_K_S',
-  'Q3_K_M',
-  'Q3_K_L',
-  'Q4_K_S',
-  'Q4_K_M',
-  'Q5_K_S',
-  'Q5_K_M',
-  'Q4_0',
-  'Q4_1',
-  'Q5_0',
-  'Q5_1',
-  'IQ2_XXS',
-  'IQ2_XS',
-  'Q2_K',
-  'Q2_K_S',
-  'Q6_K',
-  'Q8_0',
-  'F16',
-  'F32',
-  'COPY',
-];
-
-@SubCommand({ name: 'pull', aliases: ['download'] })
-export class PullCommand extends CommandRunner {
-  constructor(private readonly modelsUsecases: ModelsUsecases) {
-    super();
-  }
-
-  async run(input: string[]): Promise<void> {
-    if (input.length < 1) {
-      return Promise.reject('Model ID is required');
-    }
-
-    const modelId = input[0];
-    if (modelId.includes('/')) {
-      await this.pullHuggingFaceModel(modelId);
-    }
-
-    const bar = new SingleBar({}, Presets.shades_classic);
-    bar.start(100, 0);
-    await this.modelsUsecases.downloadModel({ modelId }, (progress) => {
-      bar.update(progress);
-    });
-    console.log('\nDownload complete!');
-    process.exit(0);
-  }
-
-  async pullHuggingFaceModel(modelId: string) {
-    const data = await this.fetchHuggingFaceRepoData(modelId);
-
-    // TODO: add select options
-    const sibling = data.siblings.filter(
-      (e: any) => e.quantization == 'Q5_K_M',
-    )[0];
-
-    if (!sibling) throw 'No expected quantization found';
-
-    const model: CreateModelDto = {
-      sources: [
-        {
-          url: sibling.downloadUrl,
-        },
-      ],
-      id: modelId,
-      name: modelId,
-      version: '',
-      format: ModelFormat.GGUF,
-      description: '',
-      settings: {},
-      parameters: {},
-      metadata: {
-        author: data.author,
-        size: sibling.fileSize,
-        tags: [],
-      },
-      engine: 'cortex',
-    };
-    if (!(await this.modelsUsecases.findOne(modelId)))
-      await this.modelsUsecases.create(model);
-  }
-
-  private async fetchHuggingFaceRepoData(repoId: string) {
-    const sanitizedUrl = this.toHuggingFaceUrl(repoId);
-
-    const res = await fetch(sanitizedUrl);
-    const data = await res.json();
-    if (data['error'] != null) {
-      throw new Error(data['error']);
-    }
-
-    if (data.tags.indexOf('gguf') === -1) {
-      throw `${repoId} is not supported. Only GGUF models are supported.`;
-    }
-
-    // fetching file sizes
-    const url = new URL(sanitizedUrl);
-    const paths = url.pathname.split('/').filter((e) => e.trim().length > 0);
-
-    for (let i = 0; i < data.siblings.length; i++) {
-      const downloadUrl = `https://huggingface.co/${paths[2]}/${paths[3]}/resolve/main/${data.siblings[i].rfilename}`;
-      data.siblings[i].downloadUrl = downloadUrl;
-    }
-
-    AllQuantizations.forEach((quantization) => {
-      data.siblings.forEach((sibling: any) => {
-        if (!sibling.quantization && sibling.rfilename.includes(quantization)) {
-          sibling.quantization = quantization;
-        }
-      });
-    });
-
-    data.modelUrl = `https://huggingface.co/${paths[2]}/${paths[3]}`;
-    return data;
-  }
-
-  private toHuggingFaceUrl(repoId: string): string {
-    try {
-      const url = new URL(`https://huggingface.co/${repoId}`);
-      if (url.host !== 'huggingface.co') {
-        throw `Invalid Hugging Face repo URL: ${repoId}`;
-      }
-
-      const paths = url.pathname.split('/').filter((e) => e.trim().length > 0);
-      if (paths.length < 2) {
-        throw `Invalid Hugging Face repo URL: ${repoId}`;
-      }
-
-      return `${url.origin}/api/models/${paths[0]}/${paths[1]}`;
-    } catch (err) {
-      if (repoId.startsWith('https')) {
-        throw new Error(`Cannot parse url: ${repoId}`);
-      }
-      throw err;
-    }
-  }
-}
diff --git a/cortex-js/src/infrastructure/commanders/questions/cuda.questions.ts b/cortex-js/src/infrastructure/commanders/questions/cuda.questions.ts
new file mode 100644
index 000000000..2309c3d00
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/questions/cuda.questions.ts
@@ -0,0 +1,17 @@
+import { Question, QuestionSet } from 'nest-commander';
+import { platform } from 'node:process';
+
+@QuestionSet({ name: 'init-cuda-questions' })
+export class InitCudaQuestions {
+  @Question({
+    type: 'list',
+    message: 'Do you want to install additional dependencies for CUDA Toolkit?',
+    name: 'installCuda',
+    default: 'Yes',
+    choices: ['Yes', 'No, I want to use my own CUDA Toolkit'],
+    when: () => platform !== 'darwin',
+  })
+  parseRunMode(val: string) {
+    return val;
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/questions/init.questions.ts b/cortex-js/src/infrastructure/commanders/questions/init.questions.ts
new file mode 100644
index 000000000..ee4675320
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/questions/init.questions.ts
@@ -0,0 +1,40 @@
+import { Question, QuestionSet } from 'nest-commander';
+import { platform } from 'node:process';
+
+@QuestionSet({ name: 'init-run-mode-questions' })
+export class InitRunModeQuestions {
+  @Question({
+    type: 'list',
+    message: 'Select run mode',
+    name: 'runMode',
+    default: 'CPU',
+    choices: ['CPU', 'GPU'],
+    when: () => platform !== 'darwin',
+  })
+  parseRunMode(val: string) {
+    return val;
+  }
+
+  @Question({
+    type: 'list',
+    message: 'Select GPU type',
+    name: 'gpuType',
+    default: 'Nvidia',
+    choices: ['Nvidia', 'Others (Vulkan)'],
+    when: (answers: any) => answers.runMode === 'GPU',
+  })
+  parseGPUType(val: string) {
+    return val;
+  }
+
+  @Question({
+    type: 'list',
+    message: 'Select CPU instructions set',
+    name: 'instructions',
+    choices: ['AVX2', 'AVX', 'AVX512'],
+    when: () => platform !== 'darwin',
+  })
+  parseContent(val: string) {
+    return val;
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/serve.command.ts b/cortex-js/src/infrastructure/commanders/serve.command.ts
index d0f63a33e..6af783c76 100644
--- a/cortex-js/src/infrastructure/commanders/serve.command.ts
+++ b/cortex-js/src/infrastructure/commanders/serve.command.ts
@@ -8,7 +8,10 @@ type ServeOptions = {
   port?: number;
 };
 
-@SubCommand({ name: 'serve' })
+@SubCommand({
+  name: 'serve',
+  description: 'Providing API endpoint for Cortex backend',
+})
 export class ServeCommand extends CommandRunner {
   constructor() {
     super();
diff --git a/cortex-js/src/infrastructure/commanders/shortcuts/run.command.ts b/cortex-js/src/infrastructure/commanders/shortcuts/run.command.ts
new file mode 100644
index 000000000..d12786519
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/shortcuts/run.command.ts
@@ -0,0 +1,48 @@
+import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
+import { ModelsUsecases } from '@/usecases/models/models.usecases';
+import { CommandRunner, SubCommand, Option } from 'nest-commander';
+import { exit } from 'node:process';
+import { ChatUsecases } from '@/usecases/chat/chat.usecases';
+import { ChatCliUsecases } from '../usecases/chat.cli.usecases';
+
+type RunOptions = {
+  model?: string;
+};
+
+@SubCommand({
+  name: 'run',
+  description: 'EXPERIMENTAL: Shortcut to start a model and chat',
+})
+export class RunCommand extends CommandRunner {
+  constructor(
+    private readonly modelsUsecases: ModelsUsecases,
+    private readonly cortexUsecases: CortexUsecases,
+    private readonly chatUsecases: ChatUsecases,
+  ) {
+    super();
+  }
+
+  async run(_input: string[], option: RunOptions): Promise<void> {
+    const modelId = option.model;
+    if (!modelId) {
+      console.error('Model ID is required');
+      exit(1);
+    }
+
+    await this.cortexUsecases.startCortex();
+    await this.modelsUsecases.startModel(modelId);
+    const chatCliUsecases = new ChatCliUsecases(
+      this.chatUsecases,
+      this.cortexUsecases,
+    );
+    await chatCliUsecases.chat(modelId);
+  }
+
+  @Option({
+    flags: '--model <model_id>',
+    description: 'Model Id to start chat with',
+  })
+  parseModelId(value: string) {
+    return value;
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/start.command.ts b/cortex-js/src/infrastructure/commanders/start.command.ts
deleted file mode 100644
index a6b3a3429..000000000
--- a/cortex-js/src/infrastructure/commanders/start.command.ts
+++ /dev/null
@@ -1,48 +0,0 @@
-import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
-import { ModelsUsecases } from '@/usecases/models/models.usecases';
-import { CommandRunner, SubCommand } from 'nest-commander';
-import { LoadModelDto } from '../dtos/models/load-model.dto';
-
-@SubCommand({ name: 'start', aliases: ['run'] })
-export class StartCommand extends CommandRunner {
-  constructor(
-    private readonly modelsUsecases: ModelsUsecases,
-    private readonly cortexUsecases: CortexUsecases,
-  ) {
-    super();
-  }
-
-  async run(input: string[]): Promise<void> {
-    const modelId = input[0];
-
-    if (!modelId) {
-      console.log('Model ID is required');
-      return;
-    }
-    return this.startCortex()
-      .then(() => this.startModel(modelId))
-      .then(console.log)
-      .catch(console.error);
-  }
-
-  private async startCortex() {
-    const host = '127.0.0.1';
-    const port = '3928';
-    return this.cortexUsecases.startCortex(host, port);
-  }
-  private async startModel(modelId: string) {
-    const settings = {
-      cpu_threads: 10,
-      ctx_len: 2048,
-      embedding: false,
-      prompt_template:
-        '{system_message}\n### Instruction: {prompt}\n### Response:',
-      system_prompt: '',
-      user_prompt: '\n### Instruction: ',
-      ai_prompt: '\n### Response:',
-      ngl: 100,
-    };
-    const loadModelDto: LoadModelDto = { modelId, settings };
-    return this.modelsUsecases.startModel(loadModelDto);
-  }
-}
diff --git a/cortex-js/src/infrastructure/commanders/types/init-options.interface.ts b/cortex-js/src/infrastructure/commanders/types/init-options.interface.ts
new file mode 100644
index 000000000..24d460bbb
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/types/init-options.interface.ts
@@ -0,0 +1,7 @@
+export interface InitOptions {
+  runMode?: 'CPU' | 'GPU';
+  gpuType?: 'Nvidia' | 'Others (Vulkan)';
+  instructions?: 'AVX' | 'AVX2' | 'AVX512' | undefined;
+  cudaVersion?: '11' | '12';
+  installCuda?: 'Yes' | string;
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts
new file mode 100644
index 000000000..9f7409cca
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/usecases/chat.cli.usecases.ts
@@ -0,0 +1,84 @@
+import { ChatUsecases } from '@/usecases/chat/chat.usecases';
+import { ChatCompletionRole } from '@/domain/models/message.interface';
+import { exit, stdin, stdout } from 'node:process';
+import * as readline from 'node:readline/promises';
+import { ChatStreamEvent } from '@/domain/abstracts/oai.abstract';
+import { ChatCompletionMessage } from '@/infrastructure/dtos/chat/chat-completion-message.dto';
+import { CreateChatCompletionDto } from '@/infrastructure/dtos/chat/create-chat-completion.dto';
+import { CortexUsecases } from '@/usecases/cortex/cortex.usecases';
+
+// TODO: make this class injectable
+export class ChatCliUsecases {
+  private exitClause = 'exit()';
+  private userIndicator = '>> ';
+  private exitMessage = 'Bye!';
+
+  constructor(
+    private readonly chatUsecases: ChatUsecases,
+    private readonly cortexUsecases: CortexUsecases,
+  ) {}
+
+  async chat(modelId: string): Promise<void> {
+    console.log(`Inorder to exit, type '${this.exitClause}'.`);
+    const messages: ChatCompletionMessage[] = [];
+
+    const rl = readline.createInterface({
+      input: stdin,
+      output: stdout,
+      prompt: this.userIndicator,
+    });
+    rl.prompt();
+
+    rl.on('close', () => {
+      this.cortexUsecases.stopCortex().then(() => {
+        console.log(this.exitMessage);
+        exit(0);
+      });
+    });
+
+    rl.on('line', (userInput: string) => {
+      if (userInput.trim() === this.exitClause) {
+        rl.close();
+        return;
+      }
+
+      messages.push({
+        content: userInput,
+        role: ChatCompletionRole.User,
+      });
+
+      const chatDto: CreateChatCompletionDto = {
+        messages,
+        model: modelId,
+        stream: true,
+        max_tokens: 2048,
+        stop: [],
+        frequency_penalty: 0.7,
+        presence_penalty: 0.7,
+        temperature: 0.7,
+        top_p: 0.7,
+      };
+
+      let llmFullResponse = '';
+      const writableStream = new WritableStream<ChatStreamEvent>({
+        write(chunk) {
+          if (chunk.type === 'data') {
+            stdout.write(chunk.data ?? '');
+            llmFullResponse += chunk.data ?? '';
+          } else if (chunk.type === 'error') {
+            console.log('Error!!');
+          } else {
+            messages.push({
+              content: llmFullResponse,
+              role: ChatCompletionRole.Assistant,
+            });
+            llmFullResponse = '';
+            console.log('\n');
+          }
+        },
+      });
+
+      this.chatUsecases.createChatCompletions(chatDto, {}, writableStream);
+    });
+  }
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/cli.usecases.module.ts b/cortex-js/src/infrastructure/commanders/usecases/cli.usecases.module.ts
new file mode 100644
index 000000000..a82b60dd0
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/usecases/cli.usecases.module.ts
@@ -0,0 +1,13 @@
+import { Module } from '@nestjs/common';
+import { InitCliUsecases } from './init.cli.usecases';
+import { HttpModule } from '@nestjs/axios';
+import { ModelsCliUsecases } from './models.cli.usecases';
+import { ModelsModule } from '@/usecases/models/models.module';
+
+@Module({
+  imports: [HttpModule, ModelsModule],
+  controllers: [],
+  providers: [InitCliUsecases, ModelsCliUsecases],
+  exports: [InitCliUsecases, ModelsCliUsecases],
+})
+export class CliUsecasesModule {}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts
new file mode 100644
index 000000000..47456e4f9
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/usecases/init.cli.usecases.ts
@@ -0,0 +1,233 @@
+import { createWriteStream, existsSync, rmSync } from 'fs';
+import { resolve, delimiter, join } from 'path';
+import { HttpService } from '@nestjs/axios';
+import { Presets, SingleBar } from 'cli-progress';
+import decompress from 'decompress';
+import { exit } from 'node:process';
+import { InitOptions } from '../types/init-options.interface';
+import { Injectable } from '@nestjs/common';
+
+@Injectable()
+export class InitCliUsecases {
+  CORTEX_RELEASES_URL = 'https://api.github.com/repos/janhq/cortex/releases';
+  CUDA_DOWNLOAD_URL =
+    'https://catalog.jan.ai/dist/cuda-dependencies/<version>/<platform>/cuda.tar.gz';
+
+  constructor(private readonly httpService: HttpService) {}
+
+  installEngine = async (
+    engineFileName: string,
+    version: string = 'latest',
+  ): Promise<any> => {
+    const res = await this.httpService
+      .get(
+        this.CORTEX_RELEASES_URL + `${version === 'latest' ? '/latest' : ''}`,
+        {
+          headers: {
+            'X-GitHub-Api-Version': '2022-11-28',
+            Accept: 'application/vnd.github+json',
+          },
+        },
+      )
+      .toPromise();
+
+    if (!res?.data) {
+      console.log('Failed to fetch releases');
+      exit(1);
+    }
+
+    let release = res?.data;
+    if (Array.isArray(res?.data)) {
+      release = Array(res?.data)[0].find(
+        (e) => e.name === version.replace('v', ''),
+      );
+    }
+    const toDownloadAsset = release.assets.find((s: any) =>
+      s.name.includes(engineFileName),
+    );
+
+    if (!toDownloadAsset) {
+      console.log(`Could not find engine file ${engineFileName}`);
+      exit(1);
+    }
+
+    console.log(`Downloading engine file ${engineFileName}`);
+    const engineDir = resolve(this.rootDir(), 'cortex-cpp');
+    if (existsSync(engineDir)) rmSync(engineDir, { recursive: true });
+
+    const download = await this.httpService
+      .get(toDownloadAsset.browser_download_url, {
+        responseType: 'stream',
+      })
+      .toPromise();
+    if (!download) {
+      console.log('Failed to download model');
+      process.exit(1);
+    }
+
+    const destination = resolve(this.rootDir(), toDownloadAsset.name);
+
+    await new Promise((resolve, reject) => {
+      const writer = createWriteStream(destination);
+      let receivedBytes = 0;
+      const totalBytes = download.headers['content-length'];
+
+      writer.on('finish', () => {
+        bar.stop();
+        resolve(true);
+      });
+
+      writer.on('error', (error) => {
+        bar.stop();
+        reject(error);
+      });
+
+      const bar = new SingleBar({}, Presets.shades_classic);
+      bar.start(100, 0);
+
+      download.data.on('data', (chunk: any) => {
+        receivedBytes += chunk.length;
+        bar.update(Math.floor((receivedBytes / totalBytes) * 100));
+      });
+
+      download.data.pipe(writer);
+    });
+
+    try {
+      await decompress(
+        resolve(this.rootDir(), destination),
+        resolve(this.rootDir()),
+      );
+    } catch (e) {
+      console.error('Error decompressing file', e);
+      exit(1);
+    }
+  };
+
+  parseEngineFileName = (options: InitOptions) => {
+    const platform =
+      process.platform === 'win32'
+        ? 'windows'
+        : process.platform === 'darwin'
+          ? 'mac'
+          : process.platform;
+    const arch = process.arch === 'arm64' ? process.arch : 'amd64';
+    const cudaVersion =
+      options.runMode === 'GPU'
+        ? options.gpuType === 'Nvidia'
+          ? '-cuda-' + (options.cudaVersion === '11' ? '11-7' : '12-0')
+          : '-vulkan'
+        : '';
+    const instructions = options.instructions ? `-${options.instructions}` : '';
+    const engineName = `${platform}-${arch}${instructions.toLowerCase()}${cudaVersion}`;
+    return `${engineName}.tar.gz`;
+  };
+
+  rootDir = () => resolve(__dirname, `../../../../`);
+
+  cudaVersion = async () => {
+    let filesCuda12: string[];
+    let filesCuda11: string[];
+    let paths: string[];
+
+    if (process.platform === 'win32') {
+      filesCuda12 = ['cublas64_12.dll', 'cudart64_12.dll', 'cublasLt64_12.dll'];
+      filesCuda11 = [
+        'cublas64_11.dll',
+        'cudart64_110.dll',
+        'cublasLt64_11.dll',
+      ];
+      paths = process.env.PATH ? process.env.PATH.split(delimiter) : [];
+    } else {
+      filesCuda12 = ['libcudart.so.12', 'libcublas.so.12', 'libcublasLt.so.12'];
+      filesCuda11 = [
+        'libcudart.so.11.0',
+        'libcublas.so.11',
+        'libcublasLt.so.11',
+      ];
+      paths = process.env.LD_LIBRARY_PATH
+        ? process.env.LD_LIBRARY_PATH.split(delimiter)
+        : [];
+      paths.push('/usr/lib/x86_64-linux-gnu/');
+    }
+
+    if (
+      filesCuda12.every(
+        (file) =>
+          existsSync(file) || this.checkFileExistenceInPaths(file, paths),
+      )
+    )
+      return '12';
+
+    if (
+      filesCuda11.every(
+        (file) =>
+          existsSync(file) || this.checkFileExistenceInPaths(file, paths),
+      )
+    )
+      return '11';
+
+    return undefined; // No CUDA Toolkit found
+  };
+
+  checkFileExistenceInPaths = (file: string, paths: string[]): boolean => {
+    return paths.some((p) => existsSync(join(p, file)));
+  };
+
+  installCudaToolkitDependency = async (options: InitOptions) => {
+    const platform = process.platform === 'win32' ? 'windows' : 'linux';
+
+    const url = this.CUDA_DOWNLOAD_URL.replace(
+      '<version>',
+      options.cudaVersion === '11' ? '11.7' : '12.0',
+    ).replace('<platform>', platform);
+    const destination = resolve(this.rootDir(), 'cuda-toolkit.tar.gz');
+
+    const download = await this.httpService
+      .get(url, {
+        responseType: 'stream',
+      })
+      .toPromise();
+
+    if (!download) {
+      console.log('Failed to download dependency');
+      process.exit(1);
+    }
+
+    await new Promise((resolve, reject) => {
+      const writer = createWriteStream(destination);
+      let receivedBytes = 0;
+      const totalBytes = download.headers['content-length'];
+
+      writer.on('finish', () => {
+        bar.stop();
+        resolve(true);
+      });
+
+      writer.on('error', (error) => {
+        bar.stop();
+        reject(error);
+      });
+
+      const bar = new SingleBar({}, Presets.shades_classic);
+      bar.start(100, 0);
+
+      download.data.on('data', (chunk: any) => {
+        receivedBytes += chunk.length;
+        bar.update(Math.floor((receivedBytes / totalBytes) * 100));
+      });
+
+      download.data.pipe(writer);
+    });
+
+    try {
+      await decompress(
+        resolve(this.rootDir(), destination),
+        resolve(this.rootDir(), 'cortex-cpp'),
+      );
+    } catch (e) {
+      console.log(e);
+      exit(1);
+    }
+  };
+}
diff --git a/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
new file mode 100644
index 000000000..509abd565
--- /dev/null
+++ b/cortex-js/src/infrastructure/commanders/usecases/models.cli.usecases.ts
@@ -0,0 +1,203 @@
+import { exit } from 'node:process';
+import { ModelsUsecases } from '@/usecases/models/models.usecases';
+import { Model, ModelFormat } from '@/domain/models/model.interface';
+import { CreateModelDto } from '@/infrastructure/dtos/models/create-model.dto';
+import { HuggingFaceRepoData } from '@/domain/models/huggingface.interface';
+import { gguf } from '@huggingface/gguf';
+import { InquirerService } from 'nest-commander';
+import { Inject, Injectable } from '@nestjs/common';
+import { Presets, SingleBar } from 'cli-progress';
+
+const AllQuantizations = [
+  'Q3_K_S',
+  'Q3_K_M',
+  'Q3_K_L',
+  'Q4_K_S',
+  'Q4_K_M',
+  'Q5_K_S',
+  'Q5_K_M',
+  'Q4_0',
+  'Q4_1',
+  'Q5_0',
+  'Q5_1',
+  'IQ2_XXS',
+  'IQ2_XS',
+  'Q2_K',
+  'Q2_K_S',
+  'Q6_K',
+  'Q8_0',
+  'F16',
+  'F32',
+  'COPY',
+];
+
+@Injectable()
+export class ModelsCliUsecases {
+  constructor(
+    private readonly modelsUsecases: ModelsUsecases,
+    @Inject(InquirerService)
+    private readonly inquirerService: InquirerService,
+  ) {}
+
+  async startModel(modelId: string): Promise<void> {
+    await this.getModelOrStop(modelId);
+    await this.modelsUsecases.startModel(modelId);
+  }
+
+  async stopModel(modelId: string): Promise<void> {
+    await this.getModelOrStop(modelId);
+    await this.modelsUsecases.stopModel(modelId);
+  }
+
+  private async getModelOrStop(modelId: string): Promise<Model> {
+    const model = await this.modelsUsecases.findOne(modelId);
+    if (!model) {
+      console.debug('Model not found');
+      exit(1);
+    }
+    return model;
+  }
+
+  async listAllModels(): Promise<Model[]> {
+    return this.modelsUsecases.findAll();
+  }
+
+  async getModel(modelId: string): Promise<Model> {
+    const model = await this.getModelOrStop(modelId);
+    return model;
+  }
+
+  async removeModel(modelId: string) {
+    await this.getModelOrStop(modelId);
+    return this.modelsUsecases.remove(modelId);
+  }
+
+  async pullModel(modelId: string) {
+    if (modelId.includes('/')) {
+      await this.pullHuggingFaceModel(modelId);
+    }
+
+    const bar = new SingleBar({}, Presets.shades_classic);
+    bar.start(100, 0);
+    const callback = (progress: number) => {
+      bar.update(progress);
+    };
+    await this.modelsUsecases.downloadModel(modelId, callback);
+  }
+
+  private async pullHuggingFaceModel(modelId: string) {
+    const data = await this.fetchHuggingFaceRepoData(modelId);
+    const { quantization } = await this.inquirerService.inquirer.prompt({
+      type: 'list',
+      name: 'quantization',
+      message: 'Select quantization',
+      choices: data.siblings
+        .map((e) => e.quantization)
+        .filter((e) => e != null),
+    });
+
+    const sibling = data.siblings
+      .filter((e) => !!e.quantization)
+      .find((e: any) => e.quantization === quantization);
+
+    if (!sibling) throw 'No expected quantization found';
+
+    let stopWord = '';
+    try {
+      const { metadata } = await gguf(sibling.downloadUrl!);
+      // @ts-expect-error "tokenizer.ggml.eos_token_id"
+      const index = metadata['tokenizer.ggml.eos_token_id'];
+      // @ts-expect-error "tokenizer.ggml.tokens"
+      stopWord = metadata['tokenizer.ggml.tokens'][index] ?? '';
+    } catch (err) {
+      console.log('Failed to get stop word: ', err);
+    }
+
+    const stopWords: string[] = [];
+    if (stopWord.length > 0) {
+      stopWords.push(stopWord);
+    }
+
+    const model: CreateModelDto = {
+      sources: [
+        {
+          url: sibling?.downloadUrl ?? '',
+        },
+      ],
+      id: modelId,
+      name: modelId,
+      version: '',
+      format: ModelFormat.GGUF,
+      description: '',
+      settings: {},
+      parameters: {
+        stop: stopWords,
+      },
+      metadata: {
+        author: data.author,
+        size: sibling.fileSize ?? 0,
+        tags: [],
+      },
+      engine: 'cortex',
+    };
+    if (!(await this.modelsUsecases.findOne(modelId)))
+      await this.modelsUsecases.create(model);
+  }
+
+  private async fetchHuggingFaceRepoData(repoId: string) {
+    const sanitizedUrl = this.toHuggingFaceUrl(repoId);
+
+    const res = await fetch(sanitizedUrl);
+    const response = await res.json();
+    if (response['error'] != null) {
+      throw new Error(response['error']);
+    }
+
+    const data = response as HuggingFaceRepoData;
+
+    if (data.tags.indexOf('gguf') === -1) {
+      throw `${repoId} is not supported. Only GGUF models are supported.`;
+    }
+
+    // fetching file sizes
+    const url = new URL(sanitizedUrl);
+    const paths = url.pathname.split('/').filter((e) => e.trim().length > 0);
+
+    for (let i = 0; i < data.siblings.length; i++) {
+      const downloadUrl = `https://huggingface.co/${paths[2]}/${paths[3]}/resolve/main/${data.siblings[i].rfilename}`;
+      data.siblings[i].downloadUrl = downloadUrl;
+    }
+
+    AllQuantizations.forEach((quantization) => {
+      data.siblings.forEach((sibling: any) => {
+        if (!sibling.quantization && sibling.rfilename.includes(quantization)) {
+          sibling.quantization = quantization;
+        }
+      });
+    });
+
+    data.modelUrl = `https://huggingface.co/${paths[2]}/${paths[3]}`;
+    return data;
+  }
+
+  private toHuggingFaceUrl(repoId: string): string {
+    try {
+      const url = new URL(`https://huggingface.co/${repoId}`);
+      if (url.host !== 'huggingface.co') {
+        throw `Invalid Hugging Face repo URL: ${repoId}`;
+      }
+
+      const paths = url.pathname.split('/').filter((e) => e.trim().length > 0);
+      if (paths.length < 2) {
+        throw `Invalid Hugging Face repo URL: ${repoId}`;
+      }
+
+      return `${url.origin}/api/models/${paths[0]}/${paths[1]}`;
+    } catch (err) {
+      if (repoId.startsWith('https')) {
+        throw new Error(`Cannot parse url: ${repoId}`);
+      }
+      throw err;
+    }
+  }
+}
diff --git a/cortex-js/src/infrastructure/controllers/chat.controller.ts b/cortex-js/src/infrastructure/controllers/chat.controller.ts
index dc9f7abda..e9c50591e 100644
--- a/cortex-js/src/infrastructure/controllers/chat.controller.ts
+++ b/cortex-js/src/infrastructure/controllers/chat.controller.ts
@@ -3,6 +3,7 @@ import { CreateChatCompletionDto } from '@/infrastructure/dtos/chat/create-chat-
 import { ChatUsecases } from '@/usecases/chat/chat.usecases';
 import { Response } from 'express';
 import { ApiTags } from '@nestjs/swagger';
+import { ChatStreamEvent } from '@/domain/abstracts/oai.abstract';
 
 @ApiTags('Inference')
 @Controller('chat')
@@ -15,6 +16,23 @@ export class ChatController {
     @Body() createChatDto: CreateChatCompletionDto,
     @Res() res: Response,
   ) {
-    this.chatService.createChatCompletions(createChatDto, headers, res);
+    const writableStream = new WritableStream<ChatStreamEvent>({
+      write(chunk) {
+        if (chunk.type === 'data') {
+          res.json(chunk.data ?? {});
+        } else if (chunk.type === 'error') {
+          res.json(chunk.error ?? {});
+        } else {
+          console.log('\n');
+        }
+      },
+    });
+
+    this.chatService.createChatCompletions(
+      createChatDto,
+      headers,
+      writableStream,
+      res,
+    );
   }
 }
diff --git a/cortex-js/src/infrastructure/controllers/inference-settings.controller.spec.ts b/cortex-js/src/infrastructure/controllers/inference-settings.controller.spec.ts
deleted file mode 100644
index 05097ddae..000000000
--- a/cortex-js/src/infrastructure/controllers/inference-settings.controller.spec.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-import { Test, TestingModule } from '@nestjs/testing';
-import { InferenceSettingsController } from './inference-settings.controller';
-import { InferenceSettingsUsecases } from '@/usecases/inference-settings/inference-settings.usecases';
-
-describe('InferenceSettingsController', () => {
-  let controller: InferenceSettingsController;
-
-  beforeEach(async () => {
-    const module: TestingModule = await Test.createTestingModule({
-      controllers: [InferenceSettingsController],
-      providers: [InferenceSettingsUsecases],
-    }).compile();
-
-    controller = module.get<InferenceSettingsController>(
-      InferenceSettingsController,
-    );
-  });
-
-  it('should be defined', () => {
-    expect(controller).toBeDefined();
-  });
-});
diff --git a/cortex-js/src/infrastructure/controllers/inference-settings.controller.ts b/cortex-js/src/infrastructure/controllers/inference-settings.controller.ts
deleted file mode 100644
index 805a4c1b3..000000000
--- a/cortex-js/src/infrastructure/controllers/inference-settings.controller.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-import {
-  Controller,
-  Get,
-  Post,
-  Body,
-  Patch,
-  Param,
-  Delete,
-} from '@nestjs/common';
-import { InferenceSettingsUsecases } from '@/usecases/inference-settings/inference-settings.usecases';
-import { CreateInferenceSettingDto } from '@/infrastructure/dtos/inference-settings/create-inference-setting.dto';
-import { UpdateInferenceSettingDto } from '@/infrastructure/dtos/inference-settings/update-inference-setting.dto';
-import { ApiTags } from '@nestjs/swagger';
-
-@ApiTags('Inference Settings')
-@Controller('inference-settings')
-export class InferenceSettingsController {
-  constructor(
-    private readonly inferenceSettingsService: InferenceSettingsUsecases,
-  ) {}
-
-  @Post()
-  create(@Body() createInferenceSettingDto: CreateInferenceSettingDto) {
-    return this.inferenceSettingsService.create(createInferenceSettingDto);
-  }
-
-  @Get()
-  findAll() {
-    return this.inferenceSettingsService.findAll();
-  }
-
-  @Get(':id')
-  findOne(@Param('id') id: string) {
-    return this.inferenceSettingsService.findOne(id);
-  }
-
-  @Patch(':id')
-  update(
-    @Param('id') id: string,
-    @Body() updateInferenceSettingDto: UpdateInferenceSettingDto,
-  ) {
-    return this.inferenceSettingsService.update(id, updateInferenceSettingDto);
-  }
-
-  @Delete(':id')
-  remove(@Param('id') id: string) {
-    return this.inferenceSettingsService.remove(id);
-  }
-}
diff --git a/cortex-js/src/infrastructure/controllers/models.controller.ts b/cortex-js/src/infrastructure/controllers/models.controller.ts
index 907349fc3..2983d1c31 100644
--- a/cortex-js/src/infrastructure/controllers/models.controller.ts
+++ b/cortex-js/src/infrastructure/controllers/models.controller.ts
@@ -12,53 +12,66 @@ import { ModelsUsecases } from '@/usecases/models/models.usecases';
 import { CreateModelDto } from '@/infrastructure/dtos/models/create-model.dto';
 import { UpdateModelDto } from '@/infrastructure/dtos/models/update-model.dto';
 import { ApiResponse, ApiTags } from '@nestjs/swagger';
-import { LoadModelSuccessDto } from '@/infrastructure/dtos/models/load-model-success.dto';
-import { LoadModelDto } from '@/infrastructure/dtos/models/load-model.dto';
-import { DownloadModelDto } from '@/infrastructure/dtos/models/download-model.dto';
+import { StartModelSuccessDto } from '@/infrastructure/dtos/models/start-model-success.dto';
+import { ModelSettingParamsDto } from '../dtos/models/model-setting-params.dto';
 
 @ApiTags('Models')
 @Controller('models')
 export class ModelsController {
-  constructor(private readonly modelsService: ModelsUsecases) {}
+  constructor(private readonly modelsUsecases: ModelsUsecases) {}
 
   @Post()
   create(@Body() createModelDto: CreateModelDto) {
-    return this.modelsService.create(createModelDto);
+    return this.modelsUsecases.create(createModelDto);
   }
 
   @HttpCode(200)
   @ApiResponse({
     status: 200,
-    description: 'The model has been loaded successfully.',
-    type: LoadModelSuccessDto,
+    description: 'The model has been started successfully.',
+    type: StartModelSuccessDto,
   })
-  @Post('load')
-  load(@Body() loadModelDto: LoadModelDto) {
-    return this.modelsService.startModel(loadModelDto);
+  @Post(':modelId/start')
+  startModel(
+    @Param('modelId') modelId: string,
+    @Body() settings: ModelSettingParamsDto,
+  ) {
+    return this.modelsUsecases.startModel(modelId, settings);
   }
 
-  @Post('download')
-  downloadModel(@Body() downloadModelDto: DownloadModelDto) {
-    return this.modelsService.downloadModel(downloadModelDto);
+  @HttpCode(200)
+  @ApiResponse({
+    status: 200,
+    description: 'The model has been stopped successfully.',
+    type: StartModelSuccessDto,
+  })
+  @Post(':modelId/stop')
+  stopModel(@Param('modelId') modelId: string) {
+    return this.modelsUsecases.stopModel(modelId);
+  }
+
+  @Get('download/:modelId')
+  downloadModel(@Param('modelId') modelId: string) {
+    return this.modelsUsecases.downloadModel(modelId);
   }
 
   @Get()
   findAll() {
-    return this.modelsService.findAll();
+    return this.modelsUsecases.findAll();
   }
 
   @Get(':id')
   findOne(@Param('id') id: string) {
-    return this.modelsService.findOne(id);
+    return this.modelsUsecases.findOne(id);
   }
 
   @Patch(':id')
   update(@Param('id') id: string, @Body() updateModelDto: UpdateModelDto) {
-    return this.modelsService.update(id, updateModelDto);
+    return this.modelsUsecases.update(id, updateModelDto);
   }
 
   @Delete(':id')
   remove(@Param('id') id: string) {
-    return this.modelsService.remove(id);
+    return this.modelsUsecases.remove(id);
   }
 }
diff --git a/cortex-js/src/infrastructure/database/database.module.ts b/cortex-js/src/infrastructure/database/database.module.ts
index ace7da1c9..016066219 100644
--- a/cortex-js/src/infrastructure/database/database.module.ts
+++ b/cortex-js/src/infrastructure/database/database.module.ts
@@ -4,7 +4,6 @@ import { sqliteDatabaseProviders } from './sqlite-database.providers';
 import { modelProviders } from './providers/model.providers';
 import { assistantProviders } from './providers/assistant.providers';
 import { messageProviders } from './providers/message.providers';
-import { inferenceSettingProviders } from './providers/inference-setting.providers';
 
 @Module({
   providers: [
@@ -13,14 +12,12 @@ import { inferenceSettingProviders } from './providers/inference-setting.provide
     ...modelProviders,
     ...assistantProviders,
     ...messageProviders,
-    ...inferenceSettingProviders,
   ],
   exports: [
     ...threadProviders,
     ...modelProviders,
     ...assistantProviders,
     ...messageProviders,
-    ...inferenceSettingProviders,
   ],
 })
 export class DatabaseModule {}
diff --git a/cortex-js/src/infrastructure/database/providers/inference-setting.providers.ts b/cortex-js/src/infrastructure/database/providers/inference-setting.providers.ts
deleted file mode 100644
index 621d25fd8..000000000
--- a/cortex-js/src/infrastructure/database/providers/inference-setting.providers.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import { InferenceSettingEntity } from '@/infrastructure/entities/inference-setting.entity';
-import { DataSource } from 'typeorm';
-
-export const inferenceSettingProviders = [
-  {
-    provide: 'INFERENCE_SETTING_REPOSITORY',
-    useFactory: (dataSource: DataSource) =>
-      dataSource.getRepository(InferenceSettingEntity),
-    inject: ['DATA_SOURCE'],
-  },
-];
diff --git a/cortex-js/src/infrastructure/database/sqlite-database.providers.ts b/cortex-js/src/infrastructure/database/sqlite-database.providers.ts
index 84700ff49..9c14ee965 100644
--- a/cortex-js/src/infrastructure/database/sqlite-database.providers.ts
+++ b/cortex-js/src/infrastructure/database/sqlite-database.providers.ts
@@ -1,13 +1,15 @@
 import { databaseFile } from 'constant';
+import { resolve } from 'path';
 import { DataSource } from 'typeorm';
 
 export const sqliteDatabaseProviders = [
   {
     provide: 'DATA_SOURCE',
     useFactory: async () => {
+      const sqlitePath = resolve(__dirname, `../../../${databaseFile}`);
       const dataSource = new DataSource({
         type: 'sqlite',
-        database: databaseFile,
+        database: sqlitePath,
         synchronize: process.env.NODE_ENV !== 'production',
         entities: [__dirname + '/../**/*.entity{.ts,.js}'],
       });
diff --git a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
index 12b9e6d2f..4745a3b3d 100644
--- a/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/cortex/start-cortex.dto.ts
@@ -1,10 +1,24 @@
-import { IsIP, IsString } from 'class-validator';
+import { ApiProperty } from '@nestjs/swagger';
+import { IsIP, IsNumber, IsString, Max, Min } from 'class-validator';
+import { defaultCortexCppHost, defaultCortexCppPort } from 'constant';
 
 export class StartCortexDto {
+  @ApiProperty({
+    name: 'host',
+    description: 'Cortexcpp host',
+    default: defaultCortexCppHost,
+  })
   @IsString()
   @IsIP()
   host: string;
 
-  @IsString()
-  port: string;
+  @ApiProperty({
+    name: 'port',
+    description: 'Cortexcpp port',
+    default: defaultCortexCppPort,
+  })
+  @IsNumber()
+  @Min(0)
+  @Max(65535)
+  port: number;
 }
diff --git a/cortex-js/src/infrastructure/dtos/inference-settings/controller-props.dto.ts b/cortex-js/src/infrastructure/dtos/inference-settings/controller-props.dto.ts
deleted file mode 100644
index 15c4601f3..000000000
--- a/cortex-js/src/infrastructure/dtos/inference-settings/controller-props.dto.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import { IsOptional, IsString } from 'class-validator';
-import { ControllerProps } from '@/domain/models/inference-setting.interface';
-
-export class ControllerPropsDto implements ControllerProps {
-  @IsString()
-  placeholder: string;
-
-  @IsString()
-  value: string;
-
-  @IsOptional()
-  @IsString()
-  type?: string;
-}
diff --git a/cortex-js/src/infrastructure/dtos/inference-settings/create-inference-setting.dto.ts b/cortex-js/src/infrastructure/dtos/inference-settings/create-inference-setting.dto.ts
deleted file mode 100644
index ba34ec81c..000000000
--- a/cortex-js/src/infrastructure/dtos/inference-settings/create-inference-setting.dto.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import { Type } from 'class-transformer';
-import { IsArray, IsString, ValidateNested } from 'class-validator';
-import { InferenceSetting } from '@/domain/models/inference-setting.interface';
-import { InferenceSettingDocumentDto } from './inference-setting-document.dto';
-
-export class CreateInferenceSettingDto implements Partial<InferenceSetting> {
-  @IsString()
-  inferenceId: string;
-
-  @IsArray()
-  @ValidateNested({ each: true })
-  @Type(() => InferenceSettingDocumentDto)
-  settings: InferenceSettingDocumentDto[];
-}
diff --git a/cortex-js/src/infrastructure/dtos/inference-settings/inference-setting-document.dto.ts b/cortex-js/src/infrastructure/dtos/inference-settings/inference-setting-document.dto.ts
deleted file mode 100644
index bca718c6a..000000000
--- a/cortex-js/src/infrastructure/dtos/inference-settings/inference-setting-document.dto.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { IsString, ValidateNested } from 'class-validator';
-import { InferenceSettingDocument } from '@/domain/models/inference-setting.interface';
-import { ControllerPropsDto } from './controller-props.dto';
-
-export class InferenceSettingDocumentDto implements InferenceSettingDocument {
-  @IsString()
-  key: string;
-
-  @IsString()
-  extensionName: string;
-
-  @IsString()
-  title: string;
-
-  @IsString()
-  description: string;
-
-  @IsString()
-  controllerType: string;
-
-  @ValidateNested()
-  controllerProps: ControllerPropsDto;
-}
diff --git a/cortex-js/src/infrastructure/dtos/inference-settings/update-inference-setting.dto.ts b/cortex-js/src/infrastructure/dtos/inference-settings/update-inference-setting.dto.ts
deleted file mode 100644
index 026dffded..000000000
--- a/cortex-js/src/infrastructure/dtos/inference-settings/update-inference-setting.dto.ts
+++ /dev/null
@@ -1,4 +0,0 @@
-import { PartialType } from '@nestjs/swagger';
-import { CreateInferenceSettingDto } from './create-inference-setting.dto';
-
-export class UpdateInferenceSettingDto extends PartialType(CreateInferenceSettingDto) {}
diff --git a/cortex-js/src/infrastructure/dtos/messages/thread-content.dto.ts b/cortex-js/src/infrastructure/dtos/messages/thread-content.dto.ts
index 8e062893d..a8d6b3337 100644
--- a/cortex-js/src/infrastructure/dtos/messages/thread-content.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/messages/thread-content.dto.ts
@@ -1,8 +1,5 @@
 import { IsEnum, ValidateNested } from 'class-validator';
-import {
-  ContentType,
-  ThreadContent,
-} from '@/domain/models/message.interface';
+import { ContentType, ThreadContent } from '@/domain/models/message.interface';
 import { ContentValueDto } from './content-value.dto';
 
 export class ThreadContentDto implements ThreadContent {
diff --git a/cortex-js/src/infrastructure/dtos/models/download-model.dto.ts b/cortex-js/src/infrastructure/dtos/models/download-model.dto.ts
deleted file mode 100644
index 46ea3c12d..000000000
--- a/cortex-js/src/infrastructure/dtos/models/download-model.dto.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-import { IsString } from 'class-validator';
-
-export class DownloadModelDto {
-  @IsString()
-  modelId: string;
-}
diff --git a/cortex-js/src/infrastructure/dtos/models/load-model.dto.ts b/cortex-js/src/infrastructure/dtos/models/load-model.dto.ts
deleted file mode 100644
index 5aaa07194..000000000
--- a/cortex-js/src/infrastructure/dtos/models/load-model.dto.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-import { IsOptional, IsString, ValidateNested } from 'class-validator';
-import { ModelSettingParamsDto } from './model-setting-params.dto';
-
-export class LoadModelDto {
-  @IsString()
-  modelId: string;
-
-  @IsOptional()
-  @ValidateNested()
-  settings?: ModelSettingParamsDto;
-}
diff --git a/cortex-js/src/infrastructure/dtos/models/load-model-success.dto.ts b/cortex-js/src/infrastructure/dtos/models/start-model-success.dto.ts
similarity index 76%
rename from cortex-js/src/infrastructure/dtos/models/load-model-success.dto.ts
rename to cortex-js/src/infrastructure/dtos/models/start-model-success.dto.ts
index e604e80b9..01c0bba05 100644
--- a/cortex-js/src/infrastructure/dtos/models/load-model-success.dto.ts
+++ b/cortex-js/src/infrastructure/dtos/models/start-model-success.dto.ts
@@ -1,6 +1,6 @@
 import { IsString } from 'class-validator';
 
-export class LoadModelSuccessDto {
+export class StartModelSuccessDto {
   @IsString()
   message: string;
 
diff --git a/cortex-js/src/infrastructure/entities/inference-setting.entity.ts b/cortex-js/src/infrastructure/entities/inference-setting.entity.ts
deleted file mode 100644
index 9b6d0cb41..000000000
--- a/cortex-js/src/infrastructure/entities/inference-setting.entity.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import {
-  InferenceSetting,
-  InferenceSettingDocument,
-} from '@/domain/models/inference-setting.interface';
-import { Column, Entity, PrimaryColumn } from 'typeorm';
-
-@Entity('inference_setting')
-export class InferenceSettingEntity implements InferenceSetting {
-  @PrimaryColumn()
-  inferenceId: string;
-
-  @Column({ type: 'simple-json' })
-  settings: InferenceSettingDocument[];
-}
diff --git a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
index f1256f072..941f1b860 100644
--- a/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
+++ b/cortex-js/src/infrastructure/providers/cortex/cortex.provider.ts
@@ -1,53 +1,55 @@
 import { Injectable } from '@nestjs/common';
 import { OAIEngineExtension } from '@/domain/abstracts/oai.abstract';
 import { PromptTemplate } from '@/domain/models/prompt-template.interface';
-import { basename, join, resolve } from 'path';
-import { Model } from '@/domain/models/model.interface';
-import { ConfigService } from '@nestjs/config';
+import { join, resolve } from 'path';
+import { Model, ModelSettingParams } from '@/domain/models/model.interface';
 import { HttpService } from '@nestjs/axios';
+import { defaultCortexCppHost, defaultCortexCppPort } from 'constant';
+import { readdirSync } from 'node:fs';
 
 /**
  * A class that implements the InferenceExtension interface from the @janhq/core package.
  * The class provides methods for initializing and stopping a model, and for making inference requests.
  * It also subscribes to events emitted by the @janhq/core package and handles new message requests.
  */
-const LOCAL_HOST = '127.0.0.1';
-const NITRO_DEFAULT_PORT = 3928;
-const NITRO_HTTP_SERVER_URL = `http://${LOCAL_HOST}:${NITRO_DEFAULT_PORT}`;
-const LOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/loadmodel`;
-const UNLOAD_MODEL_URL = `${NITRO_HTTP_SERVER_URL}/inferences/server/unloadmodel`;
-
 @Injectable()
 export default class CortexProvider extends OAIEngineExtension {
   provider: string = 'cortex';
-  apiUrl = 'http://127.0.0.1:3928/inferences/server/chat_completion';
+  apiUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/chat_completion`;
+
+  private loadModelUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/loadmodel`;
+  private unloadModelUrl = `http://${defaultCortexCppHost}:${defaultCortexCppPort}/inferences/server/unloadmodel`;
 
-  constructor(
-    private readonly configService: ConfigService,
-    protected readonly httpService: HttpService,
-  ) {
+  constructor(protected readonly httpService: HttpService) {
     super(httpService);
   }
 
-  override async loadModel(model: Model): Promise<void> {
-    const modelsContainerDir =
-      this.configService.get<string>('CORTEX_MODELS_DIR') ??
-      resolve('./models');
+  modelDir = () => resolve(__dirname, `../../../../models`);
+
+  override async loadModel(
+    model: Model,
+    settings?: ModelSettingParams,
+  ): Promise<void> {
+    const modelsContainerDir = this.modelDir();
 
     const modelFolderFullPath = join(modelsContainerDir, model.id);
-    //TODO: recheck this
-    const modelBinaryLocalPath = join(
-      modelFolderFullPath,
-      basename(model.sources[0].url),
-    );
+    const ggufFiles = readdirSync(modelFolderFullPath).filter((file) => {
+      return file.endsWith('.gguf');
+    });
+
+    if (ggufFiles.length === 0) {
+      throw new Error('Model binary not found');
+    }
 
-    // TODO: NamH check if the binary is there
+    const modelBinaryLocalPath = join(modelFolderFullPath, ggufFiles[0]);
 
-    const cpuThreadCount = 1; // TODO: NamH Math.max(1, nitroResourceProbe.numCpuPhysicalCore);
+    const cpuThreadCount = 1; // TODO: Math.max(1, nitroResourceProbe.numCpuPhysicalCore);
     const modelSettings = {
       // This is critical and requires real CPU physical core count (or performance core)
+      model: model.id,
       cpu_threads: cpuThreadCount,
       ...model.settings,
+      ...settings,
       llama_model_path: modelBinaryLocalPath,
       ...(model.settings.mmproj && {
         mmproj: join(modelFolderFullPath, model.settings.mmproj),
@@ -66,12 +68,12 @@ export default class CortexProvider extends OAIEngineExtension {
       modelSettings.ai_prompt = prompt.ai_prompt;
     }
 
-    await this.httpService.post(LOAD_MODEL_URL, modelSettings).toPromise();
+    await this.httpService.post(this.loadModelUrl, modelSettings).toPromise();
   }
 
   override async unloadModel(modelId: string): Promise<void> {
     await this.httpService
-      .post(UNLOAD_MODEL_URL, { model: modelId })
+      .post(this.unloadModelUrl, { model: modelId })
       .toPromise();
   }
 
diff --git a/cortex-js/src/usecases/chat/chat.module.ts b/cortex-js/src/usecases/chat/chat.module.ts
index 1f7c70090..e69b10b73 100644
--- a/cortex-js/src/usecases/chat/chat.module.ts
+++ b/cortex-js/src/usecases/chat/chat.module.ts
@@ -8,5 +8,6 @@ import { ExtensionModule } from '@/infrastructure/repositories/extensions/extens
   imports: [DatabaseModule, ExtensionModule],
   controllers: [ChatController],
   providers: [ChatUsecases],
+  exports: [ChatUsecases],
 })
 export class ChatModule {}
diff --git a/cortex-js/src/usecases/chat/chat.usecases.ts b/cortex-js/src/usecases/chat/chat.usecases.ts
index f4c338b0a..6386e57d8 100644
--- a/cortex-js/src/usecases/chat/chat.usecases.ts
+++ b/cortex-js/src/usecases/chat/chat.usecases.ts
@@ -1,10 +1,10 @@
 import { Inject, Injectable } from '@nestjs/common';
 import { CreateChatCompletionDto } from '@/infrastructure/dtos/chat/create-chat-completion.dto';
-import { Response } from 'express';
 import { ExtensionRepository } from '@/domain/repositories/extension.interface';
 import { Repository } from 'typeorm';
 import { ModelEntity } from '@/infrastructure/entities/model.entity';
 import { EngineExtension } from '@/domain/abstracts/engine.abstract';
+import { ChatStreamEvent } from '@/domain/abstracts/oai.abstract';
 
 @Injectable()
 export class ChatUsecases {
@@ -17,7 +17,8 @@ export class ChatUsecases {
   async createChatCompletions(
     createChatDto: CreateChatCompletionDto,
     headers: Record<string, string>,
-    res: Response,
+    stream: WritableStream<ChatStreamEvent>,
+    res?: any,
   ) {
     const extensions = (await this.extensionRepository.findAll()) ?? [];
     const model = await this.modelRepository.findOne({
@@ -26,6 +27,6 @@ export class ChatUsecases {
     const engine = extensions.find((e: any) => e.provider === model?.engine) as
       | EngineExtension
       | undefined;
-    await engine?.inference(createChatDto, headers, res);
+    engine?.inference(createChatDto, headers, stream, res);
   }
 }
diff --git a/cortex-js/src/usecases/cortex/cortex.usecases.ts b/cortex-js/src/usecases/cortex/cortex.usecases.ts
index 846af2ada..1decf6b97 100644
--- a/cortex-js/src/usecases/cortex/cortex.usecases.ts
+++ b/cortex-js/src/usecases/cortex/cortex.usecases.ts
@@ -1,23 +1,20 @@
-import { Injectable, InternalServerErrorException } from '@nestjs/common';
-import { ConfigService } from '@nestjs/config';
+import { Injectable } from '@nestjs/common';
 import { ChildProcess, spawn } from 'child_process';
 import { join } from 'path';
-import { existsSync } from 'fs';
 import { CortexOperationSuccessfullyDto } from '@/infrastructure/dtos/cortex/cortex-operation-successfully.dto';
 import { HttpService } from '@nestjs/axios';
+import { defaultCortexCppHost, defaultCortexCppPort } from 'constant';
+import { existsSync } from 'node:fs';
 
 @Injectable()
 export class CortexUsecases {
   private cortexProcess: ChildProcess | undefined;
 
-  constructor(
-    private readonly configService: ConfigService,
-    private readonly httpService: HttpService,
-  ) {}
+  constructor(private readonly httpService: HttpService) {}
 
   async startCortex(
-    host: string,
-    port: string,
+    host: string = defaultCortexCppHost,
+    port: number = defaultCortexCppPort,
   ): Promise<CortexOperationSuccessfullyDto> {
     if (this.cortexProcess) {
       return {
@@ -26,23 +23,25 @@ export class CortexUsecases {
       };
     }
 
-    const binaryPath = this.configService.get<string>('CORTEX_BINARY_PATH');
-    if (!binaryPath || !existsSync(binaryPath)) {
-      throw new InternalServerErrorException('Cortex binary not found');
+    const args: string[] = ['1', host, `${port}`];
+    const cortexCppPath = join(
+      __dirname,
+      '../../../cortex-cpp/cortex-cpp' +
+        `${process.platform === 'win32' ? '.exe' : ''}`,
+    );
+
+    if (!existsSync(cortexCppPath)) {
+      throw new Error('Cortex binary not found');
     }
 
-    const args: string[] = ['1', host, port];
     // go up one level to get the binary folder, have to also work on windows
-    const binaryFolder = join(binaryPath, '..');
-
-    this.cortexProcess = spawn(binaryPath, args, {
+    this.cortexProcess = spawn(cortexCppPath, args, {
       detached: false,
-      cwd: binaryFolder,
+      cwd: join(__dirname, '../../../cortex-cpp'),
       stdio: 'inherit',
       env: {
         ...process.env,
-        // TODO: NamH need to get below information
-        // CUDA_VISIBLE_DEVICES: executableOptions.cudaVisibleDevices,
+        CUDA_VISIBLE_DEVICES: '0',
         // // Vulkan - Support 1 device at a time for now
         // ...(executableOptions.vkVisibleDevices?.length > 0 && {
         //   GGML_VULKAN_DEVICE: executableOptions.vkVisibleDevices[0],
@@ -79,7 +78,7 @@ export class CortexUsecases {
         .delete(`http://${host}:${port}/processmanager/destroy`)
         .toPromise();
     } catch (err) {
-      console.error(err);
+      console.error(err.response.data);
     } finally {
       this.cortexProcess?.kill();
       return {
diff --git a/cortex-js/src/usecases/inference-settings/inference-settings.module.ts b/cortex-js/src/usecases/inference-settings/inference-settings.module.ts
deleted file mode 100644
index d7ca6d05e..000000000
--- a/cortex-js/src/usecases/inference-settings/inference-settings.module.ts
+++ /dev/null
@@ -1,12 +0,0 @@
-import { Module } from '@nestjs/common';
-import { InferenceSettingsUsecases } from './inference-settings.usecases';
-import { InferenceSettingsController } from '@/infrastructure/controllers/inference-settings.controller';
-import { DatabaseModule } from '@/infrastructure/database/database.module';
-
-@Module({
-  imports: [DatabaseModule],
-  controllers: [InferenceSettingsController],
-  providers: [InferenceSettingsUsecases],
-  exports: [InferenceSettingsUsecases],
-})
-export class InferenceSettingsModule {}
diff --git a/cortex-js/src/usecases/inference-settings/inference-settings.usecases.spec.ts b/cortex-js/src/usecases/inference-settings/inference-settings.usecases.spec.ts
deleted file mode 100644
index a47dd23b2..000000000
--- a/cortex-js/src/usecases/inference-settings/inference-settings.usecases.spec.ts
+++ /dev/null
@@ -1,18 +0,0 @@
-import { Test, TestingModule } from '@nestjs/testing';
-import { InferenceSettingsUsecases } from './inference-settings.usecases';
-
-describe('InferenceSettingsService', () => {
-  let service: InferenceSettingsUsecases;
-
-  beforeEach(async () => {
-    const module: TestingModule = await Test.createTestingModule({
-      providers: [InferenceSettingsUsecases],
-    }).compile();
-
-    service = module.get<InferenceSettingsUsecases>(InferenceSettingsUsecases);
-  });
-
-  it('should be defined', () => {
-    expect(service).toBeDefined();
-  });
-});
diff --git a/cortex-js/src/usecases/inference-settings/inference-settings.usecases.ts b/cortex-js/src/usecases/inference-settings/inference-settings.usecases.ts
deleted file mode 100644
index 8c512aa72..000000000
--- a/cortex-js/src/usecases/inference-settings/inference-settings.usecases.ts
+++ /dev/null
@@ -1,38 +0,0 @@
-import { Inject, Injectable } from '@nestjs/common';
-import { CreateInferenceSettingDto } from '@/infrastructure/dtos/inference-settings/create-inference-setting.dto';
-import { UpdateInferenceSettingDto } from '@/infrastructure/dtos/inference-settings/update-inference-setting.dto';
-import { Repository } from 'typeorm';
-import { InferenceSettingEntity } from '@/infrastructure/entities/inference-setting.entity';
-
-@Injectable()
-export class InferenceSettingsUsecases {
-  constructor(
-    @Inject('INFERENCE_SETTING_REPOSITORY')
-    private inferenceSettingRepository: Repository<InferenceSettingEntity>,
-  ) {}
-
-  create(createInferenceSettingDto: CreateInferenceSettingDto) {
-    return this.inferenceSettingRepository.insert(createInferenceSettingDto);
-  }
-
-  findAll() {
-    return this.inferenceSettingRepository.find();
-  }
-
-  findOne(id: string) {
-    return this.inferenceSettingRepository.findOne({
-      where: { inferenceId: id },
-    });
-  }
-
-  update(id: string, updateInferenceSettingDto: UpdateInferenceSettingDto) {
-    return this.inferenceSettingRepository.update(
-      id,
-      updateInferenceSettingDto,
-    );
-  }
-
-  remove(id: string) {
-    return this.inferenceSettingRepository.delete(id);
-  }
-}
diff --git a/cortex-js/src/usecases/models/models.usecases.ts b/cortex-js/src/usecases/models/models.usecases.ts
index c76beeb3a..d7843d7d8 100644
--- a/cortex-js/src/usecases/models/models.usecases.ts
+++ b/cortex-js/src/usecases/models/models.usecases.ts
@@ -5,7 +5,7 @@ import { BadRequestException, Inject, Injectable } from '@nestjs/common';
 import { Repository } from 'typeorm';
 import { Model, ModelFormat } from '@/domain/models/model.interface';
 import { ModelNotFoundException } from '@/infrastructure/exception/model-not-found.exception';
-import { join, basename } from 'path';
+import { join, basename, resolve } from 'path';
 import {
   promises,
   createWriteStream,
@@ -13,13 +13,11 @@ import {
   mkdirSync,
   rmdirSync,
 } from 'fs';
-import { LoadModelSuccessDto } from '@/infrastructure/dtos/models/load-model-success.dto';
-import { LoadModelDto } from '@/infrastructure/dtos/models/load-model.dto';
-import { DownloadModelDto } from '@/infrastructure/dtos/models/download-model.dto';
-import { ConfigService } from '@nestjs/config';
+import { StartModelSuccessDto } from '@/infrastructure/dtos/models/start-model-success.dto';
 import { ExtensionRepository } from '@/domain/repositories/extension.interface';
 import { EngineExtension } from '@/domain/abstracts/engine.abstract';
 import { HttpService } from '@nestjs/axios';
+import { ModelSettingParamsDto } from '@/infrastructure/dtos/models/model-setting-params.dto';
 
 @Injectable()
 export class ModelsUsecases {
@@ -27,7 +25,6 @@ export class ModelsUsecases {
     @Inject('MODEL_REPOSITORY')
     private readonly modelRepository: Repository<ModelEntity>,
     private readonly extensionRepository: ExtensionRepository,
-    private readonly configService: ConfigService,
     private readonly httpService: HttpService,
   ) {}
 
@@ -66,8 +63,7 @@ export class ModelsUsecases {
   }
 
   async remove(id: string) {
-    const modelsContainerDir =
-      this.configService.get<string>('CORTEX_MODELS_DIR') ?? './models';
+    const modelsContainerDir = this.modelDir();
 
     if (!existsSync(modelsContainerDir)) {
       return;
@@ -86,8 +82,11 @@ export class ModelsUsecases {
       });
   }
 
-  async startModel(loadModelDto: LoadModelDto): Promise<LoadModelSuccessDto> {
-    const model = await this.getModelOrThrow(loadModelDto.modelId);
+  async startModel(
+    modelId: string,
+    settings?: ModelSettingParamsDto,
+  ): Promise<StartModelSuccessDto> {
+    const model = await this.getModelOrThrow(modelId);
     const extensions = (await this.extensionRepository.findAll()) ?? [];
     const engine = extensions.find((e: any) => e.provider === model?.engine) as
       | EngineExtension
@@ -96,27 +95,28 @@ export class ModelsUsecases {
     if (!engine) {
       return {
         message: 'No extension handler found for model',
-        modelId: loadModelDto.modelId,
+        modelId: modelId,
       };
     }
 
     return engine
-      .loadModel(model)
+      .loadModel(model, settings)
       .then(() => {
         return {
           message: 'Model loaded successfully',
-          modelId: loadModelDto.modelId,
+          modelId: modelId,
         };
       })
       .catch((err) => {
         console.error(err);
         return {
           message: 'Model failed to load',
-          modelId: loadModelDto.modelId,
+          modelId: modelId,
         };
       });
   }
-  async stopModel(modelId: string): Promise<LoadModelSuccessDto> {
+
+  async stopModel(modelId: string): Promise<StartModelSuccessDto> {
     const model = await this.getModelOrThrow(modelId);
     const extensions = (await this.extensionRepository.findAll()) ?? [];
     const engine = extensions.find((e: any) => e.provider === model?.engine) as
@@ -147,11 +147,10 @@ export class ModelsUsecases {
       });
   }
 
-  async downloadModel(
-    downloadModelDto: DownloadModelDto,
-    callback?: (progress: number) => void,
-  ) {
-    const model = await this.getModelOrThrow(downloadModelDto.modelId);
+  modelDir = () => resolve(__dirname, `../../../models`);
+
+  async downloadModel(modelId: string, callback?: (progress: number) => void) {
+    const model = await this.getModelOrThrow(modelId);
 
     if (model.format === ModelFormat.API) {
       throw new BadRequestException('Cannot download remote model');
@@ -165,11 +164,10 @@ export class ModelsUsecases {
     }
 
     const fileName = basename(downloadUrl);
-    const modelsContainerDir =
-      this.configService.get<string>('CORTEX_MODELS_DIR') ?? './models';
+    const modelsContainerDir = this.modelDir();
 
     if (!existsSync(modelsContainerDir)) {
-      await mkdirSync(modelsContainerDir, { recursive: true });
+      mkdirSync(modelsContainerDir, { recursive: true });
     }
 
     const modelFolder = join(modelsContainerDir, model.id);
diff --git a/cortex-js/tsconfig.json b/cortex-js/tsconfig.json
index b7b0011be..f52125fca 100644
--- a/cortex-js/tsconfig.json
+++ b/cortex-js/tsconfig.json
@@ -18,6 +18,7 @@
     "strictBindCallApply": true,
     "forceConsistentCasingInFileNames": true,
     "noFallthroughCasesInSwitch": true,
+    "esModuleInterop": true,
     "paths": {
       "@/*": ["src/*"]
     }
diff --git a/package.json b/package.json
deleted file mode 100644
index a782f0925..000000000
--- a/package.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "name": "@janhq/cortex",
-  "version": "1.0.0",
-  "license": "AGPL-3.0",
-  "scripts": {
-    "preinstall": "npm pre-install script; platform specific (MacOS / Windows / Linux)",
-    "dev": "cd cortex-js && yarn start",
-    "build": "cd cortex-js && yarn build"
-  },
-  "dependencies": {
-    "express": "^4.17.1",
-    "typeorm": "^0.2.37",
-    "pg": "^8.7.1",
-    "dotenv": "^10.0.0"
-  },
-  "devDependencies": {
-    "@types/express": "^4.17.13",
-    "@types/node": "^14.14.33",
-    "typescript": "^4F3.5"
-  }
-}