Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add engine pull support for tensorrt-llm #765

Merged
merged 5 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { InitCliUsecases } from '../usecases/init.cli.usecases';
import { existsSync } from 'node:fs';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { join } from 'node:path';
import { Engines } from '../types/engine.interface';

type ModelStartOptions = {
attach: boolean;
Expand Down Expand Up @@ -71,7 +72,7 @@ export class ModelStartCommand extends CommandRunner {
engine,
);
}
if (engine === 'cortex.onnx' && process.platform !== 'win32') {
if (engine === Engines.onnx && process.platform !== 'win32') {
console.error('The ONNX engine does not support this OS yet.');
process.exit(1);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export enum Engines {
llamaCPP = 'cortex.llamacpp',
onnx = 'cortex.onnx',
tensorrtLLM = 'cortex.tensorrt-llm',
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ export class ChatCliUsecases {
rl.on('line', sendCompletionMessage.bind(this));

async function sendCompletionMessage(userInput: string) {
if (!userInput || userInput.trim() === '') return;

if (userInput.trim() === this.exitClause) {
rl.close();
return;
Expand Down Expand Up @@ -98,12 +100,7 @@ export class ChatCliUsecases {
model: modelId,
stream: true,
max_tokens: 4098,
stop: [],
frequency_penalty: 0.7,
presence_penalty: 0.7,
temperature: 0.7,
top_p: 0.7,

// Override with model settings
...parser.parseModelInferenceParams(model),
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ import { rm } from 'fs/promises';
import { exec } from 'child_process';
import { appPath } from '@/utils/app-path';
import {
CORTEX_ONNX_ENGINE_RELEASES_URL,
CORTEX_ENGINE_RELEASES_URL,
CORTEX_RELEASES_URL,
CUDA_DOWNLOAD_URL,
} from '@/infrastructure/constants/cortex';
import { checkNvidiaGPUExist, cudaVersion } from '@/utils/cuda';
import { Engines } from '../types/engine.interface';

@Injectable()
export class InitCliUsecases {
Expand Down Expand Up @@ -70,13 +71,14 @@ export class InitCliUsecases {
)
await this.installLlamaCppEngine(options, version);

if (engine === 'cortex.onnx' && process.platform === 'win32')
await this.installONNXEngine();
else if (engine === 'cortex.onnx' && process.platform !== 'win32') {
if (engine === Engines.onnx && process.platform !== 'win32') {
console.error('The ONNX engine does not support this OS yet.');
process.exit(1);
}

if (engine !== 'cortex.llamacpp')
await this.installAcceleratedEngine('latest', engine);

configs.initialized = true;
await this.fileManagerService.writeConfigFile(configs);
};
Expand Down Expand Up @@ -305,17 +307,17 @@ export class InitCliUsecases {
};

/**
* Download and install ONNX engine
* Download and install accelerated engine
* @param version
* @param engineFileName
*/
private async installONNXEngine(
private async installAcceleratedEngine(
version: string = 'latest',
engineFileName: string = 'windows-amd64',
engine: string = Engines.onnx,
) {
const res = await firstValueFrom(
this.httpService.get(
CORTEX_ONNX_ENGINE_RELEASES_URL +
CORTEX_ENGINE_RELEASES_URL(engine) +
`${version === 'latest' ? '/latest' : ''}`,
{
headers: {
Expand All @@ -338,15 +340,17 @@ export class InitCliUsecases {
);
}
const toDownloadAsset = release.assets.find((s: any) =>
s.name.includes(engineFileName),
s.name.includes(process.platform === 'win32' ? 'windows' : 'linux'),
);

if (!toDownloadAsset) {
console.log(`Could not find engine file ${engineFileName}`);
console.log(
`Could not find engine file for platform ${process.platform}`,
);
exit(1);
}

console.log(`Downloading ONNX engine file ${engineFileName}`);
console.log(`Downloading engine file ${toDownloadAsset.name}`);
const dataFolderPath = await this.fileManagerService.getDataFolderPath();
const engineDir = join(dataFolderPath, 'cortex-cpp');

Expand Down Expand Up @@ -397,10 +401,10 @@ export class InitCliUsecases {
await rm(destination, { force: true });

// Copy the additional files to the cortex-cpp directory
for (const file of readdirSync(join(engineDir, 'engines', 'cortex.onnx'))) {
for (const file of readdirSync(join(engineDir, 'engines', engine))) {
if (file !== 'engine.dll') {
await cpSync(
join(engineDir, 'engines', 'cortex.onnx', file),
join(engineDir, 'engines', engine, file),
join(engineDir, file),
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ import { join, basename } from 'path';
import { load } from 'js-yaml';
import { existsSync, readdirSync, readFileSync } from 'fs';
import { isLocalModel, normalizeModelId } from '@/utils/normalize-model-id';
import { getHFModelMetadata } from '@/utils/huggingface';
import { fetchJanRepoData, getHFModelMetadata } from '@/utils/huggingface';
import { createWriteStream, mkdirSync, promises } from 'node:fs';
import { firstValueFrom } from 'rxjs';
import { Engines } from '../types/engine.interface';

@Injectable()
export class ModelsCliUsecases {
Expand Down Expand Up @@ -120,8 +121,8 @@ export class ModelsCliUsecases {
process.exit(1);
}

if (modelId.includes('onnx')) {
await this.pullOnnxModel(modelId);
if (modelId.includes('onnx') || modelId.includes('tensorrt')) {
await this.pullEngineModelFiles(modelId);
} else {
await this.pullGGUFModel(modelId);
const bar = new SingleBar({}, Presets.shades_classic);
Expand Down Expand Up @@ -151,10 +152,10 @@ export class ModelsCliUsecases {
}

/**
* It's to pull ONNX model from HuggingFace repository
* It's to pull engine model files from HuggingFace repository
* @param modelId
*/
private async pullOnnxModel(modelId: string) {
private async pullEngineModelFiles(modelId: string) {
const modelsContainerDir = await this.fileService.getModelsPath();

if (!existsSync(modelsContainerDir)) {
Expand All @@ -164,35 +165,22 @@ export class ModelsCliUsecases {
const modelFolder = join(modelsContainerDir, normalizeModelId(modelId));
await promises.mkdir(modelFolder, { recursive: true }).catch(() => {});

const files = [
'genai_config.json',
'model.onnx',
'model.onnx.data',
'model.yml',
'special_tokens_map.json',
'tokenizer.json',
'tokenizer_config.json',
];
const repo = modelId.split(':')[0];
const branch = modelId.split(':')[1] || 'default';
const files = (await fetchJanRepoData(modelId)).siblings;
for (const file of files) {
console.log(`Downloading ${file}`);
console.log(`Downloading ${file.rfilename}`);
const bar = new SingleBar({}, Presets.shades_classic);
bar.start(100, 0);
const response = await firstValueFrom(
this.httpService.get(
`https://huggingface.co/cortexhub/${repo}/resolve/${branch}/${file}?download=true`,
{
responseType: 'stream',
},
),
this.httpService.get(file.downloadUrl ?? '', {
responseType: 'stream',
}),
);
if (!response) {
throw new Error('Failed to download model');
}

await new Promise((resolve, reject) => {
const writer = createWriteStream(join(modelFolder, file));
const writer = createWriteStream(join(modelFolder, file.rfilename));
let receivedBytes = 0;
const totalBytes = response.headers['content-length'];

Expand Down Expand Up @@ -281,7 +269,7 @@ export class ModelsCliUsecases {
// Default Model Settings
ctx_len: 4096,
ngl: 100,
engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
engine: Engines.llamaCPP,
};
if (!(await this.modelsUsecases.findOne(modelId)))
await this.modelsUsecases.create(model);
Expand Down
1 change: 0 additions & 1 deletion cortex-js/src/infrastructure/constants/benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ export const defaultBenchmarkConfiguration: BenchmarkConfig = {
model: 'tinyllama',
stream: true,
max_tokens: 2048,
stop: [],
frequency_penalty: 0,
presence_penalty: 0,
temperature: 0.7,
Expand Down
4 changes: 2 additions & 2 deletions cortex-js/src/infrastructure/constants/cortex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ export const CORTEX_JS_STOP_API_SERVER_URL = (
export const CORTEX_RELEASES_URL =
'https://api.github.com/repos/janhq/cortex/releases';

export const CORTEX_ONNX_ENGINE_RELEASES_URL =
'https://api.github.com/repos/janhq/cortex.onnx/releases';
export const CORTEX_ENGINE_RELEASES_URL = (engine: string) =>
`https://api.github.com/repos/janhq/${engine}/releases`;

export const CUDA_DOWNLOAD_URL =
'https://catalog.jan.ai/dist/cuda-dependencies/<version>/<platform>/cuda.tar.gz';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { EngineExtension } from '@/domain/abstracts/engine.abstract';
import { appPath } from '@/utils/app-path';
import { FileManagerService } from '@/infrastructure/services/file-manager/file-manager.service';
import { existsSync } from 'fs';
import { Engines } from '@/infrastructure/commanders/types/engine.interface';

@Injectable()
export class ExtensionRepositoryImpl implements ExtensionRepository {
Expand All @@ -18,9 +19,9 @@ export class ExtensionRepositoryImpl implements ExtensionRepository {
private readonly cortexProvider: EngineExtension,
private readonly fileService: FileManagerService,
) {
this.extensions.set('cortex.llamacpp', this.cortexProvider);
this.extensions.set('cortex.onnx', this.cortexProvider);
this.extensions.set('cortex.tensorrt-llm', this.cortexProvider);
this.extensions.set(Engines.llamaCPP, this.cortexProvider);
this.extensions.set(Engines.onnx, this.cortexProvider);
this.extensions.set(Engines.tensorrtLLM, this.cortexProvider);
this.loadCoreExtensions();
this.loadExternalExtensions();
}
Expand Down
3 changes: 2 additions & 1 deletion cortex-js/src/usecases/models/models.usecases.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { EventEmitter2 } from '@nestjs/event-emitter';
import { ModelEvent, ModelId, ModelStatus } from '@/domain/models/model.event';
import { DownloadManagerService } from '@/infrastructure/services/download-manager/download-manager.service';
import { ContextService } from '@/infrastructure/services/context/context.service';
import { Engines } from '@/infrastructure/commanders/types/engine.interface';

@Injectable()
export class ModelsUsecases {
Expand Down Expand Up @@ -466,7 +467,7 @@ export class ModelsUsecases {
// Default Model Settings
ctx_len: 4096,
ngl: 100,
engine: modelId.includes('onnx') ? 'cortex.onnx' : 'cortex.llamacpp',
engine: Engines.llamaCPP,
};
if (!(await this.findOne(modelId))) await this.create(model);
}
Expand Down
50 changes: 50 additions & 0 deletions cortex-js/src/utils/cuda.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ import { existsSync } from 'fs';
import { delimiter } from 'path';
import { checkFileExistenceInPaths } from './app-path';

export type GpuSettingInfo = {
id: string;
vram: string;
name: string;
arch?: string;
};

/**
* Return the CUDA version installed on the system
* @returns CUDA Version 11 | 12
Expand Down Expand Up @@ -63,3 +70,46 @@ export const checkNvidiaGPUExist = (): Promise<boolean> => {
});
});
};

/**
* Get GPU information from the system
* @returns GPU information
*/
export const getGpuInfo = async (): Promise<GpuSettingInfo[]> =>
new Promise((resolve) => {
exec(
'nvidia-smi --query-gpu=index,memory.total,name --format=csv,noheader,nounits',
async (error, stdout) => {
if (!error) {
// Get GPU info and gpu has higher memory first
let highestVram = 0;
let highestVramId = '0';
const gpus: GpuSettingInfo[] = stdout
.trim()
.split('\n')
.map((line) => {
let [id, vram, name] = line.split(', ');
const arch = getGpuArch(name);
vram = vram.replace(/\r/g, '');
if (parseFloat(vram) > highestVram) {
highestVram = parseFloat(vram);
highestVramId = id;
}
return { id, vram, name, arch };
});

resolve(gpus);
} else {
resolve([]);
}
},
);
});

const getGpuArch = (gpuName: string): string => {
if (!gpuName.toLowerCase().includes('nvidia')) return 'unknown';

if (gpuName.includes('30')) return 'ampere';
else if (gpuName.includes('40')) return 'ada';
else return 'unknown';
};
6 changes: 2 additions & 4 deletions cortex-js/src/utils/huggingface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
} from '@/infrastructure/constants/prompt-constants';
import { gguf } from '@huggingface/gguf';
import axios from 'axios';
import { parseModelHubEngineBranch } from './normalize-model-id';

// TODO: move this to somewhere else, should be reused by API as well. Maybe in a separate service / provider?
export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string {
Expand Down Expand Up @@ -64,7 +65,6 @@ export function guessPromptTemplateFromHuggingFace(jinjaCode?: string): string {
export async function fetchHuggingFaceRepoData(
repoId: string,
): Promise<HuggingFaceRepoData> {

const sanitizedUrl = getRepoModelsUrl(repoId);

const { data: response } = await axios.get(sanitizedUrl);
Expand Down Expand Up @@ -113,7 +113,7 @@ export async function fetchJanRepoData(
modelId: string,
): Promise<HuggingFaceRepoData> {
const repo = modelId.split(':')[0];
const tree = modelId.split(':')[1] ?? 'default';
const tree = await parseModelHubEngineBranch(modelId.split(':')[1] ?? 'default');
const url = getRepoModelsUrl(`cortexhub/${repo}`, tree);

const res = await fetch(url);
Expand Down Expand Up @@ -164,8 +164,6 @@ export async function fetchJanRepoData(

data.modelUrl = url;



return data;
}

Expand Down
Loading
Loading