Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
rename scale
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhenzhong1 committed Mar 7, 2024
1 parent 4b71287 commit d37ffe0
Show file tree
Hide file tree
Showing 33 changed files with 67 additions and 67 deletions.
6 changes: 3 additions & 3 deletions developer_document.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ For simplicity, we take [polyglot](https://huggingface.co/EleutherAI/polyglot-ko

Firstly, we need to add its temp buffer in its [related model-arch header file](neural_speed/models/gptneox/gptneox.h) and [re-compile](README.md#Install).
```diff
static const model_scratch gptneox_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 44:
return {2048ull * MB, 2048ull * MB, 4096ull * MB};
Expand Down Expand Up @@ -167,7 +167,7 @@ and update [model_name_to_arch()](neural_speed/models/model_utils/model_types.h#
+ NEW_MODEL_13B,
+};

+static const model_scratch new_model_mem_req(int n_layers, float enlarge_scale = 1.0f) {
+static const model_scratch new_model_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
+ switch (n_layers) {
+ case N:
+ return {8192ull * MB, 8192ull * MB, 8192ull * MB};
Expand Down Expand Up @@ -390,7 +390,7 @@ We recommend to use continuous batching way since it has no padding effect and c
+ ne_view_2d(ctx0, KQV_merged_contiguous, head_size * n_head, attn_sl * attn_bs, head_size * n_head * ne_element_size(KQV_merged_contiguous), ne_element_size(KQV_merged_contiguous) * off_sl)));
+ off_sl += head_size * n_head * attn_sl * attn_bs;
```
>Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`model_scratch_enlarge_scale`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.
>Note: You can set larger [`NE_MAX_NODES`](neural_speed/core/ne.h#43) and [`model_scratch_size_ratio`](neural_speed/models/llama/llama.h#29) values if out of memory when the inputs' batch size becomes larger.
## 2.3. Application
- Q4_0 quant : We can quantize the model generated by convert by adding a quant layer class to quantize it into an int4 low-bit file, so as to obtain better inference performance. Register quant layer class in your new_model_utils.cpp, just like [gptneox_utils.cpp](neural_speed/models/gptneox/gptneox_utils.cpp#L163), replace `gptneox_quant_layer` to your `new_model_quant_layer`.
Expand Down
22 changes: 11 additions & 11 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void init_gpt_params(gpt_params* params, const std::string& model_path, int max_
bool early_stopping = false, int n_keep = 0, int n_discard = -1, bool shift_roped_k = false,
int batch_size = 1, model_vocab::id pad_token = -1, const std::string& memory_dtype = "auto",
bool continuous_batching = true, const int& max_request_num = MODEL_MAX_REQUEST_NUM,
const float& model_scratch_enlarge_scale = 1.0f) {
const float& model_scratch_size_ratio = 1.0f) {
MODEL_ASSERT(params != nullptr);
#ifdef MODEL_NAME
params->model_name = MODEL_NAME;
Expand Down Expand Up @@ -126,7 +126,7 @@ void init_gpt_params(gpt_params* params, const std::string& model_path, int max_
params->min_new_tokens = min_new_tokens;
params->length_penalty = length_penalty;
params->do_early_stopping = early_stopping;
params->model_scratch_enlarge_scale = model_scratch_enlarge_scale;
params->model_scratch_size_ratio = model_scratch_size_ratio;

printf(
"beam_size: %d, do_sample: %d, top_k: %d, top_p: %f, continuous_batching: %d, max_request_num: %d, "
Expand All @@ -142,7 +142,7 @@ class ModelServer {
int top_k, float top_p, float temperature, int min_new_tokens, float length_penalty, bool early_stopping,
int n_keep, int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale, const std::string& policy, bool print_log,
const float& model_scratch_size_ratio, const std::string& policy, bool print_log,
const std::function<void()>& init_cb)
: response(response),
waiting(),
Expand All @@ -161,7 +161,7 @@ class ModelServer {
this->InitServerParams(model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty,
num_beams, do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty,
early_stopping, n_keep, n_discard, shift_roped_k, batch_size, pad_token, memory_dtype,
true, max_request_num, model_scratch_enlarge_scale);
true, max_request_num, model_scratch_size_ratio);
Cont_batch_gen_scheduler scheduler(this->params, policy, print_log ? 0 : 1);
std::vector<sequence> added_seqs;
while (running) {
Expand Down Expand Up @@ -263,11 +263,11 @@ class ModelServer {
float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale) {
const float& model_scratch_size_ratio) {
init_gpt_params(&params, model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty,
num_beams, do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty, early_stopping,
n_keep, n_discard, shift_roped_k, batch_size, pad_token, memory_dtype, continuous_batching,
max_request_num, model_scratch_enlarge_scale);
max_request_num, model_scratch_size_ratio);
if (cont_batching_model_archs.count(params.model_arch) == 0) {
fprintf(stderr, "\nERROR: ModelServer only supports gpt-j, llama!\n");
running = false;
Expand Down Expand Up @@ -325,7 +325,7 @@ class Model {
float repetition_penalty, int num_beams, bool do_sample, int top_k, float top_p, float temperature,
int min_new_tokens, float length_penalty, bool early_stopping, int n_keep, int n_discard,
bool shift_roped_k, int batch_size, model_vocab::id pad_token, const std::string& memory_dtype,
bool continuous_batching, const int& max_request_num, const float& model_scratch_enlarge_scale);
bool continuous_batching, const int& max_request_num, const float& model_scratch_size_ratio);
void reinit();
std::vector<std::vector<model_token>> generate(const std::vector<std::vector<model_token>>& input_ids);
// deprecated API
Expand Down Expand Up @@ -419,11 +419,11 @@ void Model::init_model(const std::string& model_path, int max_new_tokens, int n_
float temperature, int min_new_tokens, float length_penalty, bool early_stopping, int n_keep,
int n_discard, bool shift_roped_k, int batch_size, model_vocab::id pad_token,
const std::string& memory_dtype, bool continuous_batching, const int& max_request_num,
const float& model_scratch_enlarge_scale) {
const float& model_scratch_size_ratio) {
init_gpt_params(&params, model_path, max_new_tokens, n_batch, ctx_size, seed, threads, repetition_penalty, num_beams,
do_sample, top_k, top_p, temperature, min_new_tokens, length_penalty, early_stopping, n_keep,
n_discard, shift_roped_k, batch_size, pad_token, memory_dtype, continuous_batching, max_request_num,
model_scratch_enlarge_scale);
model_scratch_size_ratio);

n_past = 0;
n_total = 0;
Expand Down Expand Up @@ -922,7 +922,7 @@ PYBIND11_MODULE(mixtral_cpp, m)
py::arg("n_keep") = 0, py::arg("n_discard") = -1, py::arg("shift_roped_k") = false,
py::arg("batch_size") = 1, py::arg("pad_token") = -1, py::arg("memory_dtype") = "auto",
py::arg("continuous_batching") = true, py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM,
py::arg("model_scratch_enlarge_scale") = 1.0f)
py::arg("model_scratch_size_ratio") = 1.0f)
.def("generate", &Model::generate, "Generate token with input ids", py::arg("input_ids"))
.def("evaluate", &Model::evaluate, "Evaluate token with input ids and output logits",
py::arg("input_ids") = std::vector<std::vector<model_token>>{}, py::arg("logits_all") = false)
Expand Down Expand Up @@ -962,7 +962,7 @@ PYBIND11_MODULE(mixtral_cpp, m)
py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false, py::arg("n_keep") = 0,
py::arg("n_discard") = -1, py::arg("shift_roped_k") = false, py::arg("batch_size") = 1,
py::arg("pad_token") = -1, py::arg("memory_dtype") = "auto", py::arg("continuous_batching") = true,
py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM, py::arg("model_scratch_enlarge_scale") = 1.0f,
py::arg("max_request_num") = MODEL_MAX_REQUEST_NUM, py::arg("model_scratch_size_ratio") = 1.0f,
py::arg("policy") = "fcfs", py::arg("print_log") = false,
py::arg("init_cb") = std::function<void()>{[]() {}})
.def("issueQuery", &ModelServer::issueQuery, "desc placeholder", py::arg("qs"))
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/baichuan/baichuan.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum baichuan_model {
BAICHUAN_13B,
};

static const model_scratch baichuan_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch baichuan_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 40:
return {8192ull * MB, 8192ull * MB, 8192ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/baichuan/baichuan_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void BAICHUAN::init(const char* path_model, model_context* ctx, int n_gpu_layer_
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = baichuan_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = baichuan_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/bloom/bloom.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum bloom_model {
BLOOM_7B,
};

static const model_scratch bloom_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch bloom_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 30:
return {4 * 2048ull * MB, 4 * 2048ull * MB, 4 * 4096ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/bloom/bloom_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void BLOOM::init(const char* path_model, model_context* ctx, int n_gpu_layer_, b
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = bloom_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = bloom_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum chatglm_model {
CHATGLM_6B,
};

static const model_scratch chatglm_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 28:
return {2048ull * MB, 2048ull * MB, 4096ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm2.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum chatglm2_model {
CHATGLM2_6B,
};

static const model_scratch chatglm_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch chatglm_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 28:
return {4096ull * MB, 4096ull * MB, 8192ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm2_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void CHATGLM2::init(const char* path_model, model_context* ctx, int n_gpu_layer_
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = chatglm_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = chatglm_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/chatglm/chatglm_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void CHATGLM::init(const char* path_model, model_context* ctx, int n_gpu_layer_,
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = chatglm_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = chatglm_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/falcon/falcon.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum falcon_model {
FALCON_7B,
};

static const model_scratch falcon_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch falcon_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 32:
return {2048ull * MB, 2048ull * MB, 4096ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/falcon/falcon_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ void FALCON::init(const char* path_model, model_context* ctx, int n_gpu_layer_,
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
n_head_kv = hparams.n_head_kv;
scratch = falcon_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = falcon_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
8 changes: 4 additions & 4 deletions neural_speed/models/gptj/gptj.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ enum gptj_model {
GPTJ_65B,
};

static const model_scratch gptj_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch gptj_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 28:
// should be enough for batch=8 * beam=4
return {
static_cast<unsigned long long>(enlarge_scale * 3072) * MB,
static_cast<unsigned long long>(enlarge_scale * 2048) * MB,
static_cast<unsigned long long>(enlarge_scale * 3072) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 3072) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 3072) * MB,
};
default:
MODEL_ASSERT(false);
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/gptj/gptj_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void GPTJ::init(const char* path_model, model_context* ctx, int n_gpu_layer_, bo
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = gptj_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = gptj_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/gptneox/gptneox.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ enum gptneox_model {
GPTNEOX_7B,
};

static const model_scratch gptneox_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch gptneox_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 44:
return {2048ull * MB, 2048ull * MB, 4096ull * MB};
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/gptneox/gptneox_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void GPTNEOX::init(const char* path_model, model_context* ctx, int n_gpu_layer_,
n_embd = hparams.n_embd;
n_vocab = hparams.n_vocab;
n_layer = hparams.n_layer;
scratch = gptneox_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = gptneox_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
32 changes: 16 additions & 16 deletions neural_speed/models/llama/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,37 +26,37 @@ enum llama_model {
LLAMA_65B,
};

static const model_scratch llama_mem_req(int n_layers, float enlarge_scale = 1.0f) {
static const model_scratch llama_mem_req(int n_layers, float scratch_size_ratio = 1.0f) {
switch (n_layers) {
case 32:
return {
static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
static_cast<unsigned long long>(enlarge_scale * 1024) * MB,
static_cast<unsigned long long>(enlarge_scale * 1608) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 1024) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 1608) * MB,
};
case 40:
return {
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 1608) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 1608) * MB,
};
case 48:
return {
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 2366) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 2366) * MB,
};
case 60:
return {
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 512) * MB,
static_cast<unsigned long long>(enlarge_scale * 3124) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 512) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 3124) * MB,
};
case 80:
return {
static_cast<unsigned long long>(enlarge_scale * 2048) * MB,
static_cast<unsigned long long>(enlarge_scale * 2048) * MB,
static_cast<unsigned long long>(enlarge_scale * 10240) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 2048) * MB,
static_cast<unsigned long long>(scratch_size_ratio * 10240) * MB,
};
default:
MODEL_ASSERT(false);
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/llama/llama_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void Llama::init(const char* path_model, model_context* ctx, int n_gpu_layer_, b
n_head = hparams.n_head;
n_expert = hparams.n_experts;
n_expert_used = hparams.n_experts_used;
scratch = llama_mem_req(n_layer, lctx.model_scratch_enlarge_scale);
scratch = llama_mem_req(n_layer, lctx.model_scratch_size_ratio);
model.scratchs = scratch;
}

Expand Down
2 changes: 1 addition & 1 deletion neural_speed/models/model_utils/model_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ struct gpt_params {
float length_penalty = 1.0f; // exponential penalty to the length in beam search generation
bool do_early_stopping = false; // early stopping in beam search generation

float model_scratch_enlarge_scale = 1.0f; // model memory scratch enlarge scale
float model_scratch_size_ratio = 1.0f; // model memory scratch enlarge scale
};

bool gpt_params_parse(int argc, char** argv, gpt_params& params);
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/model_utils/model_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ struct model_context {

size_t mem_per_token = 0;

float model_scratch_enlarge_scale = 1.0f; // model memory scratch enlarge scale
float model_scratch_size_ratio = 1.0f; // model memory scratch enlarge scale

// decode output (3-dimensional array: [batch_size] [n_tokens] [n_vocab])
std::vector<float> logits;
Expand Down Expand Up @@ -441,7 +441,7 @@ struct model_context_params {
// global generation config
generation_config gen_conf;
// model memory scratch enlarge scale
float model_scratch_enlarge_scale;
float model_scratch_size_ratio;

// called with a progress value between 0 and 1, pass nullptr to disable
model_progress_callback progress_callback;
Expand Down
6 changes: 3 additions & 3 deletions neural_speed/models/model_utils/model_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ struct model_context_params model_context_default_params() {
/*cont_batching =*/true,
/*.max_request_num =*/1,
/*.gen_conf =*/generation_config(),
/*model_scratch_enlarge_scale =*/1.0f,
/*model_scratch_size_ratio =*/1.0f,
/*.progress_callback =*/nullptr,
/*.progress_callback_user_data =*/nullptr,
};
Expand Down Expand Up @@ -911,7 +911,7 @@ struct model_context* model_init_from_file(const char* path_model, struct model_
}
ctx->cont_batching = params.cont_batching;
ctx->generation_conf = params.gen_conf;
ctx->model_scratch_enlarge_scale = params.model_scratch_enlarge_scale;
ctx->model_scratch_size_ratio = params.model_scratch_size_ratio;
const model_archs arch = params.arch;

// the type so that kv-cache allocated according to this type must be large enough
Expand Down Expand Up @@ -1284,7 +1284,7 @@ struct model_context* model_init_from_gpt_params(const gpt_params& params) {
lparams.gen_conf.min_new_tokens = params.min_new_tokens;
lparams.gen_conf.length_penalty = params.length_penalty;
lparams.gen_conf.do_early_stopping = params.do_early_stopping;
lparams.model_scratch_enlarge_scale = params.model_scratch_enlarge_scale;
lparams.model_scratch_size_ratio = params.model_scratch_size_ratio;

NE_ASSERT(("Start size cannot be greater than the maximum context size!", lparams.n_keep < lparams.n_ctx));

Expand Down
Loading

0 comments on commit d37ffe0

Please sign in to comment.