Skip to content

Commit

Permalink
using abort_callback from ggml to stop llama computation
Browse files Browse the repository at this point in the history
  • Loading branch information
Xarbirus committed Feb 8, 2024
1 parent aa7ab99 commit 0867742
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 12 deletions.
28 changes: 23 additions & 5 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
int n_threads;
void * work_data;
size_t work_size;

ggml_abort_callback abort_callback;
void * abort_callback_data;
};

GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
Expand Down Expand Up @@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
}

cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;

return cpu_plan;
}

Expand Down Expand Up @@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
cpu_ctx->work_size = cplan.work_size;
}

cplan.work_data = cpu_ctx->work_data;

cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;

ggml_graph_compute(cgraph, &cplan);
return true;
}
Expand Down Expand Up @@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = {
ggml_backend_t ggml_backend_cpu_init(void) {
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));

ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;

ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));

Expand All @@ -776,13 +786,21 @@ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_cpu_name;
}

void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
GGML_CALL void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->n_threads = n_threads;
}

GGML_CALL void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}

GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
}
Expand Down
6 changes: 3 additions & 3 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ extern "C" {
//
// CPU backend
//

GGML_API ggml_backend_t ggml_backend_cpu_init(void);

GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

// Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
Expand Down
6 changes: 4 additions & 2 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,8 @@ extern "C" {

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

typedef bool (*ggml_abort_callback)(void * data);

// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
Expand All @@ -576,8 +578,8 @@ extern "C" {
int n_threads;

// abort ggml_graph_compute when true
bool (*abort_callback)(void * data);
void * abort_callback_data;
ggml_abort_callback abort_callback;
void * abort_callback_data;
};

enum ggml_cgraph_eval_order {
Expand Down
18 changes: 16 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1842,6 +1842,9 @@ struct llama_context {
// allocator for the input tensors
ggml_tallocr * alloc = nullptr;

ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;

// input tensors
ggml_backend_buffer_t buf_input = nullptr;
ggml_context * ctx_input = nullptr;
Expand Down Expand Up @@ -7300,6 +7303,7 @@ static int llama_decode_internal(

if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
ggml_backend_sched_graph_compute(lctx.sched, gf);

Expand Down Expand Up @@ -10482,6 +10486,8 @@ struct llama_context_params llama_context_default_params() {
/*.logits_all =*/ false,
/*.embedding =*/ false,
/*.offload_kqv =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};

return result;
Expand Down Expand Up @@ -10670,8 +10676,11 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);

ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;

ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;

const ggml_type type_k = params.type_k;
const ggml_type type_v = params.type_v;
Expand Down Expand Up @@ -11575,6 +11584,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
ctx->cparams.n_threads_batch = n_threads_batch;
}

void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}

struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
Expand Down
6 changes: 6 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ extern "C" {
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embedding; // embedding mode only
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU

ggml_abort_callback abort_callback;
void * abort_callback_data;
};

// model quantization parameters
Expand Down Expand Up @@ -612,6 +615,9 @@ extern "C" {
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

// Set abort callback
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);

// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Logits for which llama_batch.logits[i] == 0 are undefined
Expand Down

0 comments on commit 0867742

Please sign in to comment.