Skip to content

Commit

Permalink
bench : add memcpy and ggml_mul_mat benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jan 18, 2023
1 parent 49b529b commit 1290fc6
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 14 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ ifdef WHISPER_OPENBLAS
LDFLAGS += -lopenblas
endif
ifdef WHISPER_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
endif
Expand Down
164 changes: 157 additions & 7 deletions examples/bench/bench.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#include "ggml.h"
#include "whisper.h"

#include <cstdio>
#include <cstring>
#include <string>
#include <thread>
#include <vector>

// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat

std::string model = "models/ggml-base.en.bin";
};
Expand All @@ -23,6 +27,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
Expand All @@ -41,16 +46,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " %-7s 0 - whisper encoder\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, "\n");
}

int main(int argc, char ** argv) {
whisper_params params;

if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}

int bench_whisper_encoder(const whisper_params & params) {
// whisper init

struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
Expand Down Expand Up @@ -92,3 +95,150 @@ int main(int argc, char ** argv) {

return 0;
}

int bench_memcpy(const whisper_params & params) {
size_t n = 50;
size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations

// 1 GB array
const size_t size = arr*1024llu*1024llu;

char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);

for (size_t i = 0; i < size; i++) src[i] = i;

memcpy(dst, src, size); // heat-up

double tsum = 0.0;

for (size_t i = 0; i < n; i++) {
const int64_t t0 = ggml_time_us();

memcpy(dst, src, size);

const int64_t t1 = ggml_time_us();

tsum += (t1 - t0)*1e-6;

src[0] = rand();
}

fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));

// needed to prevent the compile from optimizing the memcpy away
{
double sum = 0.0;

for (size_t i = 0; i < size; i++) sum += dst[i];

fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
}

free(src);
free(dst);

return 0;
}

int bench_ggml_mul_mat(const whisper_params & params) {
const int n_max = 128;

const std::vector<size_t> sizes = {
64, 128, 256, 512, 1024, 2048, 4096,
};

const size_t N_max = sizes.back();

// a: N*N*sizeof(float)
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);

for (size_t i = 0; i < buf.size(); i++) buf[i] = i;

for (int j = 0; j < (int) sizes.size(); j++) {
int n_fp16 = 0;
int n_fp32 = 0;

// GFLOPS/s
double s_fp16 = 0.0;
double s_fp32 = 0.0;

const size_t N = sizes[j];

for (int k = 0; k < 2; ++k) {
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;

double & s = k == 0 ? s_fp16 : s_fp32;
int & n = k == 0 ? n_fp16 : n_fp32;

struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(),
};

struct ggml_context * ctx0 = ggml_init(gparams);

struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);

struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);

struct ggml_cgraph gf = ggml_build_forward(c);

gf.n_threads = params.n_threads;

double tsum = 0.0;

// heat-up
ggml_graph_compute(ctx0, &gf);

for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();

ggml_graph_compute(ctx0, &gf);

const int64_t t1 = ggml_time_us();

tsum += (t1 - t0)*1e-6;
n++;

if (tsum > 1.0 && n >= 3) {
break;
}
}

ggml_free(ctx0);

s = ((2.0*N*N*N*n)/tsum)*1e-9;
}

fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
}

return 0;
}

int main(int argc, char ** argv) {
whisper_params params;

if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}

ggml_time_init();

int ret = -1;

switch (params.what) {
case 0: ret = bench_whisper_encoder(params); break;
case 1: ret = bench_memcpy(params); break;
case 2: ret = bench_ggml_mul_mat(params); break;
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
}

return ret;
}
13 changes: 12 additions & 1 deletion extra/bench-all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ fi

models=( "tiny" "base" "small" "medium" "large" )

printf "\n"
printf "Running memcpy benchmark with 1 thread\n"
printf "\n"

./bench -w 1 -t 1 2>&1

printf "\n"
printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
printf "\n"

./bench -w 2 -t $n_threads 2>&1

printf "\n"
printf "Running benchmark for all models\n"
printf "This can take a while!\n"
Expand Down Expand Up @@ -56,4 +68,3 @@ for model in "${models[@]}"; do

printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
done

15 changes: 11 additions & 4 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4373,7 +4373,9 @@ static void ggml_compute_forward_mul_mat_f32(
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));

if (params->ith != 0) return;
if (params->ith != 0) {
return;
}

if (params->type == GGML_TASK_INIT) {
return;
Expand Down Expand Up @@ -4616,7 +4618,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));

if (params->ith != 0) return;
if (params->ith != 0) {
return;
}

if (params->type == GGML_TASK_INIT) {
return;
Expand Down Expand Up @@ -7054,7 +7058,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
#ifdef __APPLE__

//#include <os/lock.h>

//
//typedef os_unfair_lock ggml_lock_t;
//
//#define ggml_lock_init(x) UNUSED(x)
Expand Down Expand Up @@ -7161,6 +7165,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (state->params.ith < state->params.nth) {
ggml_compute_forward(&state->params, state->node);
}

state->node = NULL;
} else {
break;
Expand Down Expand Up @@ -7205,6 +7210,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
.node = NULL,
.shared = &state_shared,
};

int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
assert(rc == 0);
UNUSED(rc);
Expand Down Expand Up @@ -7273,7 +7279,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
} else {
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
Expand Down

0 comments on commit 1290fc6

Please sign in to comment.