diff --git a/implementation/matmul-c/Makefile b/implementation/matmul-c/Makefile index 968e612..4f0a759 100644 --- a/implementation/matmul-c/Makefile +++ b/implementation/matmul-c/Makefile @@ -27,6 +27,8 @@ matmul-c: \ src/operations.c \ src/parse_header.c \ src/threadpool.c \ + src/utils.c \ + src/benchmarks.c \ src/tests.c $(CC) $(CFLAGS) -o$@ $^ $(CLIBS) diff --git a/implementation/matmul-c/include/benchmarks.h b/implementation/matmul-c/include/benchmarks.h new file mode 100644 index 0000000..9d1ed87 --- /dev/null +++ b/implementation/matmul-c/include/benchmarks.h @@ -0,0 +1,24 @@ +/* vim: noet:ts=2:sts=2:sw=2 */ + +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2024 David Llewellyn-Jones */ + +#include +#include + +#ifndef __MATRIX_BENCHMARKS_H +#define __MATRIX_BENCHMARKS_H (1) + +typedef struct _Benchmark Benchmark; + +Benchmark * new_benchmark(); +Benchmark * delete_benchmark(Benchmark *benchmark); + +void benchmarks_start(Benchmark *benchmark, uint32_t operations); +void benchmarks_end(Benchmark *benchmark); +void benchmark_set_quiet(Benchmark *benchmark, bool quiet); +void benchmarks_multiply_big(ThreadPool *pool); +void benchmarks_multiply_small(Matrices *a, Matrices *b, Matrices *d); + +#endif /* __MATRIX_BENCHMARKS_H */ + diff --git a/implementation/matmul-c/include/load.h b/implementation/matmul-c/include/load.h index f492b25..0af72a4 100644 --- a/implementation/matmul-c/include/load.h +++ b/implementation/matmul-c/include/load.h @@ -23,4 +23,4 @@ void matrix_npz_load(char *filename, Matrices *matrices); Matrices *new_matrices(uint32_t count); Matrices *delete_matrices(Matrices *matrices); -#endif /* __MATRIX_OPERATIONS_H */ +#endif /* __MATRIX_LOAD_H */ diff --git a/implementation/matmul-c/include/tests.h b/implementation/matmul-c/include/tests.h index 30d723b..6c3c38d 100644 --- a/implementation/matmul-c/include/tests.h +++ b/implementation/matmul-c/include/tests.h @@ -6,6 +6,10 @@ #include #include +#include "matrix.h" +#include "load.h" +#include "threadpool.h" + #include "load.h" #ifndef __MATRIX_TESTS_H @@ -14,5 +18,7 @@ uint32_t tests_load_matrices(Matrices *a, Matrices *b, Matrices *c); bool tests_allocate_results(Matrices *c, Matrices *d); +void tests_compare(Matrices *a, Matrices *b, Matrices *c, Matrices *d, ThreadPool *pool); + #endif /* __MATRIX_TESTS_H */ diff --git a/implementation/matmul-c/include/utils.h b/implementation/matmul-c/include/utils.h new file mode 100644 index 0000000..8517d70 --- /dev/null +++ b/implementation/matmul-c/include/utils.h @@ -0,0 +1,19 @@ +/* vim: noet:ts=2:sts=2:sw=2 */ + +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2024 David Llewellyn-Jones */ + +#ifndef __MATRIX_UTILS_H +#define __MATRIX_UTILS_H (1) + +#include + +typedef struct _Rand Rand; + +Rand * new_rand(); +Rand * delete_rand(Rand *rand); +void rand_seed(Rand *rand, uint32_t seed); +double rand_next(Rand *rand); +double rand_digit(Rand *rand); + +#endif /* __MATRIX_UTILS_H */ diff --git a/implementation/matmul-c/main.c b/implementation/matmul-c/main.c index 44c7ca9..17953cf 100644 --- a/implementation/matmul-c/main.c +++ b/implementation/matmul-c/main.c @@ -10,12 +10,7 @@ #include "load.h" #include "tests.h" #include "threadpool.h" - -#define BENCHMARK_REPEAT_SMALL (32768) -#define BENCHMARK_REPEAT_LARGE (1) - -#define HEIGHT (2048) -#define WIDTH (2048) +#include "benchmarks.h" int main(int argc, char *argv[]) { Matrix *A; @@ -58,93 +53,13 @@ int main(int argc, char *argv[]) { result = tests_allocate_results(c, d); // Perform 512 multiplications and compare against the results from NumPy - printf("Performing unit tests...\n"); - uint32_t passed = 0; - for (uint32_t index = 0; index < total; ++index) { - result = multiply(d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); - result = result && equals(c->matrices[index].matrix, d->matrices[index].matrix); - if (result) { - passed += 1; - } - else { - printf("Incorrect result\n"); - matrix_print(c->matrices[index].matrix); - matrix_print(d->matrices[index].matrix); - } - } - for (uint32_t index = 0; index < total; ++index) { - result = multiply_parallel(pool, d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); - result = result && equals(c->matrices[index].matrix, d->matrices[index].matrix); - if (result) { - passed += 1; - } - else { - printf("Incorrect result\n"); - matrix_print(c->matrices[index].matrix); - matrix_print(d->matrices[index].matrix); - } - } - printf("Multiplication tests passed: %u out of %u\n", passed, total * 2); - - // Measure time taken to perform 16777216 multiplications - struct timespec start_time, end_time; - uint32_t operations; - double elapsed; - double ops_per_sec; - - // Create a pair of big random matrices - A = new_matrix(HEIGHT, WIDTH); - B = new_matrix(WIDTH, HEIGHT); - D = new_matrix(HEIGHT, HEIGHT); - matrix_fill(A, 8); - matrix_fill(B, 16); - operations = BENCHMARK_REPEAT_LARGE; - - printf("Benchmarking...\n"); - - clock_gettime(CLOCK_MONOTONIC, &start_time); - for (uint32_t count = 0; count < operations; ++count) { - multiply(D, A, B); - } - clock_gettime(CLOCK_MONOTONIC, &end_time); - elapsed = (end_time.tv_sec - start_time.tv_sec); - elapsed += (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0; - - printf("Time taken to perform %u standard large multiply operations: %.02f seconds\n", operations, elapsed); - ops_per_sec = operations / elapsed; - printf("Equivalent to %.02f operations per second\n", ops_per_sec); - - clock_gettime(CLOCK_MONOTONIC, &start_time); - for (uint32_t count = 0; count < operations; ++count) { - multiply_parallel(pool, D, A, B); - } - clock_gettime(CLOCK_MONOTONIC, &end_time); - elapsed = (end_time.tv_sec - start_time.tv_sec); - elapsed += (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0; - - printf("Time taken to perform %u standard large parallel multiply operations: %.02f seconds\n", operations, elapsed); - ops_per_sec = operations / elapsed; - printf("Equivalent to %.02f operations per second\n", ops_per_sec); + tests_compare(a, b, c, d, pool); - A = delete_matrix(A); - B = delete_matrix(B); - D = delete_matrix(D); + // Benchmark large matrix multiplications + benchmarks_multiply_big(pool); - clock_gettime(CLOCK_MONOTONIC, &start_time); - for (uint32_t count = 0; count < BENCHMARK_REPEAT_SMALL; ++count) { - for (uint32_t index = 0; index < total; ++index) { - multiply(d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); - } - } - clock_gettime(CLOCK_MONOTONIC, &end_time); - - elapsed = (end_time.tv_sec - start_time.tv_sec); - elapsed += (end_time.tv_nsec - start_time.tv_nsec) / 1000000000.0; - - operations = total * BENCHMARK_REPEAT_SMALL; - printf("Time taken to perform %u multiply operations: %.02f seconds\n", operations, elapsed); - ops_per_sec = operations / elapsed; - printf("Equivalent to %.02f operations per second\n", ops_per_sec); + // Measure time taken to perform 16777216 multiplications + benchmarks_multiply_small(a, b, d); a = delete_matrices(a); b = delete_matrices(b); diff --git a/implementation/matmul-c/src/benchmarks.c b/implementation/matmul-c/src/benchmarks.c new file mode 100644 index 0000000..0f5ffc3 --- /dev/null +++ b/implementation/matmul-c/src/benchmarks.c @@ -0,0 +1,162 @@ +/* vim: noet:ts=2:sts=2:sw=2 */ + +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2024 David Llewellyn-Jones */ + +#include + +#include "matrix.h" +#include "load.h" +#include "operations.h" +#include "threadpool.h" + +#include "benchmarks.h" + +#define BENCHMARK_REPEAT_SMALL (32768) +#define BENCHMARK_REPEAT_LARGE (1) + +#define HEIGHT (2048) +#define WIDTH (2048) + +struct _Benchmark { + struct timespec start_time; + struct timespec end_time; + uint32_t operations; + double elapsed; + double ops_per_sec; + bool quiet; +}; + +Benchmark * new_benchmark() { + Benchmark *benchmark = calloc(sizeof(Benchmark), sizeof(char)); + + return benchmark; +} + +Benchmark * delete_benchmark(Benchmark *benchmark) { + if (benchmark) { + free(benchmark); + } + return NULL; +} + +void benchmark_set_quiet(Benchmark *benchmark, bool quiet) { + benchmark->quiet = quiet; +} + +void benchmarks_start(Benchmark *benchmark, uint32_t operations) { + if (benchmark) { + if (!benchmark->quiet) { + printf("Benchmarking...\n"); + } + benchmark->operations = operations; + clock_gettime(CLOCK_MONOTONIC, &benchmark->start_time); + } +} + +void benchmarks_end(Benchmark *benchmark) { + if (benchmark) { + clock_gettime(CLOCK_MONOTONIC, &benchmark->end_time); + benchmark->elapsed = (benchmark->end_time.tv_sec - benchmark->start_time.tv_sec); + benchmark->elapsed += (benchmark->end_time.tv_nsec - benchmark->start_time.tv_nsec) / 1000000000.0; + benchmark->ops_per_sec = benchmark->operations / benchmark->elapsed; + + if (!benchmark->quiet) { + printf("Time taken to perform %u operations: %.02f seconds\n", benchmark->operations, benchmark->elapsed); + printf("Equivalent to %.02f operations per second\n", benchmark->ops_per_sec); + } + } +} + +void benchmarks_multiply_big(ThreadPool *pool) { + Benchmark *benchmark; + Matrix *A; + Matrix *B; + Matrix *D; + + printf("\n"); + printf("## Large matrix multiplication\n"); + + benchmark = new_benchmark(); + benchmark_set_quiet(benchmark, true); + + for (uint32_t width = 128; width <= 2048; width += 128) { + uint32_t diag = width; + uint32_t height = width; + uint32_t const repeat = BENCHMARK_REPEAT_LARGE * (width < 512 ? 1024 : width < 1024 ? 16 : 1); + + A = new_matrix(width, diag); + B = new_matrix(diag, height); + D = new_matrix(width, height); + matrix_fill(A, 8); + matrix_fill(B, 16); + + benchmarks_start(benchmark, repeat); + for (uint32_t count = 0; count < repeat; ++count) { + multiply(D, A, B); + } + benchmarks_end(benchmark); + printf("Size: (%d, %d, %d), time per operation: %.02f seconds\n", width, diag, height, benchmark->elapsed / benchmark->operations); + uint64_t order = (uint64_t)width * (uint64_t)diag * (uint64_t)height; + double speed = (double)order / (benchmark->elapsed / benchmark->operations); + printf("Order: %llu, elements per second: %.02f\n", order, speed); + + A = delete_matrix(A); + B = delete_matrix(B); + D = delete_matrix(D); + } + + printf("\n"); + printf("## Large parallel matrix multiplication\n"); + + for (uint32_t width = 128; width <= 2048; width += 128) { + uint32_t diag = width; + uint32_t height = width; + uint32_t const repeat = BENCHMARK_REPEAT_LARGE * (width < 512 ? 1024 : width < 1024 ? 16 : 1); + + A = new_matrix(width, diag); + B = new_matrix(diag, height); + D = new_matrix(width, height); + matrix_fill(A, 8); + matrix_fill(B, 16); + + benchmarks_start(benchmark, repeat); + for (uint32_t count = 0; count < repeat; ++count) { + multiply_parallel(pool, D, A, B); + } + benchmarks_end(benchmark); + printf("Size: (%d, %d, %d), time per operation: %.02f seconds\n", width, diag, height, benchmark->elapsed / benchmark->operations); + uint64_t order = (uint64_t)width * (uint64_t)diag * (uint64_t)height; + double speed = (double)order / (benchmark->elapsed / benchmark->operations); + printf("Order: %llu, elements per second: %.02f\n", order, speed); + + A = delete_matrix(A); + B = delete_matrix(B); + D = delete_matrix(D); + } + + benchmark = delete_benchmark(benchmark); +} + +void benchmarks_multiply_small(Matrices *a, Matrices *b, Matrices *d) { + Benchmark *benchmark; + uint32_t total; + + benchmark = new_benchmark(); + total = d->count; + + printf("\n"); + printf("## Small matrix multiplication\n"); + benchmarks_start(benchmark, total * BENCHMARK_REPEAT_SMALL); + + for (uint32_t count = 0; count < BENCHMARK_REPEAT_SMALL; ++count) { + for (uint32_t index = 0; index < total; ++index) { + multiply(d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); + } + } + + benchmarks_end(benchmark); + + benchmark = delete_benchmark(benchmark); +} + diff --git a/implementation/matmul-c/src/matrix.c b/implementation/matmul-c/src/matrix.c index a21099c..b73907c 100644 --- a/implementation/matmul-c/src/matrix.c +++ b/implementation/matmul-c/src/matrix.c @@ -6,6 +6,8 @@ #include #include +#include "utils.h" + #include "matrix.h" Matrix * new_matrix(uint16_t height, uint16_t width) { @@ -57,12 +59,16 @@ void matrix_print(Matrix *A) { } void matrix_fill(Matrix *A, uint32_t seed) { - srand(seed); + Rand * rand = new_rand(); + rand_seed(rand, seed); + if (A) { uint32_t size = A->height * A->width; for (uint32_t index = 0; index < size; ++index) { - A->elements[index] = ((uint32_t)(rand() / (RAND_MAX / 1000))) / 10; + A->elements[index] = rand_digit(rand); } } + + rand = delete_rand(rand); } diff --git a/implementation/matmul-c/src/tests.c b/implementation/matmul-c/src/tests.c index ba97caa..0046f07 100644 --- a/implementation/matmul-c/src/tests.c +++ b/implementation/matmul-c/src/tests.c @@ -5,8 +5,7 @@ #include -#include "matrix.h" -#include "load.h" +#include "operations.h" #include "tests.h" @@ -76,4 +75,38 @@ bool tests_allocate_results(Matrices *c, Matrices *d) { return true; } +// Perform 512 multiplications and compare against the results from NumPy +void tests_compare(Matrices *a, Matrices *b, Matrices *c, Matrices *d, ThreadPool *pool) { + printf("Performing unit tests...\n"); + uint32_t passed = 0; + uint32_t total = d->count; + bool result; + + for (uint32_t index = 0; index < total; ++index) { + result = multiply(d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); + result = result && equals(c->matrices[index].matrix, d->matrices[index].matrix); + if (result) { + passed += 1; + } + else { + printf("Incorrect result\n"); + matrix_print(c->matrices[index].matrix); + matrix_print(d->matrices[index].matrix); + } + } + for (uint32_t index = 0; index < total; ++index) { + result = multiply_parallel(pool, d->matrices[index].matrix, a->matrices[index].matrix, b->matrices[index].matrix); + result = result && equals(c->matrices[index].matrix, d->matrices[index].matrix); + if (result) { + passed += 1; + } + else { + printf("Incorrect result\n"); + matrix_print(c->matrices[index].matrix); + matrix_print(d->matrices[index].matrix); + } + } + printf("Multiplication tests passed: %u out of %u\n", passed, total * 2); +} + diff --git a/implementation/matmul-c/src/utils.c b/implementation/matmul-c/src/utils.c new file mode 100644 index 0000000..b157583 --- /dev/null +++ b/implementation/matmul-c/src/utils.c @@ -0,0 +1,51 @@ +/* vim: noet:ts=2:sts=2:sw=2 */ + +/* SPDX-License-Identifier: MIT */ +/* Copyright © 2024 David Llewellyn-Jones */ + +#include + +#include "utils.h" + +#define A (16807) +#define C (0) +#define M (2147483647) + +struct _Rand { + uint32_t state; +}; + +Rand * new_rand() { + Rand *rand = calloc(sizeof(Rand), sizeof(char)); + + return rand; +} + +Rand * delete_rand(Rand *rand) { + if (rand) { + free(rand); + } + return NULL; +} + +// Seed the random number generator +void rand_seed(Rand *rand, uint32_t seed) { + rand->state = seed; +} + +// A decimal between 0.0 and 1.0 +// See https://www.math.arizona.edu/~tgk/mc/book_chap3.pdf +double rand_next(Rand *rand) { + rand->state = ((A * rand->state) + C) % M; + return (double)rand->state / (double)M; +} + +// A digit between 0.0 and 100.0 +double rand_digit(Rand *rand) { + double result; + result = rand_next(rand); + result = (double)rand->state / (double)M; + result = ((int)(result * 1000.0)) / 10.0; + return result; +} +