From 6359d38c842e8d519f0d59bbca9c5b419f9c148c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 30 Mar 2023 15:35:28 +0200 Subject: [PATCH 01/93] Add test for extract_directive_code. --- test/test_util_functions.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index b3896a1ad..df7f6c416 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -695,3 +695,45 @@ def test_parse_restrictions(): assert expected in parsed +def test_extract_directive_code(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + + expected_one = "#pragma acc parallel\n#pragma acc loop\nfor ( int i = 0; i < size; i++ ) {\na[i] = i;\nb[i] = i + 1;\n}" + expected_two = "#pragma acc parallel\n#pragma acc loop\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + + returns = extract_directive_code(code) + + assert expected_one in returns + assert expected_two in returns From 2527bd6cb1ac6141e85d20be0716b0753da1586c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Sat, 1 Apr 2023 18:02:06 +0200 Subject: [PATCH 02/93] Fix test. --- test/test_util_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index df7f6c416..718a07337 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -730,8 +730,8 @@ def test_extract_directive_code(): } """ - expected_one = "#pragma acc parallel\n#pragma acc loop\nfor ( int i = 0; i < size; i++ ) {\na[i] = i;\nb[i] = i + 1;\n}" - expected_two = "#pragma acc parallel\n#pragma acc loop\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + expected_one = " #pragma acc parallel\n #pragma acc loop\n for ( int i = 0; i < size; i++ ) {\n a[i] = i;\n b[i] = i + 1;\n }" + expected_two = " #pragma acc parallel\n #pragma acc loop\n for ( int i = 0; i < size; i++ ) {\n c[i] = a[i] + b[i];\n }" returns = extract_directive_code(code) From 484c43702cc59489fe8861a225ca5594627ae789 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Sat, 1 Apr 2023 18:02:30 +0200 Subject: [PATCH 03/93] Add function to exctract sections of code. --- kernel_tuner/util.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index e10f788ea..92dc0eed1 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -960,4 +960,27 @@ def cuda_error_check(error): elif isinstance(error, nvrtc.nvrtcResult): if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: _, desc = nvrtc.nvrtcGetErrorString(error) - raise RuntimeError(f"NVRTC error: {desc.decode()}") \ No newline at end of file + raise RuntimeError(f"NVRTC error: {desc.decode()}") + +def extract_directive_code(code: str) -> list: + """ Extract explicitly marked directive sections from code """ + + start_string = "#pragma tuner start" + end_string = "#pragma tuner stop" + found_section = False + sections = list() + tmp_string = list() + + for line in code.split("\n"): + if found_section: + if end_string in line: + found_section = False + sections.append("\n".join(tmp_string)) + tmp_string = list() + else: + tmp_string.append(line) + else: + if start_string in line: + found_section = True + + return sections From e20c39e6a670b1a6c428e647bc918e55f028f845 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 3 Apr 2023 16:04:46 +0200 Subject: [PATCH 04/93] Modify test. --- test/test_util_functions.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 718a07337..4aa33c16f 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -730,10 +730,20 @@ def test_extract_directive_code(): } """ - expected_one = " #pragma acc parallel\n #pragma acc loop\n for ( int i = 0; i < size; i++ ) {\n a[i] = i;\n b[i] = i + 1;\n }" - expected_two = " #pragma acc parallel\n #pragma acc loop\n for ( int i = 0; i < size; i++ ) {\n c[i] = a[i] + b[i];\n }" + expected_one = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + }""" + expected_two = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + }""" returns = extract_directive_code(code) + assert len(returns) == 2 assert expected_one in returns assert expected_two in returns From 5a9ccf4d592d249c09af2cd106bd196025d675bc Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 3 Apr 2023 16:06:31 +0200 Subject: [PATCH 05/93] Formatted file with black. --- test/test_util_functions.py | 241 ++++++++++++++++++++---------------- 1 file changed, 132 insertions(+), 109 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 4aa33c16f..3ec9dbe44 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -22,10 +22,7 @@ def test_get_grid_dimensions1(): problem_size = (1024, 1024, 1) - params = { - "block_x": 41, - "block_y": 37 - } + params = {"block_x": 41, "block_y": 37} grid_div = (["block_x"], ["block_y"], None) @@ -39,19 +36,28 @@ def test_get_grid_dimensions1(): assert grid[1] == 28 assert grid[2] == 1 - grid = get_grid_dimensions(problem_size, params, (grid_div[0], None, None), block_size_names) + grid = get_grid_dimensions( + problem_size, params, (grid_div[0], None, None), block_size_names + ) assert grid[0] == 25 assert grid[1] == 1024 assert grid[2] == 1 - grid = get_grid_dimensions(problem_size, params, (None, grid_div[1], None), block_size_names) + grid = get_grid_dimensions( + problem_size, params, (None, grid_div[1], None), block_size_names + ) assert grid[0] == 1024 assert grid[1] == 28 assert grid[2] == 1 - grid = get_grid_dimensions(problem_size, params, (None, lambda p: p["block_x"], lambda p: p["block_y"] * p["block_x"]), block_size_names) + grid = get_grid_dimensions( + problem_size, + params, + (None, lambda p: p["block_x"], lambda p: p["block_y"] * p["block_x"]), + block_size_names, + ) assert grid[0] == 1024 assert grid[1] == 25 @@ -60,15 +66,14 @@ def test_get_grid_dimensions1(): def test_get_grid_dimensions2(): problem_size = (1024, 1024, 1) - params = { - "block_x": 41, - "block_y": 37 - } + params = {"block_x": 41, "block_y": 37} grid_div_x = ["block_x*8"] grid_div_y = ["(block_y+2)/8"] - grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names) + grid = get_grid_dimensions( + problem_size, params, (grid_div_x, grid_div_y, None), block_size_names + ) assert grid[0] == 4 assert grid[1] == 256 @@ -76,16 +81,15 @@ def test_get_grid_dimensions2(): def test_get_grid_dimensions3(): problem_size = (1024, 1024, 1) - params = { - "block_x": 41, - "block_y": 37 - } + params = {"block_x": 41, "block_y": 37} grid_div_x = ["block_x", "block_y"] grid_div_y = ["(block_y+2)/8"] def assert_grid_dimensions(problem_size): - grid = get_grid_dimensions(problem_size, params, (grid_div_x, grid_div_y, None), block_size_names) + grid = get_grid_dimensions( + problem_size, params, (grid_div_x, grid_div_y, None), block_size_names + ) assert grid[0] == 1 assert grid[1] == 256 assert grid[2] == 1 @@ -98,10 +102,7 @@ def assert_grid_dimensions(problem_size): def test_get_problem_size1(): problem_size = ("num_blocks_x", "num_blocks_y*3") - params = { - "num_blocks_x": 71, - "num_blocks_y": 57 - } + params = {"num_blocks_x": 71, "num_blocks_y": 57} answer = get_problem_size(problem_size, params) assert answer[0] == 71 @@ -111,9 +112,7 @@ def test_get_problem_size1(): def test_get_problem_size2(): problem_size = "num_blocks_x" - params = { - "num_blocks_x": 71 - } + params = {"num_blocks_x": 71} answer = get_problem_size(problem_size, params) assert answer[0] == 71 @@ -124,16 +123,12 @@ def test_get_problem_size2(): def test_get_problem_size3(): with pytest.raises(TypeError): problem_size = (3.8, "num_blocks_y*3") - params = { - "num_blocks_y": 57 - } + params = {"num_blocks_y": 57} get_problem_size(problem_size, params) def test_get_problem_size4(): - params = { - "num_blocks_x": 71 - } + params = {"num_blocks_x": 71} answer = get_problem_size(lambda p: (p["num_blocks_x"], 1, 13), params) assert answer[0] == 71 @@ -142,11 +137,7 @@ def test_get_problem_size4(): def test_get_thread_block_dimensions(): - - params = { - "block_size_x": 123, - "block_size_y": 257 - } + params = {"block_size_x": 123, "block_size_y": 257} threads = get_thread_block_dimensions(params) assert len(threads) == 3 @@ -166,40 +157,46 @@ def test_prepare_kernel_string(): params = dict() params["is"] = 8 - _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "", None) - expected = "#define grid_size_x 3\n" \ - "#define grid_size_y 7\n" \ - "#define block_size_x 1\n" \ - "#define block_size_y 2\n" \ - "#define block_size_z 3\n" \ - "#define is 8\n" \ - "#define kernel_tuner 1\n" \ - "#line 1\n" \ - "this is a weird kernel" + _, output = prepare_kernel_string( + "this", kernel, params, grid, threads, block_size_names, "", None + ) + expected = ( + "#define grid_size_x 3\n" + "#define grid_size_y 7\n" + "#define block_size_x 1\n" + "#define block_size_y 2\n" + "#define block_size_z 3\n" + "#define is 8\n" + "#define kernel_tuner 1\n" + "#line 1\n" + "this is a weird kernel" + ) assert output == expected # Check custom defines - defines = OrderedDict( - foo=1, - bar="custom", - baz=lambda config: config["is"] * 5) - - _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "", defines) - expected = "#define foo 1\n" \ - "#define bar custom\n" \ - "#define baz 40\n" \ - "#line 1\n" \ - "this is a weird kernel" + defines = OrderedDict(foo=1, bar="custom", baz=lambda config: config["is"] * 5) + + _, output = prepare_kernel_string( + "this", kernel, params, grid, threads, block_size_names, "", defines + ) + expected = ( + "#define foo 1\n" + "#define bar custom\n" + "#define baz 40\n" + "#line 1\n" + "this is a weird kernel" + ) assert output == expected # Throw exception on invalid name (for instance, a space in the name) invalid_defines = {"invalid name": "1"} with pytest.raises(ValueError): - prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "", invalid_defines) + prepare_kernel_string( + "this", kernel, params, grid, threads, block_size_names, "", invalid_defines + ) def test_prepare_kernel_string_partial_loop_unrolling(): - kernel = """this is a weird kernel(what * language, is this, anyway* C) { #pragma unroll loop_unroll_factor_monkey for monkey in the forest { @@ -211,16 +208,19 @@ def test_prepare_kernel_string_partial_loop_unrolling(): params = dict() params["loop_unroll_factor_monkey"] = 8 - _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "CUDA", None) + _, output = prepare_kernel_string( + "this", kernel, params, grid, threads, block_size_names, "CUDA", None + ) assert "constexpr int loop_unroll_factor_monkey = 8;" in output params["loop_unroll_factor_monkey"] = 0 - _, output = prepare_kernel_string("this", kernel, params, grid, threads, block_size_names, "CUDA", None) + _, output = prepare_kernel_string( + "this", kernel, params, grid, threads, block_size_names, "CUDA", None + ) assert not "constexpr int loop_unroll_factor_monkey" in output assert not "#pragma unroll loop_unroll_factor_monkey" in output - def test_replace_param_occurrences(): kernel = "this is a weird kernel" params = dict() @@ -228,7 +228,9 @@ def test_replace_param_occurrences(): params["weird"] = 14 new_kernel = replace_param_occurrences(kernel, params) - assert new_kernel == "this 8 a 14 kernel" # Note: The "is" in "this" should not be replaced + assert ( + new_kernel == "this 8 a 14 kernel" + ) # Note: The "is" in "this" should not be replaced new_kernel = replace_param_occurrences(kernel, dict()) assert kernel == new_kernel @@ -240,14 +242,15 @@ def test_replace_param_occurrences(): def test_check_restrictions(): - params = { - "a": 7, - "b": 4, - "c": 3 - } + params = {"a": 7, "b": 4, "c": 3} print(params.values()) print(params.keys()) - restrictions = [["a==b+c"], ["a==b+c", "b==b", "a-b==c"], ["a==b+c", "b!=b", "a-b==c"], lambda p: p["a"] == p["b"] + p["c"]] + restrictions = [ + ["a==b+c"], + ["a==b+c", "b==b", "a-b==c"], + ["a==b+c", "b!=b", "a-b==c"], + lambda p: p["a"] == p["b"] + p["c"], + ] expected = [True, True, False, True] # test the call returns expected for r, e in zip(restrictions, expected): @@ -327,7 +330,7 @@ def test_check_argument_list1(): numbers[get_global_id(0)] = numbers[get_global_id(0)] * number; } """ - args = [np.int32(5), 'blah', np.array([1, 2, 3])] + args = [np.int32(5), "blah", np.array([1, 2, 3])] try: check_argument_list(kernel_name, kernel_string, args) print("Expected a TypeError to be raised") @@ -358,7 +361,9 @@ def test_check_argument_list3(): } """ args = [np.uint16(42), np.float16([3, 4, 6]), np.int32([300])] - assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "at position 2") + assert_user_warning( + check_argument_list, [kernel_name, kernel_string, args], "at position 2" + ) def test_check_argument_list4(): @@ -368,7 +373,9 @@ def test_check_argument_list4(): } """ args = [np.uint16(42), np.float16([3, 4, 6]), np.int64([300]), np.ubyte(32)] - assert_user_warning(check_argument_list, [kernel_name, kernel_string, args], "do not match in size") + assert_user_warning( + check_argument_list, [kernel_name, kernel_string, args], "do not match in size" + ) def test_check_argument_list5(): @@ -386,7 +393,12 @@ def test_check_argument_list5(): a[threadIdx.x] = b[blockIdx.x]*c*d; } """ - args = [np.array([1, 2, 3]).astype(np.float64), np.array([1, 2, 3]).astype(np.float32), np.int32(6), np.int32(7)] + args = [ + np.array([1, 2, 3]).astype(np.float64), + np.array([1, 2, 3]).astype(np.float32), + np.int32(6), + np.int32(7), + ] assert_no_user_warning(check_argument_list, [kernel_name, kernel_string, args]) @@ -422,7 +434,12 @@ def test_check_argument_list7(): def test_check_tune_params_list(): - tune_params = dict(zip(["one_thing", "led_to_another", "and_before_you_know_it", "grid_size_y"], [1, 2, 3, 4])) + tune_params = dict( + zip( + ["one_thing", "led_to_another", "and_before_you_know_it", "grid_size_y"], + [1, 2, 3, 4], + ) + ) try: check_tune_params_list(tune_params, None) print("Expected a ValueError to be raised") @@ -445,7 +462,7 @@ def test_check_tune_params_list2(): def test_check_tune_params_list3(): # test that exception is raised when tunable parameter is passed that needs an NVMLObserver for param in ["nvml_pwr_limit", "nvml_gr_clock", "nvml_mem_clock"]: - tune_params = {param:[]} + tune_params = {param: []} with pytest.raises(ValueError, match=r".*NVMLObserver.*"): check_tune_params_list(tune_params, None) with pytest.raises(ValueError, match=r".*NVMLObserver.*"): @@ -453,7 +470,6 @@ def test_check_tune_params_list3(): def test_check_block_size_params_names_list(): - def test_warnings(function, args, number, warning_type): with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. @@ -468,22 +484,42 @@ def test_warnings(function, args, number, warning_type): # check warning triggers for both unused blocksize names block_size_names = ["block_size_a", "block_size_b"] tune_params = dict(zip(["hyper", "ultra", "mega", "turbo"], [1, 2, 3, 4])) - test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 2, UserWarning) + test_warnings( + check_block_size_params_names_list, + [block_size_names, tune_params], + 2, + UserWarning, + ) # check warning does not triger when nondefault block size names are used correctly block_size_names = ["block_size_a", "block_size_b"] - tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])) - test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None) + tune_params = dict( + zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]) + ) + test_warnings( + check_block_size_params_names_list, [block_size_names, tune_params], 0, None + ) # check that a warning is issued when none of the default names are used and no alternative names are specified block_size_names = None - tune_params = dict(zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3])) - test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 1, UserWarning) + tune_params = dict( + zip(["block_size_a", "block_size_b", "many_other_things"], [1, 2, 3]) + ) + test_warnings( + check_block_size_params_names_list, + [block_size_names, tune_params], + 1, + UserWarning, + ) # check that no error is raised when any of the default block size names is being used block_size_names = None - tune_params = dict(zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]])) - test_warnings(check_block_size_params_names_list, [block_size_names, tune_params], 0, None) + tune_params = dict( + zip(["block_size_x", "several_other_things"], [[1, 2, 3, 4], [2, 4]]) + ) + test_warnings( + check_block_size_params_names_list, [block_size_names, tune_params], 0, None + ) def test_get_kernel_string_func(): @@ -491,9 +527,7 @@ def test_get_kernel_string_func(): def gen_kernel(params): return "__global__ void kernel_name() { %s }" % params["block_size_x"] - params = { - "block_size_x": "//do that kernel thing!" - } + params = {"block_size_x": "//do that kernel thing!"} expected = "__global__ void kernel_name() { //do that kernel thing! }" answer = get_kernel_string(gen_kernel, params) assert answer == expected @@ -523,7 +557,7 @@ def test_read_write_file(): my_string = "this is the test string" try: write_file(filename, my_string) - with open(filename, 'r') as f: + with open(filename, "r") as f: answer = f.read() assert my_string == answer answer2 = read_file(filename) @@ -556,7 +590,6 @@ def verify2(answer, result_host, atol): def test_process_cache(): - def assert_open_cachefile_is_correctly_parsed(cache): with open(cache, "r") as cachefile: filestr = cachefile.read() @@ -572,7 +605,12 @@ def assert_open_cachefile_is_correctly_parsed(cache): delete_temp_file(cache) kernel_options = Options(kernel_name="test_kernel", problem_size=(1, 2)) - tuning_options = Options(cache=cache, tune_params=Options(x=[1, 2, 3, 4]), simulation_mode=False, objective="time") + tuning_options = Options( + cache=cache, + tune_params=Options(x=[1, 2, 3, 4]), + simulation_mode=False, + objective="time", + ) runner = Options(dev=Options(name="test_device"), simulation_mode=False) try: @@ -587,10 +625,7 @@ def assert_open_cachefile_is_correctly_parsed(cache): assert len(tuning_options.cache) == 0 # store one entry in the cache - params = { - "x": 4, - "time": np.float32(0.1234) - } + params = {"x": 4, "time": np.float32(0.1234)} store_cache("4", params, tuning_options) assert len(tuning_options.cache) == 1 @@ -632,10 +667,7 @@ def assert_open_cachefile_is_correctly_parsed(cache): def test_process_metrics(): - params = { - "x": 15, - "b": 12 - } + params = {"x": 15, "b": 12} metrics = OrderedDict() metrics["y"] = lambda p: p["x"] @@ -644,19 +676,13 @@ def test_process_metrics(): assert params["y"] == params["x"] # test if we can do the same with a string - params = { - "x": 15, - "b": 12 - } + params = {"x": 15, "b": 12} metrics["y"] = "x" params = process_metrics(params, metrics) assert params["y"] == params["x"] # test if composability works correctly - params = { - "x": 15, - "b": 12 - } + params = {"x": 15, "b": 12} metrics = OrderedDict() metrics["y"] = "x" metrics["z"] = "y" @@ -668,10 +694,7 @@ def test_process_metrics(): params = process_metrics(params, {}) # test ValueError is raised when b already exists in params - params = { - "x": 15, - "b": 12 - } + params = {"x": 15, "b": 12} metrics = OrderedDict() metrics["b"] = "x" with pytest.raises(ValueError): @@ -679,7 +702,6 @@ def test_process_metrics(): def test_parse_restrictions(): - tune_params = {"block_size_x": [50, 100], "use_padding": [0, 1]} restrict = ["block_size_x != 320"] @@ -695,6 +717,7 @@ def test_parse_restrictions(): assert expected in parsed + def test_extract_directive_code(): code = """ #include From d9c5775b578a3ebdc9ca4bd17fb245ae98af53d4 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 3 Apr 2023 16:46:38 +0200 Subject: [PATCH 06/93] OpenACC skeleton. --- kernel_tuner/backends/openacc.py | 68 ++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 kernel_tuner/backends/openacc.py diff --git a/kernel_tuner/backends/openacc.py b/kernel_tuner/backends/openacc.py new file mode 100644 index 000000000..353265c33 --- /dev/null +++ b/kernel_tuner/backends/openacc.py @@ -0,0 +1,68 @@ +""" This module contains the functionality for running and compiling OpenACC sections """ + +import subprocess +import platform + +from kernel_tuner.backends.backend import CompilerBackend +from kernel_tuner.observers.c import CRuntimeObserver + + +class OpenACCFunctions(CompilerBackend): + """Class that groups the code for running and compiling OpenaCC functions in C++.""" + + def __init__(self, iterations=7, compiler_options=None, compiler=None): + self.iterations = iterations + # if no compiler is specified, use nvc++ by default + self.compiler = compiler or "nvc++" + self.observers = [CRuntimeObserver(self)] + + cc_version = str(subprocess.check_output([self.compiler, "--version"])) + cc_version = cc_version.splitlines()[0].split(" ")[-1] + + # environment info + env = dict() + env["CC Version"] = cc_version + env["iterations"] = self.iterations + env["compiler_options"] = compiler_options + self.env = env + self.name = platform.processor() + + def ready_argument_list(self, arguments): + """This method must implement the allocation of the arguments on device memory.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def compile(self, kernel_instance): + """This method must implement the compilation of a kernel into a callable function.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def start_event(self): + """This method must implement the recording of the start of a measurement.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def stop_event(self): + """This method must implement the recording of the end of a measurement.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def kernel_finished(self): + """This method must implement a check that returns True if the kernel has finished, False otherwise.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def synchronize(self): + """This method must implement a barrier that halts execution until device has finished its tasks.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def run_kernel(self, func, gpu_args, threads, grid, stream): + """This method must implement the execution of the kernel on the device.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def memset(self, allocation, value, size): + """This method must implement setting the memory to a value on the device.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def memcpy_dtoh(self, dest, src): + """This method must implement a device to host copy.""" + raise NotImplementedError("OpenACC backend does not support this feature") + + def memcpy_htod(self, dest, src): + """This method must implement a host to device copy.""" + raise NotImplementedError("OpenACC backend does not support this feature") From f0e9bbc5c949a620cd3a82c86e3cd2ef858b2b2c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 3 Apr 2023 16:46:51 +0200 Subject: [PATCH 07/93] Added main test for OpenACC backend. --- test/context.py | 34 +++++++++++++++++++++++++++------- test/test_backend.py | 11 +++++++---- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/test/context.py b/test/context.py index 71c0616a5..1ee3bb3cf 100644 --- a/test/context.py +++ b/test/context.py @@ -6,6 +6,7 @@ try: import pycuda.driver as drv + drv.init() pycuda_present = True except Exception: @@ -13,8 +14,9 @@ try: import pyopencl + opencl_present = True - if 'namespace' in str(sys.modules['pyopencl']): + if "namespace" in str(sys.modules["pyopencl"]): opencl_present = False if len(pyopencl.get_platforms()) == 0: opencl_present = False @@ -24,27 +26,43 @@ gcc_present = shutil.which("gcc") is not None gfortran_present = shutil.which("gfortran") is not None openmp_present = "libgomp" in subprocess.getoutput(["ldconfig -p | grep libgomp"]) +openacc_present = shutil.which("nvc++") is not None try: import cupy - cupy.cuda.Device(0).attributes #triggers exception if there are no CUDA-capable devices + + cupy.cuda.Device( + 0 + ).attributes # triggers exception if there are no CUDA-capable devices cupy_present = True except Exception: cupy_present = False try: import cuda + cuda_present = True except Exception: cuda_present = False -skip_if_no_pycuda = pytest.mark.skipif(not pycuda_present, reason="PyCuda not installed or no CUDA device detected") -skip_if_no_cupy = pytest.mark.skipif(not cupy_present, reason="CuPy not installed or no CUDA device detected") -skip_if_no_cuda = pytest.mark.skipif(not cuda_present, reason="NVIDIA CUDA not installed") -skip_if_no_opencl = pytest.mark.skipif(not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected") +skip_if_no_pycuda = pytest.mark.skipif( + not pycuda_present, reason="PyCuda not installed or no CUDA device detected" +) +skip_if_no_cupy = pytest.mark.skipif( + not cupy_present, reason="CuPy not installed or no CUDA device detected" +) +skip_if_no_cuda = pytest.mark.skipif( + not cuda_present, reason="NVIDIA CUDA not installed" +) +skip_if_no_opencl = pytest.mark.skipif( + not opencl_present, reason="PyOpenCL not installed or no OpenCL device detected" +) skip_if_no_gcc = pytest.mark.skipif(not gcc_present, reason="No gcc on PATH") -skip_if_no_gfortran = pytest.mark.skipif(not gfortran_present, reason="No gfortran on PATH") +skip_if_no_gfortran = pytest.mark.skipif( + not gfortran_present, reason="No gfortran on PATH" +) skip_if_no_openmp = pytest.mark.skipif(not openmp_present, reason="No OpenMP found") +skip_if_no_openacc = pytest.mark.skipif(not openacc_present, reason="No nvc++ on PATH") def skip_backend(backend: str): @@ -60,3 +78,5 @@ def skip_backend(backend: str): pytest.skip("No gcc on PATH") elif backend.upper() == "FORTRAN" and not gfortran_present: pytest.skip("No gfortran on PATH") + elif backend.upper() == "OPENACC" and not openacc_present: + pytest.skip("No nvc++ on PATH") diff --git a/test/test_backend.py b/test/test_backend.py index 87613de58..397f76eda 100644 --- a/test/test_backend.py +++ b/test/test_backend.py @@ -1,14 +1,12 @@ -import pytest - -import kernel_tuner from .context import ( skip_if_no_gcc, skip_if_no_cupy, skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pycuda, + skip_if_no_openacc, ) -from kernel_tuner.backends import backend, c, cupy, nvcuda, opencl, pycuda +from kernel_tuner.backends import backend, c, cupy, nvcuda, opencl, pycuda, openacc class WrongBackend(backend.Backend): @@ -48,3 +46,8 @@ def test_opencl_backend(): @skip_if_no_pycuda def test_pycuda_backend(): dev = pycuda.PyCudaFunctions() + + +@skip_if_no_openacc +def test_openacc_backend(): + dev = openacc.OpenACCFunctions() From a3cb9f7c5e20a12afd5bc4e924612a3f408720f5 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 4 Apr 2023 13:47:29 +0200 Subject: [PATCH 08/93] Fixed nvc++ version. --- kernel_tuner/backends/openacc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/backends/openacc.py b/kernel_tuner/backends/openacc.py index 353265c33..140dab8f8 100644 --- a/kernel_tuner/backends/openacc.py +++ b/kernel_tuner/backends/openacc.py @@ -10,14 +10,14 @@ class OpenACCFunctions(CompilerBackend): """Class that groups the code for running and compiling OpenaCC functions in C++.""" - def __init__(self, iterations=7, compiler_options=None, compiler=None): + def __init__(self, iterations=7, compiler_options=None, compiler="nvc++"): self.iterations = iterations # if no compiler is specified, use nvc++ by default - self.compiler = compiler or "nvc++" + self.compiler = compiler self.observers = [CRuntimeObserver(self)] cc_version = str(subprocess.check_output([self.compiler, "--version"])) - cc_version = cc_version.splitlines()[0].split(" ")[-1] + cc_version = cc_version.splitlines()[0].split(" ")[1] # environment info env = dict() From c43aa4aa4247592f6b86cf49d63432e0ca9c9c7f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 4 Apr 2023 15:27:45 +0200 Subject: [PATCH 09/93] Add extract_preprocessor and relative test. --- kernel_tuner/util.py | 381 +++++++++++++++++++++++++----------- test/test_util_functions.py | 42 ++++ 2 files changed, 308 insertions(+), 115 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 92dc0eed1..2ff506554 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -13,7 +13,20 @@ from types import FunctionType import numpy as np -from constraint import Constraint, AllDifferentConstraint, AllEqualConstraint, MaxSumConstraint, ExactSumConstraint, MinSumConstraint, InSetConstraint, NotInSetConstraint, SomeInSetConstraint, SomeNotInSetConstraint, FunctionConstraint +from constraint import ( + Constraint, + AllDifferentConstraint, + AllEqualConstraint, + MaxSumConstraint, + ExactSumConstraint, + MinSumConstraint, + InSetConstraint, + NotInSetConstraint, + SomeInSetConstraint, + SomeNotInSetConstraint, + FunctionConstraint, +) + try: import cupy as cp except ImportError: @@ -29,7 +42,6 @@ class ErrorConfig(str): - def __str__(self): return self.__class__.__name__ @@ -49,10 +61,9 @@ class RuntimeFailedConfig(ErrorConfig): pass -class TorchPlaceHolder(): - +class TorchPlaceHolder: def __init__(self): - self.Tensor = Exception #using Exception here as a type that will never be among kernel arguments + self.Tensor = Exception # using Exception here as a type that will never be among kernel arguments class SkippableFailure(Exception): @@ -80,22 +91,25 @@ def check_argument_type(dtype, kernel_argument): "uint16": ["ushort", "unsigned short", "uint16_t"], "int16": ["short", "int16_t"], "uint32": ["uint", "unsigned int", "uint32_t"], - "int32": ["int", "int32_t"], # discrepancy between OpenCL and C here, long may be 32bits in C + "int32": [ + "int", + "int32_t", + ], # discrepancy between OpenCL and C here, long may be 32bits in C "uint64": ["ulong", "unsigned long", "uint64_t"], "int64": ["long", "int64_t"], "float16": ["half"], "float32": ["float"], "float64": ["double"], "complex64": ["float2"], - "complex128": ["double2"] + "complex128": ["double2"], } if dtype in types_map: return any([substr in kernel_argument for substr in types_map[dtype]]) - return False # unknown dtype. do not throw exception to still allow kernel to run. + return False # unknown dtype. do not throw exception to still allow kernel to run. def check_argument_list(kernel_name, kernel_string, args): - """ raise an exception if a kernel arguments do not match host arguments """ + """raise an exception if a kernel arguments do not match host arguments""" kernel_arguments = list() collected_errors = list() for iterator in re.finditer(kernel_name + "[ \n\t]*" + r"\(", kernel_string): @@ -106,23 +120,38 @@ def check_argument_list(kernel_name, kernel_string, args): for arguments_set, arguments in enumerate(kernel_arguments): collected_errors.append(list()) if len(arguments) != len(args): - collected_errors[arguments_set].append("Kernel and host argument lists do not match in size.") + collected_errors[arguments_set].append( + "Kernel and host argument lists do not match in size." + ) continue - for (i, arg) in enumerate(args): + for i, arg in enumerate(args): kernel_argument = arguments[i] if not isinstance(arg, (np.ndarray, np.generic, cp.ndarray, torch.Tensor)): - raise TypeError("Argument at position " + str(i) + " of type: " + str(type(arg)) + " should be of type np.ndarray or numpy scalar") + raise TypeError( + "Argument at position " + + str(i) + + " of type: " + + str(type(arg)) + + " should be of type np.ndarray or numpy scalar" + ) correct = True if isinstance(arg, np.ndarray) and not "*" in kernel_argument: - correct = False # array is passed to non-pointer kernel argument + correct = False # array is passed to non-pointer kernel argument if correct and check_argument_type(str(arg.dtype), kernel_argument): continue - collected_errors[arguments_set].append("Argument at position " + str(i) + " of dtype: " + str(arg.dtype) + " does not match " + kernel_argument + - ".") + collected_errors[arguments_set].append( + "Argument at position " + + str(i) + + " of dtype: " + + str(arg.dtype) + + " does not match " + + kernel_argument + + "." + ) if not collected_errors[arguments_set]: # We assume that if there is a possible list of arguments that matches with the provided one # it is the right one @@ -132,22 +161,33 @@ def check_argument_list(kernel_name, kernel_string, args): def check_stop_criterion(to): - """ checks if max_fevals is reached or time limit is exceeded """ + """checks if max_fevals is reached or time limit is exceeded""" if "max_fevals" in to and len(to.unique_results) >= to.max_fevals: raise StopCriterionReached("max_fevals reached") - if "time_limit" in to and (((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3)) > to.time_limit): + if "time_limit" in to and ( + ((time.perf_counter() - to.start_time) + (to.simulated_time * 1e-3)) + > to.time_limit + ): raise StopCriterionReached("time limit exceeded") def check_tune_params_list(tune_params, observers): - """ raise an exception if a tune parameter has a forbidden name """ + """raise an exception if a tune parameter has a forbidden name""" forbidden_names = ("grid_size_x", "grid_size_y", "grid_size_z", "time") for name, param in tune_params.items(): if name in forbidden_names: - raise ValueError("Tune parameter " + name + " with value " + str(param) + " has a forbidden name!") + raise ValueError( + "Tune parameter " + + name + + " with value " + + str(param) + + " has a forbidden name!" + ) if any("nvml_" in param for param in tune_params): if not observers or not any(isinstance(obs, NVMLObserver) for obs in observers): - raise ValueError("Tune parameters starting with nvml_ require an NVMLObserver!") + raise ValueError( + "Tune parameters starting with nvml_ require an NVMLObserver!" + ) def check_block_size_names(block_size_names): @@ -173,14 +213,22 @@ def check_block_size_params_names_list(block_size_names, tune_params): if block_size_names is not None: for name in block_size_names: if name not in tune_params.keys(): - warnings.warn("Block size name " + name + " is not specified in the tunable parameters list!", UserWarning) - else: # if default block size names are used + warnings.warn( + "Block size name " + + name + + " is not specified in the tunable parameters list!", + UserWarning, + ) + else: # if default block size names are used if not any([k in default_block_size_names for k in tune_params.keys()]): - warnings.warn("None of the tunable parameters specify thread block dimensions!", UserWarning) + warnings.warn( + "None of the tunable parameters specify thread block dimensions!", + UserWarning, + ) def check_restrictions(restrictions, params: dict, verbose: bool): - """ check whether a specific instance meets the search space restrictions """ + """check whether a specific instance meets the search space restrictions""" valid = True if callable(restrictions): valid = restrictions(params) @@ -200,12 +248,16 @@ def check_restrictions(restrictions, params: dict, verbose: bool): except ZeroDivisionError: pass if not valid and verbose: - print("skipping config", get_instance_string(params), "reason: config fails restriction") + print( + "skipping config", + get_instance_string(params), + "reason: config fails restriction", + ) return valid def convert_constraint_restriction(restrict: Constraint): - """ Convert the python-constraint to a function for backwards compatibility """ + """Convert the python-constraint to a function for backwards compatibility""" if isinstance(restrict, FunctionConstraint): f_restrict = lambda p: restrict._func(*p) elif isinstance(restrict, AllDifferentConstraint): @@ -220,7 +272,15 @@ def convert_constraint_restriction(restrict: Constraint): f_restrict = lambda p: sum(p) == restrict._exactsum elif isinstance(restrict, MinSumConstraint): f_restrict = lambda p: sum(p) >= restrict._exactsum - elif isinstance(restrict, (InSetConstraint, NotInSetConstraint, SomeInSetConstraint, SomeNotInSetConstraint)): + elif isinstance( + restrict, + ( + InSetConstraint, + NotInSetConstraint, + SomeInSetConstraint, + SomeNotInSetConstraint, + ), + ): raise NotImplementedError( f"Restriction of the type {type(restrict)} is explicitely not supported in backwards compatibility mode, because the behaviour is too complex. Please rewrite this constraint to a function to use it with this algorithm." ) @@ -230,13 +290,13 @@ def convert_constraint_restriction(restrict: Constraint): def check_thread_block_dimensions(params, max_threads, block_size_names=None): - """ check on maximum thread block dimensions """ + """check on maximum thread block dimensions""" dims = get_thread_block_dimensions(params, block_size_names) return np.prod(dims) <= max_threads def config_valid(config, tuning_options, max_threads): - """ combines restrictions and a check on the max thread block dimension to check config validity """ + """combines restrictions and a check on the max thread block dimension to check config validity""" legal = True params = OrderedDict(zip(tuning_options.tune_params.keys(), config)) if tuning_options.restrictions: @@ -244,12 +304,14 @@ def config_valid(config, tuning_options, max_threads): if not legal: return False block_size_names = tuning_options.get("block_size_names", None) - valid_thread_block_dimensions = check_thread_block_dimensions(params, max_threads, block_size_names) + valid_thread_block_dimensions = check_thread_block_dimensions( + params, max_threads, block_size_names + ) return valid_thread_block_dimensions def delete_temp_file(filename): - """ delete a temporary file, don't complain if no longer exists """ + """delete a temporary file, don't complain if no longer exists""" try: os.remove(filename) except OSError as e: @@ -269,15 +331,20 @@ def detect_language(kernel_string): def get_best_config(results, objective, objective_higher_is_better=False): - """ Returns the best configuration from a list of results according to some objective """ + """Returns the best configuration from a list of results according to some objective""" func = max if objective_higher_is_better else min - ignore_val = sys.float_info.max if not objective_higher_is_better else -sys.float_info.max - best_config = func(results, key=lambda x: x[objective] if isinstance(x[objective], float) else ignore_val) + ignore_val = ( + sys.float_info.max if not objective_higher_is_better else -sys.float_info.max + ) + best_config = func( + results, + key=lambda x: x[objective] if isinstance(x[objective], float) else ignore_val, + ) return best_config def get_config_string(params, keys=None, units=None): - """ return a compact string representation of a measurement """ + """return a compact string representation of a measurement""" def compact_number(v): if isinstance(v, float): @@ -313,21 +380,29 @@ def get_dimension_divisor(divisor_list, default, params): if callable(divisor_list): return divisor_list(params) else: - return np.prod([int(eval(replace_param_occurrences(s, params))) for s in divisor_list]) + return np.prod( + [int(eval(replace_param_occurrences(s, params))) for s in divisor_list] + ) - divisors = [get_dimension_divisor(d, block_size_names[i], params) for i, d in enumerate(grid_div)] - return tuple(int(np.ceil(float(current_problem_size[i]) / float(d))) for i, d in enumerate(divisors)) + divisors = [ + get_dimension_divisor(d, block_size_names[i], params) + for i, d in enumerate(grid_div) + ] + return tuple( + int(np.ceil(float(current_problem_size[i]) / float(d))) + for i, d in enumerate(divisors) + ) def get_instance_string(params): - """ combine the parameters to a string mostly used for debug output - use of OrderedDict is advised + """combine the parameters to a string mostly used for debug output + use of OrderedDict is advised """ return "_".join([str(i) for i in params.values()]) def get_kernel_string(kernel_source, params=None): - """ retrieve the kernel source and return as a string + """retrieve the kernel source and return as a string This function processes the passed kernel_source argument, which could be a function, a string with a filename, or just a string with code already. @@ -352,7 +427,7 @@ def get_kernel_string(kernel_source, params=None): :rtype: string """ # logging.debug('get_kernel_string called with %s', str(kernel_source)) - logging.debug('get_kernel_string called') + logging.debug("get_kernel_string called") kernel_string = None if callable(kernel_source): @@ -372,7 +447,7 @@ def get_problem_size(problem_size, params): if callable(problem_size): problem_size = problem_size(params) if isinstance(problem_size, (str, int, np.integer)): - problem_size = (problem_size, ) + problem_size = (problem_size,) current_problem_size = [1, 1, 1] for i, s in enumerate(problem_size): if isinstance(s, str): @@ -380,27 +455,31 @@ def get_problem_size(problem_size, params): elif isinstance(s, (int, np.integer)): current_problem_size[i] = s else: - raise TypeError("Error: problem_size should only contain strings or integers") + raise TypeError( + "Error: problem_size should only contain strings or integers" + ) return current_problem_size def get_smem_args(smem_args, params): - """ return a dict with kernel instance specific size """ + """return a dict with kernel instance specific size""" result = smem_args.copy() - if 'size' in result: - size = result['size'] + if "size" in result: + size = result["size"] if callable(size): size = size(params) elif isinstance(size, str): size = replace_param_occurrences(size, params) size = int(eval(size)) - result['size'] = size + result["size"] = size return result def get_temp_filename(suffix=None): - """ return a string in the form of temp_X, where X is a large integer """ - tmp_file = tempfile.mkstemp(suffix=suffix or "", prefix="temp_", dir=os.getcwd()) # or "" for Python 2 compatibility + """return a string in the form of temp_X, where X is a large integer""" + tmp_file = tempfile.mkstemp( + suffix=suffix or "", prefix="temp_", dir=os.getcwd() + ) # or "" for Python 2 compatibility os.close(tmp_file[0]) return tmp_file[1] @@ -417,7 +496,7 @@ def get_thread_block_dimensions(params, block_size_names=None): def get_total_timings(results, env, overhead_time): - """ Sum all timings and put their totals in the env """ + """Sum all timings and put their totals in the env""" total_framework_time = 0 total_strategy_time = 0 total_compile_time = 0 @@ -425,30 +504,47 @@ def get_total_timings(results, env, overhead_time): total_benchmark_time = 0 if results: for result in results: - if 'framework_time' not in result or 'strategy_time' not in result or 'compile_time' not in result or 'verification_time' not in result: - #warnings.warn("No detailed timings in results") + if ( + "framework_time" not in result + or "strategy_time" not in result + or "compile_time" not in result + or "verification_time" not in result + ): + # warnings.warn("No detailed timings in results") return env - total_framework_time += result['framework_time'] - total_strategy_time += result['strategy_time'] - total_compile_time += result['compile_time'] - total_verification_time += result['verification_time'] - total_benchmark_time += result['benchmark_time'] + total_framework_time += result["framework_time"] + total_strategy_time += result["strategy_time"] + total_compile_time += result["compile_time"] + total_verification_time += result["verification_time"] + total_benchmark_time += result["benchmark_time"] # add the seperate times to the environment dict - env['total_framework_time'] = total_framework_time - env['total_strategy_time'] = total_strategy_time - env['total_compile_time'] = total_compile_time - env['total_verification_time'] = total_verification_time - env['total_benchmark_time'] = total_benchmark_time - if 'simulated_time' in env: - overhead_time += env['simulated_time'] - env['overhead_time'] = overhead_time - (total_framework_time + total_strategy_time + total_compile_time + total_verification_time + total_benchmark_time) + env["total_framework_time"] = total_framework_time + env["total_strategy_time"] = total_strategy_time + env["total_compile_time"] = total_compile_time + env["total_verification_time"] = total_verification_time + env["total_benchmark_time"] = total_benchmark_time + if "simulated_time" in env: + overhead_time += env["simulated_time"] + env["overhead_time"] = overhead_time - ( + total_framework_time + + total_strategy_time + + total_compile_time + + total_verification_time + + total_benchmark_time + ) return env def print_config(config, tuning_options, runner): """print the configuration string with tunable parameters and benchmark results""" - print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units) + print_config_output( + tuning_options.tune_params, + config, + runner.quiet, + tuning_options.metrics, + runner.units, + ) def print_config_output(tune_params, params, quiet, metrics, units): @@ -462,7 +558,7 @@ def print_config_output(tune_params, params, quiet, metrics, units): def process_metrics(params, metrics): - """ process user-defined metrics for derived benchmark results + """process user-defined metrics for derived benchmark results Metrics must be an OrderedDict to support composable metrics. The dictionary keys describe the name given to this user-defined metric and will be used as the key in the results dictionaries @@ -490,7 +586,9 @@ def process_metrics(params, metrics): """ if not isinstance(metrics, OrderedDict): - raise ValueError("metrics should be an OrderedDict to preserve order and support composability") + raise ValueError( + "metrics should be an OrderedDict to preserve order and support composability" + ) for k, v in metrics.items(): if isinstance(v, str): value = eval(replace_param_occurrences(v, params)) @@ -506,8 +604,8 @@ def process_metrics(params, metrics): def looks_like_a_filename(kernel_source): - """ attempt to detect whether source code or a filename was passed """ - logging.debug('looks_like_a_filename called') + """attempt to detect whether source code or a filename was passed""" + logging.debug("looks_like_a_filename called") result = False if isinstance(kernel_source, str): result = True @@ -524,12 +622,14 @@ def looks_like_a_filename(kernel_source): result = False # string must contain substring ".c", ".opencl", or ".F" result = result and any([s in kernel_source for s in (".c", ".opencl", ".F")]) - logging.debug('kernel_source is a filename: %s' % str(result)) + logging.debug("kernel_source is a filename: %s" % str(result)) return result -def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, block_size_names, lang, defines): - """ prepare kernel string for compilation +def prepare_kernel_string( + kernel_name, kernel_string, params, grid, threads, block_size_names, lang, defines +): + """prepare kernel string for compilation Prepends the kernel with a series of C preprocessor defines specific to this kernel instance: @@ -568,7 +668,7 @@ def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, blo :rtype: string """ - logging.debug('prepare_kernel_string called for %s', kernel_name) + logging.debug("prepare_kernel_string called for %s", kernel_name) kernel_prefix = "" @@ -611,7 +711,9 @@ def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, blo # in OpenCL this isn't the case and we can just insert "#define loop_unroll_factor N" # using 0 to disable specifying a loop unrolling factor for this loop if v == "0": - kernel_string = re.sub(r"\n\s*#pragma\s+unroll\s+" + k, "\n", kernel_string) # + r"[^\S]*" + kernel_string = re.sub( + r"\n\s*#pragma\s+unroll\s+" + k, "\n", kernel_string + ) # + r"[^\S]*" else: kernel_prefix += f"constexpr int {k} = {v};\n" else: @@ -629,18 +731,18 @@ def prepare_kernel_string(kernel_name, kernel_string, params, grid, threads, blo def read_file(filename): - """ return the contents of the file named filename or None if file not found """ + """return the contents of the file named filename or None if file not found""" if os.path.isfile(filename): - with open(filename, 'r') as f: + with open(filename, "r") as f: return f.read() def replace_param_occurrences(string, params): """replace occurrences of the tuning params with their current value""" - result = '' + result = "" # Split on tokens and replace a token if it is a key in `params`. - for part in re.split('([a-zA-Z0-9_]+)', string): + for part in re.split("([a-zA-Z0-9_]+)", string): if part in params: result += str(params[part]) else: @@ -661,10 +763,10 @@ def write_file(filename, string): """dump the contents of string to a file called filename""" # ugly fix, hopefully we can find a better one if sys.version_info[0] >= 3: - with open(filename, 'w', encoding="utf-8") as f: + with open(filename, "w", encoding="utf-8") as f: f.write(string) else: - with open(filename, 'w') as f: + with open(filename, "w") as f: f.write(string.encode("utf-8")) @@ -686,13 +788,13 @@ def has_kw_argument(func, name): if v is None: return None - if has_kw_argument(v, 'atol'): + if has_kw_argument(v, "atol"): return v return lambda answer, result_host, atol: v(answer, result_host) def parse_restrictions(restrictions: list, tune_params: dict): - """ parses restrictions from a list of strings into a compilable function """ + """parses restrictions from a list of strings into a compilable function""" # rewrite the restrictions so variables are singled out regex_match_variable = r"([a-zA-Z_$][a-zA-Z_$0-9]*)" @@ -704,7 +806,9 @@ def replace_params(match_object): else: return key - parsed = ") and (".join([re.sub(regex_match_variable, replace_params, res) for res in restrictions]) + parsed = ") and (".join( + [re.sub(regex_match_variable, replace_params, res) for res in restrictions] + ) # tidy up the code by removing the last suffix and unnecessary spaces parsed_restrictions = "(" + parsed.strip() + ")" @@ -716,17 +820,16 @@ def replace_params(match_object): def compile_restrictions(restrictions: list, tune_params: dict): - """ parses restrictions from a list of strings into a callable function """ + """parses restrictions from a list of strings into a callable function""" parsed_restrictions = parse_restrictions(restrictions, tune_params) # actually compile - code_object = compile(parsed_restrictions, '', 'exec') + code_object = compile(parsed_restrictions, "", "exec") func = FunctionType(code_object.co_consts[0], globals()) return func class NpEncoder(json.JSONEncoder): - def default(self, obj): if isinstance(obj, np.integer): return int(obj) @@ -763,23 +866,33 @@ def process_cache(cache, kernel_options, tuning_options, runner): """ # caching only works correctly if tunable_parameters are stored in a OrderedDict if not isinstance(tuning_options.tune_params, OrderedDict): - raise ValueError("Caching only works correctly when tunable parameters are stored in a OrderedDict") + raise ValueError( + "Caching only works correctly when tunable parameters are stored in a OrderedDict" + ) # if file does not exist, create new cache if not os.path.isfile(cache): if tuning_options.simulation_mode: - raise ValueError(f"Simulation mode requires an existing cachefile: file {cache} does not exist") + raise ValueError( + f"Simulation mode requires an existing cachefile: file {cache} does not exist" + ) c = OrderedDict() c["device_name"] = runner.dev.name c["kernel_name"] = kernel_options.kernel_name - c["problem_size"] = kernel_options.problem_size if not callable(kernel_options.problem_size) else "callable" + c["problem_size"] = ( + kernel_options.problem_size + if not callable(kernel_options.problem_size) + else "callable" + ) c["tune_params_keys"] = list(tuning_options.tune_params.keys()) c["tune_params"] = tuning_options.tune_params c["objective"] = tuning_options.objective c["cache"] = {} - contents = json.dumps(c, cls=NpEncoder, indent="")[:-3] # except the last "}\n}" + contents = json.dumps(c, cls=NpEncoder, indent="")[ + :-3 + ] # except the last "}\n}" # write the header to the cachefile with open(cache, "w") as cachefile: @@ -798,32 +911,54 @@ def process_cache(cache, kernel_options, tuning_options, runner): # check if it is safe to continue tuning from this cache if cached_data["device_name"] != runner.dev.name: - raise ValueError("Cannot load cache which contains results for different device") + raise ValueError( + "Cannot load cache which contains results for different device" + ) if cached_data["kernel_name"] != kernel_options.kernel_name: - raise ValueError("Cannot load cache which contains results for different kernel") + raise ValueError( + "Cannot load cache which contains results for different kernel" + ) if "problem_size" in cached_data and not callable(kernel_options.problem_size): # if problem_size is not iterable, compare directly if not hasattr(kernel_options.problem_size, "__iter__"): if cached_data["problem_size"] != kernel_options.problem_size: - raise ValueError("Cannot load cache which contains results for different problem_size") + raise ValueError( + "Cannot load cache which contains results for different problem_size" + ) # else (problem_size is iterable) # cache returns list, problem_size is likely a tuple. Therefore, the next check # checks the equality of all items in the list/tuples individually - elif not all([i == j for i, j in zip(cached_data["problem_size"], kernel_options.problem_size)]): - raise ValueError("Cannot load cache which contains results for different problem_size") + elif not all( + [ + i == j + for i, j in zip( + cached_data["problem_size"], kernel_options.problem_size + ) + ] + ): + raise ValueError( + "Cannot load cache which contains results for different problem_size" + ) if cached_data["tune_params_keys"] != list(tuning_options.tune_params.keys()): - if all(key in tuning_options.tune_params for key in cached_data["tune_params_keys"]): - raise ValueError(f"All tunable parameters are present, but the order is wrong. \ - Cache has order: {cached_data['tune_params_keys']}, tuning_options has: {list(tuning_options.tune_params.keys())}") - raise ValueError(f"Cannot load cache which contains results obtained with different tunable parameters. \ - Cache has: {cached_data['tune_params_keys']}, tuning_options has: {list(tuning_options.tune_params.keys())}") + if all( + key in tuning_options.tune_params + for key in cached_data["tune_params_keys"] + ): + raise ValueError( + f"All tunable parameters are present, but the order is wrong. \ + Cache has order: {cached_data['tune_params_keys']}, tuning_options has: {list(tuning_options.tune_params.keys())}" + ) + raise ValueError( + f"Cannot load cache which contains results obtained with different tunable parameters. \ + Cache has: {cached_data['tune_params_keys']}, tuning_options has: {list(tuning_options.tune_params.keys())}" + ) tuning_options.cachefile = cache tuning_options.cache = cached_data["cache"] def read_cache(cache, open_cache=True): - """ Read the cachefile into a dictionary, if open_cache=True prepare the cachefile for appending """ + """Read the cachefile into a dictionary, if open_cache=True prepare the cachefile for appending""" with open(cache, "r") as cachefile: filestr = cachefile.read().strip() @@ -842,7 +977,7 @@ def read_cache(cache, open_cache=True): error_configs = { "InvalidConfig": InvalidConfig(), "CompilationFailedConfig": CompilationFailedConfig(), - "RuntimeFailedConfig": RuntimeFailedConfig() + "RuntimeFailedConfig": RuntimeFailedConfig(), } # replace strings with ErrorConfig instances @@ -869,7 +1004,7 @@ def close_cache(cache): def store_cache(key, params, tuning_options): - """ stores a new entry (key, params) to the cachefile """ + """stores a new entry (key, params) to the cachefile""" # create converter for dumping numpy objects to JSON def JSONconverter(obj): @@ -881,7 +1016,7 @@ def JSONconverter(obj): return obj.tolist() return obj.__str__() - #logging.debug('store_cache called, cache=%s, cachefile=%s' % (tuning_options.cache, tuning_options.cachefile)) + # logging.debug('store_cache called, cache=%s, cachefile=%s' % (tuning_options.cache, tuning_options.cachefile)) if isinstance(tuning_options.cache, dict): if not key in tuning_options.cache: tuning_options.cache[key] = params @@ -894,21 +1029,25 @@ def JSONconverter(obj): if tuning_options.cachefile: with open(tuning_options.cachefile, "a") as cachefile: - cachefile.write("\n" + json.dumps({ key: output_params }, default=JSONconverter)[1:-1] + ",") + cachefile.write( + "\n" + + json.dumps({key: output_params}, default=JSONconverter)[1:-1] + + "," + ) def dump_cache(obj: str, tuning_options): - """ dumps a string in the cache, this omits the several checks of store_cache() to speed up the process - with great power comes great responsibility! """ + """dumps a string in the cache, this omits the several checks of store_cache() to speed up the process - with great power comes great responsibility!""" if isinstance(tuning_options.cache, dict) and tuning_options.cachefile: with open(tuning_options.cachefile, "a") as cachefile: cachefile.write(obj) class MaxProdConstraint(Constraint): - """ Constraint enforcing that values of given variables create a product up to a given amount """ + """Constraint enforcing that values of given variables create a product up to a given amount""" def __init__(self, maxprod): - """ Instantiate a MaxProdConstraint + """Instantiate a MaxProdConstraint :params maxprod: Value to be considered as the maximum product :type maxprod: number @@ -917,7 +1056,7 @@ def __init__(self, maxprod): self._maxprod = maxprod def preProcess(self, variables, domains, constraints, vconstraints): - """ """ + """ """ Constraint.preProcess(self, variables, domains, constraints, vconstraints) maxprod = self._maxprod for variable in variables: @@ -947,8 +1086,9 @@ def __call__(self, variables, domains, assignments, forwardcheck=False): return False return True + def cuda_error_check(error): - """ Checking the status of CUDA calls using the NVIDIA cuda-python backend """ + """Checking the status of CUDA calls using the NVIDIA cuda-python backend""" if isinstance(error, cuda.CUresult): if error != cuda.CUresult.CUDA_SUCCESS: _, name = cuda.cuGetErrorName(error) @@ -962,15 +1102,16 @@ def cuda_error_check(error): _, desc = nvrtc.nvrtcGetErrorString(error) raise RuntimeError(f"NVRTC error: {desc.decode()}") + def extract_directive_code(code: str) -> list: - """ Extract explicitly marked directive sections from code """ - + """Extract explicitly marked directive sections from code""" + start_string = "#pragma tuner start" end_string = "#pragma tuner stop" found_section = False sections = list() tmp_string = list() - + for line in code.split("\n"): if found_section: if end_string in line: @@ -984,3 +1125,13 @@ def extract_directive_code(code: str) -> list: found_section = True return sections + + +def extract_preprocessor(code: str) -> list: + preprocessor = list() + + for line in code.split("\n"): + if "#define" in line or "#include" in line: + preprocessor.append(line) + + return preprocessor diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 3ec9dbe44..d26321a42 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -770,3 +770,45 @@ def test_extract_directive_code(): assert len(returns) == 2 assert expected_one in returns assert expected_two in returns + +def test_extract_preprocessor(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + + expected = [" #include ", " #define VECTOR_SIZE 65536"] + results = extract_preprocessor(code) + + assert len(results) == 2 + for item in expected: + assert item in results From 0c332eadb454b9794bfcff66629f03cb05d38f70 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 4 Apr 2023 16:04:31 +0200 Subject: [PATCH 10/93] Added function and relative test to wrap C++ timing around code. --- kernel_tuner/util.py | 8 ++++++++ test/test_util_functions.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2ff506554..5e67c9478 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1135,3 +1135,11 @@ def extract_preprocessor(code: str) -> list: preprocessor.append(line) return preprocessor + + +def wrap_cpp_timing(code: str) -> str: + start = "auto start = std::chrono::high_resolution_clock::now();" + end = "auto end = std::chrono::high_resolution_clock::now();" + sum = "auto elapsed_time = end - start;" + + return "\n".join([start, code, end, sum]) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index d26321a42..ef0e678a2 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -771,6 +771,7 @@ def test_extract_directive_code(): assert expected_one in returns assert expected_two in returns + def test_extract_preprocessor(): code = """ #include @@ -812,3 +813,12 @@ def test_extract_preprocessor(): assert len(results) == 2 for item in expected: assert item in results + + +def test_wrap_cpp_timing(): + code = "for ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + wrapped = wrap_cpp_timing(code) + assert ( + wrapped + == "auto start = std::chrono::high_resolution_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::high_resolution_clock::now();\nauto elapsed_time = end - start;" + ) From bb9e06ecf86fd5e97973747809713b7319f73d8b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 5 Apr 2023 10:53:18 +0200 Subject: [PATCH 11/93] Convert time to milliseconds to be consistent with other backends. --- kernel_tuner/util.py | 19 +++++++++++++++++-- test/test_util_functions.py | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 5e67c9478..bb52af616 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1105,7 +1105,6 @@ def cuda_error_check(error): def extract_directive_code(code: str) -> list: """Extract explicitly marked directive sections from code""" - start_string = "#pragma tuner start" end_string = "#pragma tuner stop" found_section = False @@ -1127,7 +1126,21 @@ def extract_directive_code(code: str) -> list: return sections +def extract_directive_signature(code: str) -> list: + """Extract the user defined signature for directive sections""" + start_string = "#pragma tuner start" + signatures = list() + + for line in code.split("\n"): + if start_string in line: + # TODO: generate C++ signature from line + signatures.append(line) + + return signatures + + def extract_preprocessor(code: str) -> list: + """Extract include and define statements from C/C++ code""" preprocessor = list() for line in code.split("\n"): @@ -1138,8 +1151,10 @@ def extract_preprocessor(code: str) -> list: def wrap_cpp_timing(code: str) -> str: + """Wrap C++ timing code using std::chrono around C++ code""" start = "auto start = std::chrono::high_resolution_clock::now();" end = "auto end = std::chrono::high_resolution_clock::now();" sum = "auto elapsed_time = end - start;" + ret = "return static_cast(elapsed_time.count() * 1e3);" - return "\n".join([start, code, end, sum]) + return "\n".join([start, code, end, sum, ret]) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index ef0e678a2..f481dc973 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -820,5 +820,5 @@ def test_wrap_cpp_timing(): wrapped = wrap_cpp_timing(code) assert ( wrapped - == "auto start = std::chrono::high_resolution_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::high_resolution_clock::now();\nauto elapsed_time = end - start;" + == "auto start = std::chrono::high_resolution_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::high_resolution_clock::now();\nauto elapsed_time = end - start;\nreturn static_cast(elapsed_time.count() * 1e3);" ) From f0235f87e6b6e741946b06e9055106a56051441c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 5 Apr 2023 11:37:47 +0200 Subject: [PATCH 12/93] Added a function to extract signature of parallel sections, plus a test for it. --- kernel_tuner/util.py | 15 +++++++++++++-- test/test_util_functions.py | 8 ++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index bb52af616..bd73f0f2f 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1133,8 +1133,19 @@ def extract_directive_signature(code: str) -> list: for line in code.split("\n"): if start_string in line: - # TODO: generate C++ signature from line - signatures.append(line) + tmp_string = line.split(" ") + name = tmp_string[3] + tmp_string = tmp_string[4:] + params = list() + for param in tmp_string: + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + p_type = param[1:-1] + p_type = p_type.split(":")[0] + if "*" in p_type: + p_type = p_type.replace("*", " *") + params.append(f"{p_type} {p_name}") + signatures.append(f"float {name}({', '.join(params)})") return signatures diff --git a/test/test_util_functions.py b/test/test_util_functions.py index f481dc973..d52095597 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -822,3 +822,11 @@ def test_wrap_cpp_timing(): wrapped == "auto start = std::chrono::high_resolution_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::high_resolution_clock::now();\nauto elapsed_time = end - start;\nreturn static_cast(elapsed_time.count() * 1e3);" ) + + +def test_extract_directive_signature(): + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" + signatures = extract_directive_signature(code) + + assert len(signatures) == 1 + assert "float vector_add(float * a, float * b, float * c, int size)" in signatures From 2f2beb0c49702a2f6e10df1da43de36d3b26099f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 5 Apr 2023 13:47:58 +0200 Subject: [PATCH 13/93] Added function to extract the data used by directives, and relative test. --- kernel_tuner/util.py | 21 +++++++++++++++++++++ test/test_util_functions.py | 11 +++++++++++ 2 files changed, 32 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index bd73f0f2f..d3223a038 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1150,6 +1150,27 @@ def extract_directive_signature(code: str) -> list: return signatures +def extract_directive_data(code: str) -> dict: + """Extract the data used in the directive section""" + start_string = "#pragma tuner start" + data = dict() + + for line in code.split("\n"): + if start_string in line: + name = line.split(" ")[3] + data[name] = dict() + tmp_string = line.split(" ")[4:] + for param in tmp_string: + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + param = param[1:-1] + p_type = param.split(":")[0] + p_size = param.split(":")[1] + data[name][p_name] = [p_type, p_size] + + return data + + def extract_preprocessor(code: str) -> list: """Extract include and define statements from C/C++ code""" preprocessor = list() diff --git a/test/test_util_functions.py b/test/test_util_functions.py index d52095597..4b0b86841 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -830,3 +830,14 @@ def test_extract_directive_signature(): assert len(signatures) == 1 assert "float vector_add(float * a, float * b, float * c, int size)" in signatures + + +def test_extract_directive_data(): + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" + data = extract_directive_data(code) + + assert len(data) == 1 + assert len(data["vector_add"]) == 4 + assert "float*" in data["vector_add"]["b"] + assert "int" not in data["vector_add"]["c"] + assert "VECTOR_SIZE" in data["vector_add"]["size"] From 6d60d177354f2bbade6d5ef6d0668fc14bd3a5a1 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 5 Apr 2023 15:23:40 +0200 Subject: [PATCH 14/93] Directive functions can now be used to only extract information for one directive, not all of them. --- kernel_tuner/util.py | 57 +++++++++++++++++++------------------ test/test_util_functions.py | 16 ++++++----- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index d3223a038..fe77e9087 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1103,7 +1103,7 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") -def extract_directive_code(code: str) -> list: +def extract_directive_code(code: str, kernel_name: str = None) -> list: """Extract explicitly marked directive sections from code""" start_string = "#pragma tuner start" end_string = "#pragma tuner stop" @@ -1121,52 +1121,55 @@ def extract_directive_code(code: str) -> list: tmp_string.append(line) else: if start_string in line: - found_section = True + if kernel_name is None or f" {kernel_name} " in line: + found_section = True return sections -def extract_directive_signature(code: str) -> list: +def extract_directive_signature(code: str, kernel_name: str = None) -> list: """Extract the user defined signature for directive sections""" start_string = "#pragma tuner start" signatures = list() for line in code.split("\n"): if start_string in line: - tmp_string = line.split(" ") - name = tmp_string[3] - tmp_string = tmp_string[4:] - params = list() - for param in tmp_string: - p_name = param.split("(")[0] - param = param.replace(p_name, "", 1) - p_type = param[1:-1] - p_type = p_type.split(":")[0] - if "*" in p_type: - p_type = p_type.replace("*", " *") - params.append(f"{p_type} {p_name}") - signatures.append(f"float {name}({', '.join(params)})") + if kernel_name is None or f" {kernel_name} " in line: + tmp_string = line.split(" ") + name = tmp_string[3] + tmp_string = tmp_string[4:] + params = list() + for param in tmp_string: + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + p_type = param[1:-1] + p_type = p_type.split(":")[0] + if "*" in p_type: + p_type = p_type.replace("*", " *") + params.append(f"{p_type} {p_name}") + signatures.append(f"float {name}({', '.join(params)})") return signatures -def extract_directive_data(code: str) -> dict: +def extract_directive_data(code: str, kernel_name: str = None) -> dict: """Extract the data used in the directive section""" start_string = "#pragma tuner start" data = dict() for line in code.split("\n"): if start_string in line: - name = line.split(" ")[3] - data[name] = dict() - tmp_string = line.split(" ")[4:] - for param in tmp_string: - p_name = param.split("(")[0] - param = param.replace(p_name, "", 1) - param = param[1:-1] - p_type = param.split(":")[0] - p_size = param.split(":")[1] - data[name][p_name] = [p_type, p_size] + if kernel_name is None or f" {kernel_name} " in line: + name = line.split(" ")[3] + data[name] = dict() + tmp_string = line.split(" ")[4:] + for param in tmp_string: + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + param = param[1:-1] + p_type = param.split(":")[0] + p_size = param.split(":")[1] + data[name][p_name] = [p_type, p_size] return data diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 4b0b86841..b9035ee5f 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -752,7 +752,6 @@ def test_extract_directive_code(): free(c); } """ - expected_one = """ #pragma acc parallel #pragma acc loop for ( int i = 0; i < size; i++ ) { @@ -764,12 +763,12 @@ def test_extract_directive_code(): for ( int i = 0; i < size; i++ ) { c[i] = a[i] + b[i]; }""" - returns = extract_directive_code(code) - assert len(returns) == 2 assert expected_one in returns assert expected_two in returns + returns = extract_directive_code(code, "vector_add") + assert len(returns) == 0 def test_extract_preprocessor(): @@ -806,10 +805,8 @@ def test_extract_preprocessor(): free(c); } """ - expected = [" #include ", " #define VECTOR_SIZE 65536"] results = extract_preprocessor(code) - assert len(results) == 2 for item in expected: assert item in results @@ -827,17 +824,22 @@ def test_wrap_cpp_timing(): def test_extract_directive_signature(): code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" signatures = extract_directive_signature(code) - assert len(signatures) == 1 assert "float vector_add(float * a, float * b, float * c, int size)" in signatures + signatures = extract_directive_signature(code, "vector_add") + assert len(signatures) == 1 + assert "float vector_add(float * a, float * b, float * c, int size)" in signatures + signatures = extract_directive_signature(code, "vector_add_ext") + assert len(signatures) == 0 def test_extract_directive_data(): code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" data = extract_directive_data(code) - assert len(data) == 1 assert len(data["vector_add"]) == 4 assert "float*" in data["vector_add"]["b"] assert "int" not in data["vector_add"]["c"] assert "VECTOR_SIZE" in data["vector_add"]["size"] + data = extract_directive_data(code, "vector_add_double") + assert len(data) == 0 From dc0ea6cd4d808d6fa99478e6d486890b13100ed4 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 11:31:48 +0200 Subject: [PATCH 15/93] All functions now return a dict indexed by kernel name + bug fixing. --- kernel_tuner/util.py | 23 +++++++++++++---------- test/test_util_functions.py | 23 +++++++++++++++-------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index fe77e9087..6defc0d79 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1103,39 +1103,42 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") -def extract_directive_code(code: str, kernel_name: str = None) -> list: +def extract_directive_code(code: str, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" start_string = "#pragma tuner start" end_string = "#pragma tuner stop" found_section = False - sections = list() + sections = dict() tmp_string = list() + name = "" for line in code.split("\n"): if found_section: if end_string in line: found_section = False - sections.append("\n".join(tmp_string)) + sections[name] = "\n".join(tmp_string) tmp_string = list() + name = "" else: tmp_string.append(line) else: if start_string in line: if kernel_name is None or f" {kernel_name} " in line: found_section = True + name = line.strip().split(" ")[3] return sections -def extract_directive_signature(code: str, kernel_name: str = None) -> list: +def extract_directive_signature(code: str, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" start_string = "#pragma tuner start" - signatures = list() + signatures = dict() for line in code.split("\n"): if start_string in line: if kernel_name is None or f" {kernel_name} " in line: - tmp_string = line.split(" ") + tmp_string = line.strip().split(" ") name = tmp_string[3] tmp_string = tmp_string[4:] params = list() @@ -1147,7 +1150,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> list: if "*" in p_type: p_type = p_type.replace("*", " *") params.append(f"{p_type} {p_name}") - signatures.append(f"float {name}({', '.join(params)})") + signatures[name] = f"float {name}({', '.join(params)})" return signatures @@ -1160,9 +1163,9 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: for line in code.split("\n"): if start_string in line: if kernel_name is None or f" {kernel_name} " in line: - name = line.split(" ")[3] + name = line.strip().split(" ")[3] data[name] = dict() - tmp_string = line.split(" ")[4:] + tmp_string = line.strip().split(" ")[4:] for param in tmp_string: p_name = param.split("(")[0] param = param.replace(p_name, "", 1) @@ -1186,7 +1189,7 @@ def extract_preprocessor(code: str) -> list: def wrap_cpp_timing(code: str) -> str: - """Wrap C++ timing code using std::chrono around C++ code""" + """Wrap C++ timing code (using std::chrono) around the provided code""" start = "auto start = std::chrono::high_resolution_clock::now();" end = "auto end = std::chrono::high_resolution_clock::now();" sum = "auto elapsed_time = end - start;" diff --git a/test/test_util_functions.py b/test/test_util_functions.py index b9035ee5f..ef33f9c03 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -730,7 +730,7 @@ def test_extract_directive_code(): __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - #pragma tuner start + #pragma tuner start initialize #pragma acc parallel #pragma acc loop for ( int i = 0; i < size; i++ ) { @@ -739,7 +739,7 @@ def test_extract_directive_code(): } #pragma tuner stop - #pragma tuner start + #pragma tuner start vector_add #pragma acc parallel #pragma acc loop for ( int i = 0; i < size; i++ ) { @@ -765,9 +765,10 @@ def test_extract_directive_code(): }""" returns = extract_directive_code(code) assert len(returns) == 2 - assert expected_one in returns - assert expected_two in returns - returns = extract_directive_code(code, "vector_add") + assert expected_one in returns["initialize"] + assert expected_two in returns["vector_add"] + assert expected_one not in returns["vector_add"] + returns = extract_directive_code(code, "vector_a") assert len(returns) == 0 @@ -822,13 +823,19 @@ def test_wrap_cpp_timing(): def test_extract_directive_signature(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) " signatures = extract_directive_signature(code) assert len(signatures) == 1 - assert "float vector_add(float * a, float * b, float * c, int size)" in signatures + assert ( + "float vector_add(float * a, float * b, float * c, int size)" + in signatures["vector_add"] + ) signatures = extract_directive_signature(code, "vector_add") assert len(signatures) == 1 - assert "float vector_add(float * a, float * b, float * c, int size)" in signatures + assert ( + "float vector_add(float * a, float * b, float * c, int size)" + in signatures["vector_add"] + ) signatures = extract_directive_signature(code, "vector_add_ext") assert len(signatures) == 0 From a9f204ea9df52c39a5039af7ac8432ce1e394e27 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 11:42:24 +0200 Subject: [PATCH 16/93] Draft implementation for compile. --- kernel_tuner/backends/openacc.py | 46 +++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/openacc.py b/kernel_tuner/backends/openacc.py index 140dab8f8..354d1a8bd 100644 --- a/kernel_tuner/backends/openacc.py +++ b/kernel_tuner/backends/openacc.py @@ -2,9 +2,13 @@ import subprocess import platform +import ctypes as C +import _ctypes +import numpy as np from kernel_tuner.backends.backend import CompilerBackend from kernel_tuner.observers.c import CRuntimeObserver +from kernel_tuner.util import get_temp_filename, write_file, delete_temp_file class OpenACCFunctions(CompilerBackend): @@ -14,6 +18,7 @@ def __init__(self, iterations=7, compiler_options=None, compiler="nvc++"): self.iterations = iterations # if no compiler is specified, use nvc++ by default self.compiler = compiler + self.lib = None self.observers = [CRuntimeObserver(self)] cc_version = str(subprocess.check_output([self.compiler, "--version"])) @@ -33,7 +38,42 @@ def ready_argument_list(self, arguments): def compile(self, kernel_instance): """This method must implement the compilation of a kernel into a callable function.""" - raise NotImplementedError("OpenACC backend does not support this feature") + if self.lib is not None: + self.cleanup_lib() + compiler_options = ["-fPIC -fast -acc=gpu"] + if self.compiler_options: + compiler_options += self.compiler_options + source_file = get_temp_filename(suffix=".cpp") + filename = ".".join(source_file.split(".")[:-1]) + try: + write_file(source_file, kernel_instance.kernel_string) + + lib_extension = ".so" + if platform.system() == "Darwin": + lib_extension = ".dylib" + + subprocess.check_call( + [self.compiler, "-c", source_file] + + compiler_options + + ["-o", filename + ".o"] + ) + subprocess.check_call( + [self.compiler, filename + ".o"] + + compiler_options + + ["-shared", "-o", filename + lib_extension] + ) + + self.lib = np.ctypeslib.load_library(filename, ".") + func = getattr(self.lib, kernel_instance.kernel_name) + func.restype = C.c_float + + finally: + delete_temp_file(source_file) + delete_temp_file(filename + ".o") + delete_temp_file(filename + ".so") + delete_temp_file(filename + ".dylib") + + return func def start_event(self): """This method must implement the recording of the start of a measurement.""" @@ -66,3 +106,7 @@ def memcpy_dtoh(self, dest, src): def memcpy_htod(self, dest, src): """This method must implement a host to device copy.""" raise NotImplementedError("OpenACC backend does not support this feature") + + def cleanup_lib(self): + """Unload the previously loaded shared library""" + _ctypes.dlclose(self.lib._handle) From 775bb51179c3f838b0e90b3ed76c1cf5bb5ddc27 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 12:53:26 +0200 Subject: [PATCH 17/93] Add draft test with tunable OpenACC code. --- examples/c/vector_add_openacc.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 examples/c/vector_add_openacc.py diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py new file mode 100644 index 000000000..31b60d7f7 --- /dev/null +++ b/examples/c/vector_add_openacc.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +"""This is a simple example for tuning C++ OpenACC code with the kernel tuner""" + +import numpy +from kernel_tuner import tune_kernel +from kernel_tuner.util import extract_directive_signature, extract_directive_code, extract_preprocessor, wrap_cpp_timing +from collections import OrderedDict + +code = """ +#include + +#define VECTOR_SIZE 65536 + +int main(void) { + int size = VECTOR_SIZE; + float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start vector_add a(float:VECTOR_SIZE) b(float:VECTOR_SIZE) c(float:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); +} +""" + +# Extract tunable directive and generate kernel_string +preprocessor = extract_preprocessor(code) +kernel_string = "\n".join(preprocessor) + "\n" +directive_signatures = extract_directive_signature(code, kernel_name="vector_add") +kernel_string += directive_signatures["vector_add"] + "{\n" +directive_codes = extract_directive_code(code, kernel_name="vector_add") +kernel_string += directive_codes["vector_add"] + "\n}" + +size = 65536 + +a = numpy.random.randn(size).astype(numpy.float32) +b = numpy.random.randn(size).astype(numpy.float32) +c = numpy.zeros_like(b) +n = numpy.int32(size) + +args = [c, a, b, n] + +tune_params = OrderedDict() +tune_params["ngangs"] = [2**i for i in range(0, 11)] +tune_params["nthreads"] = [2**i for i in range(0, 11)] + +answer = [a+b, None, None, None] + +tune_kernel("vector_add", kernel_string, size, args, tune_params, + answer=answer, compiler_options=['-O3']) From e4900ca272bc182e365904aaf51f2a67b46d2663 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 12:56:07 +0200 Subject: [PATCH 18/93] Set compiler. --- examples/c/vector_add_openacc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 31b60d7f7..5bf294080 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -49,10 +49,10 @@ args = [c, a, b, n] tune_params = OrderedDict() -tune_params["ngangs"] = [2**i for i in range(0, 11)] +tune_params["ngangs"] = [2**i for i in range(0, 21)] tune_params["nthreads"] = [2**i for i in range(0, 11)] answer = [a+b, None, None, None] tune_kernel("vector_add", kernel_string, size, args, tune_params, - answer=answer, compiler_options=['-O3']) + answer=answer, compiler_options=["-fPIC -fast -acc=gpu"], compiler="nvc++") From 329ce0acaf132f97c68ae7497becb39465c2aded Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 14:35:54 +0200 Subject: [PATCH 19/93] Fix timing. --- kernel_tuner/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 6defc0d79..e08d5d4af 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1192,7 +1192,7 @@ def wrap_cpp_timing(code: str) -> str: """Wrap C++ timing code (using std::chrono) around the provided code""" start = "auto start = std::chrono::high_resolution_clock::now();" end = "auto end = std::chrono::high_resolution_clock::now();" - sum = "auto elapsed_time = end - start;" - ret = "return static_cast(elapsed_time.count() * 1e3);" + sum = "std::chrono::duration elapsed_time = end - start;" + ret = "return static_cast(elapsed_time.count());" return "\n".join([start, code, end, sum, ret]) From 11a2a737b53b2d4d7fe9ac49749c9a4bbed09349 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 16:46:21 +0200 Subject: [PATCH 20/93] Mark pointers as restricted in the signature. --- kernel_tuner/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index e08d5d4af..de489edb4 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1148,7 +1148,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: p_type = param[1:-1] p_type = p_type.split(":")[0] if "*" in p_type: - p_type = p_type.replace("*", " *") + p_type = p_type.replace("*", " * restrict") params.append(f"{p_type} {p_name}") signatures[name] = f"float {name}({', '.join(params)})" From 917dbbb6ac6d22ff8cce5b8242caf220c7c16094 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Apr 2023 16:51:16 +0200 Subject: [PATCH 21/93] Measure milliseconds with floats. --- kernel_tuner/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index de489edb4..2bf69ad59 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1192,7 +1192,7 @@ def wrap_cpp_timing(code: str) -> str: """Wrap C++ timing code (using std::chrono) around the provided code""" start = "auto start = std::chrono::high_resolution_clock::now();" end = "auto end = std::chrono::high_resolution_clock::now();" - sum = "std::chrono::duration elapsed_time = end - start;" - ret = "return static_cast(elapsed_time.count());" + sum = "std::chrono::duration elapsed_time = end - start;" + ret = "return elapsed_time.count();" return "\n".join([start, code, end, sum, ret]) From 93414c93e3b1b180137912e78a097440bfcf9be9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 09:24:33 +0200 Subject: [PATCH 22/93] Fix some bugs in the example. --- examples/c/vector_add_openacc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 5bf294080..ac645f73c 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -17,7 +17,7 @@ float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - #pragma tuner start vector_add a(float:VECTOR_SIZE) b(float:VECTOR_SIZE) c(float:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) #pragma acc loop for ( int i = 0; i < size; i++ ) { @@ -33,11 +33,11 @@ # Extract tunable directive and generate kernel_string preprocessor = extract_preprocessor(code) -kernel_string = "\n".join(preprocessor) + "\n" +kernel_string = "\n".join(preprocessor) + "\n#include \n#include \n" directive_signatures = extract_directive_signature(code, kernel_name="vector_add") kernel_string += directive_signatures["vector_add"] + "{\n" directive_codes = extract_directive_code(code, kernel_name="vector_add") -kernel_string += directive_codes["vector_add"] + "\n}" +kernel_string += wrap_cpp_timing(directive_codes["vector_add"]) + "\n}" size = 65536 @@ -55,4 +55,4 @@ answer = [a+b, None, None, None] tune_kernel("vector_add", kernel_string, size, args, tune_params, - answer=answer, compiler_options=["-fPIC -fast -acc=gpu"], compiler="nvc++") + answer=answer, compiler_options=["-fast", "-acc=gpu"], compiler="nvc++") From ea0fefd4a1f31fd05f836592b8bbf23c1ecb1bba Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 13:00:31 +0200 Subject: [PATCH 23/93] OpenACC backend unnecessary, decided to improve the C backend. --- kernel_tuner/backends/openacc.py | 112 ------------------------------- test/test_backend.py | 8 +-- 2 files changed, 1 insertion(+), 119 deletions(-) delete mode 100644 kernel_tuner/backends/openacc.py diff --git a/kernel_tuner/backends/openacc.py b/kernel_tuner/backends/openacc.py deleted file mode 100644 index 354d1a8bd..000000000 --- a/kernel_tuner/backends/openacc.py +++ /dev/null @@ -1,112 +0,0 @@ -""" This module contains the functionality for running and compiling OpenACC sections """ - -import subprocess -import platform -import ctypes as C -import _ctypes -import numpy as np - -from kernel_tuner.backends.backend import CompilerBackend -from kernel_tuner.observers.c import CRuntimeObserver -from kernel_tuner.util import get_temp_filename, write_file, delete_temp_file - - -class OpenACCFunctions(CompilerBackend): - """Class that groups the code for running and compiling OpenaCC functions in C++.""" - - def __init__(self, iterations=7, compiler_options=None, compiler="nvc++"): - self.iterations = iterations - # if no compiler is specified, use nvc++ by default - self.compiler = compiler - self.lib = None - self.observers = [CRuntimeObserver(self)] - - cc_version = str(subprocess.check_output([self.compiler, "--version"])) - cc_version = cc_version.splitlines()[0].split(" ")[1] - - # environment info - env = dict() - env["CC Version"] = cc_version - env["iterations"] = self.iterations - env["compiler_options"] = compiler_options - self.env = env - self.name = platform.processor() - - def ready_argument_list(self, arguments): - """This method must implement the allocation of the arguments on device memory.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def compile(self, kernel_instance): - """This method must implement the compilation of a kernel into a callable function.""" - if self.lib is not None: - self.cleanup_lib() - compiler_options = ["-fPIC -fast -acc=gpu"] - if self.compiler_options: - compiler_options += self.compiler_options - source_file = get_temp_filename(suffix=".cpp") - filename = ".".join(source_file.split(".")[:-1]) - try: - write_file(source_file, kernel_instance.kernel_string) - - lib_extension = ".so" - if platform.system() == "Darwin": - lib_extension = ".dylib" - - subprocess.check_call( - [self.compiler, "-c", source_file] - + compiler_options - + ["-o", filename + ".o"] - ) - subprocess.check_call( - [self.compiler, filename + ".o"] - + compiler_options - + ["-shared", "-o", filename + lib_extension] - ) - - self.lib = np.ctypeslib.load_library(filename, ".") - func = getattr(self.lib, kernel_instance.kernel_name) - func.restype = C.c_float - - finally: - delete_temp_file(source_file) - delete_temp_file(filename + ".o") - delete_temp_file(filename + ".so") - delete_temp_file(filename + ".dylib") - - return func - - def start_event(self): - """This method must implement the recording of the start of a measurement.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def stop_event(self): - """This method must implement the recording of the end of a measurement.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def kernel_finished(self): - """This method must implement a check that returns True if the kernel has finished, False otherwise.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def synchronize(self): - """This method must implement a barrier that halts execution until device has finished its tasks.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def run_kernel(self, func, gpu_args, threads, grid, stream): - """This method must implement the execution of the kernel on the device.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def memset(self, allocation, value, size): - """This method must implement setting the memory to a value on the device.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def memcpy_dtoh(self, dest, src): - """This method must implement a device to host copy.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def memcpy_htod(self, dest, src): - """This method must implement a host to device copy.""" - raise NotImplementedError("OpenACC backend does not support this feature") - - def cleanup_lib(self): - """Unload the previously loaded shared library""" - _ctypes.dlclose(self.lib._handle) diff --git a/test/test_backend.py b/test/test_backend.py index 397f76eda..4a0e871c2 100644 --- a/test/test_backend.py +++ b/test/test_backend.py @@ -4,9 +4,8 @@ skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pycuda, - skip_if_no_openacc, ) -from kernel_tuner.backends import backend, c, cupy, nvcuda, opencl, pycuda, openacc +from kernel_tuner.backends import backend, c, cupy, nvcuda, opencl, pycuda class WrongBackend(backend.Backend): @@ -46,8 +45,3 @@ def test_opencl_backend(): @skip_if_no_pycuda def test_pycuda_backend(): dev = pycuda.PyCudaFunctions() - - -@skip_if_no_openacc -def test_openacc_backend(): - dev = openacc.OpenACCFunctions() From 791d789afd639c71d109dd30896c177e87935fda Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 13:10:04 +0200 Subject: [PATCH 24/93] Fixed tests and improved timing code. --- kernel_tuner/util.py | 4 ++-- test/test_util_functions.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2bf69ad59..18e45ced9 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1190,8 +1190,8 @@ def extract_preprocessor(code: str) -> list: def wrap_cpp_timing(code: str) -> str: """Wrap C++ timing code (using std::chrono) around the provided code""" - start = "auto start = std::chrono::high_resolution_clock::now();" - end = "auto end = std::chrono::high_resolution_clock::now();" + start = "auto start = std::chrono::steady_clock::now();" + end = "auto end = std::chrono::steady_clock::now();" sum = "std::chrono::duration elapsed_time = end - start;" ret = "return elapsed_time.count();" diff --git a/test/test_util_functions.py b/test/test_util_functions.py index ef33f9c03..9b6b02aaa 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -818,7 +818,7 @@ def test_wrap_cpp_timing(): wrapped = wrap_cpp_timing(code) assert ( wrapped - == "auto start = std::chrono::high_resolution_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::high_resolution_clock::now();\nauto elapsed_time = end - start;\nreturn static_cast(elapsed_time.count() * 1e3);" + == "auto start = std::chrono::steady_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" ) @@ -827,13 +827,13 @@ def test_extract_directive_signature(): signatures = extract_directive_signature(code) assert len(signatures) == 1 assert ( - "float vector_add(float * a, float * b, float * c, int size)" + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, "vector_add") assert len(signatures) == 1 assert ( - "float vector_add(float * a, float * b, float * c, int size)" + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" in signatures["vector_add"] ) signatures = extract_directive_signature(code, "vector_add_ext") From 26fd049553a9546bd383da0b6f29f11481cbfdcd Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 13:15:13 +0200 Subject: [PATCH 25/93] For now it is necessary to have extern "C" in signature, will work on this later. --- examples/c/vector_add_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index ac645f73c..43db840da 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -35,7 +35,7 @@ preprocessor = extract_preprocessor(code) kernel_string = "\n".join(preprocessor) + "\n#include \n#include \n" directive_signatures = extract_directive_signature(code, kernel_name="vector_add") -kernel_string += directive_signatures["vector_add"] + "{\n" +kernel_string += 'extern "C"' + directive_signatures["vector_add"] + "{\n" directive_codes = extract_directive_code(code, kernel_name="vector_add") kernel_string += wrap_cpp_timing(directive_codes["vector_add"]) + "\n}" From 63c157e493c3b15070e910e053826b66fc74a68d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 13:15:55 +0200 Subject: [PATCH 26/93] Typo. --- examples/c/vector_add_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 43db840da..c0a1e1267 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -35,7 +35,7 @@ preprocessor = extract_preprocessor(code) kernel_string = "\n".join(preprocessor) + "\n#include \n#include \n" directive_signatures = extract_directive_signature(code, kernel_name="vector_add") -kernel_string += 'extern "C"' + directive_signatures["vector_add"] + "{\n" +kernel_string += 'extern "C" ' + directive_signatures["vector_add"] + "{\n" directive_codes = extract_directive_code(code, kernel_name="vector_add") kernel_string += wrap_cpp_timing(directive_codes["vector_add"]) + "\n}" From cfbffeb177924de379e01e0dbbc7145144c390ba Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 13:30:20 +0200 Subject: [PATCH 27/93] Change the order of the arguments to match generated function. --- examples/c/vector_add_openacc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index c0a1e1267..428523f2d 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -46,13 +46,13 @@ c = numpy.zeros_like(b) n = numpy.int32(size) -args = [c, a, b, n] +args = [a, b, c, n] tune_params = OrderedDict() -tune_params["ngangs"] = [2**i for i in range(0, 21)] +tune_params["ngangs"] = [2**i for i in range(0, 15)] tune_params["nthreads"] = [2**i for i in range(0, 11)] -answer = [a+b, None, None, None] +answer = [None, None, a+b, None] tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fast", "-acc=gpu"], compiler="nvc++") From bf1cf2e09ecb27507c5f97d51b0a51b13e1c1885 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 14:04:30 +0200 Subject: [PATCH 28/93] The C backend is now the Compiler backend. Updated list of known compilers. --- kernel_tuner/backends/{c.py => compiler.py} | 43 ++++++++++++-------- kernel_tuner/core.py | 4 +- kernel_tuner/observers/{c.py => compiler.py} | 2 +- test/test_backend.py | 4 +- test/test_c_functions.py | 28 ++++++------- 5 files changed, 46 insertions(+), 35 deletions(-) rename kernel_tuner/backends/{c.py => compiler.py} (90%) rename kernel_tuner/observers/{c.py => compiler.py} (91%) diff --git a/kernel_tuner/backends/c.py b/kernel_tuner/backends/compiler.py similarity index 90% rename from kernel_tuner/backends/c.py rename to kernel_tuner/backends/compiler.py index 486181cc3..30b39b1af 100644 --- a/kernel_tuner/backends/c.py +++ b/kernel_tuner/backends/compiler.py @@ -13,7 +13,7 @@ import numpy.ctypeslib from kernel_tuner.backends.backend import CompilerBackend -from kernel_tuner.observers.c import CRuntimeObserver +from kernel_tuner.observers.compiler import CompilerRuntimeObserver from kernel_tuner.util import ( get_temp_filename, delete_temp_file, @@ -40,7 +40,7 @@ Argument = namedtuple("Argument", ["numpy", "ctypes"]) -class CFunctions(CompilerBackend): +class CompilerFunctions(CompilerBackend): """Class that groups the code for running and compiling C functions""" def __init__(self, iterations=7, compiler_options=None, compiler=None): @@ -56,14 +56,24 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None): self.compiler = compiler or "g++" self.lib = None self.using_openmp = False - self.observers = [CRuntimeObserver(self)] + self.using_openacc = False + self.observers = [CompilerRuntimeObserver(self)] self.last_result = None - try: - cc_version = str(subprocess.check_output([self.compiler, "--version"])) - cc_version = cc_version.splitlines()[0].split(" ")[-1] - except OSError as e: - raise e + if self.compiler == "g++": + try: + cc_version = str(subprocess.check_output([self.compiler, "--version"])) + cc_version = cc_version.split("\\n")[0].split(" ")[2] + except OSError as e: + raise e + elif self.compiler in ["nvc++", "nvfortran"]: + try: + cc_version = str(subprocess.check_output([self.compiler, "--version"])) + cc_version = cc_version.split(" ")[1] + except OSError as e: + raise e + else: + cc_version = None # check if nvcc is available self.nvcc_available = False @@ -143,13 +153,14 @@ def compile(self, kernel_instance): if "#include " in kernel_string or "use omp_lib" in kernel_string: logging.debug("set using_openmp to true") self.using_openmp = True - if self.compiler == "pgfortran": + if self.compiler in ["nvc++", "nvfortran"]: compiler_options.append("-mp") else: - if "#pragma acc" in kernel_string or "!$acc" in kernel_string: - compiler_options.append("-fopenacc") - else: - compiler_options.append("-fopenmp") + compiler_options.append("-fopenmp") + + # detect openacc + if "#pragma acc" in kernel_string or "!$acc" in kernel_string: + self.using_openacc = True # if filename is known, use that one suffix = kernel_instance.kernel_source.get_user_suffix() @@ -175,7 +186,7 @@ def compile(self, kernel_instance): # select right suffix based on compiler suffix = ".cc" - if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]: + if self.compiler in ["gfortran", "nvfortran", "ftn", "ifort"]: suffix = ".F90" if self.compiler == "nvcc": @@ -208,7 +219,7 @@ def compile(self, kernel_instance): kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name elif self.compiler in ["ftn", "ifort"]: kernel_name = match.group(1) + "_mp_" + kernel_name + "_" - elif self.compiler == "pgfortran": + elif self.compiler == "nvfortran": kernel_name = match.group(1) + "_" + kernel_name + "_" else: # for functions outside of modules @@ -338,7 +349,7 @@ def memcpy_htod(self, dest, src): def cleanup_lib(self): """unload the previously loaded shared library""" - if not self.using_openmp: + if not self.using_openmp and not self.using_openacc: # this if statement is necessary because shared libraries that use # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP logging.debug("unloading shared library") diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index accb58cc1..c0c0a3b21 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -17,7 +17,7 @@ from kernel_tuner.backends.pycuda import PyCudaFunctions from kernel_tuner.backends.nvcuda import CudaFunctions from kernel_tuner.backends.opencl import OpenCLFunctions -from kernel_tuner.backends.c import CFunctions +from kernel_tuner.backends.compiler import CompilerFunctions from kernel_tuner.backends.opencl import OpenCLFunctions import kernel_tuner.util as util @@ -238,7 +238,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No elif lang.upper() == "OPENCL": dev = OpenCLFunctions(device, platform, compiler_options=compiler_options, iterations=iterations, observers=observers) elif lang.upper() in ["C", "FORTRAN"]: - dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations) + dev = CompilerFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations) else: raise ValueError("Sorry, support for languages other than CUDA, OpenCL, or C is not implemented yet") diff --git a/kernel_tuner/observers/c.py b/kernel_tuner/observers/compiler.py similarity index 91% rename from kernel_tuner/observers/c.py rename to kernel_tuner/observers/compiler.py index 674a0f3ce..c04887233 100644 --- a/kernel_tuner/observers/c.py +++ b/kernel_tuner/observers/compiler.py @@ -3,7 +3,7 @@ from kernel_tuner.observers.observer import BenchmarkObserver -class CRuntimeObserver(BenchmarkObserver): +class CompilerRuntimeObserver(BenchmarkObserver): """Observer that collects results returned by benchmarking function in the C backend""" def __init__(self, dev): diff --git a/test/test_backend.py b/test/test_backend.py index 4a0e871c2..460ce371a 100644 --- a/test/test_backend.py +++ b/test/test_backend.py @@ -5,7 +5,7 @@ skip_if_no_opencl, skip_if_no_pycuda, ) -from kernel_tuner.backends import backend, c, cupy, nvcuda, opencl, pycuda +from kernel_tuner.backends import backend, compiler, cupy, nvcuda, opencl, pycuda class WrongBackend(backend.Backend): @@ -24,7 +24,7 @@ def test_wrong_backend(): @skip_if_no_gcc def test_c_backend(): - dev = c.CFunctions() + dev = c.CompilerFunctions() @skip_if_no_cupy diff --git a/test/test_c_functions.py b/test/test_c_functions.py index 6fb0c2b6d..5094bfd6a 100644 --- a/test/test_c_functions.py +++ b/test/test_c_functions.py @@ -11,7 +11,7 @@ from unittest.mock import patch, Mock import kernel_tuner -from kernel_tuner.backends.c import CFunctions, Argument +from kernel_tuner.backends.compiler import CompilerFunctions, Argument from kernel_tuner.core import KernelSource, KernelInstance from kernel_tuner import util @@ -24,7 +24,7 @@ def test_ready_argument_list1(): arg3 = np.array([7, 8, 9]).astype(np.int32) arguments = [arg1, arg2, arg3] - cfunc = CFunctions() + cfunc = CompilerFunctions() output = cfunc.ready_argument_list(arguments) print(output) @@ -56,7 +56,7 @@ def test_ready_argument_list2(): arg3 = np.float32(6.0) arguments = [arg1, arg2, arg3] - cfunc = CFunctions() + cfunc = CompilerFunctions() output = cfunc.ready_argument_list(arguments) print(output) @@ -75,7 +75,7 @@ def test_ready_argument_list2(): def test_ready_argument_list3(): arg1 = Mock() arguments = [arg1] - cfunc = CFunctions() + cfunc = CompilerFunctions() try: cfunc.ready_argument_list(arguments) assert False @@ -87,7 +87,7 @@ def test_ready_argument_list3(): def test_ready_argument_list4(): with raises(TypeError): arg1 = int(9) - cfunc = CFunctions() + cfunc = CompilerFunctions() cfunc.ready_argument_list([arg1]) @@ -96,7 +96,7 @@ def test_ready_argument_list5(): arg1 = np.array([1, 2, 3]).astype(np.float32) arguments = [arg1] - cfunc = CFunctions() + cfunc = CompilerFunctions() output = cfunc.ready_argument_list(arguments) assert all(output[0].numpy == arg1) @@ -110,7 +110,7 @@ def test_ready_argument_list5(): def test_byte_array_arguments(): arg1 = np.array([1, 2, 3]).astype(np.int8) - cfunc = CFunctions() + cfunc = CompilerFunctions() output = cfunc.ready_argument_list([arg1]) @@ -136,7 +136,7 @@ def test_compile(npct, subprocess): kernel_sources = KernelSource(kernel_name, kernel_string, "C") kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) - cfunc = CFunctions() + cfunc = CompilerFunctions() f = cfunc.compile(kernel_instance) print(subprocess.mock_calls) @@ -166,7 +166,7 @@ def test_compile_detects_device_code(npct, subprocess): kernel_sources = KernelSource(kernel_name, kernel_string, "C") kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) - cfunc = CFunctions() + cfunc = CompilerFunctions() cfunc.compile(kernel_instance) print(subprocess.check_call.call_args_list) @@ -191,7 +191,7 @@ def test_memset(): x_c = x.ctypes.data_as(C.POINTER(C.c_float)) arg = Argument(numpy=x, ctypes=x_c) - cfunc = CFunctions() + cfunc = CompilerFunctions() cfunc.memset(arg, 0, x.nbytes) output = np.ctypeslib.as_array(x_c, shape=(4,)) @@ -209,7 +209,7 @@ def test_memcpy_dtoh(): arg = Argument(numpy=x, ctypes=x_c) output = np.zeros_like(x) - cfunc = CFunctions() + cfunc = CompilerFunctions() cfunc.memcpy_dtoh(output, arg) print(a) @@ -227,7 +227,7 @@ def test_memcpy_htod(): x_c = x.ctypes.data_as(C.POINTER(C.c_float)) arg = Argument(numpy=x, ctypes=x_c) - cfunc = CFunctions() + cfunc = CompilerFunctions() cfunc.memcpy_htod(arg, src) assert all(arg.numpy == a) @@ -247,7 +247,7 @@ def test_complies_fortran_function_no_module(): kernel_sources = KernelSource(kernel_name, kernel_string, "C") kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) - cfunc = CFunctions(compiler="gfortran") + cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) result = cfunc.run_kernel(func, [], (), ()) @@ -278,7 +278,7 @@ def test_complies_fortran_function_with_module(): try: - cfunc = CFunctions(compiler="gfortran") + cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) result = cfunc.run_kernel(func, [], (), ()) From a4fe579770154f84b179cc47d224104199954360 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 14:12:40 +0200 Subject: [PATCH 29/93] Formatted with black. --- examples/c/vector_add_openacc.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 428523f2d..b84dff6b2 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -3,7 +3,12 @@ import numpy from kernel_tuner import tune_kernel -from kernel_tuner.util import extract_directive_signature, extract_directive_code, extract_preprocessor, wrap_cpp_timing +from kernel_tuner.util import ( + extract_directive_signature, + extract_directive_code, + extract_preprocessor, + wrap_cpp_timing, +) from collections import OrderedDict code = """ @@ -52,7 +57,15 @@ tune_params["ngangs"] = [2**i for i in range(0, 15)] tune_params["nthreads"] = [2**i for i in range(0, 11)] -answer = [None, None, a+b, None] +answer = [None, None, a + b, None] -tune_kernel("vector_add", kernel_string, size, args, tune_params, - answer=answer, compiler_options=["-fast", "-acc=gpu"], compiler="nvc++") +tune_kernel( + "vector_add", + kernel_string, + size, + args, + tune_params, + answer=answer, + compiler_options=["-fast", "-acc=gpu"], + compiler="nvc++", +) From abb759d6614cf4817c2356bf62c762feb6410d90 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 14:20:05 +0200 Subject: [PATCH 30/93] Fixed some tests. --- test/test_backend.py | 2 +- ...unctions.py => test_compiler_functions.py} | 57 ++++++++++--------- 2 files changed, 32 insertions(+), 27 deletions(-) rename test/{test_c_functions.py => test_compiler_functions.py} (87%) diff --git a/test/test_backend.py b/test/test_backend.py index 460ce371a..e694649c1 100644 --- a/test/test_backend.py +++ b/test/test_backend.py @@ -24,7 +24,7 @@ def test_wrong_backend(): @skip_if_no_gcc def test_c_backend(): - dev = c.CompilerFunctions() + dev = compiler.CompilerFunctions() @skip_if_no_cupy diff --git a/test/test_c_functions.py b/test/test_compiler_functions.py similarity index 87% rename from test/test_c_functions.py rename to test/test_compiler_functions.py index 5094bfd6a..475719fb0 100644 --- a/test/test_c_functions.py +++ b/test/test_compiler_functions.py @@ -17,6 +17,7 @@ from .context import skip_if_no_gfortran, skip_if_no_gcc, skip_if_no_openmp + @skip_if_no_gcc def test_ready_argument_list1(): arg1 = np.array([1, 2, 3]).astype(np.float32) @@ -33,22 +34,23 @@ def test_ready_argument_list1(): output_arg2 = np.ctypeslib.as_array(output[1].ctypes, shape=arg2.shape) output_arg3 = np.ctypeslib.as_array(output[2].ctypes, shape=arg3.shape) - assert output_arg1.dtype == 'float32' - assert output_arg2.dtype == 'float64' - assert output_arg3.dtype == 'int32' + assert output_arg1.dtype == "float32" + assert output_arg2.dtype == "float64" + assert output_arg3.dtype == "int32" assert all(output_arg1 == arg1) assert all(output_arg2 == arg2) assert all(output_arg3 == arg3) - assert output[0].numpy.dtype == 'float32' - assert output[1].numpy.dtype == 'float64' - assert output[2].numpy.dtype == 'int32' + assert output[0].numpy.dtype == "float32" + assert output[1].numpy.dtype == "float64" + assert output[2].numpy.dtype == "int32" assert all(output[0].numpy == arg1) assert all(output[1].numpy == arg2) assert all(output[2].numpy == arg3) + @skip_if_no_gcc def test_ready_argument_list2(): arg1 = np.array([1, 2, 3]).astype(np.float32) @@ -62,7 +64,7 @@ def test_ready_argument_list2(): output_arg1 = np.ctypeslib.as_array(output[0].ctypes, shape=arg1.shape) - assert output_arg1.dtype == 'float32' + assert output_arg1.dtype == "float32" assert isinstance(output[1].ctypes, C.c_int32) assert isinstance(output[2].ctypes, C.c_float) @@ -116,7 +118,7 @@ def test_byte_array_arguments(): output_arg1 = np.ctypeslib.as_array(output[0].ctypes, shape=arg1.shape) - assert output_arg1.dtype == 'int8' + assert output_arg1.dtype == "int8" assert all(output_arg1 == arg1) @@ -127,14 +129,15 @@ def test_byte_array_arguments(): assert all(dest == arg1) -@patch('kernel_tuner.backends.c.subprocess') -@patch('kernel_tuner.backends.c.numpy.ctypeslib') +@patch("kernel_tuner.backends.compiler.subprocess") +@patch("kernel_tuner.backends.compiler.numpy.ctypeslib") def test_compile(npct, subprocess): - kernel_string = "this is a fake C program" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) + kernel_instance = KernelInstance( + kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] + ) cfunc = CompilerFunctions() f = cfunc.compile(kernel_instance) @@ -148,23 +151,25 @@ def test_compile(npct, subprocess): args, _ = npct.load_library.call_args_list[0] filename = args[0] - print('filename=' + filename) + print("filename=" + filename) # check if temporary files are cleaned up correctly import os.path + assert not os.path.isfile(filename + ".cu") assert not os.path.isfile(filename + ".o") assert not os.path.isfile(filename + ".so") -@patch('kernel_tuner.backends.c.subprocess') -@patch('kernel_tuner.backends.c.numpy.ctypeslib') +@patch("kernel_tuner.backends.compiler.subprocess") +@patch("kernel_tuner.backends.compiler.numpy.ctypeslib") def test_compile_detects_device_code(npct, subprocess): - kernel_string = "this code clearly contains device code __global__ kernel(float* arg){ return; }" kernel_name = "blabla" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) + kernel_instance = KernelInstance( + kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] + ) cfunc = CompilerFunctions() cfunc.compile(kernel_instance) @@ -177,8 +182,8 @@ def test_compile_detects_device_code(npct, subprocess): args, kwargs = call args = args[0] print(args) - if args[0] == 'nvcc' and args[1] == '-c': - assert args[2][-3:] == '.cu' + if args[0] == "nvcc" and args[1] == "-c": + assert args[2][-3:] == ".cu" dot_cu_used = True assert dot_cu_used @@ -245,7 +250,9 @@ def test_complies_fortran_function_no_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) + kernel_instance = KernelInstance( + kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] + ) cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) @@ -274,10 +281,11 @@ def test_complies_fortran_function_with_module(): """ kernel_name = "my_test_function" kernel_sources = KernelSource(kernel_name, kernel_string, "C") - kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), []) + kernel_instance = KernelInstance( + kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [] + ) try: - cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) @@ -291,7 +299,6 @@ def test_complies_fortran_function_with_module(): @pytest.fixture def env(): - kernel_string = """ #include @@ -319,6 +326,7 @@ def env(): return ["vector_add", kernel_string, size, args, tune_params] + @skip_if_no_openmp @skip_if_no_gcc def test_benchmark(env): @@ -327,6 +335,3 @@ def test_benchmark(env): assert all(["nthreads" in result for result in results]) assert all(["time" in result for result in results]) assert all([result["time"] > 0.0 for result in results]) - - - From 6033751dcbf58110b5144d7ebc735393d8ed7295 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 7 Apr 2023 14:31:17 +0200 Subject: [PATCH 31/93] Avoid shadowing a builtin variable. --- kernel_tuner/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 18e45ced9..dbf564550 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1192,7 +1192,7 @@ def wrap_cpp_timing(code: str) -> str: """Wrap C++ timing code (using std::chrono) around the provided code""" start = "auto start = std::chrono::steady_clock::now();" end = "auto end = std::chrono::steady_clock::now();" - sum = "std::chrono::duration elapsed_time = end - start;" + timing = "std::chrono::duration elapsed_time = end - start;" ret = "return elapsed_time.count();" - return "\n".join([start, code, end, sum, ret]) + return "\n".join([start, code, end, timing, ret]) From 703fe4acf7896222cf6615dc23c64de9d20708ff Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 9 May 2023 14:07:03 +0200 Subject: [PATCH 32/93] Add nvc to compilers. --- kernel_tuner/backends/compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 30b39b1af..188c94594 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -66,7 +66,7 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None): cc_version = cc_version.split("\\n")[0].split(" ")[2] except OSError as e: raise e - elif self.compiler in ["nvc++", "nvfortran"]: + elif self.compiler in ["nvc", "nvc++", "nvfortran"]: try: cc_version = str(subprocess.check_output([self.compiler, "--version"])) cc_version = cc_version.split(" ")[1] @@ -153,7 +153,7 @@ def compile(self, kernel_instance): if "#include " in kernel_string or "use omp_lib" in kernel_string: logging.debug("set using_openmp to true") self.using_openmp = True - if self.compiler in ["nvc++", "nvfortran"]: + if self.compiler in ["nvc", "nvc++", "nvfortran"]: compiler_options.append("-mp") else: compiler_options.append("-fopenmp") From 8e1da2ce8ed750085b3a6b1cddd380116002bf6a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 9 May 2023 14:22:16 +0200 Subject: [PATCH 33/93] Extract statements that span over multiple lines. --- kernel_tuner/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 1a21d5299..73fbf16e7 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1103,7 +1103,7 @@ def extract_directive_code(code: str, kernel_name: str = None) -> dict: tmp_string = list() name = "" - for line in code.split("\n"): + for line in code.replace("\\\n", "").split("\n"): if found_section: if end_string in line: found_section = False @@ -1126,7 +1126,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: start_string = "#pragma tuner start" signatures = dict() - for line in code.split("\n"): + for line in code.replace("\\\n", "").split("\n"): if start_string in line: if kernel_name is None or f" {kernel_name} " in line: tmp_string = line.strip().split(" ") @@ -1151,7 +1151,7 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: start_string = "#pragma tuner start" data = dict() - for line in code.split("\n"): + for line in code.replace("\\\n", "").split("\n"): if start_string in line: if kernel_name is None or f" {kernel_name} " in line: name = line.strip().split(" ")[3] @@ -1172,7 +1172,7 @@ def extract_preprocessor(code: str) -> list: """Extract include and define statements from C/C++ code""" preprocessor = list() - for line in code.split("\n"): + for line in code.replace("\\\n", "").split("\n"): if "#define" in line or "#include" in line: preprocessor.append(line) From 75e6e08270ce0925fc7efaf8d3d2837e0e11793d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 9 May 2023 15:01:29 +0200 Subject: [PATCH 34/93] Test if parameter is empty. --- kernel_tuner/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 73fbf16e7..dbb20d8d6 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1134,6 +1134,8 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: tmp_string = tmp_string[4:] params = list() for param in tmp_string: + if len(param) == 0: + continue p_name = param.split("(")[0] param = param.replace(p_name, "", 1) p_type = param[1:-1] @@ -1158,6 +1160,8 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: data[name] = dict() tmp_string = line.strip().split(" ")[4:] for param in tmp_string: + if len(param) == 0: + continue p_name = param.split("(")[0] param = param.replace(p_name, "", 1) param = param[1:-1] From c656743f8806ee9b955acb5720a1c85c996b0234 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 10 May 2023 14:13:34 +0200 Subject: [PATCH 35/93] Add function to generate tunable function. --- kernel_tuner/util.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index dbb20d8d6..2ec73219a 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1191,3 +1191,12 @@ def wrap_cpp_timing(code: str) -> str: ret = "return elapsed_time.count();" return "\n".join([start, code, end, timing, ret]) + + +def generate_directive_function(preprocessor: str, signature: str, body: str) -> str: + """Generate tunable function for one directive""" + code = "\n".join(preprocessor) + "\n#include \n" + code += 'extern "C" ' + signature + "{\n" + code += wrap_cpp_timing(body) + "\n}" + + return code From 7bac398d03c2e9162d94303f02bf1ed1369306ad Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 11 May 2023 10:24:38 +0200 Subject: [PATCH 36/93] Updating the directives code. --- kernel_tuner/util.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2ec73219a..85d40f73a 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1094,10 +1094,8 @@ def cuda_error_check(error): raise RuntimeError(f"NVRTC error: {desc.decode()}") -def extract_directive_code(code: str, kernel_name: str = None) -> dict: - """Extract explicitly marked directive sections from code""" - start_string = "#pragma tuner start" - end_string = "#pragma tuner stop" +def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: + """Extract an arbitrary section of code""" found_section = False sections = dict() tmp_string = list() @@ -1105,7 +1103,7 @@ def extract_directive_code(code: str, kernel_name: str = None) -> dict: for line in code.replace("\\\n", "").split("\n"): if found_section: - if end_string in line: + if stop in line: found_section = False sections[name] = "\n".join(tmp_string) tmp_string = list() @@ -1113,7 +1111,7 @@ def extract_directive_code(code: str, kernel_name: str = None) -> dict: else: tmp_string.append(line) else: - if start_string in line: + if start in line: if kernel_name is None or f" {kernel_name} " in line: found_section = True name = line.strip().split(" ")[3] @@ -1121,6 +1119,27 @@ def extract_directive_code(code: str, kernel_name: str = None) -> dict: return sections +def extract_directive_code(code: str, kernel_name: str = None) -> dict: + """Extract explicitly marked directive sections from code""" + start_string = "#pragma tuner start" + end_string = "#pragma tuner stop" + + return extract_code(start_string, end_string, code, kernel_name) + + +def extract_initialization_code(code: str) -> str: + """Extract the initialization section from code""" + start_string = "#pragma tuner initialize" + end_string = "#pragma tuner stop" + + function = extract_code(start_string, end_string, code) + if len(function) == 1: + _, value = function.popitem() + return value + else: + return "" + + def extract_directive_signature(code: str, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" start_string = "#pragma tuner start" @@ -1166,7 +1185,10 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: param = param.replace(p_name, "", 1) param = param[1:-1] p_type = param.split(":")[0] - p_size = param.split(":")[1] + try: + p_size = param.split(":")[1] + except IndexError: + p_size = 0 data[name][p_name] = [p_type, p_size] return data @@ -1193,9 +1215,12 @@ def wrap_cpp_timing(code: str) -> str: return "\n".join([start, code, end, timing, ret]) -def generate_directive_function(preprocessor: str, signature: str, body: str) -> str: +def generate_directive_function( + preprocessor: str, signature: str, body: str, initialization: str = "" +) -> str: """Generate tunable function for one directive""" code = "\n".join(preprocessor) + "\n#include \n" + code += initialization code += 'extern "C" ' + signature + "{\n" code += wrap_cpp_timing(body) + "\n}" From abe19813559456a4ca4d9e320d934c65c2a00552 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 11 May 2023 10:45:34 +0200 Subject: [PATCH 37/93] Move initialization code to the function. --- kernel_tuner/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 85d40f73a..f962ca6db 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1220,8 +1220,8 @@ def generate_directive_function( ) -> str: """Generate tunable function for one directive""" code = "\n".join(preprocessor) + "\n#include \n" - code += initialization code += 'extern "C" ' + signature + "{\n" + code += initialization code += wrap_cpp_timing(body) + "\n}" return code From 8d0827c4e0a3862f29b31cdc61b33484be5c222f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 11 May 2023 14:04:23 +0200 Subject: [PATCH 38/93] If initialization is provided, no need for external memory allocation. --- kernel_tuner/util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index f962ca6db..91df7243b 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1219,9 +1219,14 @@ def generate_directive_function( preprocessor: str, signature: str, body: str, initialization: str = "" ) -> str: """Generate tunable function for one directive""" - code = "\n".join(preprocessor) + "\n#include \n" + code = "\n".join(preprocessor) + if "#include " not in preprocessor: + code += "\n#include \n" + # If initialization is provided use simplified signature + if len(initialization) > 1: + code += initialization + signature = signature.split("(")[0] + "()" code += 'extern "C" ' + signature + "{\n" - code += initialization code += wrap_cpp_timing(body) + "\n}" return code From 4beabc92a401810deaef34a07a904db24109edd1 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 11 May 2023 14:06:37 +0200 Subject: [PATCH 39/93] Initialization code moved to wrong spot. --- kernel_tuner/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 91df7243b..795b46823 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1224,9 +1224,10 @@ def generate_directive_function( code += "\n#include \n" # If initialization is provided use simplified signature if len(initialization) > 1: - code += initialization signature = signature.split("(")[0] + "()" code += 'extern "C" ' + signature + "{\n" + if len(initialization) > 1: + code += initialization code += wrap_cpp_timing(body) + "\n}" return code From 7776588f233be4b8ad6c4736bba86ff84e2ff449 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 11 May 2023 14:12:39 +0200 Subject: [PATCH 40/93] Missing carrier return. --- kernel_tuner/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 795b46823..d852f1670 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1227,7 +1227,7 @@ def generate_directive_function( signature = signature.split("(")[0] + "()" code += 'extern "C" ' + signature + "{\n" if len(initialization) > 1: - code += initialization + code += initialization + "\n" code += wrap_cpp_timing(body) + "\n}" return code From 09fecf12d24b1fe026f5bf467b493f566e9cc3a2 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 11:45:59 +0200 Subject: [PATCH 41/93] First implementation of automatic memory allocation for directives. --- kernel_tuner/util.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index d852f1670..cc6e84b08 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1231,3 +1231,29 @@ def generate_directive_function( code += wrap_cpp_timing(body) + "\n}" return code + + +def allocate_signature_memory(data: dict) -> list: + """Allocates the data needed by a kernel and returns the arguments array""" + args = [] + max_int = 1024 + + for type, size in data.items(): + if "*" in type: + # The parameter is an array + if type == "float*": + args.append(np.random.rand(size).astype(np.float32)) + elif type == "double*": + args.append(np.random.rand(size).astype(np.float64)) + elif type == "int*": + args.append(np.random.randint(max_int, size=size)) + else: + # The parameter is a scalar + if type == "float": + args.append(np.float32(size)) + elif type == "double": + args.append(np.float64(size)) + elif type == "int": + args.append(np.int32(size)) + + return args From ced52f711dff44fc1babbe7de4a0c23c9d8b7df3 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 11:52:20 +0200 Subject: [PATCH 42/93] Updating the example --- examples/c/vector_add_openacc.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index b84dff6b2..6f5878bcd 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -7,7 +7,7 @@ extract_directive_signature, extract_directive_code, extract_preprocessor, - wrap_cpp_timing, + generate_directive_function ) from collections import OrderedDict @@ -38,11 +38,9 @@ # Extract tunable directive and generate kernel_string preprocessor = extract_preprocessor(code) -kernel_string = "\n".join(preprocessor) + "\n#include \n#include \n" -directive_signatures = extract_directive_signature(code, kernel_name="vector_add") -kernel_string += 'extern "C" ' + directive_signatures["vector_add"] + "{\n" -directive_codes = extract_directive_code(code, kernel_name="vector_add") -kernel_string += wrap_cpp_timing(directive_codes["vector_add"]) + "\n}" +signature = extract_directive_signature(code, kernel_name="vector_add") +body = extract_directive_code(code, kernel_name="vector_add") +kernel_string = generate_directive_function(preprocessor, signature, body) size = 65536 From cf2cf956d49d401799faa39a2e51b677f5a49927 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 11:54:02 +0200 Subject: [PATCH 43/93] Bug fix in the example. --- examples/c/vector_add_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 6f5878bcd..ca25b3a4e 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -40,7 +40,7 @@ preprocessor = extract_preprocessor(code) signature = extract_directive_signature(code, kernel_name="vector_add") body = extract_directive_code(code, kernel_name="vector_add") -kernel_string = generate_directive_function(preprocessor, signature, body) +kernel_string = generate_directive_function(preprocessor, signature["vector_add"], body["vector_add"]) size = 65536 From 2f469e562d2b9c589ee5e6a69d5549e801eb6182 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 13:53:12 +0200 Subject: [PATCH 44/93] Add some editors. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index ffb292e58..bd48c938b 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ temp_*.* .DS_Store .AppleDouble .LSOverride + +.vscode +.idea \ No newline at end of file From 89ffafd34777eb68699c054d83fb68234bde6d23 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 13:53:35 +0200 Subject: [PATCH 45/93] Update the example to allocate memory automatically. --- examples/c/vector_add_openacc.py | 27 +++++++++++++------------- kernel_tuner/util.py | 33 +++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index ca25b3a4e..6196b54d5 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -7,7 +7,9 @@ extract_directive_signature, extract_directive_code, extract_preprocessor, - generate_directive_function + generate_directive_function, + extract_directive_data, + allocate_signature_memory, ) from collections import OrderedDict @@ -38,29 +40,26 @@ # Extract tunable directive and generate kernel_string preprocessor = extract_preprocessor(code) -signature = extract_directive_signature(code, kernel_name="vector_add") -body = extract_directive_code(code, kernel_name="vector_add") -kernel_string = generate_directive_function(preprocessor, signature["vector_add"], body["vector_add"]) - -size = 65536 - -a = numpy.random.randn(size).astype(numpy.float32) -b = numpy.random.randn(size).astype(numpy.float32) -c = numpy.zeros_like(b) -n = numpy.int32(size) +signature = extract_directive_signature(code) +body = extract_directive_code(code) +kernel_string = generate_directive_function( + preprocessor, signature["vector_add"], body["vector_add"] +) -args = [a, b, c, n] +# Allocate memory on the host +data = extract_directive_data(code) +args = allocate_signature_memory(data["vector_add"], preprocessor) tune_params = OrderedDict() tune_params["ngangs"] = [2**i for i in range(0, 15)] tune_params["nthreads"] = [2**i for i in range(0, 11)] -answer = [None, None, a + b, None] +answer = [None, None, args[0] + args[1], None] tune_kernel( "vector_add", kernel_string, - size, + 0, args, tune_params, answer=answer, diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index cc6e84b08..2a1d491d7 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1233,27 +1233,42 @@ def generate_directive_function( return code -def allocate_signature_memory(data: dict) -> list: +def allocate_signature_memory(data: dict, preprocessor: list = None) -> list: """Allocates the data needed by a kernel and returns the arguments array""" args = [] max_int = 1024 - for type, size in data.items(): - if "*" in type: + for parameter in data.keys(): + p_type = data[parameter][0] + size = data[parameter][1] + if type(size) is not int: + try: + # Try to convert the size to an integer + size = int(size) + except ValueError: + # If size cannot be natively converted to string, we try to derive it from the preprocessor + for line in preprocessor: + if f"#define {size}" in line: + try: + size = int(line.split(" ")[2]) + break + except ValueError: + continue + if "*" in p_type: # The parameter is an array - if type == "float*": + if p_type == "float*": args.append(np.random.rand(size).astype(np.float32)) - elif type == "double*": + elif p_type == "double*": args.append(np.random.rand(size).astype(np.float64)) - elif type == "int*": + elif p_type == "int*": args.append(np.random.randint(max_int, size=size)) else: # The parameter is a scalar - if type == "float": + if p_type == "float": args.append(np.float32(size)) - elif type == "double": + elif p_type == "double": args.append(np.float64(size)) - elif type == "int": + elif p_type == "int": args.append(np.int32(size)) return args From da9facfd14ab7ea51aed2fa77fff5c7bb2d0a29c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 12 May 2023 13:58:13 +0200 Subject: [PATCH 46/93] Remove unused import. --- examples/c/vector_add_openacc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 6196b54d5..8364672df 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """This is a simple example for tuning C++ OpenACC code with the kernel tuner""" -import numpy from kernel_tuner import tune_kernel from kernel_tuner.util import ( extract_directive_signature, From d8ee4cc526a9da1dece9639c0d075d5322cd204b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 16 May 2023 13:52:42 +0200 Subject: [PATCH 47/93] Add default block size names for OpenACC. --- kernel_tuner/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2a1d491d7..b25a26a19 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -92,7 +92,7 @@ class StopCriterionReached(Exception): except ImportError: torch = TorchPlaceHolder() -default_block_size_names = ["block_size_x", "block_size_y", "block_size_z"] +default_block_size_names = ["block_size_x", "block_size_y", "block_size_z", "ngangs", "nworkers", "vlength"] def check_argument_type(dtype, kernel_argument): From 63754a24a8bfcf64fffb620a21afa9b4215a063a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 16 May 2023 16:12:18 +0200 Subject: [PATCH 48/93] Remove simplified signature, it is up to the user to use it or not in combination with initializer. --- kernel_tuner/util.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index b25a26a19..9f945dd6b 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1222,9 +1222,6 @@ def generate_directive_function( code = "\n".join(preprocessor) if "#include " not in preprocessor: code += "\n#include \n" - # If initialization is provided use simplified signature - if len(initialization) > 1: - signature = signature.split("(")[0] + "()" code += 'extern "C" ' + signature + "{\n" if len(initialization) > 1: code += initialization + "\n" From 600384825fb2b0b598e93dd74e3cdd84f0c2b813 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 16 May 2023 16:12:58 +0200 Subject: [PATCH 49/93] Formatted using black. --- kernel_tuner/util.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 9f945dd6b..00aa8b893 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -92,7 +92,14 @@ class StopCriterionReached(Exception): except ImportError: torch = TorchPlaceHolder() -default_block_size_names = ["block_size_x", "block_size_y", "block_size_z", "ngangs", "nworkers", "vlength"] +default_block_size_names = [ + "block_size_x", + "block_size_y", + "block_size_z", + "ngangs", + "nworkers", + "vlength", +] def check_argument_type(dtype, kernel_argument): From fefa13331a6a3201583f63b8de51b1962945733b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 24 May 2023 13:30:52 +0200 Subject: [PATCH 50/93] Dummy fortran timing function. --- kernel_tuner/util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 00aa8b893..66005499a 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1221,6 +1221,11 @@ def wrap_cpp_timing(code: str) -> str: return "\n".join([start, code, end, timing, ret]) +def wrap_fortran_timing(code: str) -> str: + """Wrap Fortran timing around the provided code""" + + return "\n" + def generate_directive_function( preprocessor: str, signature: str, body: str, initialization: str = "" From 2094a4b6651ad547922f0e4d321c8c5162e1894d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 24 May 2023 15:11:23 +0200 Subject: [PATCH 51/93] First draft of new fortran openacc vector_add. --- examples/fortran/vector_add_openacc.py | 62 ++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 examples/fortran/vector_add_openacc.py diff --git a/examples/fortran/vector_add_openacc.py b/examples/fortran/vector_add_openacc.py new file mode 100644 index 000000000..2828fde97 --- /dev/null +++ b/examples/fortran/vector_add_openacc.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +"""This is a simple example for tuning Fortran OpenACC code with the kernel tuner""" + +from kernel_tuner import tune_kernel +from kernel_tuner.util import ( + extract_directive_signature, + extract_directive_code, + extract_preprocessor, + generate_directive_function, + extract_directive_data, + allocate_signature_memory, +) +from collections import OrderedDict + +code = """ +#define VECTOR_SIZE 65536 + +subroutine vector_add(C, A, B, n) + use iso_c_binding + real (c_float), intent(out), dimension(VECTOR_SIZE) :: C + real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B + integer (c_int), intent(in) :: n + + !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) + !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + do i = 1, N + C(i) = A(i) + B(i) + end do + !$acc end parallel loop + !$tuner stop + +end subroutine vector_add +""" + +# Extract tunable directive and generate kernel_string +preprocessor = extract_preprocessor(code) +signature = extract_directive_signature(code) +body = extract_directive_code(code) +kernel_string = generate_directive_function( + preprocessor, signature["vector_add"], body["vector_add"] +) + +# Allocate memory on the host +data = extract_directive_data(code) +args = allocate_signature_memory(data["vector_add"], preprocessor) + +tune_params = OrderedDict() +tune_params["ngangs"] = [2**i for i in range(0, 15)] +tune_params["vlength"] = [2**i for i in range(0, 11)] + +answer = [None, None, args[0] + args[1], None] + +tune_kernel( + "vector_add", + kernel_string, + 0, + args, + tune_params, + answer=answer, + compiler_options=["-fast", "-acc=gpu"], + compiler="nvfortran", +) From 50311f5ee8fba2e00ecb6d5c1a8d860490c09701 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 24 May 2023 15:26:45 +0200 Subject: [PATCH 52/93] First draft of Fortran support for OpenACC. --- kernel_tuner/util.py | 117 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 23 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 66005499a..e344283ce 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1128,16 +1128,38 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d def extract_directive_code(code: str, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" - start_string = "#pragma tuner start" - end_string = "#pragma tuner stop" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + end_string = "#pragma tuner stop" + elif f90: + start_string = "!$tuner start" + end_string = "!$tuner stop" return extract_code(start_string, end_string, code, kernel_name) def extract_initialization_code(code: str) -> str: """Extract the initialization section from code""" - start_string = "#pragma tuner initialize" - end_string = "#pragma tuner stop" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner initialize" + end_string = "#pragma tuner stop" + elif f90: + start_string = "!$tuner initialize" + end_string = "!$tuner stop" function = extract_code(start_string, end_string, code) if len(function) == 1: @@ -1149,7 +1171,17 @@ def extract_initialization_code(code: str) -> str: def extract_directive_signature(code: str, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" - start_string = "#pragma tuner start" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + elif f90: + start_string = "!$tuner start" signatures = dict() for line in code.replace("\\\n", "").split("\n"): @@ -1168,15 +1200,31 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: p_type = p_type.split(":")[0] if "*" in p_type: p_type = p_type.replace("*", " * restrict") - params.append(f"{p_type} {p_name}") - signatures[name] = f"float {name}({', '.join(params)})" + if cpp: + params.append(f"{p_type} {p_name}") + elif f90: + params.append(p_name) + if cpp: + signatures[name] = f"float {name}({', '.join(params)})" + elif f90: + signatures[name] = f"function {name}({', '.join(params)})" return signatures def extract_directive_data(code: str, kernel_name: str = None) -> dict: """Extract the data used in the directive section""" - start_string = "#pragma tuner start" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + elif f90: + start_string = "!$tuner start" data = dict() for line in code.replace("\\\n", "").split("\n"): @@ -1202,7 +1250,7 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: def extract_preprocessor(code: str) -> list: - """Extract include and define statements from C/C++ code""" + """Extract include and define statements from code""" preprocessor = list() for line in code.replace("\\\n", "").split("\n"): @@ -1212,32 +1260,55 @@ def extract_preprocessor(code: str) -> list: return preprocessor -def wrap_cpp_timing(code: str) -> str: - """Wrap C++ timing code (using std::chrono) around the provided code""" - start = "auto start = std::chrono::steady_clock::now();" - end = "auto end = std::chrono::steady_clock::now();" - timing = "std::chrono::duration elapsed_time = end - start;" - ret = "return elapsed_time.count();" +def wrap_timing(code: str) -> str: + """Wrap timing code around the provided code""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start = "auto start = std::chrono::steady_clock::now();" + end = "auto end = std::chrono::steady_clock::now();" + timing = "std::chrono::duration elapsed_time = end - start;" + ret = "return elapsed_time.count();" + elif f90: + start = "integer,intent(out) start\nreal,intent(out) rate\ninteger,intent(out) end\ncall system_clock(start, rate)" + end = "call system_clock(end)" + timing = "timing = (real(end - start) / real(rate)) * 1e3" + ret = "" return "\n".join([start, code, end, timing, ret]) -def wrap_fortran_timing(code: str) -> str: - """Wrap Fortran timing around the provided code""" - - return "\n" - def generate_directive_function( preprocessor: str, signature: str, body: str, initialization: str = "" ) -> str: """Generate tunable function for one directive""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + code = "\n".join(preprocessor) - if "#include " not in preprocessor: + if cpp and "#include " not in preprocessor: code += "\n#include \n" - code += 'extern "C" ' + signature + "{\n" + if cpp: + code += 'extern "C" ' + signature + "{\n" + elif f90: + code += signature + " result(timing)\n" if len(initialization) > 1: code += initialization + "\n" - code += wrap_cpp_timing(body) + "\n}" + code += wrap_timing(body) + "\n}" + if cpp: + code += "\n}" + elif f90: + name = signature.split(" ")[1] + code += f"\nend function {name}\n" return code From 22af3fd4655b370f216ff2d04ac74b56f71a60ec Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 24 May 2023 15:27:08 +0200 Subject: [PATCH 53/93] Format with black. --- examples/fortran/test_fortran_vector_add.py | 18 +++++++++++++----- examples/fortran/vector_add.py | 12 ++++++++---- examples/fortran/vector_add_acc.py | 18 +++++++++++++----- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/examples/fortran/test_fortran_vector_add.py b/examples/fortran/test_fortran_vector_add.py index 46495b592..415647024 100755 --- a/examples/fortran/test_fortran_vector_add.py +++ b/examples/fortran/test_fortran_vector_add.py @@ -6,10 +6,10 @@ import numpy as np from kernel_tuner import run_kernel -def test(): +def test(): filename = Path(__file__).parent / "vector_add.F90" - with open(filename, 'r') as f: + with open(filename, "r") as f: kernel_string = f.read() size = 10000000 @@ -25,9 +25,17 @@ def test(): tune_params["N"] = size tune_params["NTHREADS"] = 4 - answer = run_kernel("vector_add", kernel_string, size, args, tune_params, lang="fortran", compiler="gfortran") - - assert np.allclose(answer[0], a+b, atol=1e-8) + answer = run_kernel( + "vector_add", + kernel_string, + size, + args, + tune_params, + lang="fortran", + compiler="gfortran", + ) + + assert np.allclose(answer[0], a + b, atol=1e-8) if __name__ == "__main__": diff --git a/examples/fortran/vector_add.py b/examples/fortran/vector_add.py index ca26ffb90..5c1a5476b 100755 --- a/examples/fortran/vector_add.py +++ b/examples/fortran/vector_add.py @@ -7,7 +7,6 @@ def tune(): - size = int(80e6) a = np.random.randn(size).astype(np.float32) @@ -23,9 +22,14 @@ def tune(): print("compile with gfortran") result, _ = tune_kernel( - "time_vector_add", "vector_add.F90", size, - args, tune_params, lang="C", compiler="gfortran" - ) + "time_vector_add", + "vector_add.F90", + size, + args, + tune_params, + lang="C", + compiler="gfortran", + ) return result diff --git a/examples/fortran/vector_add_acc.py b/examples/fortran/vector_add_acc.py index 27c60f0f9..aa09334e8 100755 --- a/examples/fortran/vector_add_acc.py +++ b/examples/fortran/vector_add_acc.py @@ -6,9 +6,9 @@ import numpy as np from kernel_tuner import tune_kernel -def tune(): - size = int(72*1024*1024) +def tune(): + size = int(72 * 1024 * 1024) a = np.random.randn(size).astype(np.float32) b = np.random.randn(size).astype(np.float32) @@ -21,11 +21,19 @@ def tune(): tune_params["N"] = [size] tune_params["block_size_x"] = [32, 64, 128, 256, 512] - result, env = tune_kernel("time_vector_add", "vector_add_acc.F90", size, args, - tune_params, lang="C", compiler="pgfortran", - compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"]) + result, env = tune_kernel( + "time_vector_add", + "vector_add_acc.F90", + size, + args, + tune_params, + lang="C", + compiler="pgfortran", + compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"], + ) return result + if __name__ == "__main__": tune() From 6fbfb33dcb5377f6276e33eae9ba896b0347db8d Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 25 May 2023 11:40:45 +0200 Subject: [PATCH 54/93] Fux bug. --- kernel_tuner/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index e344283ce..332411915 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1289,9 +1289,9 @@ def generate_directive_function( """Generate tunable function for one directive""" cpp = False f90 = False - if "#pragma acc" in code: + if "#pragma acc" in body: cpp = True - elif "!$acc" in code: + elif "!$acc" in body: f90 = True code = "\n".join(preprocessor) From 8264e8c6257a15cfefaeada98a6101550365bfc5 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 25 May 2023 11:56:18 +0200 Subject: [PATCH 55/93] Missing test from import. --- test/context.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/context.py b/test/context.py index ef99cd647..67fc394a7 100644 --- a/test/context.py +++ b/test/context.py @@ -14,6 +14,7 @@ try: import pynvml + pynvml_present = True except ImportError: pynvml_present = False @@ -54,6 +55,7 @@ skip_if_no_pycuda = pytest.mark.skipif( not pycuda_present, reason="PyCuda not installed or no CUDA device detected" ) +skip_if_no_pynvml = pytest.mark.skipif(not pynvml_present, reason="NVML not installed") skip_if_no_cupy = pytest.mark.skipif( not cupy_present, reason="CuPy not installed or no CUDA device detected" ) From 2250316b192c17b480a1e91dd46091ba4a1db5ea Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 25 May 2023 12:30:06 +0200 Subject: [PATCH 56/93] Fixed test. --- test/test_util_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 9b6b02aaa..bb3080d7e 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -813,9 +813,9 @@ def test_extract_preprocessor(): assert item in results -def test_wrap_cpp_timing(): +def test_wrap_timing(): code = "for ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" - wrapped = wrap_cpp_timing(code) + wrapped = wrap_timing(code) assert ( wrapped == "auto start = std::chrono::steady_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" From 33836eaef9690b9856269432f77966acf13dcae9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 25 May 2023 12:32:18 +0200 Subject: [PATCH 57/93] Fix test to support C++. --- test/test_util_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index bb3080d7e..4fc8606b6 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -823,7 +823,7 @@ def test_wrap_timing(): def test_extract_directive_signature(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) " + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" signatures = extract_directive_signature(code) assert len(signatures) == 1 assert ( @@ -841,7 +841,7 @@ def test_extract_directive_signature(): def test_extract_directive_data(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)" + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" data = extract_directive_data(code) assert len(data) == 1 assert len(data["vector_add"]) == 4 From 5d2487c6d5a6980670e366190a36504a2ee67c49 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 25 May 2023 12:47:44 +0200 Subject: [PATCH 58/93] Fix test. --- test/test_util_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 4fc8606b6..758daadb5 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -814,11 +814,11 @@ def test_extract_preprocessor(): def test_wrap_timing(): - code = "for ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + code = "#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" wrapped = wrap_timing(code) assert ( wrapped - == "auto start = std::chrono::steady_clock::now();\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" + == "auto start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" ) From 31f055ee11a59413383e3b4062be441e1204ec5f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 10:32:31 +0200 Subject: [PATCH 59/93] Splitting directive utils from the rest. --- kernel_tuner/util.py | 253 ------------------------------- kernel_tuner/utils/directives.py | 253 +++++++++++++++++++++++++++++++ test/test_util_functions.py | 134 ---------------- test/test_utils_directives.py | 134 ++++++++++++++++ 4 files changed, 387 insertions(+), 387 deletions(-) create mode 100644 kernel_tuner/utils/directives.py create mode 100644 test/test_utils_directives.py diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 332411915..bec4d4e5e 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -1099,256 +1099,3 @@ def cuda_error_check(error): if error != nvrtc.nvrtcResult.NVRTC_SUCCESS: _, desc = nvrtc.nvrtcGetErrorString(error) raise RuntimeError(f"NVRTC error: {desc.decode()}") - - -def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: - """Extract an arbitrary section of code""" - found_section = False - sections = dict() - tmp_string = list() - name = "" - - for line in code.replace("\\\n", "").split("\n"): - if found_section: - if stop in line: - found_section = False - sections[name] = "\n".join(tmp_string) - tmp_string = list() - name = "" - else: - tmp_string.append(line) - else: - if start in line: - if kernel_name is None or f" {kernel_name} " in line: - found_section = True - name = line.strip().split(" ")[3] - - return sections - - -def extract_directive_code(code: str, kernel_name: str = None) -> dict: - """Extract explicitly marked directive sections from code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True - - if cpp: - start_string = "#pragma tuner start" - end_string = "#pragma tuner stop" - elif f90: - start_string = "!$tuner start" - end_string = "!$tuner stop" - - return extract_code(start_string, end_string, code, kernel_name) - - -def extract_initialization_code(code: str) -> str: - """Extract the initialization section from code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True - - if cpp: - start_string = "#pragma tuner initialize" - end_string = "#pragma tuner stop" - elif f90: - start_string = "!$tuner initialize" - end_string = "!$tuner stop" - - function = extract_code(start_string, end_string, code) - if len(function) == 1: - _, value = function.popitem() - return value - else: - return "" - - -def extract_directive_signature(code: str, kernel_name: str = None) -> dict: - """Extract the user defined signature for directive sections""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True - - if cpp: - start_string = "#pragma tuner start" - elif f90: - start_string = "!$tuner start" - signatures = dict() - - for line in code.replace("\\\n", "").split("\n"): - if start_string in line: - if kernel_name is None or f" {kernel_name} " in line: - tmp_string = line.strip().split(" ") - name = tmp_string[3] - tmp_string = tmp_string[4:] - params = list() - for param in tmp_string: - if len(param) == 0: - continue - p_name = param.split("(")[0] - param = param.replace(p_name, "", 1) - p_type = param[1:-1] - p_type = p_type.split(":")[0] - if "*" in p_type: - p_type = p_type.replace("*", " * restrict") - if cpp: - params.append(f"{p_type} {p_name}") - elif f90: - params.append(p_name) - if cpp: - signatures[name] = f"float {name}({', '.join(params)})" - elif f90: - signatures[name] = f"function {name}({', '.join(params)})" - - return signatures - - -def extract_directive_data(code: str, kernel_name: str = None) -> dict: - """Extract the data used in the directive section""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True - - if cpp: - start_string = "#pragma tuner start" - elif f90: - start_string = "!$tuner start" - data = dict() - - for line in code.replace("\\\n", "").split("\n"): - if start_string in line: - if kernel_name is None or f" {kernel_name} " in line: - name = line.strip().split(" ")[3] - data[name] = dict() - tmp_string = line.strip().split(" ")[4:] - for param in tmp_string: - if len(param) == 0: - continue - p_name = param.split("(")[0] - param = param.replace(p_name, "", 1) - param = param[1:-1] - p_type = param.split(":")[0] - try: - p_size = param.split(":")[1] - except IndexError: - p_size = 0 - data[name][p_name] = [p_type, p_size] - - return data - - -def extract_preprocessor(code: str) -> list: - """Extract include and define statements from code""" - preprocessor = list() - - for line in code.replace("\\\n", "").split("\n"): - if "#define" in line or "#include" in line: - preprocessor.append(line) - - return preprocessor - - -def wrap_timing(code: str) -> str: - """Wrap timing code around the provided code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True - - if cpp: - start = "auto start = std::chrono::steady_clock::now();" - end = "auto end = std::chrono::steady_clock::now();" - timing = "std::chrono::duration elapsed_time = end - start;" - ret = "return elapsed_time.count();" - elif f90: - start = "integer,intent(out) start\nreal,intent(out) rate\ninteger,intent(out) end\ncall system_clock(start, rate)" - end = "call system_clock(end)" - timing = "timing = (real(end - start) / real(rate)) * 1e3" - ret = "" - - return "\n".join([start, code, end, timing, ret]) - - -def generate_directive_function( - preprocessor: str, signature: str, body: str, initialization: str = "" -) -> str: - """Generate tunable function for one directive""" - cpp = False - f90 = False - if "#pragma acc" in body: - cpp = True - elif "!$acc" in body: - f90 = True - - code = "\n".join(preprocessor) - if cpp and "#include " not in preprocessor: - code += "\n#include \n" - if cpp: - code += 'extern "C" ' + signature + "{\n" - elif f90: - code += signature + " result(timing)\n" - if len(initialization) > 1: - code += initialization + "\n" - code += wrap_timing(body) + "\n}" - if cpp: - code += "\n}" - elif f90: - name = signature.split(" ")[1] - code += f"\nend function {name}\n" - - return code - - -def allocate_signature_memory(data: dict, preprocessor: list = None) -> list: - """Allocates the data needed by a kernel and returns the arguments array""" - args = [] - max_int = 1024 - - for parameter in data.keys(): - p_type = data[parameter][0] - size = data[parameter][1] - if type(size) is not int: - try: - # Try to convert the size to an integer - size = int(size) - except ValueError: - # If size cannot be natively converted to string, we try to derive it from the preprocessor - for line in preprocessor: - if f"#define {size}" in line: - try: - size = int(line.split(" ")[2]) - break - except ValueError: - continue - if "*" in p_type: - # The parameter is an array - if p_type == "float*": - args.append(np.random.rand(size).astype(np.float32)) - elif p_type == "double*": - args.append(np.random.rand(size).astype(np.float64)) - elif p_type == "int*": - args.append(np.random.randint(max_int, size=size)) - else: - # The parameter is a scalar - if p_type == "float": - args.append(np.float32(size)) - elif p_type == "double": - args.append(np.float64(size)) - elif p_type == "int": - args.append(np.int32(size)) - - return args diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py new file mode 100644 index 000000000..090d2f1ee --- /dev/null +++ b/kernel_tuner/utils/directives.py @@ -0,0 +1,253 @@ +import numpy as np + +def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: + """Extract an arbitrary section of code""" + found_section = False + sections = dict() + tmp_string = list() + name = "" + + for line in code.replace("\\\n", "").split("\n"): + if found_section: + if stop in line: + found_section = False + sections[name] = "\n".join(tmp_string) + tmp_string = list() + name = "" + else: + tmp_string.append(line) + else: + if start in line: + if kernel_name is None or f" {kernel_name} " in line: + found_section = True + name = line.strip().split(" ")[3] + + return sections + + +def extract_directive_code(code: str, kernel_name: str = None) -> dict: + """Extract explicitly marked directive sections from code""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + end_string = "#pragma tuner stop" + elif f90: + start_string = "!$tuner start" + end_string = "!$tuner stop" + + return extract_code(start_string, end_string, code, kernel_name) + + +def extract_initialization_code(code: str) -> str: + """Extract the initialization section from code""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner initialize" + end_string = "#pragma tuner stop" + elif f90: + start_string = "!$tuner initialize" + end_string = "!$tuner stop" + + function = extract_code(start_string, end_string, code) + if len(function) == 1: + _, value = function.popitem() + return value + else: + return "" + + +def extract_directive_signature(code: str, kernel_name: str = None) -> dict: + """Extract the user defined signature for directive sections""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + elif f90: + start_string = "!$tuner start" + signatures = dict() + + for line in code.replace("\\\n", "").split("\n"): + if start_string in line: + if kernel_name is None or f" {kernel_name} " in line: + tmp_string = line.strip().split(" ") + name = tmp_string[3] + tmp_string = tmp_string[4:] + params = list() + for param in tmp_string: + if len(param) == 0: + continue + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + p_type = param[1:-1] + p_type = p_type.split(":")[0] + if "*" in p_type: + p_type = p_type.replace("*", " * restrict") + if cpp: + params.append(f"{p_type} {p_name}") + elif f90: + params.append(p_name) + if cpp: + signatures[name] = f"float {name}({', '.join(params)})" + elif f90: + signatures[name] = f"function {name}({', '.join(params)})" + + return signatures + + +def extract_directive_data(code: str, kernel_name: str = None) -> dict: + """Extract the data used in the directive section""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start_string = "#pragma tuner start" + elif f90: + start_string = "!$tuner start" + data = dict() + + for line in code.replace("\\\n", "").split("\n"): + if start_string in line: + if kernel_name is None or f" {kernel_name} " in line: + name = line.strip().split(" ")[3] + data[name] = dict() + tmp_string = line.strip().split(" ")[4:] + for param in tmp_string: + if len(param) == 0: + continue + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + param = param[1:-1] + p_type = param.split(":")[0] + try: + p_size = param.split(":")[1] + except IndexError: + p_size = 0 + data[name][p_name] = [p_type, p_size] + + return data + + +def extract_preprocessor(code: str) -> list: + """Extract include and define statements from code""" + preprocessor = list() + + for line in code.replace("\\\n", "").split("\n"): + if "#define" in line or "#include" in line: + preprocessor.append(line) + + return preprocessor + + +def wrap_timing(code: str) -> str: + """Wrap timing code around the provided code""" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True + + if cpp: + start = "auto start = std::chrono::steady_clock::now();" + end = "auto end = std::chrono::steady_clock::now();" + timing = "std::chrono::duration elapsed_time = end - start;" + ret = "return elapsed_time.count();" + elif f90: + start = "integer,intent(out) start\nreal,intent(out) rate\ninteger,intent(out) end\ncall system_clock(start, rate)" + end = "call system_clock(end)" + timing = "timing = (real(end - start) / real(rate)) * 1e3" + ret = "" + + return "\n".join([start, code, end, timing, ret]) + + +def generate_directive_function( + preprocessor: str, signature: str, body: str, initialization: str = "" +) -> str: + """Generate tunable function for one directive""" + cpp = False + f90 = False + if "#pragma acc" in body: + cpp = True + elif "!$acc" in body: + f90 = True + + code = "\n".join(preprocessor) + if cpp and "#include " not in preprocessor: + code += "\n#include \n" + if cpp: + code += 'extern "C" ' + signature + "{\n" + elif f90: + code += signature + " result(timing)\n" + if len(initialization) > 1: + code += initialization + "\n" + code += wrap_timing(body) + "\n}" + if cpp: + code += "\n}" + elif f90: + name = signature.split(" ")[1] + code += f"\nend function {name}\n" + + return code + + +def allocate_signature_memory(data: dict, preprocessor: list = None) -> list: + """Allocates the data needed by a kernel and returns the arguments array""" + args = [] + max_int = 1024 + + for parameter in data.keys(): + p_type = data[parameter][0] + size = data[parameter][1] + if type(size) is not int: + try: + # Try to convert the size to an integer + size = int(size) + except ValueError: + # If size cannot be natively converted to string, we try to derive it from the preprocessor + for line in preprocessor: + if f"#define {size}" in line: + try: + size = int(line.split(" ")[2]) + break + except ValueError: + continue + if "*" in p_type: + # The parameter is an array + if p_type == "float*": + args.append(np.random.rand(size).astype(np.float32)) + elif p_type == "double*": + args.append(np.random.rand(size).astype(np.float64)) + elif p_type == "int*": + args.append(np.random.randint(max_int, size=size)) + else: + # The parameter is a scalar + if p_type == "float": + args.append(np.float32(size)) + elif p_type == "double": + args.append(np.float64(size)) + elif p_type == "int": + args.append(np.int32(size)) + + return args diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 758daadb5..777600df1 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -716,137 +716,3 @@ def test_parse_restrictions(): expected = '(params["block_size_x"] != 320) and (params["use_padding"] == 0 or params["block_size_x"] % 32 != 0)' assert expected in parsed - - -def test_extract_directive_code(): - code = """ - #include - - #define VECTOR_SIZE 65536 - - int main(void) { - int size = VECTOR_SIZE; - __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - - #pragma tuner start initialize - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - } - #pragma tuner stop - - #pragma tuner start vector_add - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - } - #pragma tuner stop - - free(a); - free(b); - free(c); - } - """ - expected_one = """ #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - }""" - expected_two = """ #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - }""" - returns = extract_directive_code(code) - assert len(returns) == 2 - assert expected_one in returns["initialize"] - assert expected_two in returns["vector_add"] - assert expected_one not in returns["vector_add"] - returns = extract_directive_code(code, "vector_a") - assert len(returns) == 0 - - -def test_extract_preprocessor(): - code = """ - #include - - #define VECTOR_SIZE 65536 - - int main(void) { - int size = VECTOR_SIZE; - __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - - #pragma tuner start - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - } - #pragma tuner stop - - #pragma tuner start - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - } - #pragma tuner stop - - free(a); - free(b); - free(c); - } - """ - expected = [" #include ", " #define VECTOR_SIZE 65536"] - results = extract_preprocessor(code) - assert len(results) == 2 - for item in expected: - assert item in results - - -def test_wrap_timing(): - code = "#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" - wrapped = wrap_timing(code) - assert ( - wrapped - == "auto start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" - ) - - -def test_extract_directive_signature(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" - signatures = extract_directive_signature(code) - assert len(signatures) == 1 - assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" - in signatures["vector_add"] - ) - signatures = extract_directive_signature(code, "vector_add") - assert len(signatures) == 1 - assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" - in signatures["vector_add"] - ) - signatures = extract_directive_signature(code, "vector_add_ext") - assert len(signatures) == 0 - - -def test_extract_directive_data(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" - data = extract_directive_data(code) - assert len(data) == 1 - assert len(data["vector_add"]) == 4 - assert "float*" in data["vector_add"]["b"] - assert "int" not in data["vector_add"]["c"] - assert "VECTOR_SIZE" in data["vector_add"]["size"] - data = extract_directive_data(code, "vector_add_double") - assert len(data) == 0 diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py new file mode 100644 index 000000000..aa52d1ea6 --- /dev/null +++ b/test/test_utils_directives.py @@ -0,0 +1,134 @@ +from kernel_tuner.utils.directives import * + +def test_extract_directive_code(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start initialize + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start vector_add + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + expected_one = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + }""" + expected_two = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + }""" + returns = extract_directive_code(code) + assert len(returns) == 2 + assert expected_one in returns["initialize"] + assert expected_two in returns["vector_add"] + assert expected_one not in returns["vector_add"] + returns = extract_directive_code(code, "vector_a") + assert len(returns) == 0 + + +def test_extract_preprocessor(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + expected = [" #include ", " #define VECTOR_SIZE 65536"] + results = extract_preprocessor(code) + assert len(results) == 2 + for item in expected: + assert item in results + + +def test_wrap_timing(): + code = "#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + wrapped = wrap_timing(code) + assert ( + wrapped + == "auto start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" + ) + + +def test_extract_directive_signature(): + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" + signatures = extract_directive_signature(code) + assert len(signatures) == 1 + assert ( + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + in signatures["vector_add"] + ) + signatures = extract_directive_signature(code, "vector_add") + assert len(signatures) == 1 + assert ( + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + in signatures["vector_add"] + ) + signatures = extract_directive_signature(code, "vector_add_ext") + assert len(signatures) == 0 + + +def test_extract_directive_data(): + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" + data = extract_directive_data(code) + assert len(data) == 1 + assert len(data["vector_add"]) == 4 + assert "float*" in data["vector_add"]["b"] + assert "int" not in data["vector_add"]["c"] + assert "VECTOR_SIZE" in data["vector_add"]["size"] + data = extract_directive_data(code, "vector_add_double") + assert len(data) == 0 From b1f4e5ebe64f4c8ac8ebeff14450aac3d7d7e1e1 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 11:40:29 +0200 Subject: [PATCH 60/93] Update imports. --- examples/c/vector_add_openacc.py | 2 +- examples/fortran/vector_add_openacc.py | 2 +- kernel_tuner/utils/directives.py | 1 + test/test_utils_directives.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/c/vector_add_openacc.py b/examples/c/vector_add_openacc.py index 8364672df..8894de1e2 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/c/vector_add_openacc.py @@ -2,7 +2,7 @@ """This is a simple example for tuning C++ OpenACC code with the kernel tuner""" from kernel_tuner import tune_kernel -from kernel_tuner.util import ( +from kernel_tuner.utils.directives import ( extract_directive_signature, extract_directive_code, extract_preprocessor, diff --git a/examples/fortran/vector_add_openacc.py b/examples/fortran/vector_add_openacc.py index 2828fde97..eee345cc8 100644 --- a/examples/fortran/vector_add_openacc.py +++ b/examples/fortran/vector_add_openacc.py @@ -2,7 +2,7 @@ """This is a simple example for tuning Fortran OpenACC code with the kernel tuner""" from kernel_tuner import tune_kernel -from kernel_tuner.util import ( +from kernel_tuner.utils.directives import ( extract_directive_signature, extract_directive_code, extract_preprocessor, diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 090d2f1ee..c27fa88e1 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -1,5 +1,6 @@ import numpy as np + def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: """Extract an arbitrary section of code""" found_section = False diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py index aa52d1ea6..cf647ee75 100644 --- a/test/test_utils_directives.py +++ b/test/test_utils_directives.py @@ -1,5 +1,6 @@ from kernel_tuner.utils.directives import * + def test_extract_directive_code(): code = """ #include From 3503544d25beccdc4ba60a48de40f6b45bf96ccc Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 11:44:11 +0200 Subject: [PATCH 61/93] Add module init. --- kernel_tuner/utils/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 kernel_tuner/utils/__init__.py diff --git a/kernel_tuner/utils/__init__.py b/kernel_tuner/utils/__init__.py new file mode 100644 index 000000000..e69de29bb From eb7928cd24e0ecad2c802237767981cc60419c4a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 11:57:26 +0200 Subject: [PATCH 62/93] Updated setup. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 040a1c931..829efd737 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ def readme(): "kernel_tuner.observers", "kernel_tuner.runners", "kernel_tuner.strategies", + "kernel_tuner.utils" ], long_description=readme(), long_description_content_type="text/x-rst", From 4caa52c91bab42b51eb5a20794466b7dca339516 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 12:00:06 +0200 Subject: [PATCH 63/93] Fix bug in generating tuning function. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c27fa88e1..7135b5287 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -203,7 +203,7 @@ def generate_directive_function( code += signature + " result(timing)\n" if len(initialization) > 1: code += initialization + "\n" - code += wrap_timing(body) + "\n}" + code += wrap_timing(body) if cpp: code += "\n}" elif f90: From 0b780da3605e6174ed3d8831f0c0698c07b6c8b5 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 13:12:07 +0200 Subject: [PATCH 64/93] Added test and fixed some bugs. --- kernel_tuner/utils/directives.py | 13 +++++++++++-- test/test_utils_directives.py | 20 +++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 7135b5287..68a641a30 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -7,6 +7,12 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d sections = dict() tmp_string = list() name = "" + cpp = False + f90 = False + if "#pragma acc" in code: + cpp = True + elif "!$acc" in code: + f90 = True for line in code.replace("\\\n", "").split("\n"): if found_section: @@ -19,9 +25,12 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d tmp_string.append(line) else: if start in line: - if kernel_name is None or f" {kernel_name} " in line: + if kernel_name is None or f" {kernel_name} " in line or ( kernel_name in line and len(line.partition(kernel_name)[2]) == 0): found_section = True - name = line.strip().split(" ")[3] + if cpp: + name = line.strip().split(" ")[3] + elif f90: + name = line.strip().split(" ")[2] return sections diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py index cf647ee75..ecb51d53a 100644 --- a/test/test_utils_directives.py +++ b/test/test_utils_directives.py @@ -51,9 +51,27 @@ def test_extract_directive_code(): assert expected_one in returns["initialize"] assert expected_two in returns["vector_add"] assert expected_one not in returns["vector_add"] - returns = extract_directive_code(code, "vector_a") + returns = extract_directive_code(code, "vector") assert len(returns) == 0 + code = """ + !$tuner start vector_add + !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + do i = 1, N + C(i) = A(i) + B(i) + end do + !$acc end parallel loop + !$tuner stop + """ + expected = """ !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + do i = 1, N + C(i) = A(i) + B(i) + end do + !$acc end parallel loop""" + returns = extract_directive_code(code, "vector_add") + assert len(returns) == 1 + assert expected in returns["vector_add"] + def test_extract_preprocessor(): code = """ From 67456e6c57468403de4b0c2ee5af717d6f0a392f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 14:52:43 +0200 Subject: [PATCH 65/93] Add test and fix bug in signature. --- kernel_tuner/utils/directives.py | 8 ++++++-- test/test_utils_directives.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 68a641a30..ca5ce3cd4 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -97,8 +97,12 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if start_string in line: if kernel_name is None or f" {kernel_name} " in line: tmp_string = line.strip().split(" ") - name = tmp_string[3] - tmp_string = tmp_string[4:] + if cpp: + name = tmp_string[3] + tmp_string = tmp_string[4:] + elif f90: + name = tmp_string[2] + tmp_string = tmp_string[3:] params = list() for param in tmp_string: if len(param) == 0: diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py index ecb51d53a..585662c7d 100644 --- a/test/test_utils_directives.py +++ b/test/test_utils_directives.py @@ -139,6 +139,10 @@ def test_extract_directive_signature(): ) signatures = extract_directive_signature(code, "vector_add_ext") assert len(signatures) == 0 + code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" + signatures = extract_directive_signature(code) + assert len(signatures) == 1 + assert "function vector_add(A, B, C, n)" in signatures["vector_add"] def test_extract_directive_data(): From 09575927769fa2af31cfd1ba9668a2287572fc7c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:05:31 +0200 Subject: [PATCH 66/93] Some code refactoring. --- kernel_tuner/utils/directives.py | 65 ++++++++++---------------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index ca5ce3cd4..0589664da 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -1,18 +1,23 @@ import numpy as np +def correct_kernel(kernel_name: str, line: str) -> bool: + return f" {kernel_name} " in line or ( + kernel_name in line and len(line.partition(kernel_name)[2]) == 0 + ) + + +def cpp_or_f90(code: str) -> list: + return "#pragma acc" in code, "!$acc" in code + + def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: """Extract an arbitrary section of code""" found_section = False sections = dict() tmp_string = list() name = "" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) for line in code.replace("\\\n", "").split("\n"): if found_section: @@ -25,7 +30,7 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d tmp_string.append(line) else: if start in line: - if kernel_name is None or f" {kernel_name} " in line or ( kernel_name in line and len(line.partition(kernel_name)[2]) == 0): + if kernel_name is None or correct_kernel(kernel_name, line): found_section = True if cpp: name = line.strip().split(" ")[3] @@ -37,12 +42,7 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d def extract_directive_code(code: str, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) if cpp: start_string = "#pragma tuner start" @@ -56,12 +56,7 @@ def extract_directive_code(code: str, kernel_name: str = None) -> dict: def extract_initialization_code(code: str) -> str: """Extract the initialization section from code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) if cpp: start_string = "#pragma tuner initialize" @@ -80,12 +75,7 @@ def extract_initialization_code(code: str) -> str: def extract_directive_signature(code: str, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) if cpp: start_string = "#pragma tuner start" @@ -95,7 +85,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: for line in code.replace("\\\n", "").split("\n"): if start_string in line: - if kernel_name is None or f" {kernel_name} " in line: + if kernel_name is None or correct_kernel(kernel_name, line): tmp_string = line.strip().split(" ") if cpp: name = tmp_string[3] @@ -127,12 +117,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: def extract_directive_data(code: str, kernel_name: str = None) -> dict: """Extract the data used in the directive section""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) if cpp: start_string = "#pragma tuner start" @@ -142,7 +127,7 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: for line in code.replace("\\\n", "").split("\n"): if start_string in line: - if kernel_name is None or f" {kernel_name} " in line: + if kernel_name is None or correct_kernel(kernel_name, line): name = line.strip().split(" ")[3] data[name] = dict() tmp_string = line.strip().split(" ")[4:] @@ -175,12 +160,7 @@ def extract_preprocessor(code: str) -> list: def wrap_timing(code: str) -> str: """Wrap timing code around the provided code""" - cpp = False - f90 = False - if "#pragma acc" in code: - cpp = True - elif "!$acc" in code: - f90 = True + cpp, f90 = cpp_or_f90(code) if cpp: start = "auto start = std::chrono::steady_clock::now();" @@ -200,12 +180,7 @@ def generate_directive_function( preprocessor: str, signature: str, body: str, initialization: str = "" ) -> str: """Generate tunable function for one directive""" - cpp = False - f90 = False - if "#pragma acc" in body: - cpp = True - elif "!$acc" in body: - f90 = True + cpp, f90 = cpp_or_f90(code) code = "\n".join(preprocessor) if cpp and "#include " not in preprocessor: From 96e852d32cb527a5f84d25dffa789b4e275c0ade Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:10:32 +0200 Subject: [PATCH 67/93] Added test and fixed bugs. --- kernel_tuner/utils/directives.py | 8 ++++++-- test/test_utils_directives.py | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 0589664da..b6144433c 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -128,9 +128,13 @@ def extract_directive_data(code: str, kernel_name: str = None) -> dict: for line in code.replace("\\\n", "").split("\n"): if start_string in line: if kernel_name is None or correct_kernel(kernel_name, line): - name = line.strip().split(" ")[3] + if cpp: + name = line.strip().split(" ")[3] + tmp_string = line.strip().split(" ")[4:] + elif f90: + name = line.strip().split(" ")[2] + tmp_string = line.strip().split(" ")[3:] data[name] = dict() - tmp_string = line.strip().split(" ")[4:] for param in tmp_string: if len(param) == 0: continue diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py index 585662c7d..1e8f79d93 100644 --- a/test/test_utils_directives.py +++ b/test/test_utils_directives.py @@ -155,3 +155,10 @@ def test_extract_directive_data(): assert "VECTOR_SIZE" in data["vector_add"]["size"] data = extract_directive_data(code, "vector_add_double") assert len(data) == 0 + code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" + data = extract_directive_data(code) + assert len(data) == 1 + assert len(data["vector_add"]) == 4 + assert "float*" in data["vector_add"]["B"] + assert "int" not in data["vector_add"]["C"] + assert "VECTOR_SIZE" in data["vector_add"]["n"] From 89b88aac7e03e42d1db16bbe5088fe25e5bfc727 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:13:13 +0200 Subject: [PATCH 68/93] Typo. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index b6144433c..5a44655c9 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -184,7 +184,7 @@ def generate_directive_function( preprocessor: str, signature: str, body: str, initialization: str = "" ) -> str: """Generate tunable function for one directive""" - cpp, f90 = cpp_or_f90(code) + cpp, f90 = cpp_or_f90(body) code = "\n".join(preprocessor) if cpp and "#include " not in preprocessor: From c9e713231a55c437e61ac8c5a3232b90d9aacf54 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:25:29 +0200 Subject: [PATCH 69/93] Fixed a bug in code generation. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 5a44655c9..7fc437ce8 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -192,7 +192,7 @@ def generate_directive_function( if cpp: code += 'extern "C" ' + signature + "{\n" elif f90: - code += signature + " result(timing)\n" + code += "\n" + signature + " result(timing)\n" if len(initialization) > 1: code += initialization + "\n" code += wrap_timing(body) From d2dab14448c1ddb18785746984ed167f034de465 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:26:40 +0200 Subject: [PATCH 70/93] Extract correct name. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 7fc437ce8..e312b2c56 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -199,7 +199,7 @@ def generate_directive_function( if cpp: code += "\n}" elif f90: - name = signature.split(" ")[1] + name = signature.split("(")[0] code += f"\nend function {name}\n" return code From d45f34353d48bb0b740f8a32a9d042104c5990d3 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:34:57 +0200 Subject: [PATCH 71/93] Bug fix. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index e312b2c56..c37b4fb5c 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -199,7 +199,7 @@ def generate_directive_function( if cpp: code += "\n}" elif f90: - name = signature.split("(")[0] + name = signature.split(" ")[1].split("(")[0] code += f"\nend function {name}\n" return code From 98302d0f3ee12e7d269b9130a464cc8d3fb77d95 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 15:40:11 +0200 Subject: [PATCH 72/93] Fix fortran timing code. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c37b4fb5c..be576ee91 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -172,7 +172,7 @@ def wrap_timing(code: str) -> str: timing = "std::chrono::duration elapsed_time = end - start;" ret = "return elapsed_time.count();" elif f90: - start = "integer,intent(out) start\nreal,intent(out) rate\ninteger,intent(out) end\ncall system_clock(start, rate)" + start = "integer :: start\nreal :: rate\ninteger :: end\ncall system_clock(start, rate)" end = "call system_clock(end)" timing = "timing = (real(end - start) / real(rate)) * 1e3" ret = "" From e203b00f9593e5b408817977fb89f4fbd7b4728a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 22 Jun 2023 16:22:48 +0200 Subject: [PATCH 73/93] Working on the right fortran signature. --- kernel_tuner/utils/directives.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index be576ee91..e52d0b9fb 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -110,7 +110,29 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if cpp: signatures[name] = f"float {name}({', '.join(params)})" elif f90: - signatures[name] = f"function {name}({', '.join(params)})" + signatures[name] = f"function {name}({', '.join(params)}) result(timing)" + params = list() + for param in tmp_string: + if len(param) == 0: + continue + p_name = param.split("(")[0] + param = param.replace(p_name, "", 1) + p_type = param[1:-1] + p_size = p_type.split(":")[1] + p_type = p_type.split(":")[0] + if "float*" in p_type: + params.append(f"real (c_float), dimension({p_size}) :: {p_name}") + elif "double*" in p_type: + params.append(f"real (c_double), dimension({p_size}) :: {p_name}") + elif "int*" in p_type: + params.append(f"integer (c_int), dimension({p_size}) :: {p_name}") + elif "float" in p_type: + params.append(f"real (c_float) :: {p_name}") + elif "double" in p_type: + params.append(f"real (c_double) :: {p_name}") + elif "int" in p_type: + params.append(f"integer (c_int) :: {p_name}") + signatures[name] += "\n".join(params) return signatures @@ -172,7 +194,7 @@ def wrap_timing(code: str) -> str: timing = "std::chrono::duration elapsed_time = end - start;" ret = "return elapsed_time.count();" elif f90: - start = "integer :: start\nreal :: rate\ninteger :: end\ncall system_clock(start, rate)" + start = "integer (c_int) :: start\nreal (c_float) :: rate\ninteger (c_int) :: end\ncall system_clock(start, rate)" end = "call system_clock(end)" timing = "timing = (real(end - start) / real(rate)) * 1e3" ret = "" @@ -192,7 +214,7 @@ def generate_directive_function( if cpp: code += 'extern "C" ' + signature + "{\n" elif f90: - code += "\n" + signature + " result(timing)\n" + code += "\n" + signature if len(initialization) > 1: code += initialization + "\n" code += wrap_timing(body) From d4927bfd0cfd3032f99cca6f9de1ebf8d11e44d9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:07:28 +0200 Subject: [PATCH 74/93] Fix bug. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index e52d0b9fb..040cb8123 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -110,7 +110,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if cpp: signatures[name] = f"float {name}({', '.join(params)})" elif f90: - signatures[name] = f"function {name}({', '.join(params)}) result(timing)" + signatures[name] = f"function {name}({', '.join(params)}) result(timing)\n" params = list() for param in tmp_string: if len(param) == 0: From a66339f9048f12abd830b8d1f58641410d68406a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:09:38 +0200 Subject: [PATCH 75/93] Fix another line bug. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 040cb8123..166775b38 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -132,7 +132,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: params.append(f"real (c_double) :: {p_name}") elif "int" in p_type: params.append(f"integer (c_int) :: {p_name}") - signatures[name] += "\n".join(params) + signatures[name] += "\n".join(params) + "\n" return signatures From c4b7f84b5904cc86c334e060d316ebd4dbb22082 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:11:39 +0200 Subject: [PATCH 76/93] Add C bindings. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 166775b38..f9f417fe4 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -110,7 +110,7 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if cpp: signatures[name] = f"float {name}({', '.join(params)})" elif f90: - signatures[name] = f"function {name}({', '.join(params)}) result(timing)\n" + signatures[name] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\n" params = list() for param in tmp_string: if len(param) == 0: From 15edb95755afe42ac8d1892cd13417d1a519dc0a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:12:32 +0200 Subject: [PATCH 77/93] Format. --- kernel_tuner/utils/directives.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index f9f417fe4..c9a7008a3 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -110,7 +110,9 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if cpp: signatures[name] = f"float {name}({', '.join(params)})" elif f90: - signatures[name] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\n" + signatures[ + name + ] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\n" params = list() for param in tmp_string: if len(param) == 0: @@ -121,11 +123,17 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: p_size = p_type.split(":")[1] p_type = p_type.split(":")[0] if "float*" in p_type: - params.append(f"real (c_float), dimension({p_size}) :: {p_name}") + params.append( + f"real (c_float), dimension({p_size}) :: {p_name}" + ) elif "double*" in p_type: - params.append(f"real (c_double), dimension({p_size}) :: {p_name}") + params.append( + f"real (c_double), dimension({p_size}) :: {p_name}" + ) elif "int*" in p_type: - params.append(f"integer (c_int), dimension({p_size}) :: {p_name}") + params.append( + f"integer (c_int), dimension({p_size}) :: {p_name}" + ) elif "float" in p_type: params.append(f"real (c_float) :: {p_name}") elif "double" in p_type: From 2e5f39ed23ce67a012cc0612b53d438e7fba065f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:49:02 +0200 Subject: [PATCH 78/93] Pack fortran code inside a module. --- kernel_tuner/utils/directives.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c9a7008a3..c25ee9ad1 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -222,6 +222,7 @@ def generate_directive_function( if cpp: code += 'extern "C" ' + signature + "{\n" elif f90: + code += "\nmodule kt\ncontains\n" code += "\n" + signature if len(initialization) > 1: code += initialization + "\n" @@ -230,7 +231,7 @@ def generate_directive_function( code += "\n}" elif f90: name = signature.split(" ")[1].split("(")[0] - code += f"\nend function {name}\n" + code += f"\nend function {name}\nend module kt\n" return code From 2ee2d9f7b8ed38d0644a51fe00a196a5c1530178 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 10:58:40 +0200 Subject: [PATCH 79/93] Working on the examples. --- examples/fortran/vector_add_acc.py | 6 ++---- examples/fortran/vector_add_openacc.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/fortran/vector_add_acc.py b/examples/fortran/vector_add_acc.py index aa09334e8..ff84c8b6e 100755 --- a/examples/fortran/vector_add_acc.py +++ b/examples/fortran/vector_add_acc.py @@ -1,8 +1,6 @@ #!/usr/bin/env python """This is a minimal example for calling Fortran functions""" -import logging -import json import numpy as np from kernel_tuner import tune_kernel @@ -28,8 +26,8 @@ def tune(): args, tune_params, lang="C", - compiler="pgfortran", - compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"], + compiler="nvfortran", + compiler_options=["-fast", "-acc=gpu"], ) return result diff --git a/examples/fortran/vector_add_openacc.py b/examples/fortran/vector_add_openacc.py index eee345cc8..d550fde2b 100644 --- a/examples/fortran/vector_add_openacc.py +++ b/examples/fortran/vector_add_openacc.py @@ -23,7 +23,7 @@ !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) !$acc parallel loop num_gangs(ngangs) vector_length(vlength) - do i = 1, N + do i = 1, n C(i) = A(i) + B(i) end do !$acc end parallel loop From 361eab2cad5740e41e9c4a942f0c65f7335c6cc3 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 11:15:42 +0200 Subject: [PATCH 80/93] Argument order in the example. --- examples/fortran/vector_add_openacc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fortran/vector_add_openacc.py b/examples/fortran/vector_add_openacc.py index d550fde2b..60058e9eb 100644 --- a/examples/fortran/vector_add_openacc.py +++ b/examples/fortran/vector_add_openacc.py @@ -15,7 +15,7 @@ code = """ #define VECTOR_SIZE 65536 -subroutine vector_add(C, A, B, n) +subroutine vector_add(A, B, C, n) use iso_c_binding real (c_float), intent(out), dimension(VECTOR_SIZE) :: C real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B From 9983012e19715524cdb47f47e871c404986a002a Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 11:30:52 +0200 Subject: [PATCH 81/93] Missing timing variable. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c25ee9ad1..24f90a3d5 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -202,7 +202,7 @@ def wrap_timing(code: str) -> str: timing = "std::chrono::duration elapsed_time = end - start;" ret = "return elapsed_time.count();" elif f90: - start = "integer (c_int) :: start\nreal (c_float) :: rate\ninteger (c_int) :: end\ncall system_clock(start, rate)" + start = "integer (c_int) :: start\nreal (c_float) :: rate\ninteger (c_int) :: end\nreal (c_float) :: timing\ncall system_clock(start, rate)" end = "call system_clock(end)" timing = "timing = (real(end - start) / real(rate)) * 1e3" ret = "" From c8ae1a568b56234e1a16bf335be4d9c23733b4ae Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 13:38:01 +0200 Subject: [PATCH 82/93] Old instance of pgfortran. --- kernel_tuner/backends/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 188c94594..36726b718 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -223,7 +223,7 @@ def compile(self, kernel_instance): kernel_name = match.group(1) + "_" + kernel_name + "_" else: # for functions outside of modules - if self.compiler in ["gfortran", "ftn", "ifort", "pgfortran"]: + if self.compiler in ["gfortran", "ftn", "ifort", "nvfortran"]: kernel_name = kernel_name + "_" try: From 51f14d94f8f7f03aabcd388667304e7e0ed71c91 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 13:40:28 +0200 Subject: [PATCH 83/93] Have the bindings module available. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 24f90a3d5..07d864c37 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -222,7 +222,7 @@ def generate_directive_function( if cpp: code += 'extern "C" ' + signature + "{\n" elif f90: - code += "\nmodule kt\ncontains\n" + code += "\nmodule kt\nuse iso_c_binding\ncontains\n" code += "\n" + signature if len(initialization) > 1: code += initialization + "\n" From 1ee01ce5a2172a3fa0ae4f9fe89aaa3e57beba73 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 13:44:01 +0200 Subject: [PATCH 84/93] Use the variable passed, not the constant. --- examples/fortran/vector_add_acc.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fortran/vector_add_acc.F90 b/examples/fortran/vector_add_acc.F90 index 4c2c63ba9..8e6fdb7f3 100644 --- a/examples/fortran/vector_add_acc.F90 +++ b/examples/fortran/vector_add_acc.F90 @@ -20,7 +20,7 @@ subroutine vector_add(C, A, B, n) !$acc data copyin(A, B) copyout(C) !$acc parallel loop device_type(nvidia) vector_length(block_size_x) - do i = 1, N + do i = 1, n C(i) = A(i) + B(i) end do !$acc end parallel loop From 035a377327defbb473b12497bdf7db300f2354dd Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 15:12:07 +0200 Subject: [PATCH 85/93] Pass the scalars by value in fortran. --- kernel_tuner/utils/directives.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 07d864c37..c3307ddf1 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -135,11 +135,11 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: f"integer (c_int), dimension({p_size}) :: {p_name}" ) elif "float" in p_type: - params.append(f"real (c_float) :: {p_name}") + params.append(f"real (c_float), value :: {p_name}") elif "double" in p_type: - params.append(f"real (c_double) :: {p_name}") + params.append(f"real (c_double), value :: {p_name}") elif "int" in p_type: - params.append(f"integer (c_int) :: {p_name}") + params.append(f"integer (c_int), value :: {p_name}") signatures[name] += "\n".join(params) + "\n" return signatures From b03cd4fa8d106bdcae994ee3a077cadf613b8f3b Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Fri, 23 Jun 2023 15:15:18 +0200 Subject: [PATCH 86/93] Pass n by value. --- examples/fortran/vector_add_acc.F90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/fortran/vector_add_acc.F90 b/examples/fortran/vector_add_acc.F90 index 8e6fdb7f3..9188e74cc 100644 --- a/examples/fortran/vector_add_acc.F90 +++ b/examples/fortran/vector_add_acc.F90 @@ -15,7 +15,7 @@ subroutine vector_add(C, A, B, n) use iso_c_binding real (c_float), intent(out), dimension(N) :: C real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), intent(in) :: n + integer (c_int), value, intent(in) :: n !$acc data copyin(A, B) copyout(C) @@ -35,7 +35,7 @@ function time_vector_add(C, A, B, n) result(time) use iso_c_binding real (c_float), intent(out), dimension(N) :: C real (c_float), intent(in), dimension(N) :: A, B - integer (c_int), intent(in) :: n + integer (c_int), value, intent(in) :: n real (c_float) :: time real (c_double) start_time, end_time From 92a8bd03699904ab5d1c64de43e29b1bd32b6aa0 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Thu, 6 Jul 2023 12:58:18 +0200 Subject: [PATCH 87/93] Init does not have a name, bugfix for that. --- kernel_tuner/utils/directives.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index c3307ddf1..88bef4c01 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -32,10 +32,13 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d if start in line: if kernel_name is None or correct_kernel(kernel_name, line): found_section = True - if cpp: - name = line.strip().split(" ")[3] - elif f90: - name = line.strip().split(" ")[2] + try: + if cpp: + name = line.strip().split(" ")[3] + elif f90: + name = line.strip().split(" ")[2] + except IndexError: + name = "init" return sections From 8bd0c4e906fac350a2f3d1d3a6592f7e1b3cfcf7 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 10 Jul 2023 13:20:22 +0200 Subject: [PATCH 88/93] Fix a possible variable overloading in timing code. --- kernel_tuner/utils/directives.py | 12 ++++++------ test/test_utils_directives.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 88bef4c01..51f366064 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -200,14 +200,14 @@ def wrap_timing(code: str) -> str: cpp, f90 = cpp_or_f90(code) if cpp: - start = "auto start = std::chrono::steady_clock::now();" - end = "auto end = std::chrono::steady_clock::now();" - timing = "std::chrono::duration elapsed_time = end - start;" + start = "auto kt_timing_start = std::chrono::steady_clock::now();" + end = "auto kt_timing_end = std::chrono::steady_clock::now();" + timing = "std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;" ret = "return elapsed_time.count();" elif f90: - start = "integer (c_int) :: start\nreal (c_float) :: rate\ninteger (c_int) :: end\nreal (c_float) :: timing\ncall system_clock(start, rate)" - end = "call system_clock(end)" - timing = "timing = (real(end - start) / real(rate)) * 1e3" + start = "integer (c_int) :: kt_timing_start\nreal (c_float) :: kt_rate\ninteger (c_int) :: kt_timing_end\nreal (c_float) :: timing\ncall system_clock(kt_timing_start, kt_rate)" + end = "call system_clock(kt_timing_end)" + timing = "timing = (real(kt_timing_end - kt_timing_start) / real(kt_rate)) * 1e3" ret = "" return "\n".join([start, code, end, timing, ret]) diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py index 1e8f79d93..df8f36e14 100644 --- a/test/test_utils_directives.py +++ b/test/test_utils_directives.py @@ -119,7 +119,7 @@ def test_wrap_timing(): wrapped = wrap_timing(code) assert ( wrapped - == "auto start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = end - start;\nreturn elapsed_time.count();" + == "auto kt_timing_start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto kt_timing_end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;\nreturn elapsed_time.count();" ) From 02be63885f99a4a8afc56f629a95fdb4a44b150f Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 10 Jul 2023 14:09:34 +0200 Subject: [PATCH 89/93] Allow to have more sections of init code. --- kernel_tuner/utils/directives.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 51f366064..7f1f07038 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -68,10 +68,9 @@ def extract_initialization_code(code: str) -> str: start_string = "!$tuner initialize" end_string = "!$tuner stop" - function = extract_code(start_string, end_string, code) - if len(function) == 1: - _, value = function.popitem() - return value + init_code = extract_code(start_string, end_string, code) + if len(function) >= 1: + return "\n".join(init_code) else: return "" From d05eead24a5abf06c9a884b6951d3cb220f6074c Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 10 Jul 2023 14:13:43 +0200 Subject: [PATCH 90/93] Typo. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 7f1f07038..8cb25271d 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -69,7 +69,7 @@ def extract_initialization_code(code: str) -> str: end_string = "!$tuner stop" init_code = extract_code(start_string, end_string, code) - if len(function) >= 1: + if len(init_code) >= 1: return "\n".join(init_code) else: return "" From a6e2b8719db66adb3210287ce0d722001cd75b79 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Mon, 10 Jul 2023 14:41:07 +0200 Subject: [PATCH 91/93] Still working on having more init sections --- kernel_tuner/utils/directives.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 8cb25271d..901b4ba1a 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -17,6 +17,7 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d sections = dict() tmp_string = list() name = "" + init_found = 0 cpp, f90 = cpp_or_f90(code) for line in code.replace("\\\n", "").split("\n"): @@ -38,7 +39,8 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d elif f90: name = line.strip().split(" ")[2] except IndexError: - name = "init" + name = f"init_{init_found}" + init_found += 1 return sections @@ -70,7 +72,7 @@ def extract_initialization_code(code: str) -> str: init_code = extract_code(start_string, end_string, code) if len(init_code) >= 1: - return "\n".join(init_code) + return "\n".join(init_code.values()) else: return "" From 0eb052d3f89557d7826d5384a2bd2069097f3fc9 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Tue, 11 Jul 2023 16:15:35 +0200 Subject: [PATCH 92/93] Wrong return type. --- kernel_tuner/utils/directives.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 901b4ba1a..c60d68be7 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -7,7 +7,7 @@ def correct_kernel(kernel_name: str, line: str) -> bool: ) -def cpp_or_f90(code: str) -> list: +def cpp_or_f90(code: str) -> tuple: return "#pragma acc" in code, "!$acc" in code From 4f3dab70180e303c4585acf3e6eeff5fa52b9366 Mon Sep 17 00:00:00 2001 From: Alessio Sclocco Date: Wed, 4 Oct 2023 15:15:19 +0200 Subject: [PATCH 93/93] Fixed a merge bug. --- kernel_tuner/backends/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index eb23cbb8d..a7e15c577 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -50,7 +50,7 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers :type iterations: int """ self.observers = observers or [] - self.observers.append(CRuntimeObserver(self)) + self.observers.append(CompilerRuntimeObserver(self)) self.iterations = iterations self.max_threads = 1024