From 9e69cf4b2dedad95e5f52ac5d7a8fbe1a08a8c80 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 17 Dec 2021 14:14:10 -0600 Subject: [PATCH 01/27] Test, fix reductions with no inames --- loopy/preprocess.py | 17 +++++++++++------ test/test_reduction.py | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc0e82afb..55b735f4a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1751,6 +1751,8 @@ def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, def map_reduction(expr, rec, callables_table, guarding_predicates, nresults=1): + nonlocal insn_changed + # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. @@ -1827,6 +1829,10 @@ def _error_if_force_scan_on(cls, msg): ", ".join(str(kernel.iname_tags(iname)) for iname in bad_inames))) + # }}} + + insn_changed = True + if n_local_par == 0 and n_sequential == 0: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", @@ -1840,8 +1846,6 @@ def _error_if_force_scan_on(cls, msg): return expr.expr, callables_table - # }}} - if may_be_implemented_as_scan: assert force_scan or automagic_scans_ok @@ -1916,7 +1920,7 @@ def _error_if_force_scan_on(cls, msg): domains = kernel.domains[:] temp_kernel = kernel - changed = False + kernel_changed = False import loopy as lp while insn_queue: @@ -1925,6 +1929,7 @@ def _error_if_force_scan_on(cls, msg): new_insn_add_within_inames = set() generated_insns = [] + insn_changed = False insn = insn_queue.pop(0) @@ -1947,7 +1952,7 @@ def _error_if_force_scan_on(cls, msg): callables_table=cb_mapper.callables_table, guarding_predicates=insn.predicates), - if generated_insns: + if insn_changed: # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. @@ -2010,14 +2015,14 @@ def _error_if_force_scan_on(cls, msg): domains=domains) temp_kernel = lp.replace_instruction_ids( temp_kernel, insn_id_replacements) - changed = True + kernel_changed = True else: # nothing happened, we're done with insn assert not new_insn_add_depends_on new_insns.append(insn) - if changed: + if kernel_changed: kernel = kernel.copy( instructions=new_insns, temporary_variables=new_temporary_variables, diff --git a/test/test_reduction.py b/test/test_reduction.py index c623c68c6..931628a04 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -460,6 +460,27 @@ def test_any_all(ctx_factory): assert not out_dict["out2"].get() +def test_reduction_without_inames(ctx_factory): + """Ensure that reductions with no inames get rewritten to the element + being reduced over. This was sometimes erroneously eliminated because + reduction realization used the generation of new statements as a criterion + for whether work was done. + """ + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{:}", + """ + out = reduce(any, [], 5) + """) + knl = lp.set_options(knl, return_dict=True) + + _, out_dict = knl(cq) + + assert out_dict["out"].get() == 5 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From 4f3ad69fbdfa39e31532ae034c5ef76523e73e00 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 13 Nov 2021 11:32:25 -0600 Subject: [PATCH 02/27] implements rename_inames --- loopy/__init__.py | 4 +- loopy/transform/iname.py | 109 +++++++++++++++++++++++++++------------ 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 9bd01534b..7e6ee5234 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -71,7 +71,7 @@ from loopy.transform.iname import ( set_loop_priority, prioritize_loops, untag_inames, split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames, - rename_iname, remove_unused_inames, + rename_iname, rename_inames, remove_unused_inames, split_reduction_inward, split_reduction_outward, affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique, @@ -198,7 +198,7 @@ "set_loop_priority", "prioritize_loops", "untag_inames", "split_iname", "chunk_iname", "join_inames", "tag_inames", "duplicate_inames", - "rename_iname", "remove_unused_inames", + "rename_iname", "rename_inames", "remove_unused_inames", "split_reduction_inward", "split_reduction_outward", "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index e55bad50c..3712d678b 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -58,6 +58,8 @@ .. autofunction:: rename_iname +.. autofunction:: rename_inames + .. autofunction:: remove_unused_inames .. autofunction:: split_reduction_inward @@ -1126,26 +1128,64 @@ def has_schedulable_iname_nesting(kernel): # {{{ rename_inames @for_each_kernel -def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): +def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None): """ + :arg old_inames: A collection of inames that must be renamed to **new_iname**. :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. :arg existing_ok: execute even if *new_iname* already exists """ + from collections.abc import Collection + if (isinstance(old_inames, str) + or not isinstance(old_inames, Collection)): + raise LoopyError("'old_inames' must be a collection of strings, " + f"got '{type(old_inames)}'.") + + if new_iname in old_inames: + raise LoopyError("new iname is part of inames being renamed") + + if new_iname in (kernel.all_variable_names() - kernel.all_inames()): + raise LoopyError(f"New iname '{new_iname}' is already a variable in the" + "kernel") + + if any((len(insn.within_inames & frozenset(old_inames)) > 1) + for insn in kernel.instructions): + raise LoopyError("old_inames contains nested inames" + " -- renaming is illegal.") + + # sort to have deterministic implementation. + old_inames = sorted(old_inames) var_name_gen = kernel.get_var_name_generator() # FIXME: Distinguish existing iname vs. existing other variable - does_exist = var_name_gen.is_name_conflicting(new_iname) + does_exist = new_iname in kernel.all_inames() - if old_iname not in kernel.all_inames(): - raise LoopyError("old iname '%s' does not exist" % old_iname) + if not (frozenset(old_inames) <= kernel.all_inames()): + raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}" + " do not exist.") if does_exist and not existing_ok: - raise LoopyError("iname '%s' conflicts with an existing identifier" - "--cannot rename" % new_iname) + raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier" + " --cannot rename") - if does_exist: + if not does_exist: + # {{{ rename old_inames[0] -> new_iname + # so that the code below can focus on "merging" inames that already exist + + kernel = duplicate_inames( + kernel, old_inames[0], within=within, new_inames=[new_iname]) + kernel = remove_unused_inames(kernel, old_inames[0]) + + # old_iname[0] is already renamed to new_iname => do not rename again. + old_inames = old_inames[1:] + + # }}} + + del does_exist + assert new_iname in kernel.all_inames() + + for old_iname in old_inames: # {{{ check that the domains match up dom = kernel.get_inames_domain(frozenset((old_iname, new_iname))) @@ -1177,42 +1217,45 @@ def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): # }}} - from pymbolic import var - subst_dict = {old_iname: var(new_iname)} - - from loopy.match import parse_stack_match - within = parse_stack_match(within) + from pymbolic import var + subst_dict = {old_iname: var(new_iname) for old_iname in old_inames} - from pymbolic.mapper.substitutor import make_subst_func - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, var_name_gen) - smap = RuleAwareSubstitutionMapper(rule_mapping_context, - make_subst_func(subst_dict), within) + from loopy.match import parse_stack_match + within = parse_stack_match(within) - kernel = rule_mapping_context.finish_kernel( - smap.map_kernel(kernel)) + from pymbolic.mapper.substitutor import make_subst_func + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + smap = RuleAwareSubstitutionMapper(rule_mapping_context, + make_subst_func(subst_dict), within) - new_instructions = [] - for insn in kernel.instructions: - if (old_iname in insn.within_inames - and within(kernel, insn, ())): - insn = insn.copy( - within_inames=( - (insn.within_inames - frozenset([old_iname])) - | frozenset([new_iname]))) + from loopy.kernel.instruction import MultiAssignmentBase - new_instructions.append(insn) + def does_insn_involve_iname(kernel, insn, *args): + return (not isinstance(insn, MultiAssignmentBase) + or frozenset(old_inames) & insn.dependency_names() + or frozenset(old_inames) & insn.reduction_inames()) - kernel = kernel.copy(instructions=new_instructions) + kernel = rule_mapping_context.finish_kernel( + smap.map_kernel(kernel, within=does_insn_involve_iname)) - else: - kernel = duplicate_inames( - kernel, [old_iname], within=within, new_inames=[new_iname]) + new_instructions = [insn.copy(within_inames=((insn.within_inames + - frozenset(old_inames)) + | frozenset([new_iname]))) + if ((len(frozenset(old_inames) & insn.within_inames) != 0) + and within(kernel, insn, ())) + else insn + for insn in kernel.instructions] - kernel = remove_unused_inames(kernel, [old_iname]) + kernel = kernel.copy(instructions=new_instructions) + kernel = remove_unused_inames(kernel, old_inames) return kernel + +def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): + return rename_inames(kernel, [old_iname], new_iname, existing_ok, within) + # }}} From 14ae87913f60e0eed25e3e9e995e69054c7d33be Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 13 Nov 2021 11:32:35 -0600 Subject: [PATCH 03/27] tests rename inames --- test/test_transform.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index 3915ce161..e42eeb498 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1337,6 +1337,35 @@ def test_rename_inames_redn(): assert "ifused" in t_unit.default_entrypoint.all_inames() +def test_rename_inames(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i1, i2]: 0<=i1, i2<10}", + """ + y1[i1] = 2 + y2[i2] = 3 + """) + ref_knl = knl + knl = lp.rename_inames(knl, ["i1", "i2"], "ifused") + lp.auto_test_vs_ref(knl, ctx, ref_knl) + + +def test_rename_inames_existing_ok(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i1, i2, i3]: 0<=i1, i2, i3<10}", + """ + y1[i1] = 2 + y2[i2] = 3 + y3[i3] = 4 + """) + ref_knl = knl + knl = lp.rename_inames(knl, ["i1", "i2"], "i3", existing_ok=True) + lp.auto_test_vs_ref(knl, ctx, ref_knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From 468cc9576b806d58f27d8c88cb329a0188237928 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 17 Nov 2021 16:59:11 -0600 Subject: [PATCH 04/27] adds a routine to memoize the transformation results to disk --- loopy/__init__.py | 4 +-- loopy/tools.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 7e6ee5234..1a6c1599e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -154,7 +154,7 @@ from loopy.target.ispc import ISPCTarget from loopy.target.numba import NumbaTarget, NumbaCudaTarget -from loopy.tools import Optional, t_unit_to_python +from loopy.tools import Optional, t_unit_to_python, memoize_on_disk __all__ = [ @@ -299,7 +299,7 @@ "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", - "Optional", + "Optional", "memoize_on_disk", # {{{ from this file diff --git a/loopy/tools.py b/loopy/tools.py index facfe6ee6..9216cbc19 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -23,7 +23,7 @@ import collections.abc as abc import numpy as np -from pytools import memoize_method +from pytools import memoize_method, ProcessLogger from pytools.persistent_dict import KeyBuilder as KeyBuilderBase from loopy.symbolic import (WalkMapper as LoopyWalkMapper, RuleAwareIdentityMapper) @@ -31,6 +31,9 @@ PersistentHashWalkMapper as PersistentHashWalkMapperBase) from sys import intern +import logging +logger = logging.getLogger(__name__) + def is_integer(obj): return isinstance(obj, (int, np.integer)) @@ -862,4 +865,61 @@ def t_unit_to_python(t_unit, var_name="t_unit", else: return python_code + +def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder): + from loopy.version import DATA_MODEL_VERSION + from functools import wraps + from pytools.persistent_dict import WriteOncePersistentDict + from loopy.translation_unit import TranslationUnit + from loopy.kernel import LoopKernel + import pymbolic.primitives as prim + + @wraps(func) + def wrapper(*args, **kwargs): + from loopy import CACHING_ENABLED + + if (not CACHING_ENABLED + or kwargs.pop("_no_memoize_on_disk", False)): + return func(*args, **kwargs) + + transform_cache = WriteOncePersistentDict( + ("loopy-memoize-cache-" + f"{key_builder_t.__qualname__}-{key_builder_t.__name__}" + f"-v0-{DATA_MODEL_VERSION}"), + key_builder=key_builder_t()) + + def _get_persistent_hashable_arg(arg): + if isinstance(arg, prim.Expression): + return PymbolicExpressionHashWrapper(arg) + else: + return arg + + cache_key = (tuple(_get_persistent_hashable_arg(arg) + for arg in args), + {kw: _get_persistent_hashable_arg(arg) + for kw, arg in kwargs.items()}) + + try: + result = transform_cache[cache_key] + logger.debug(f"Function {func.__name__} returned from" + " memoized result on disk.") + return result + except KeyError: + logger.debug(f"Function {func.__name__} not present" + " on disk.") + if args and isinstance(args[0], LoopKernel): + proc_log_str = f"{func.__name__} on '{args[0].name}'" + elif args and isinstance(args[0], TranslationUnit): + proc_log_str = f"{func.__name__} on '{args[0].entrypoints}'" + else: + proc_log_str = f"{func.__name__}" + + with ProcessLogger(logger, proc_log_str): + result = func(*args, **kwargs) + + transform_cache.store_if_not_present(cache_key, result) + return result + + return wrapper + # vim: fdm=marker From 18371df332a89880d4501f7adc059a6e4036785d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 17 Nov 2021 17:03:48 -0600 Subject: [PATCH 05/27] test memoize_on_disk --- test/test_misc.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/test/test_misc.py b/test/test_misc.py index 58ba732ac..0e8a528ec 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -21,6 +21,7 @@ """ import pytest +import loopy as lp import sys @@ -279,6 +280,58 @@ def test_Optional(): # noqa # }}} +@lp.memoize_on_disk +def very_costly_transform(knl, iname): + from time import sleep + sleep(5) + return lp.split_iname(knl, iname, 4) + + +def test_memoize_on_disk(): + if not lp.CACHING_ENABLED: + # if caching is disabled => don't test the caching behavior + pytest.skip("cannot test memoization if caching disabled") + + knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + y[i] = i + """) + + from time import time + uncached_knl = very_costly_transform(knl, "i") + + start = time() + cached_knl = very_costly_transform(knl, "i") + end = time() + assert (end - start) < 4 + assert cached_knl == uncached_knl + + +@lp.memoize_on_disk +def get_twice_of_pym_expr(expr): + from time import sleep + sleep(2) + return 2 * expr + + +def test_memoize_on_disk_with_pym_expr(): + if not lp.CACHING_ENABLED: + # if caching is disabled => don't test the caching behavior + pytest.skip("cannot test memoization if caching disabled") + + from pymbolic import parse + expr = parse("a[i] + b[i]") + + from time import time + uncached_result = get_twice_of_pym_expr(expr) + + start = time() + cached_result = get_twice_of_pym_expr(expr) + end = time() + assert (end - start) < 1.5 + assert cached_result == uncached_result + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From 55143b21711a534c07bbb14aaa63ff3879a93433 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 8 Dec 2021 12:42:59 +0530 Subject: [PATCH 06/27] use memoize_on_disk rather than hand rolling self memoization implementation --- loopy/preprocess.py | 30 ++---------------------------- loopy/transform/buffer.py | 34 ++-------------------------------- test/test_reduction.py | 3 +-- 3 files changed, 5 insertions(+), 62 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 55b735f4a..d30e68d80 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -28,10 +28,7 @@ LoopyAdvisory) import islpy as isl -from pytools.persistent_dict import WriteOncePersistentDict - -from loopy.tools import LoopyKeyBuilder -from loopy.version import DATA_MODEL_VERSION +from loopy.tools import memoize_on_disk from loopy.kernel.data import make_assignment, filter_iname_tags_by_type from loopy.kernel.tools import kernel_has_global_barriers # for the benefit of loopy.statistics, for now @@ -2360,11 +2357,6 @@ def filter_reachable_callables(t_unit): return t_unit.copy(callables_table=new_callables) -preprocess_cache = WriteOncePersistentDict( - "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) - - def _preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState @@ -2413,24 +2405,9 @@ def _preprocess_single_kernel(kernel, callables_table, device=None): return kernel +@memoize_on_disk def preprocess_program(program, device=None): - # {{{ cache retrieval - - from loopy import CACHING_ENABLED - if CACHING_ENABLED: - input_program = program - - try: - result = preprocess_cache[program] - logger.debug(f"program with entrypoints: {program.entrypoints}" - " preprocess cache hit") - return result - except KeyError: - pass - - # }}} - from loopy.kernel import KernelState if program.state >= KernelState.PREPROCESSED: return program @@ -2519,9 +2496,6 @@ def preprocess_program(program, device=None): # }}} - if CACHING_ENABLED: - preprocess_cache.store_if_not_present(input_program, program) - return program diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b23ccf526..a6e25457d 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -26,9 +26,7 @@ RuleAwareIdentityMapper, SubstitutionRuleMappingContext, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func -from pytools.persistent_dict import WriteOncePersistentDict -from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper -from loopy.version import DATA_MODEL_VERSION +from loopy.tools import memoize_on_disk from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.translation_unit import TranslationUnit @@ -124,12 +122,6 @@ def map_array_access(self, index, expn_state): # }}} -buffer_array_cache = WriteOncePersistentDict( - "loopy-buffer-array-cache-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) - - -# Adding an argument? also add something to the cache_key below. def buffer_array_for_single_kernel(kernel, callables_table, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, @@ -248,26 +240,6 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, # }}} - # {{{ caching - - from loopy import CACHING_ENABLED - - cache_key = (kernel, var_name, - tuple(buffer_inames), - PymbolicExpressionHashWrapper(init_expression), - PymbolicExpressionHashWrapper(store_expression), within, - default_tag, temporary_scope, fetch_bounding_box) - - if CACHING_ENABLED: - try: - result = buffer_array_cache[cache_key] - logger.info("%s: buffer_array cache hit" % kernel.name) - return result - except KeyError: - pass - - # }}} - var_name_gen = kernel.get_var_name_generator() within_inames = set() @@ -543,12 +515,10 @@ def none_to_empty_set(s): from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel, callables_table) - if CACHING_ENABLED: - buffer_array_cache.store_if_not_present(cache_key, kernel) - return kernel +@memoize_on_disk def buffer_array(program, *args, **kwargs): assert isinstance(program, TranslationUnit) diff --git a/test/test_reduction.py b/test/test_reduction.py index 931628a04..1aa3b52b6 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -51,7 +51,6 @@ def test_nonsense_reduction(ctx_factory): - ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", @@ -63,7 +62,7 @@ def test_nonsense_reduction(ctx_factory): import pytest with pytest.raises(RuntimeError): - knl = lp.preprocess_kernel(knl, ctx.devices[0]) + knl = lp.preprocess_kernel(knl) def test_empty_reduction(ctx_factory): From 712ede3cf000699c6902d39dd533aa8cc2899459 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 11 Jan 2022 12:55:41 -0600 Subject: [PATCH 07/27] Fix test for numpy builtin type to avoid numpy#4317 --- loopy/target/pyopencl_execution.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 87e13faa2..255858c19 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -21,6 +21,7 @@ """ +import numpy as np from pytools import memoize_method from pytools.py_codegen import Indentation from loopy.target.execution import ( @@ -51,9 +52,12 @@ def __init__(self): def python_dtype_str_inner(self, dtype): import pyopencl.tools as cl_tools - if dtype.isbuiltin: + # Test for types built into numpy. dtype.isbuiltin does not work: + # https://github.com/numpy/numpy/issues/4317 + # Guided by https://numpy.org/doc/stable/reference/arrays.scalars.html + if issubclass(dtype.type, (np.bool_, np.number)): name = dtype.name - if dtype.name == "bool": + if dtype.type == np.bool_: name = "bool8" return f"_lpy_np.dtype(_lpy_np.{name})" else: From d6b8fb9ea28302c433046d569219492001956ec6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 12 Jan 2022 22:59:17 -0600 Subject: [PATCH 08/27] Implements `LazilyUnpicklingList.__(add|mul)__` --- loopy/tools.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/loopy/tools.py b/loopy/tools.py index 9216cbc19..d12ff750c 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -501,6 +501,12 @@ def insert(self, key, value): def __getstate__(self): return {"_list": [_PickledObject(val) for val in self._list]} + def __add__(self, other): + return self._list + other + + def __mul__(self, other): + return self._list * other + class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList): """A list which lazily unpickles its values, and supports equality comparison From 23d19bd684146ac35521b3ad49b89c19623e37d3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 23 Jan 2022 15:27:06 -0600 Subject: [PATCH 09/27] Fix ISPC dev download link --- .github/workflows/ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4f8ee7da..744c2c162 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,8 +100,11 @@ jobs: . ./ci-support-v0 build_py_project_in_conda_env - curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - - export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" + # https://github.com/ispc/ispc/issues/2240 + # curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - + # export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" + curl -L https://github.com/ispc/ispc/releases/download/v1.17.0/ispc-v1.17.0-linux.tar.gz | tar xfz - + export PATH="$(pwd)/ispc-v1.17.0-linux/bin:$PATH" export PYOPENCL_TEST=portable:pthread @@ -198,7 +201,7 @@ jobs: run: | # helps with tmate debugging sudo chmod a+rwX -R $(whoami) /__w/_temp || true - + # - uses: mxschmitt/action-tmate@v3 # vim: sw=4 From f810d61ca41c5fd772f5ed8c13fe8abdf08fe570 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 23 Jan 2022 17:46:35 -0600 Subject: [PATCH 10/27] Use (new) install_ispc command from ci-support --- .github/workflows/ci.yml | 6 +----- .gitlab-ci.yml | 8 ++++---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 744c2c162..efc8bdde2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,11 +100,7 @@ jobs: . ./ci-support-v0 build_py_project_in_conda_env - # https://github.com/ispc/ispc/issues/2240 - # curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - - # export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" - curl -L https://github.com/ispc/ispc/releases/download/v1.17.0/ispc-v1.17.0-linux.tar.gz | tar xfz - - export PATH="$(pwd)/ispc-v1.17.0-linux/bin:$PATH" + install_ispc export PYOPENCL_TEST=portable:pthread diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3945734eb..32d1b886b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -114,12 +114,12 @@ Pytest POCL Examples: export PYOPENCL_TEST=portable:pthread export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh - . ./ci-support.sh + curl -L -O -k https://tiker.net/ci-support-v0 + . ./ci-support-v0 + build_py_project_in_venv - curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz - - export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH" + install_ispc . ./build-py-project-and-run-examples.sh run_py_examples From 6b9d8fa8e0ef62ac641c09767195d88f3ff59b50 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 23 Jan 2022 17:47:39 -0600 Subject: [PATCH 11/27] Drop curl -k flags in gitlab CI config --- .gitlab-ci.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 32d1b886b..721f90b58 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,7 +8,7 @@ Pytest POCL: - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" - export LOOPY_NO_CACHE=1 - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 @@ -26,7 +26,7 @@ Pytest Nvidia Titan V: - export EXTRA_INSTALL="pybind11 numpy mako" - export LOOPY_NO_CACHE=1 - source /opt/enable-intel-cl.sh - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 @@ -44,7 +44,7 @@ Pytest POCL without arg check: - export EXTRA_INSTALL="pybind11 numpy mako" - export LOOPY_NO_CACHE=1 - export _LOOPY_SKIP_ARG_CHECKS=1 - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 @@ -63,7 +63,7 @@ Pytest Intel: - export LOOPY_NO_CACHE=1 - export LOOPY_INTEL_CL_OK_FOR_TEST_REF=1 - source /opt/enable-intel-cl.sh - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: - python3 @@ -80,7 +80,7 @@ Pytest POCL Twice With Cache: script: | export PYOPENCL_TEST=portable:pthread export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh + curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh . ./ci-support.sh build_py_project_in_venv ( test_py_project ) @@ -100,7 +100,7 @@ Pytest POCL Twice With Cache: # - export PY_EXE=pypy # - export PYOPENCL_TEST=portable:pthread # - export EXTRA_INSTALL="pybind11 numpy mako" -# - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh +# - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh # - ". ./build-and-test-py-project.sh" # tags: # - pypy @@ -114,7 +114,7 @@ Pytest POCL Examples: export PYOPENCL_TEST=portable:pthread export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - curl -L -O -k https://tiker.net/ci-support-v0 + curl -L -O https://tiker.net/ci-support-v0 . ./ci-support-v0 build_py_project_in_venv @@ -140,7 +140,7 @@ Pylint: # Needed to avoid name shadowing issues when running from source directory. - PROJECT_INSTALL_FLAGS="--editable" - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser" - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py tags: - python3 @@ -152,7 +152,7 @@ Documentation: script: - PROJECT=loopy - EXTRA_INSTALL="pybind11 numpy" - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-docs.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-docs.sh - ". ./build-docs.sh" tags: - python3 @@ -160,7 +160,7 @@ Documentation: Flake8: stage: test script: - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples tags: - python3 @@ -175,7 +175,7 @@ Benchmarks: - PYOPENCL_TEST=portable:pthread - export LOOPY_NO_CACHE=1 - export ASV_FACTOR=1.5 - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh + - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh - ". ./build-and-benchmark-py-project.sh" tags: - linux From 7f44a241ad11ab2afcd1dbd14de5cd0191923825 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 3 Feb 2022 19:00:27 -0600 Subject: [PATCH 12/27] Drop -k flags on curl in Github CI --- .github/workflows/ci.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index efc8bdde2..a2b69d81e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: python-version: '3.6' - name: "Main Script" run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh + curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples pylint: @@ -32,7 +32,7 @@ jobs: run: | sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml USE_CONDA_BUILD=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh + curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py pytest: @@ -42,7 +42,7 @@ jobs: - uses: actions/checkout@v2 - name: "Main Script" run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh + curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh . ./build-and-test-py-project-within-miniconda.sh pytest_intel: @@ -71,7 +71,7 @@ jobs: - uses: actions/checkout@v2 - name: "Main Script" run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh + curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh export _LOOPY_SKIP_ARG_CHECKS=1 . ./build-and-test-py-project-within-miniconda.sh @@ -82,7 +82,7 @@ jobs: - uses: actions/checkout@v2 - name: "Main Script" run: | - curl -L -O -k https://tiker.net/ci-support-v0 + curl -L -O https://tiker.net/ci-support-v0 . ./ci-support-v0 build_py_project_in_conda_env ( test_py_project ) @@ -96,7 +96,7 @@ jobs: - name: "Main Script" run: | EXTRA_INSTALL="matplotlib ipykernel nbconvert" - curl -L -O -k https://tiker.net/ci-support-v0 + curl -L -O https://tiker.net/ci-support-v0 . ./ci-support-v0 build_py_project_in_conda_env @@ -121,7 +121,7 @@ jobs: - name: "Main Script" run: | PROJECT=loopy - curl -L -O -k https://tiker.net/ci-support-v0 + curl -L -O https://tiker.net/ci-support-v0 . ci-support-v0 build_py_project_in_conda_env build_docs From 375977a4ab9398093df637c6a3f23925f3c3363e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 3 Feb 2022 19:01:00 -0600 Subject: [PATCH 13/27] Drop unnecessary .. currentmodule in loopy.kernel.data --- loopy/kernel/data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index f04f3cbc5..b8194e107 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -48,8 +48,6 @@ from warnings import warn __doc__ = """ -.. currentmodule:: loopy.kernel.data - .. autofunction:: filter_iname_tags_by_type .. autoclass:: InameImplementationTag From 18f78846354a93f7d3b98ade7f1fecf816402f42 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 3 Feb 2022 19:45:51 -0600 Subject: [PATCH 14/27] Introduce replace_instruction_ids_in_insn --- loopy/transform/instruction.py | 62 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 9a7936cd3..bdf74fc56 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -250,40 +250,42 @@ def remove_instructions(kernel, insn_ids): # {{{ replace_instruction_ids -def replace_instruction_ids(kernel, replacements): - if not replacements: - return kernel +def replace_instruction_ids_in_insn(insn, replacements): + changed = False + new_depends_on = list(insn.depends_on) + extra_depends_on = [] + new_no_sync_with = [] + + for idep, dep in enumerate(insn.depends_on): + if dep in replacements: + new_deps = list(replacements[dep]) + new_depends_on[idep] = new_deps[0] + extra_depends_on.extend(new_deps[1:]) + changed = True - new_insns = [] + for insn_id, scope in insn.no_sync_with: + if insn_id in replacements: + new_no_sync_with.extend( + (repl, scope) for repl in replacements[insn_id]) + changed = True + else: + new_no_sync_with.append((insn_id, scope)) - for insn in kernel.instructions: - changed = False - new_depends_on = list(insn.depends_on) - extra_depends_on = [] - new_no_sync_with = [] - - for idep, dep in enumerate(insn.depends_on): - if dep in replacements: - new_deps = list(replacements[dep]) - new_depends_on[idep] = new_deps[0] - extra_depends_on.extend(new_deps[1:]) - changed = True - - for insn_id, scope in insn.no_sync_with: - if insn_id in replacements: - new_no_sync_with.extend( - (repl, scope) for repl in replacements[insn_id]) - changed = True - else: - new_no_sync_with.append((insn_id, scope)) + if changed: + return insn.copy( + depends_on=frozenset(new_depends_on + extra_depends_on), + no_sync_with=frozenset(new_no_sync_with)) + else: + return insn - new_insns.append( - insn.copy( - depends_on=frozenset(new_depends_on + extra_depends_on), - no_sync_with=frozenset(new_no_sync_with)) - if changed else insn) - return kernel.copy(instructions=new_insns) +def replace_instruction_ids(kernel, replacements): + if not replacements: + return kernel + + return kernel.copy(instructions=[ + replace_instruction_ids_in_insn(insn, replacements) + for insn in kernel.instructions]) # }}} From 0986f8e771ee3ded5d4d93fb7011fa6258735867 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 3 Feb 2022 18:56:01 -0600 Subject: [PATCH 15/27] Refactor realize_reduction, move to separate file --- loopy/__init__.py | 4 +- loopy/preprocess.py | 1815 +---------------------- loopy/transform/realize_reduction.py | 2053 ++++++++++++++++++++++++++ 3 files changed, 2059 insertions(+), 1813 deletions(-) create mode 100644 loopy/transform/realize_reduction.py diff --git a/loopy/__init__.py b/loopy/__init__.py index 1a6c1599e..0e6e9f87c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -122,10 +122,12 @@ merge, inline_callable_kernel, rename_callable) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call +from loopy.transform.realize_reduction import realize_reduction + # }}} from loopy.type_inference import infer_unknown_types -from loopy.preprocess import (preprocess_kernel, realize_reduction, +from loopy.preprocess import (preprocess_kernel, preprocess_program, infer_arg_descr) from loopy.schedule import ( generate_loop_schedules, get_one_scheduled_kernel, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d30e68d80..1b2a01840 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -26,19 +26,16 @@ from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) -import islpy as isl from loopy.tools import memoize_on_disk -from loopy.kernel.data import make_assignment, filter_iname_tags_by_type -from loopy.kernel.tools import kernel_has_global_barriers +from loopy.kernel.data import filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper +from loopy.symbolic import RuleAwareIdentityMapper # from loopy.transform.iname import remove_any_newly_unused_inames from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) -from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger @@ -250,1813 +247,6 @@ def find_temporary_address_space(kernel): # }}} -# {{{ rewrite reduction to imperative form - - -# {{{ utils (not stateful) - -from collections import namedtuple - - -_InameClassification = namedtuple("_InameClassifiction", - "sequential, local_parallel, nonlocal_parallel") - - -def _classify_reduction_inames(kernel, inames): - sequential = [] - local_par = [] - nonlocal_par = [] - - from loopy.kernel.data import ( - LocalInameTagBase, UnrolledIlpTag, UnrollTag, - ConcurrentTag, filter_iname_tags_by_type) - - for iname in inames: - iname_tags = kernel.iname_tags(iname) - - if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)): - # These are nominally parallel, but we can live with - # them as sequential. - sequential.append(iname) - - elif filter_iname_tags_by_type(iname_tags, LocalInameTagBase): - local_par.append(iname) - - elif filter_iname_tags_by_type(iname_tags, ConcurrentTag): - nonlocal_par.append(iname) - - else: - sequential.append(iname) - - return _InameClassification( - tuple(sequential), tuple(local_par), tuple(nonlocal_par)) - - -def _add_params_to_domain(domain, param_names): - dim_type = isl.dim_type - nparams_orig = domain.dim(dim_type.param) - domain = domain.add_dims(dim_type.param, len(param_names)) - - for param_idx, param_name in enumerate(param_names): - domain = domain.set_dim_name( - dim_type.param, param_idx + nparams_orig, param_name) - - return domain - - -def _move_set_to_param_dims_except(domain, except_dims): - dim_type = isl.dim_type - - iname_idx = 0 - for iname in domain.get_var_names(dim_type.set): - if iname not in except_dims: - domain = domain.move_dims( - dim_type.param, 0, - dim_type.set, iname_idx, 1) - iname_idx -= 1 - iname_idx += 1 - - return domain - - -def _domain_depends_on_given_set_dims(domain, set_dim_names): - set_dim_names = frozenset(set_dim_names) - - return any( - set_dim_names & set(constr.get_coefficients_by_name()) - for constr in domain.get_constraints()) - - -def _check_reduction_is_triangular(kernel, expr, scan_param): - """Check whether the reduction within `expr` with scan parameters described by - the structure `scan_param` is triangular. This attempts to verify that the - domain for the scan and sweep inames is as follows: - - [params] -> { - [other inames..., scan_iname, sweep_iname]: - (sweep_min_value - <= sweep_iname - <= sweep_max_value) - and - (scan_min_value - <= scan_iname - <= stride * (sweep_iname - sweep_min_value) + scan_min_value) - and - (irrelevant constraints) - } - """ - - orig_domain = kernel.get_inames_domain( - frozenset((scan_param.sweep_iname, scan_param.scan_iname))) - - sweep_iname = scan_param.sweep_iname - scan_iname = scan_param.scan_iname - affs = isl.affs_from_space(orig_domain.space) - - sweep_lower_bound = isl.align_spaces( - scan_param.sweep_lower_bound, - affs[0]) - - sweep_upper_bound = isl.align_spaces( - scan_param.sweep_upper_bound, - affs[0]) - - scan_lower_bound = isl.align_spaces( - scan_param.scan_lower_bound, - affs[0]) - - from itertools import product - - for (sweep_lb_domain, sweep_lb_aff), \ - (sweep_ub_domain, sweep_ub_aff), \ - (scan_lb_domain, scan_lb_aff) in \ - product(sweep_lower_bound.get_pieces(), - sweep_upper_bound.get_pieces(), - scan_lower_bound.get_pieces()): - - # Assumptions inherited from the domains of the pwaffs - assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain - - # Sweep iname constraints - hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) - hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) - - # Scan iname constraints - hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) - hyp_domain &= affs[scan_iname].le_set( - scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) - + scan_lb_aff) - - hyp_domain, = (hyp_domain & assumptions).get_basic_sets() - test_domain, = (orig_domain & assumptions).get_basic_sets() - - hyp_gist_against_test = hyp_domain.gist(test_domain) - if _domain_depends_on_given_set_dims(hyp_gist_against_test, - (sweep_iname, scan_iname)): - return False, ( - "gist of hypothesis against test domain " - "has sweep or scan dependent constraints: '%s'" - % hyp_gist_against_test) - - test_gist_against_hyp = test_domain.gist(hyp_domain) - if _domain_depends_on_given_set_dims(test_gist_against_hyp, - (sweep_iname, scan_iname)): - return False, ( - "gist of test against hypothesis domain " - "has sweep or scan dependent constraint: '%s'" - % test_gist_against_hyp) - - return True, "ok" - - -_ScanCandidateParameters = namedtuple( - "_ScanCandidateParameters", - "sweep_iname, scan_iname, sweep_lower_bound, " - "sweep_upper_bound, scan_lower_bound, stride") - - -def _try_infer_scan_candidate_from_expr( - kernel, expr, within_inames, sweep_iname=None): - """Analyze `expr` and determine if it can be implemented as a scan. - """ - from loopy.symbolic import Reduction - assert isinstance(expr, Reduction) - - if len(expr.inames) != 1: - raise ValueError( - "Multiple inames in reduction: '{}'".format(", ".join(expr.inames))) - - scan_iname, = expr.inames - - from loopy.kernel.tools import DomainChanger - dchg = DomainChanger(kernel, (scan_iname,)) - domain = dchg.get_original_domain() - - if sweep_iname is None: - try: - sweep_iname = _try_infer_sweep_iname( - domain, scan_iname, kernel.all_inames()) - except ValueError as v: - raise ValueError( - "Couldn't determine a sweep iname for the scan " - "expression '%s': %s" % (expr, v)) - - try: - sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( - _try_infer_scan_and_sweep_bounds( - kernel, scan_iname, sweep_iname, within_inames)) - except ValueError as v: - raise ValueError( - "Couldn't determine bounds for the scan with expression '%s' " - "(sweep iname: '%s', scan iname: '%s'): %s" - % (expr, sweep_iname, scan_iname, v)) - - try: - stride = _try_infer_scan_stride( - kernel, scan_iname, sweep_iname, sweep_lower_bound) - except ValueError as v: - raise ValueError( - "Couldn't determine a scan stride for the scan with expression '%s' " - "(sweep iname: '%s', scan iname: '%s'): %s" - % (expr, sweep_iname, scan_iname, v)) - - return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, - sweep_upper_bound, scan_lower_bound, stride) - - -def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): - """The sweep iname is the outer iname which guides the scan. - - E.g. for a domain of {[i,j]: 0<=i 1: - raise ValueError( - "More than one sweep iname candidate for scan iname '%s' found " - "(via constraint '%s')" % (scan_iname, constr)) - - next_candidate = candidate_vars.pop() - - if sweep_iname_candidate is None: - sweep_iname_candidate = next_candidate - defining_constraint = constr - else: - # Check next_candidate consistency - if sweep_iname_candidate != next_candidate: - raise ValueError( - "More than one sweep iname candidate for scan iname '%s' " - "found (via constraints '%s', '%s')" % - (scan_iname, defining_constraint, constr)) - - if sweep_iname_candidate is None: - raise ValueError( - "Couldn't find any sweep iname candidates for " - "scan iname '%s'" % scan_iname) - - return sweep_iname_candidate - - -def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): - domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) - domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) - - var_dict = domain.get_var_dict() - sweep_idx = var_dict[sweep_iname][1] - scan_idx = var_dict[scan_iname][1] - - domain = domain.project_out_except( - within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) - - try: - with isl.SuppressedWarnings(domain.get_ctx()): - sweep_lower_bound = domain.dim_min(sweep_idx) - sweep_upper_bound = domain.dim_max(sweep_idx) - scan_lower_bound = domain.dim_min(scan_idx) - except isl.Error as e: - raise ValueError("isl error: %s" % e) - - return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) - - -def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): - """The stride is the number of steps the scan iname takes per iteration - of the sweep iname. This is allowed to be an integer constant. - - E.g. for a domain of {[i,j]: 0<=i 1: - raise ValueError("range in multiple pieces: %s" % scan_iname_range) - elif len(scan_iname_pieces) == 0: - raise ValueError("empty range found for iname '%s'" % scan_iname) - - scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] - - if not scan_iname_constr.plain_is_universe(): - raise ValueError("found constraints: %s" % scan_iname_constr) - - if scan_iname_aff.dim(dim_type.div): - raise ValueError("aff has div: %s" % scan_iname_aff) - - coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) - - if len(coeffs) == 0: - try: - scan_iname_aff.get_constant_val() - except Exception: - raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) - - # If this point is reached we're assuming the domain is of the form - # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value - # this function returns will be verified later by - # _check_reduction_is_triangular(). - return 1 - - if sweep_iname not in coeffs: - raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) - - stride = coeffs[sweep_iname] - - if not stride.is_int(): - raise ValueError("stride not an integer: %s" % stride) - - if not stride.is_pos(): - raise ValueError("stride not positive: %s" % stride) - - return stride.to_python() - - -def _get_domain_with_iname_as_param(domain, iname): - dim_type = isl.dim_type - - if domain.find_dim_by_name(dim_type.param, iname) >= 0: - return domain - - iname_idx = domain.find_dim_by_name(dim_type.set, iname) - - assert iname_idx >= 0, (iname, domain) - - return domain.move_dims( - dim_type.param, domain.dim(dim_type.param), - dim_type.set, iname_idx, 1) - - -def _create_domain_for_sweep_tracking(orig_domain, - tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): - dim_type = isl.dim_type - - subd = isl.BasicSet.universe(orig_domain.params().space) - - # Add tracking_iname and sweep iname. - - subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) - - # Here we realize the domain: - # - # [..., i] -> { - # [j]: 0 <= j - l - # and - # j - l <= k * (i - m) - # and - # k * (i - m - 1) < j - l } - # where - # * i is the sweep iname - # * j is the tracking iname - # * k is the stride for the scan - # * l is the lower bound for the scan - # * m is the lower bound for the sweep iname - # - affs = isl.affs_from_space(subd.space) - - subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) - subd &= (affs[tracking_iname] - scan_min_value)\ - .le_set(stride * (affs[sweep_iname] - sweep_min_value)) - subd &= (affs[tracking_iname] - scan_min_value)\ - .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) - - # Move tracking_iname into a set dim (NOT sweep iname). - subd = subd.move_dims( - dim_type.set, 0, - dim_type.param, subd.dim(dim_type.param) - 1, 1) - - # Simplify (maybe). - orig_domain_with_sweep_param = ( - _get_domain_with_iname_as_param(orig_domain, sweep_iname)) - subd = subd.gist_params(orig_domain_with_sweep_param.params()) - - subd, = subd.get_basic_sets() - - return subd - - -def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): - """ - Multi assignment function calls are currently lowered into OpenCL so that - the function call:: - - a, b = segmented_sum(x, y, z, w) - - becomes:: - - a = segmented_sum_mangled(x, y, z, w, &b). - - For OpenCL, the scope of "b" is significant, and the preamble generation - currently assumes the scope is always private. This function forces that to - be the case by introducing temporary assignments into the kernel. - """ - - insn_id_gen = kernel.get_instruction_id_generator() - var_name_gen = kernel.get_var_name_generator() - - new_or_updated_instructions = {} - new_temporaries = {} - - dep_map = { - insn.id: insn.depends_on for insn in kernel.instructions} - - inverse_dep_map = {insn.id: set() for insn in kernel.instructions} - - for insn_id, deps in dep_map.items(): - for dep in deps: - inverse_dep_map[dep].add(insn_id) - - del dep_map - - # {{{ utils - - def _add_to_no_sync_with(insn_id, new_no_sync_with_params): - insn = kernel.id_to_insn.get(insn_id) - insn = new_or_updated_instructions.get(insn_id, insn) - new_or_updated_instructions[insn_id] = ( - insn.copy( - no_sync_with=( - insn.no_sync_with | frozenset(new_no_sync_with_params)))) - - def _add_to_depends_on(insn_id, new_depends_on_params): - insn = kernel.id_to_insn.get(insn_id) - insn = new_or_updated_instructions.get(insn_id, insn) - new_or_updated_instructions[insn_id] = ( - insn.copy( - depends_on=insn.depends_on | frozenset(new_depends_on_params))) - - # }}} - - from loopy.kernel.instruction import CallInstruction, is_array_call - for insn in kernel.instructions: - if not isinstance(insn, CallInstruction): - continue - - if len(insn.assignees) <= 1: - continue - - if is_array_call(insn.assignees, insn.expression): - continue - - assignees = insn.assignees - assignee_var_names = insn.assignee_var_names() - - new_assignees = [assignees[0]] - newly_added_assignments_ids = set() - needs_replacement = False - - last_added_insn_id = insn.id - - from loopy.kernel.data import AddressSpace, TemporaryVariable - - FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa - - for assignee_nr, assignee_var_name, assignee in zip( - range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)), - assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:], - assignees[FIRST_POINTER_ASSIGNEE_IDX:]): - - if ( - assignee_var_name in kernel.temporary_variables - and - (kernel.temporary_variables[assignee_var_name].address_space - == AddressSpace.PRIVATE)): - new_assignees.append(assignee) - continue - - needs_replacement = True - - # {{{ generate a new assignent instruction - - new_assignee_name = var_name_gen( - "{insn_id}_retval_{assignee_nr}" - .format(insn_id=insn.id, assignee_nr=assignee_nr)) - - new_assignment_id = insn_id_gen( - "{insn_id}_assign_retval_{assignee_nr}" - .format(insn_id=insn.id, assignee_nr=assignee_nr)) - - newly_added_assignments_ids.add(new_assignment_id) - - new_temporaries[new_assignee_name] = ( - TemporaryVariable( - name=new_assignee_name, - dtype=None, - address_space=AddressSpace.PRIVATE)) - - from pymbolic import var - new_assignee = var(new_assignee_name) - new_assignees.append(new_assignee) - - new_or_updated_instructions[new_assignment_id] = ( - make_assignment( - assignees=(assignee,), - expression=new_assignee, - id=new_assignment_id, - depends_on=frozenset([last_added_insn_id]), - depends_on_is_final=True, - no_sync_with=( - insn.no_sync_with | frozenset([(insn.id, "any")])), - predicates=insn.predicates, - within_inames=insn.within_inames)) - - last_added_insn_id = new_assignment_id - - # }}} - - if not needs_replacement: - continue - - # {{{ update originating instruction - - orig_insn = new_or_updated_instructions.get(insn.id, insn) - - new_or_updated_instructions[insn.id] = ( - orig_insn.copy(assignees=tuple(new_assignees))) - - _add_to_no_sync_with(insn.id, - [(id, "any") for id in newly_added_assignments_ids]) - - # }}} - - # {{{ squash spurious memory dependencies amongst new assignments - - for new_insn_id in newly_added_assignments_ids: - _add_to_no_sync_with(new_insn_id, - [(id, "any") - for id in newly_added_assignments_ids - if id != new_insn_id]) - - # }}} - - # {{{ update instructions that depend on the originating instruction - - for inverse_dep in inverse_dep_map[insn.id]: - _add_to_depends_on(inverse_dep, newly_added_assignments_ids) - - for insn_id, scope in ( - new_or_updated_instructions[inverse_dep].no_sync_with): - if insn_id == insn.id: - _add_to_no_sync_with( - inverse_dep, - [(id, scope) for id in newly_added_assignments_ids]) - - # }}} - - if not new_temporaries and not new_or_updated_instructions: - return kernel - - new_temporary_variables = kernel.temporary_variables.copy() - new_temporary_variables.update(new_temporaries) - - new_instructions = ( - list(new_or_updated_instructions.values()) - + list(insn - for insn in kernel.instructions - if insn.id not in new_or_updated_instructions)) - - return kernel.copy(temporary_variables=new_temporary_variables, - instructions=new_instructions) - - -def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): - # Intersect with inames, because we could have captured some kernel params - # in here too... - dependent_inames = ( - frozenset(subdomain.get_var_names(isl.dim_type.param)) - & kernel.all_inames()) - idx, = kernel.get_leaf_domain_indices(dependent_inames) - domains.insert(idx + 1, subdomain) - -# }}} - - -class RealizeReductionCallbackMapper(ReductionCallbackMapper): - def __init__(self, callback, callables_table): - super().__init__(callback) - self.callables_table = callables_table - - def map_reduction(self, expr, **kwargs): - result, self.callables_table = self.callback(expr, self.rec, - **kwargs) - return result - - def map_if(self, expr, callables_table, guarding_predicates, nresults=1): - import pymbolic.primitives as prim - rec_cond = self.rec(expr.condition, callables_table=callables_table, - guarding_predicates=guarding_predicates, - nresults=nresults) - return prim.If(rec_cond, - self.rec(expr.then, callables_table=callables_table, - guarding_predicates=( - guarding_predicates - | frozenset([rec_cond])), - nresults=nresults), - self.rec(expr.else_, callables_table=callables_table, - guarding_predicates=( - guarding_predicates - | frozenset([prim.LogicalNot(rec_cond)])), - nresults=nresults)) - - -# @remove_any_newly_unused_inames -def realize_reduction_for_single_kernel(kernel, callables_table, - insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, - force_scan=False, force_outer_iname_for_scan=None): - """Rewrites reductions into their imperative form. With *insn_id_filter* - specified, operate only on the instruction with an instruction id matching - *insn_id_filter*. - - If *insn_id_filter* is given, only the outermost level of reductions will be - expanded, inner reductions will be left alone (because they end up in a new - instruction with a different ID, which doesn't match the filter). - - If *insn_id_filter* is not given, all reductions in all instructions will - be realized. - - If *automagic_scans_ok*, this function will attempt to rewrite triangular - reductions as scans automatically. - - If *force_scan* is *True*, this function will attempt to rewrite *all* - candidate reductions as scans and raise an error if this is not possible - (this is most useful combined with *insn_id_filter*). - - If *force_outer_iname_for_scan* is not *None*, this function will attempt - to realize candidate reductions as scans using the specified iname as the - outer (sweep) iname. - """ - - logger.debug("%s: realize reduction" % kernel.name) - - new_insns = [] - new_iname_tags = {} - - insn_id_gen = kernel.get_instruction_id_generator() - - var_name_gen = kernel.get_var_name_generator() - new_temporary_variables = kernel.temporary_variables.copy() - inames_added_for_scan = set() - inames_to_remove = set() - - # {{{ helpers - - def _strip_if_scalar(reference, val): - if len(reference) == 1: - return val[0] - else: - return val - - def preprocess_scan_arguments( - insn, expr, nresults, scan_iname, track_iname, - newly_generated_insn_id_set): - """Does iname substitution within scan arguments and returns a set of values - suitable to be passed to the binary op. Returns a tuple.""" - - if nresults > 1: - inner_expr = expr - - # In the case of a multi-argument scan, we need a name for each of - # the arguments in order to pass them to the binary op - so we expand - # items that are not "plain" tuples here. - if not isinstance(inner_expr, tuple): - get_args_insn_id = insn_id_gen( - "{}_{}_get".format(insn.id, "_".join(expr.inames))) - - inner_expr = expand_inner_reduction( - id=get_args_insn_id, - expr=inner_expr, - nresults=nresults, - depends_on=insn.depends_on, - within_inames=insn.within_inames | expr.inames, - within_inames_is_final=insn.within_inames_is_final, - predicates=insn.predicates, - ) - - newly_generated_insn_id_set.add(get_args_insn_id) - - updated_inner_exprs = tuple( - replace_var_within_expr(sub_expr, scan_iname, track_iname) - for sub_expr in inner_expr) - else: - updated_inner_exprs = ( - replace_var_within_expr(expr, scan_iname, track_iname),) - - return updated_inner_exprs - - def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, - within_inames_is_final, predicates): - # FIXME: use make_temporaries - from pymbolic.primitives import Call - from loopy.symbolic import Reduction - assert isinstance(expr, (Call, Reduction)) - - temp_var_names = [ - var_name_gen(id + "_arg" + str(i)) - for i in range(nresults)] - - for name in temp_var_names: - from loopy.kernel.data import TemporaryVariable, AddressSpace - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=None, - address_space=AddressSpace.PRIVATE) - - from pymbolic import var - temp_vars = tuple(var(n) for n in temp_var_names) - - call_insn = make_assignment( - id=id, - assignees=temp_vars, - expression=expr, - depends_on=depends_on, - within_inames=within_inames, - within_inames_is_final=within_inames_is_final, - predicates=predicates) - - generated_insns.append(call_insn) - - return temp_vars - - # }}} - - # {{{ sequential - - def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates): - outer_insn_inames = insn.within_inames - - from loopy.kernel.data import AddressSpace - acc_var_names = make_temporaries( - name_based_on="acc_"+"_".join(expr.inames), - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - address_space=AddressSpace.PRIVATE) - - init_insn_depends_on = frozenset() - - # check first that the original kernel had global barriers - # if not, we don't need to check. Since the function - # kernel_has_global_barriers is cached, we don't do - # extra work compared to not checking. - # FIXME: Explain why we care about global barriers her - if kernel_has_global_barriers(kernel): - global_barrier = lp.find_most_recent_global_barrier(temp_kernel, - insn.id) - - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) - - from pymbolic import var - acc_vars = tuple(var(n) for n in acc_var_names) - - init_id = insn_id_gen( - "{}_{}_init".format(insn.id, "_".join(expr.inames))) - - expression, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, target=kernel.target) - - init_insn = make_assignment( - id=init_id, - assignees=acc_vars, - within_inames=outer_insn_inames - frozenset(expr.inames), - within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on, - expression=expression, - - # Do not inherit predicates: Those might read variables - # that may not yet be set, and we don't have a great way - # of figuring out what the dependencies of the accumulator - # initializer should be. - - # This way, we may initialize a few too many accumulators, - # but that's better than being incorrect. - # https://github.com/inducer/loopy/issues/231 - ) - - generated_insns.append(init_insn) - - update_id = insn_id_gen( - based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) - - update_insn_iname_deps = insn.within_inames | set(expr.inames) - if insn.within_inames_is_final: - update_insn_iname_deps = insn.within_inames | set(expr.inames) - - reduction_insn_depends_on = {init_id} - - # In the case of a multi-argument reduction, we need a name for each of - # the arguments in order to pass them to the binary op - so we expand - # items that are not "plain" tuples here. - if nresults > 1 and not isinstance(expr.expr, tuple): - get_args_insn_id = insn_id_gen( - "{}_{}_get".format(insn.id, "_".join(expr.inames))) - - reduction_expr = expand_inner_reduction( - id=get_args_insn_id, - expr=expr.expr, - nresults=nresults, - depends_on=insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, - ) - - reduction_insn_depends_on.add(get_args_insn_id) - else: - reduction_expr = expr.expr - - expression, callables_table = expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - reduction_expr, - callables_table, - kernel.target) - - reduction_insn = make_assignment( - id=update_id, - assignees=acc_vars, - expression=expression, - depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates,) - - generated_insns.append(reduction_insn) - - new_insn_add_depends_on.add(reduction_insn.id) - - if nresults == 1: - assert len(acc_vars) == 1 - return acc_vars[0], callables_table - else: - return acc_vars, callables_table - - # }}} - - # {{{ local-parallel - - def _get_int_iname_size(iname): - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import pw_aff_to_expr - size = pw_aff_to_expr( - static_max_of_pw_aff( - kernel.get_iname_bounds(iname).size, - constants_only=True)) - assert isinstance(size, int) - return size - - def _make_slab_set(iname, size): - v = isl.make_zero_and_vars([iname]) - bs, = ( - v[0].le_set(v[iname]) - & - v[iname].lt_set(v[0] + size)).get_basic_sets() - return bs - - def _make_slab_set_from_range(iname, lbound, ubound): - v = isl.make_zero_and_vars([iname]) - bs, = ( - v[iname].ge_set(v[0] + lbound) - & - v[iname].lt_set(v[0] + ubound)).get_basic_sets() - return bs - - def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates): - red_iname, = expr.inames - - size = _get_int_iname_size(red_iname) - - outer_insn_inames = insn.within_inames - - from loopy.kernel.data import LocalInameTagBase - outer_local_inames = tuple(oiname for oiname in outer_insn_inames - if kernel.iname_tags_of_type(oiname, LocalInameTagBase)) - - from pymbolic import var - outer_local_iname_vars = tuple( - var(oiname) for oiname in outer_local_inames) - - outer_local_iname_sizes = tuple( - _get_int_iname_size(oiname) - for oiname in outer_local_inames) - - from loopy.kernel.data import AddressSpace - - neutral_var_names = make_temporaries( - name_based_on="neutral_"+red_iname, - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - address_space=AddressSpace.PRIVATE) - - acc_var_names = make_temporaries( - name_based_on="acc_"+red_iname, - nvars=nresults, - shape=outer_local_iname_sizes + (size,), - dtypes=reduction_dtypes, - address_space=AddressSpace.LOCAL) - - acc_vars = tuple(var(n) for n in acc_var_names) - - # {{{ add separate iname to carry out the reduction - - # Doing this sheds any odd conditionals that may be active - # on our red_iname. - - base_exec_iname = var_name_gen("red_"+red_iname) - domains.append(_make_slab_set(base_exec_iname, size)) - new_iname_tags[base_exec_iname] = kernel.iname_tags(red_iname) - - # }}} - - base_iname_deps = outer_insn_inames - frozenset(expr.inames) - - neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, - callables_table=callables_table, target=kernel.target) - init_id = insn_id_gen(f"{insn.id}_{red_iname}_init") - init_insn = make_assignment( - id=init_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(base_exec_iname),)] - for acc_var in acc_vars), - expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), - # Do not inherit predicates: Those might read variables - # that may not yet be set, and we don't have a great way - # of figuring out what the dependencies of the accumulator - # initializer should be. - - # This way, we may initialize a few too many accumulators, - # but that's better than being incorrect. - # https://github.com/inducer/loopy/issues/231 - ) - generated_insns.append(init_insn) - - init_neutral_id = insn_id_gen(f"{insn.id}_{red_iname}_init_neutral") - init_neutral_insn = make_assignment( - id=init_neutral_id, - assignees=tuple(var(nvn) for nvn in neutral_var_names), - expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), - predicates=guarding_predicates, - ) - generated_insns.append(init_neutral_insn) - - transfer_depends_on = {init_neutral_id, init_id} - - # In the case of a multi-argument reduction, we need a name for each of - # the arguments in order to pass them to the binary op - so we expand - # items that are not "plain" tuples here. - if nresults > 1 and not isinstance(expr.expr, tuple): - get_args_insn_id = insn_id_gen( - f"{insn.id}_{red_iname}_get") - - reduction_expr = expand_inner_reduction( - id=get_args_insn_id, - expr=expr.expr, - nresults=nresults, - depends_on=insn.depends_on, - within_inames=( - (outer_insn_inames - frozenset(expr.inames)) - | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, - ) - - transfer_depends_on.add(get_args_insn_id) - else: - reduction_expr = expr.expr - - transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer") - expression, callables_table = expr.operation( - arg_dtypes, - _strip_if_scalar( - neutral_var_names, - tuple(var(nvn) for nvn in neutral_var_names)), - reduction_expr, - callables_table, - kernel.target) - transfer_insn = make_assignment( - id=transfer_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(red_iname),)] - for acc_var in acc_vars), - expression=expression, - within_inames=( - (outer_insn_inames - frozenset(expr.inames)) - | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, - no_sync_with=frozenset([(init_id, "any")]), - predicates=insn.predicates, - ) - generated_insns.append(transfer_insn) - - cur_size = 1 - while cur_size < size: - cur_size *= 2 - - prev_id = transfer_id - bound = size - - stage_exec_iname = None - - istage = 0 - while cur_size > 1: - - new_size = cur_size // 2 - assert new_size * 2 == cur_size - - stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) - domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname) - - stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) - expression, callables_table = expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars)), - callables_table, - kernel.target) - - stage_insn = make_assignment( - id=stage_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars), - expression=expression, - within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - predicates=insn.predicates, - ) - - generated_insns.append(stage_insn) - prev_id = stage_id - - cur_size = new_size - bound = cur_size - istage += 1 - - new_insn_add_depends_on.add(prev_id) - new_insn_add_no_sync_with.add((prev_id, "any")) - new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) - - if nresults == 1: - assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)], callables_table - else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in - acc_vars], callables_table - # }}} - - # {{{ utils (stateful) - - from pytools import memoize - - @memoize - def get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, - tracking_iname): - domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) - - inames_added_for_scan.add(tracking_iname) - - new_domain = _create_domain_for_sweep_tracking(domain, - tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) - - _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) - - return tracking_iname - - def replace_var_within_expr(expr, from_var, to_var): - from pymbolic.mapper.substitutor import make_subst_func - - from loopy.symbolic import ( - SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) - - rule_mapping_context = SubstitutionRuleMappingContext( - temp_kernel.substitutions, var_name_gen) - - from pymbolic import var - mapper = RuleAwareSubstitutionMapper( - rule_mapping_context, - make_subst_func({from_var: var(to_var)}), - within=lambda *args: True) - - return mapper(expr, temp_kernel, None) - - def make_temporaries(name_based_on, nvars, shape, dtypes, address_space): - var_names = [ - var_name_gen(name_based_on.format(index=i)) - for i in range(nvars)] - - from loopy.kernel.data import TemporaryVariable - - for name, dtype in zip(var_names, dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=shape, - dtype=dtype, - address_space=address_space) - - return var_names - - # }}} - - # {{{ sequential scan - - def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, - scan_min_value, stride, guarding_predicates): - outer_insn_inames = insn.within_inames - inames_to_remove.add(scan_iname) - - track_iname = var_name_gen( - "{sweep_iname}__seq_scan" - .format(sweep_iname=sweep_iname)) - - get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, - stride, track_iname) - - from loopy.kernel.data import AddressSpace - acc_var_names = make_temporaries( - name_based_on="acc_" + scan_iname, - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - address_space=AddressSpace.PRIVATE) - - from pymbolic import var - acc_vars = tuple(var(n) for n in acc_var_names) - - init_id = insn_id_gen( - "{}_{}_init".format(insn.id, "_".join(expr.inames))) - - init_insn_depends_on = frozenset() - - # FIXME: Explain why we care about global barriers here - if kernel_has_global_barriers(kernel): - global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) - - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) - - expression, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, target=kernel.target) - - init_insn = make_assignment( - id=init_id, - assignees=acc_vars, - within_inames=outer_insn_inames - frozenset( - (sweep_iname,) + expr.inames), - within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on, - expression=expression, - # Do not inherit predicates: Those might read variables - # that may not yet be set, and we don't have a great way - # of figuring out what the dependencies of the accumulator - # initializer should be. - - # This way, we may initialize a few too many accumulators, - # but that's better than being incorrect. - # https://github.com/inducer/loopy/issues/231 - ) - - generated_insns.append(init_insn) - - update_insn_depends_on = {init_insn.id} | insn.depends_on - - updated_inner_exprs = ( - preprocess_scan_arguments(insn, expr.expr, nresults, - scan_iname, track_iname, update_insn_depends_on)) - - update_id = insn_id_gen( - based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) - - update_insn_iname_deps = insn.within_inames | {track_iname} - if insn.within_inames_is_final: - update_insn_iname_deps = insn.within_inames | {track_iname} - - expression, callables_table = expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - _strip_if_scalar(acc_vars, updated_inner_exprs), - callables_table, - kernel.target) - - scan_insn = make_assignment( - id=update_id, - assignees=acc_vars, - expression=expression, - depends_on=frozenset(update_insn_depends_on), - within_inames=update_insn_iname_deps, - no_sync_with=insn.no_sync_with, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, - ) - - generated_insns.append(scan_insn) - - new_insn_add_depends_on.add(scan_insn.id) - - if nresults == 1: - assert len(acc_vars) == 1 - return acc_vars[0], callables_table - else: - return acc_vars, callables_table - - # }}} - - # {{{ local-parallel scan - - def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, - scan_min_value, stride, guarding_predicates): - - scan_size = _get_int_iname_size(sweep_iname) - - assert scan_size > 0 - - if scan_size == 1: - return map_reduction_seq(expr, rec, callables_table, - nresults, arg_dtypes, reduction_dtypes, - guarding_predicates) - - outer_insn_inames = insn.within_inames - - from loopy.kernel.data import LocalInameTagBase - outer_local_inames = tuple(oiname for oiname in outer_insn_inames - if kernel.iname_tags_of_type(oiname, LocalInameTagBase) - and oiname != sweep_iname) - - from pymbolic import var - outer_local_iname_vars = tuple( - var(oiname) for oiname in outer_local_inames) - - outer_local_iname_sizes = tuple( - _get_int_iname_size(oiname) - for oiname in outer_local_inames) - - track_iname = var_name_gen( - "{sweep_iname}__pre_scan" - .format(sweep_iname=sweep_iname)) - - get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, - track_iname) - - # {{{ add separate iname to carry out the scan - - # Doing this sheds any odd conditionals that may be active - # on our scan_iname. - - base_exec_iname = var_name_gen(sweep_iname + "__scan") - domains.append(_make_slab_set(base_exec_iname, scan_size)) - new_iname_tags[base_exec_iname] = kernel.iname_tags(sweep_iname) - - # }}} - - from loopy.kernel.data import AddressSpace - - read_var_names = make_temporaries( - name_based_on="read_"+scan_iname+"_arg_{index}", - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - address_space=AddressSpace.PRIVATE) - - acc_var_names = make_temporaries( - name_based_on="acc_"+scan_iname, - nvars=nresults, - shape=outer_local_iname_sizes + (scan_size,), - dtypes=reduction_dtypes, - address_space=AddressSpace.LOCAL) - - acc_vars = tuple(var(n) for n in acc_var_names) - read_vars = tuple(var(n) for n in read_var_names) - - base_iname_deps = (outer_insn_inames - - frozenset(expr.inames) - frozenset([sweep_iname])) - - neutral, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, target=kernel.target) - - init_insn_depends_on = insn.depends_on - - # FIXME: Explain why we care about global barriers here - if kernel_has_global_barriers(kernel): - global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) - - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) - - init_id = insn_id_gen(f"{insn.id}_{scan_iname}_init") - init_insn = make_assignment( - id=init_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(base_exec_iname),)] - for acc_var in acc_vars), - expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on, - # Do not inherit predicates: Those might read variables - # that may not yet be set, and we don't have a great way - # of figuring out what the dependencies of the accumulator - # initializer should be. - - # This way, we may initialize a few too many accumulators, - # but that's better than being incorrect. - # https://github.com/inducer/loopy/issues/231 - ) - generated_insns.append(init_insn) - - transfer_insn_depends_on = {init_insn.id} | insn.depends_on - - updated_inner_exprs = ( - preprocess_scan_arguments(insn, expr.expr, nresults, - scan_iname, track_iname, transfer_insn_depends_on)) - - from loopy.symbolic import Reduction - - from loopy.symbolic import pw_aff_to_expr - sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) - - transfer_id = insn_id_gen(f"{insn.id}_{scan_iname}_transfer") - transfer_insn = make_assignment( - id=transfer_id, - assignees=tuple( - acc_var[outer_local_iname_vars - + (var(sweep_iname) - sweep_min_value_expr,)] - for acc_var in acc_vars), - expression=Reduction( - operation=expr.operation, - inames=(track_iname,), - expr=_strip_if_scalar(acc_vars, updated_inner_exprs), - allow_simultaneous=False, - ), - within_inames=outer_insn_inames - frozenset(expr.inames), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(transfer_insn_depends_on), - no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with, - predicates=insn.predicates, - ) - - generated_insns.append(transfer_insn) - - prev_id = transfer_id - - istage = 0 - cur_size = 1 - - while cur_size < scan_size: - stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) - domains.append( - _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_tags(sweep_iname) - - for read_var, acc_var in zip(read_vars, acc_vars): - read_stage_id = insn_id_gen( - "scan_%s_read_stage_%d" % (scan_iname, istage)) - - read_stage_insn = make_assignment( - id=read_stage_id, - assignees=(read_var,), - expression=( - acc_var[ - outer_local_iname_vars - + (var(stage_exec_iname) - cur_size,)]), - within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - predicates=insn.predicates, - ) - - if cur_size == 1: - # Performance hack: don't add a barrier here with transfer_insn. - # NOTE: This won't work if the way that local inames - # are lowered changes. - read_stage_insn = read_stage_insn.copy( - no_sync_with=( - read_stage_insn.no_sync_with - | frozenset([(transfer_id, "any")]))) - - generated_insns.append(read_stage_insn) - prev_id = read_stage_id - - write_stage_id = insn_id_gen( - "scan_%s_write_stage_%d" % (scan_iname, istage)) - - expression, callables_table = expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, read_vars), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - callables_table, - kernel.target) - - write_stage_insn = make_assignment( - id=write_stage_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars), - expression=expression, - within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - predicates=insn.predicates, - ) - - generated_insns.append(write_stage_insn) - prev_id = write_stage_id - - cur_size *= 2 - istage += 1 - - new_insn_add_depends_on.add(prev_id) - new_insn_add_within_inames.add(sweep_iname) - - output_idx = var(sweep_iname) - sweep_min_value_expr - - if nresults == 1: - assert len(acc_vars) == 1 - return (acc_vars[0][outer_local_iname_vars + (output_idx,)], - callables_table) - else: - return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars], callables_table - - # }}} - - # {{{ seq/par dispatch - - def map_reduction(expr, rec, callables_table, - guarding_predicates, nresults=1): - nonlocal insn_changed - - # Only expand one level of reduction at a time, going from outermost to - # innermost. Otherwise we get the (iname + insn) dependencies wrong. - - from loopy.type_inference import ( - infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( - infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, callables_table, unknown_types_ok)) - - outer_insn_inames = insn.within_inames - bad_inames = frozenset(expr.inames) & outer_insn_inames - if bad_inames: - raise LoopyError("reduction used within loop(s) that it was " - "supposed to reduce over: " + ", ".join(bad_inames)) - - iname_classes = _classify_reduction_inames(temp_kernel, expr.inames) - - n_sequential = len(iname_classes.sequential) - n_local_par = len(iname_classes.local_parallel) - n_nonlocal_par = len(iname_classes.nonlocal_parallel) - - really_force_scan = force_scan and ( - len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan) - - def _error_if_force_scan_on(cls, msg): - if really_force_scan: - raise cls(msg) - - may_be_implemented_as_scan = False - if force_scan or automagic_scans_ok: - from loopy.diagnostic import ReductionIsNotTriangularError - - try: - # Try to determine scan candidate information (sweep iname, scan - # iname, etc). - scan_param = _try_infer_scan_candidate_from_expr( - temp_kernel, expr, outer_insn_inames, - sweep_iname=force_outer_iname_for_scan) - - except ValueError as v: - error = str(v) - - else: - # Ensures the reduction is triangular (somewhat expensive). - may_be_implemented_as_scan, error = ( - _check_reduction_is_triangular( - temp_kernel, expr, scan_param)) - - if not may_be_implemented_as_scan: - _error_if_force_scan_on(ReductionIsNotTriangularError, error) - - # {{{ sanity checks - - if n_local_par and n_sequential: - raise LoopyError("Reduction over '%s' contains both parallel and " - "sequential inames. It must be split " - "(using split_reduction_{in,out}ward) " - "before code generation." - % ", ".join(expr.inames)) - - if n_local_par > 1: - raise LoopyError("Reduction over '%s' contains more than" - "one parallel iname. It must be split " - "(using split_reduction_{in,out}ward) " - "before code generation." - % ", ".join(expr.inames)) - - if n_nonlocal_par: - bad_inames = iname_classes.nonlocal_parallel - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname(s) '%s' " - "respectively tagged '%s'" - % (", ".join(bad_inames), - ", ".join(str(kernel.iname_tags(iname)) - for iname in bad_inames))) - - # }}} - - insn_changed = True - - if n_local_par == 0 and n_sequential == 0: - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "empty_reduction", - "Empty reduction found (no inames to reduce over). " - "Eliminating.") - - # We're not supposed to reduce/sum at all. (Note how this is distinct - # from an empty reduction--there is an element here, just no inames - # to reduce over. It's rather similar to an array with () shape in - # numpy.) - - return expr.expr, callables_table - - if may_be_implemented_as_scan: - assert force_scan or automagic_scans_ok - - # We require the "scan" iname to be tagged sequential. - if n_sequential: - sweep_iname = scan_param.sweep_iname - sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) - - sequential = sweep_iname in sweep_class.sequential - parallel = sweep_iname in sweep_class.local_parallel - bad_parallel = sweep_iname in sweep_class.nonlocal_parallel - - if sweep_iname not in outer_insn_inames: - _error_if_force_scan_on(LoopyError, - "Sweep iname '%s' was detected, but is not an iname " - "for the instruction." % sweep_iname) - elif bad_parallel: - _error_if_force_scan_on(LoopyError, - "Sweep iname '%s' has an unsupported parallel tag '%s' " - "- the only parallelism allowed is 'local'." % - (sweep_iname, - ", ".join(tag.key - for tag in temp_kernel.iname_tags(sweep_iname)))) - elif parallel: - return map_scan_local( - expr, rec, callables_table, nresults, - arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, - scan_param.sweep_lower_bound, - scan_param.scan_lower_bound, - scan_param.stride, - guarding_predicates) - elif sequential: - return map_scan_seq( - expr, rec, callables_table, nresults, - arg_dtypes, reduction_dtypes, sweep_iname, - scan_param.scan_iname, - scan_param.sweep_lower_bound, - scan_param.scan_lower_bound, - scan_param.stride, - guarding_predicates) - - # fallthrough to reduction implementation - - else: - assert n_local_par > 0 - scan_iname, = expr.inames - _error_if_force_scan_on(LoopyError, - "Scan iname '%s' is parallel tagged: this is not allowed " - "(only the sweep iname should be tagged if parallelism " - "is desired)." % scan_iname) - - # fallthrough to reduction implementation - - if n_sequential: - assert n_local_par == 0 - return map_reduction_seq(expr, rec, callables_table, - nresults, arg_dtypes, reduction_dtypes, - guarding_predicates) - else: - assert n_local_par > 0 - return map_reduction_local( - expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates) - - # }}} - - cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) - - insn_queue = kernel.instructions[:] - insn_id_replacements = {} - domains = kernel.domains[:] - - temp_kernel = kernel - kernel_changed = False - - import loopy as lp - while insn_queue: - new_insn_add_depends_on = set() - new_insn_add_no_sync_with = set() - new_insn_add_within_inames = set() - - generated_insns = [] - insn_changed = False - - insn = insn_queue.pop(0) - - if insn_id_filter is not None and insn.id != insn_id_filter \ - or not isinstance(insn, lp.MultiAssignmentBase): - new_insns.append(insn) - continue - - nresults = len(insn.assignees) - - # Run reduction expansion. - from loopy.symbolic import Reduction - if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, - callables_table=cb_mapper.callables_table, - guarding_predicates=insn.predicates, - nresults=nresults) - else: - new_expressions = cb_mapper(insn.expression, - callables_table=cb_mapper.callables_table, - guarding_predicates=insn.predicates), - - if insn_changed: - # An expansion happened, so insert the generated stuff plus - # ourselves back into the queue. - - result_assignment_dep_on = \ - insn.depends_on | frozenset(new_insn_add_depends_on) - kwargs = insn.get_copy_kwargs( - no_sync_with=insn.no_sync_with - | frozenset(new_insn_add_no_sync_with), - within_inames=( - insn.within_inames - | new_insn_add_within_inames)) - - kwargs.pop("id") - kwargs.pop("depends_on") - kwargs.pop("expression") - kwargs.pop("assignee", None) - kwargs.pop("assignees", None) - kwargs.pop("temp_var_type", None) - kwargs.pop("temp_var_types", None) - - if isinstance(insn.expression, Reduction) and nresults > 1: - result_assignment_ids = [ - insn_id_gen(insn.id) for i in range(nresults)] - replacement_insns = [ - lp.Assignment( - id=result_assignment_ids[i], - depends_on=( - result_assignment_dep_on - | (frozenset([result_assignment_ids[i-1]]) - if i else frozenset())), - assignee=assignee, - expression=new_expr, - **kwargs) - for i, (assignee, new_expr) in enumerate(zip( - insn.assignees, new_expressions))] - - insn_id_replacements[insn.id] = [ - rinsn.id for rinsn in replacement_insns] - else: - new_expr, = new_expressions - # since we are replacing the instruction with - # only one instruction, there's no need to replace id - replacement_insns = [ - make_assignment( - id=insn.id, - depends_on=result_assignment_dep_on, - assignees=insn.assignees, - expression=new_expr, - **kwargs) - ] - - insn_queue = generated_insns + replacement_insns + insn_queue - - # The reduction expander needs an up-to-date kernel - # object to find dependencies. Keep temp_kernel up-to-date. - - temp_kernel = kernel.copy( - instructions=new_insns + insn_queue, - temporary_variables=new_temporary_variables, - domains=domains) - temp_kernel = lp.replace_instruction_ids( - temp_kernel, insn_id_replacements) - kernel_changed = True - else: - # nothing happened, we're done with insn - assert not new_insn_add_depends_on - - new_insns.append(insn) - - if kernel_changed: - kernel = kernel.copy( - instructions=new_insns, - temporary_variables=new_temporary_variables, - domains=domains) - - kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - - from loopy.transform.iname import tag_inames - kernel = tag_inames(kernel, new_iname_tags) - - kernel = ( - _hackily_ensure_multi_assignment_return_values_are_scoped_private( - kernel)) - - return kernel, cb_mapper.callables_table - - -def realize_reduction(program, *args, **kwargs): - assert isinstance(program, TranslationUnit) - - callables_table = dict(program.callables_table) - kernels_to_scan = [in_knl_callable.subkernel - for in_knl_callable in program.callables_table.values() - if isinstance(in_knl_callable, CallableKernel)] - - for knl in kernels_to_scan: - new_knl, callables_table = realize_reduction_for_single_kernel( - knl, callables_table, *args, **kwargs) - in_knl_callable = callables_table[knl.name].copy( - subkernel=new_knl) - callables_table[knl.name] = in_knl_callable - - return program.copy(callables_table=callables_table) - -# }}} - - # {{{ realize_ilp def realize_ilp(kernel): @@ -2449,6 +639,7 @@ def preprocess_program(program, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. + from loopy.transform.realize_reduction import realize_reduction program = realize_reduction(program, unknown_types_ok=False) # {{{ preprocess callable kernels diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py new file mode 100644 index 000000000..c02c05fdf --- /dev/null +++ b/loopy/transform/realize_reduction.py @@ -0,0 +1,2053 @@ +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2022 University of Illinois Board of Trustees +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from dataclasses import dataclass +from typing import Tuple, Dict, Callable, List, Optional, Set, Sequence + +import logging +logger = logging.getLogger(__name__) + +from pytools import memoize_on_first_arg +from pytools.tag import Tag +import islpy as isl + +from loopy.kernel.data import make_assignment +from loopy.kernel.tools import ( + kernel_has_global_barriers, find_most_recent_global_barrier) +from loopy.symbolic import ReductionCallbackMapper +from loopy.translation_unit import TranslationUnit +from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.data import TemporaryVariable, AddressSpace +from loopy.kernel.instruction import ( + InstructionBase, MultiAssignmentBase, Assignment) +from loopy.kernel import LoopKernel +from loopy.diagnostic import ( + LoopyError, warn_with_kernel, ReductionIsNotTriangularError) +from loopy.transform.instruction import replace_instruction_ids_in_insn + + +# {{{ reduction realization context + +@dataclass(frozen=True) +class _ReductionRealizationContext: + # {{{ read-only + + force_scan: bool + automagic_scans_ok: bool + unknown_types_ok: bool + + # FIXME: This feels like a broken-by-design concept + force_outer_iname_for_scan: Optional[str] + + # We use the original kernel for a number of lookups whose value + # we do not change and which might be already cached on it. + orig_kernel: LoopKernel + + kernel: LoopKernel + + # FIXME: This shouldn't be here. We might generate multiple instructions + # in a nested manner. Why should the 'top-level' instruction be special? + insn: InstructionBase + + # }}} + + # {{{ internally mutable + + insn_id_gen: Callable[[str], str] + var_name_gen: Callable[[str], str] + + additional_temporary_variables: Dict[str, TemporaryVariable] + additional_insns: List[InstructionBase] + domains: List[isl.BasicSet] + additional_iname_tags: Dict[str, Sequence[Tag]] + + # FIXME: This is a broken-by-design concept. Local-parallel scans emit a + # reduction internally. This serves to avoid force_scan acting on that + # reduction. + inames_added_for_scan: Set[str] + + # FIXME: Clarify how these relate to recursively generated instructions. + new_insn_add_depends_on: Set[str] + new_insn_add_no_sync_with: Set[Tuple[str, str]] + new_insn_add_within_inames: Set[str] + + # }}} + + # {{{ change tracking + + were_changes_made: bool + + def changes_made(self): + object.__setattr__(self, "were_changes_made", True) + + # }}} + +# }}} + + +# {{{ iname/domain wrangling + +@dataclass(frozen=True) +class _InameClassification: + sequential: Tuple[str, ...] + local_parallel: Tuple[str, ...] + nonlocal_parallel: Tuple[str, ...] + + +def _classify_reduction_inames(kernel, inames): + sequential = [] + local_par = [] + nonlocal_par = [] + + from loopy.kernel.data import ( + LocalInameTagBase, UnrolledIlpTag, UnrollTag, + ConcurrentTag, filter_iname_tags_by_type) + + for iname in inames: + iname_tags = kernel.iname_tags(iname) + + if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + sequential.append(iname) + + elif filter_iname_tags_by_type(iname_tags, LocalInameTagBase): + local_par.append(iname) + + elif filter_iname_tags_by_type(iname_tags, ConcurrentTag): + nonlocal_par.append(iname) + + else: + sequential.append(iname) + + return _InameClassification( + tuple(sequential), tuple(local_par), tuple(nonlocal_par)) + + +def _add_params_to_domain(domain, param_names): + dim_type = isl.dim_type + nparams_orig = domain.dim(dim_type.param) + domain = domain.add_dims(dim_type.param, len(param_names)) + + for param_idx, param_name in enumerate(param_names): + domain = domain.set_dim_name( + dim_type.param, param_idx + nparams_orig, param_name) + + return domain + + +def _move_set_to_param_dims_except(domain, except_dims): + dim_type = isl.dim_type + + iname_idx = 0 + for iname in domain.get_var_names(dim_type.set): + if iname not in except_dims: + domain = domain.move_dims( + dim_type.param, 0, + dim_type.set, iname_idx, 1) + iname_idx -= 1 + iname_idx += 1 + + return domain + + +def _domain_depends_on_given_set_dims(domain, set_dim_names): + set_dim_names = frozenset(set_dim_names) + + return any( + set_dim_names & set(constr.get_coefficients_by_name()) + for constr in domain.get_constraints()) + + +def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): + # Intersect with inames, because we could have captured some kernel params + # in here too... + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + idx, = kernel.get_leaf_domain_indices(dependent_inames) + domains.insert(idx + 1, subdomain) + +# }}} + + +# {{{ scan inference + +def _check_reduction_is_triangular(kernel, expr, scan_param): + """Check whether the reduction within `expr` with scan parameters described by + the structure `scan_param` is triangular. This attempts to verify that the + domain for the scan and sweep inames is as follows: + + [params] -> { + [other inames..., scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (scan_min_value + <= scan_iname + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + and + (irrelevant constraints) + } + """ + + orig_domain = kernel.get_inames_domain( + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) + + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname + affs = isl.affs_from_space(orig_domain.space) + + sweep_lower_bound = isl.align_spaces( + scan_param.sweep_lower_bound, + affs[0]) + + sweep_upper_bound = isl.align_spaces( + scan_param.sweep_upper_bound, + affs[0]) + + scan_lower_bound = isl.align_spaces( + scan_param.scan_lower_bound, + affs[0]) + + from itertools import product + + for (sweep_lb_domain, sweep_lb_aff), \ + (sweep_ub_domain, sweep_ub_aff), \ + (scan_lb_domain, scan_lb_aff) in \ + product(sweep_lower_bound.get_pieces(), + sweep_upper_bound.get_pieces(), + scan_lower_bound.get_pieces()): + + # Assumptions inherited from the domains of the pwaffs + assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain + + # Sweep iname constraints + hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) + hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) + + # Scan iname constraints + hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) + hyp_domain &= affs[scan_iname].le_set( + scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) + + scan_lb_aff) + + hyp_domain, = (hyp_domain & assumptions).get_basic_sets() + test_domain, = (orig_domain & assumptions).get_basic_sets() + + hyp_gist_against_test = hyp_domain.gist(test_domain) + if _domain_depends_on_given_set_dims(hyp_gist_against_test, + (sweep_iname, scan_iname)): + return False, ( + "gist of hypothesis against test domain " + "has sweep or scan dependent constraints: '%s'" + % hyp_gist_against_test) + + test_gist_against_hyp = test_domain.gist(hyp_domain) + if _domain_depends_on_given_set_dims(test_gist_against_hyp, + (sweep_iname, scan_iname)): + return False, ( + "gist of test against hypothesis domain " + "has sweep or scan dependent constraint: '%s'" + % test_gist_against_hyp) + + return True, "ok" + + +@dataclass(frozen=True) +class _ScanCandidateParameters: + sweep_iname: str + scan_iname: str + sweep_lower_bound: isl.PwAff + sweep_upper_bound: isl.PwAff + scan_lower_bound: isl.PwAff + stride: int + + +def _try_infer_scan_candidate_from_expr( + kernel, expr, within_inames, sweep_iname=None): + """Analyze `expr` and determine if it can be implemented as a scan. + """ + from loopy.symbolic import Reduction + assert isinstance(expr, Reduction) + + if len(expr.inames) != 1: + raise ValueError( + "Multiple inames in reduction: '{}'".format(", ".join(expr.inames))) + + scan_iname, = expr.inames + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger(kernel, (scan_iname,)) + domain = dchg.get_original_domain() + + if sweep_iname is None: + try: + sweep_iname = _try_infer_sweep_iname( + domain, scan_iname, kernel.all_inames()) + except ValueError as v: + raise ValueError( + "Couldn't determine a sweep iname for the scan " + "expression '%s': %s" % (expr, v)) + + try: + sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( + _try_infer_scan_and_sweep_bounds( + kernel, scan_iname, sweep_iname, within_inames)) + except ValueError as v: + raise ValueError( + "Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + try: + stride = _try_infer_scan_stride( + kernel, scan_iname, sweep_iname, sweep_lower_bound) + except ValueError as v: + raise ValueError( + "Couldn't determine a scan stride for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, + sweep_upper_bound, scan_lower_bound, stride) + + +def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): + """The sweep iname is the outer iname which guides the scan. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' found " + "(via constraint '%s')" % (scan_iname, constr)) + + next_candidate = candidate_vars.pop() + + if sweep_iname_candidate is None: + sweep_iname_candidate = next_candidate + defining_constraint = constr + else: + # Check next_candidate consistency + if sweep_iname_candidate != next_candidate: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' " + "found (via constraints '%s', '%s')" % + (scan_iname, defining_constraint, constr)) + + if sweep_iname_candidate is None: + raise ValueError( + "Couldn't find any sweep iname candidates for " + "scan iname '%s'" % scan_iname) + + return sweep_iname_candidate + + +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) + domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) + + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] + + domain = domain.project_out_except( + within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) + + try: + with isl.SuppressedWarnings(domain.get_ctx()): + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) + + return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) + + +def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): + """The stride is the number of steps the scan iname takes per iteration + of the sweep iname. This is allowed to be an integer constant. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError("range in multiple pieces: %s" % scan_iname_range) + elif len(scan_iname_pieces) == 0: + raise ValueError("empty range found for iname '%s'" % scan_iname) + + scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] + + if not scan_iname_constr.plain_is_universe(): + raise ValueError("found constraints: %s" % scan_iname_constr) + + if scan_iname_aff.dim(dim_type.div): + raise ValueError("aff has div: %s" % scan_iname_aff) + + coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) + + if len(coeffs) == 0: + try: + scan_iname_aff.get_constant_val() + except Exception: + raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) + + # If this point is reached we're assuming the domain is of the form + # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value + # this function returns will be verified later by + # _check_reduction_is_triangular(). + return 1 + + if sweep_iname not in coeffs: + raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) + + stride = coeffs[sweep_iname] + + if not stride.is_int(): + raise ValueError("stride not an integer: %s" % stride) + + if not stride.is_pos(): + raise ValueError("stride not positive: %s" % stride) + + return stride.to_python() + +# }}} + + +# {{{ domain creation for scans + +def _get_domain_with_iname_as_param(domain, iname): + dim_type = isl.dim_type + + if domain.find_dim_by_name(dim_type.param, iname) >= 0: + return domain + + iname_idx = domain.find_dim_by_name(dim_type.set, iname) + + assert iname_idx >= 0, (iname, domain) + + return domain.move_dims( + dim_type.param, domain.dim(dim_type.param), + dim_type.set, iname_idx, 1) + + +def _create_domain_for_sweep_tracking(orig_domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): + dim_type = isl.dim_type + + subd = isl.BasicSet.universe(orig_domain.params().space) + + # Add tracking_iname and sweep iname. + + subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + + # Here we realize the domain: + # + # [..., i] -> { + # [j]: 0 <= j - l + # and + # j - l <= k * (i - m) + # and + # k * (i - m - 1) < j - l } + # where + # * i is the sweep iname + # * j is the tracking iname + # * k is the stride for the scan + # * l is the lower bound for the scan + # * m is the lower bound for the sweep iname + # + affs = isl.affs_from_space(subd.space) + + subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) + subd &= (affs[tracking_iname] - scan_min_value)\ + .le_set(stride * (affs[sweep_iname] - sweep_min_value)) + subd &= (affs[tracking_iname] - scan_min_value)\ + .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) + + # Move tracking_iname into a set dim (NOT sweep iname). + subd = subd.move_dims( + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + # Simplify (maybe). + orig_domain_with_sweep_param = ( + _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + subd = subd.gist_params(orig_domain_with_sweep_param.params()) + + subd, = subd.get_basic_sets() + + return subd + +# }}} + + +# {{{ _hackily_ensure_multi_assignment_return_values_are_scoped_private + +def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): + """ + Multi assignment function calls are currently lowered into OpenCL so that + the function call:: + + a, b = segmented_sum(x, y, z, w) + + becomes:: + + a = segmented_sum_mangled(x, y, z, w, &b). + + For OpenCL, the scope of "b" is significant, and the preamble generation + currently assumes the scope is always private. This function forces that to + be the case by introducing temporary assignments into the kernel. + """ + + insn_id_gen = kernel.get_instruction_id_generator() + var_name_gen = kernel.get_var_name_generator() + + new_or_updated_instructions = {} + new_temporaries = {} + + dep_map = { + insn.id: insn.depends_on for insn in kernel.instructions} + + inverse_dep_map = {insn.id: set() for insn in kernel.instructions} + + for insn_id, deps in dep_map.items(): + for dep in deps: + inverse_dep_map[dep].add(insn_id) + + del dep_map + + # {{{ utils + + def _add_to_no_sync_with(insn_id, new_no_sync_with_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + no_sync_with=( + insn.no_sync_with | frozenset(new_no_sync_with_params)))) + + def _add_to_depends_on(insn_id, new_depends_on_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + depends_on=insn.depends_on | frozenset(new_depends_on_params))) + + # }}} + + from loopy.kernel.instruction import CallInstruction, is_array_call + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + continue + + if len(insn.assignees) <= 1: + continue + + if is_array_call(insn.assignees, insn.expression): + continue + + assignees = insn.assignees + assignee_var_names = insn.assignee_var_names() + + new_assignees = [assignees[0]] + newly_added_assignments_ids = set() + needs_replacement = False + + last_added_insn_id = insn.id + + FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa + + for assignee_nr, assignee_var_name, assignee in zip( + range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)), + assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:], + assignees[FIRST_POINTER_ASSIGNEE_IDX:]): + + if ( + assignee_var_name in kernel.temporary_variables + and + (kernel.temporary_variables[assignee_var_name].address_space + == AddressSpace.PRIVATE)): + new_assignees.append(assignee) + continue + + needs_replacement = True + + # {{{ generate a new assignent instruction + + new_assignee_name = var_name_gen( + "{insn_id}_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + new_assignment_id = insn_id_gen( + "{insn_id}_assign_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + newly_added_assignments_ids.add(new_assignment_id) + + new_temporaries[new_assignee_name] = ( + TemporaryVariable( + name=new_assignee_name, + dtype=None, + address_space=AddressSpace.PRIVATE)) + + from pymbolic import var + new_assignee = var(new_assignee_name) + new_assignees.append(new_assignee) + + new_or_updated_instructions[new_assignment_id] = ( + make_assignment( + assignees=(assignee,), + expression=new_assignee, + id=new_assignment_id, + depends_on=frozenset([last_added_insn_id]), + depends_on_is_final=True, + no_sync_with=( + insn.no_sync_with | frozenset([(insn.id, "any")])), + predicates=insn.predicates, + within_inames=insn.within_inames)) + + last_added_insn_id = new_assignment_id + + # }}} + + if not needs_replacement: + continue + + # {{{ update originating instruction + + orig_insn = new_or_updated_instructions.get(insn.id, insn) + + new_or_updated_instructions[insn.id] = ( + orig_insn.copy(assignees=tuple(new_assignees))) + + _add_to_no_sync_with(insn.id, + [(id, "any") for id in newly_added_assignments_ids]) + + # }}} + + # {{{ squash spurious memory dependencies amongst new assignments + + for new_insn_id in newly_added_assignments_ids: + _add_to_no_sync_with(new_insn_id, + [(id, "any") + for id in newly_added_assignments_ids + if id != new_insn_id]) + + # }}} + + # {{{ update instructions that depend on the originating instruction + + for inverse_dep in inverse_dep_map[insn.id]: + _add_to_depends_on(inverse_dep, newly_added_assignments_ids) + + for insn_id, scope in ( + new_or_updated_instructions[inverse_dep].no_sync_with): + if insn_id == insn.id: + _add_to_no_sync_with( + inverse_dep, + [(id, scope) for id in newly_added_assignments_ids]) + + # }}} + + if not new_temporaries and not new_or_updated_instructions: + return kernel + + new_temporary_variables = kernel.temporary_variables.copy() + new_temporary_variables.update(new_temporaries) + + new_instructions = ( + list(new_or_updated_instructions.values()) + + list(insn + for insn in kernel.instructions + if insn.id not in new_or_updated_instructions)) + + return kernel.copy(temporary_variables=new_temporary_variables, + instructions=new_instructions) + +# }}} + + +# {{{ RealizeReductionCallbackMapper + +class RealizeReductionCallbackMapper(ReductionCallbackMapper): + def __init__(self, callback, callables_table): + super().__init__(callback) + self.callables_table = callables_table + + def map_reduction(self, expr, **kwargs): + result, self.callables_table = self.callback(expr, rec=self.rec, + **kwargs) + return result + + def map_if(self, expr, *, + callables_table, red_realize_ctx, + guarding_predicates, nresults): + + common_kwargs = dict( + callables_table=callables_table, + red_realize_ctx=red_realize_ctx, + nresults=nresults) + + import pymbolic.primitives as prim + rec_cond = self.rec( + expr.condition, + guarding_predicates=guarding_predicates, + **common_kwargs) + return prim.If(rec_cond, + self.rec(expr.then, + guarding_predicates=( + guarding_predicates + | frozenset([rec_cond])), + **common_kwargs), + self.rec(expr.else_, + guarding_predicates=( + guarding_predicates + | frozenset([prim.LogicalNot(rec_cond)])), + **common_kwargs)) + +# }}} + + +# {{{ helpers + +def _strip_if_scalar(reference, val): + if len(reference) == 1: + return val[0] + else: + return val + + +def _preprocess_scan_arguments( + red_realize_ctx, + expr, nresults, scan_iname, track_iname, + newly_generated_insn_id_set, + insn_id_gen): + """Does iname substitution within scan arguments and returns a set of values + suitable to be passed to the binary op. Returns a tuple.""" + + insn = red_realize_ctx.insn + + if nresults > 1: + inner_expr = expr + + # In the case of a multi-argument scan, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. + if not isinstance(inner_expr, tuple): + get_args_insn_id = insn_id_gen( + "{}_{}_get".format(insn.id, "_".join(expr.inames))) + + inner_expr = expand_inner_reduction( + red_realize_ctx=red_realize_ctx, + id=get_args_insn_id, + expr=inner_expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=insn.within_inames | expr.inames, + within_inames_is_final=insn.within_inames_is_final, + predicates=insn.predicates, + ) + + newly_generated_insn_id_set.add(get_args_insn_id) + + updated_inner_exprs = tuple( + replace_var_within_expr( + red_realize_ctx.kernel, red_realize_ctx.var_name_gen, + sub_expr, scan_iname, track_iname) + for sub_expr in inner_expr) + else: + updated_inner_exprs = ( + replace_var_within_expr( + red_realize_ctx.kernel, red_realize_ctx.var_name_gen, + expr, scan_iname, track_iname),) + + return updated_inner_exprs + +# }}} + + +def expand_inner_reduction( + red_realize_ctx, id, expr, nresults, depends_on, within_inames, + within_inames_is_final, predicates): + # FIXME: use _make_temporaries + from pymbolic.primitives import Call + from loopy.symbolic import Reduction + assert isinstance(expr, (Call, Reduction)) + + temp_var_names = [ + red_realize_ctx.var_name_gen(id + "_arg" + str(i)) + for i in range(nresults)] + + for name in temp_var_names: + red_realize_ctx.additional_temporary_variables[name] = TemporaryVariable( + name=name, + shape=(), + dtype=None, + address_space=AddressSpace.PRIVATE) + + from pymbolic import var + temp_vars = tuple(var(n) for n in temp_var_names) + + call_insn = make_assignment( + id=id, + assignees=temp_vars, + expression=expr, + depends_on=depends_on, + within_inames=within_inames, + within_inames_is_final=within_inames_is_final, + predicates=predicates) + + red_realize_ctx.additional_insns.append(call_insn) + + return temp_vars + + +# {{{ reduction type: sequential + +def map_reduction_seq( + red_realize_ctx, expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, guarding_predicates): + orig_kernel = red_realize_ctx.orig_kernel + insn = red_realize_ctx.insn + + outer_insn_inames = red_realize_ctx.insn.within_inames + + acc_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="acc_"+"_".join(expr.inames), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + address_space=AddressSpace.PRIVATE) + + init_insn_depends_on = frozenset() + + # check first that the original kernel had global barriers + # if not, we don't need to check. Since the function + # kernel_has_global_barriers is cached, we don't do + # extra work compared to not checking. + # FIXME: Explain why we care about global barriers here + if kernel_has_global_barriers(orig_kernel): + global_barrier = find_most_recent_global_barrier( + red_realize_ctx.kernel, + insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) + + init_id = red_realize_ctx.insn_id_gen( + "{}_{}_init".format(insn.id, "_".join(expr.inames))) + + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, + target=red_realize_ctx.orig_kernel.target) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + expression=expression, + + # Do not inherit predicates: Those might read variables + # that may not yet be set, and we don't have a great way + # of figuring out what the dependencies of the accumulator + # initializer should be. + + # This way, we may initialize a few too many accumulators, + # but that's better than being incorrect. + # https://github.com/inducer/loopy/issues/231 + ) + + red_realize_ctx.additional_insns.append(init_insn) + + update_id = red_realize_ctx.insn_id_gen( + based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = insn.within_inames | set(expr.inames) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set(expr.inames) + + reduction_insn_depends_on = {init_id} + + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. + if nresults > 1 and not isinstance(expr.expr, tuple): + get_args_insn_id = red_realize_ctx.insn_id_gen( + "{}_{}_get".format(insn.id, "_".join(expr.inames))) + + reduction_expr = expand_inner_reduction( + red_realize_ctx=red_realize_ctx, + id=get_args_insn_id, + expr=expr.expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final, + predicates=guarding_predicates, + ) + + reduction_insn_depends_on.add(get_args_insn_id) + else: + reduction_expr = expr.expr + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + reduction_expr, + callables_table, + orig_kernel.target) + + reduction_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expression, + depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final, + predicates=guarding_predicates,) + + red_realize_ctx.additional_insns.append(reduction_insn) + + red_realize_ctx.new_insn_add_depends_on.add(reduction_insn.id) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0], callables_table + else: + return acc_vars, callables_table + +# }}} + + +# {{{ reduction type: local-parallel + +def _get_int_iname_size(kernel, iname): + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + size = pw_aff_to_expr( + static_max_of_pw_aff( + kernel.get_iname_bounds(iname).size, + constants_only=True)) + assert isinstance(size, int) + return size + + +def _make_slab_set(iname, size): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[0].le_set(v[iname]) + & + v[iname].lt_set(v[0] + size)).get_basic_sets() + return bs + + +def _make_slab_set_from_range(iname, lbound, ubound): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[iname].ge_set(v[0] + lbound) + & + v[iname].lt_set(v[0] + ubound)).get_basic_sets() + return bs + + +def map_reduction_local( + red_realize_ctx, + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, guarding_predicates): + orig_kernel = red_realize_ctx.orig_kernel + insn = red_realize_ctx.insn + + red_iname, = expr.inames + + size = _get_int_iname_size(orig_kernel, red_iname) + + outer_insn_inames = insn.within_inames + + from loopy.kernel.data import LocalInameTagBase + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase)) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(orig_kernel, oiname) + for oiname in outer_local_inames) + + neutral_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="neutral_"+red_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + address_space=AddressSpace.PRIVATE) + + acc_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="acc_"+red_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + address_space=AddressSpace.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + + # {{{ add separate iname to carry out the reduction + + # Doing this sheds any odd conditionals that may be active + # on our red_iname. + + base_exec_iname = red_realize_ctx.var_name_gen("red_"+red_iname) + red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, size)) + red_realize_ctx.additional_iname_tags[base_exec_iname] \ + = orig_kernel.iname_tags(red_iname) + + # }}} + + base_iname_deps = outer_insn_inames - frozenset(expr.inames) + + neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, + callables_table=callables_table, target=orig_kernel.target) + init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_init") + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(), + # Do not inherit predicates: Those might read variables + # that may not yet be set, and we don't have a great way + # of figuring out what the dependencies of the accumulator + # initializer should be. + + # This way, we may initialize a few too many accumulators, + # but that's better than being incorrect. + # https://github.com/inducer/loopy/issues/231 + ) + red_realize_ctx.additional_insns.append(init_insn) + + init_neutral_id = red_realize_ctx.insn_id_gen( + f"{insn.id}_{red_iname}_init_neutral") + init_neutral_insn = make_assignment( + id=init_neutral_id, + assignees=tuple(var(nvn) for nvn in neutral_var_names), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(), + predicates=guarding_predicates, + ) + red_realize_ctx.additional_insns.append(init_neutral_insn) + + transfer_depends_on = {init_neutral_id, init_id} + + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. + if nresults > 1 and not isinstance(expr.expr, tuple): + get_args_insn_id = red_realize_ctx.insn_id_gen( + f"{insn.id}_{red_iname}_get") + + reduction_expr = expand_inner_reduction( + red_realize_ctx=red_realize_ctx, + id=get_args_insn_id, + expr=expr.expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=( + (outer_insn_inames - frozenset(expr.inames)) + | frozenset([red_iname])), + within_inames_is_final=insn.within_inames_is_final, + predicates=guarding_predicates, + ) + + transfer_depends_on.add(get_args_insn_id) + else: + reduction_expr = expr.expr + + transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_transfer") + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar( + neutral_var_names, + tuple(var(nvn) for nvn in neutral_var_names)), + reduction_expr, + callables_table, + orig_kernel.target) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(red_iname),)] + for acc_var in acc_vars), + expression=expression, + within_inames=( + (outer_insn_inames - frozenset(expr.inames)) + | frozenset([red_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, + no_sync_with=frozenset([(init_id, "any")]), + predicates=insn.predicates, + ) + red_realize_ctx.additional_insns.append(transfer_insn) + + cur_size = 1 + while cur_size < size: + cur_size *= 2 + + prev_id = transfer_id + bound = size + + stage_exec_iname = None + + istage = 0 + while cur_size > 1: + + new_size = cur_size // 2 + assert new_size * 2 == cur_size + + stage_exec_iname = red_realize_ctx.var_name_gen( + "red_%s_s%d" % (red_iname, istage)) + red_realize_ctx.domains.append( + _make_slab_set(stage_exec_iname, bound-new_size)) + red_realize_ctx.additional_iname_tags[stage_exec_iname] \ + = orig_kernel.iname_tags(red_iname) + + stage_id = red_realize_ctx.insn_id_gen( + "red_%s_stage_%d" % (red_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars)), + callables_table, + orig_kernel.target) + + stage_insn = make_assignment( + id=stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expression, + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + predicates=insn.predicates, + ) + + red_realize_ctx.additional_insns.append(stage_insn) + prev_id = stage_id + + cur_size = new_size + bound = cur_size + istage += 1 + + red_realize_ctx.new_insn_add_depends_on.add(prev_id) + red_realize_ctx.new_insn_add_no_sync_with.add((prev_id, "any")) + red_realize_ctx.new_insn_add_within_inames.add( + stage_exec_iname or base_exec_iname) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (0,)], callables_table + else: + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in + acc_vars], callables_table +# }}} + + +# {{{ utils (stateful) + +@memoize_on_first_arg +def _get_or_add_sweep_tracking_iname_and_domain( + red_realize_ctx, + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + tracking_iname): + kernel = red_realize_ctx.kernel + + domain = kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) + + red_realize_ctx.inames_added_for_scan.add(tracking_iname) + + new_domain = _create_domain_for_sweep_tracking(domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + + _insert_subdomain_into_domain_tree(kernel, red_realize_ctx.domains, new_domain) + + return tracking_iname + + +def replace_var_within_expr(kernel, var_name_gen, expr, from_var, to_var): + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({from_var: var(to_var)}), + within=lambda *args: True) + + return mapper(expr, kernel, None) + + +def _make_temporaries( + red_realize_ctx, name_based_on, nvars, shape, dtypes, address_space): + var_names = [ + red_realize_ctx.var_name_gen(name_based_on.format(index=i)) + for i in range(nvars)] + + from loopy.kernel.data import TemporaryVariable + + for name, dtype in zip(var_names, dtypes): + red_realize_ctx.additional_temporary_variables[name] = TemporaryVariable( + name=name, + shape=shape, + dtype=dtype, + address_space=address_space) + + return var_names + +# }}} + + +# {{{ reduction type: sequential scan + +def map_scan_seq( + red_realize_ctx, + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride, guarding_predicates): + insn = red_realize_ctx.insn + + outer_insn_inames = insn.within_inames + + track_iname = red_realize_ctx.var_name_gen( + "{sweep_iname}__seq_scan" + .format(sweep_iname=sweep_iname)) + + _get_or_add_sweep_tracking_iname_and_domain( + red_realize_ctx, + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride, track_iname) + + from loopy.kernel.data import AddressSpace + acc_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="acc_" + scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + address_space=AddressSpace.PRIVATE) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) + + init_id = red_realize_ctx.insn_id_gen( + "{}_{}_init".format(insn.id, "_".join(expr.inames))) + + init_insn_depends_on = frozenset() + + # FIXME: Explain why we care about global barriers here + if kernel_has_global_barriers(red_realize_ctx.orig_kernel): + global_barrier = find_most_recent_global_barrier( + red_realize_ctx.kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, + target=red_realize_ctx.orig_kernel.target) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset( + (sweep_iname,) + expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + expression=expression, + # Do not inherit predicates: Those might read variables + # that may not yet be set, and we don't have a great way + # of figuring out what the dependencies of the accumulator + # initializer should be. + + # This way, we may initialize a few too many accumulators, + # but that's better than being incorrect. + # https://github.com/inducer/loopy/issues/231 + ) + + red_realize_ctx.additional_insns.append(init_insn) + + update_insn_depends_on = {init_insn.id} | insn.depends_on + + updated_inner_exprs = _preprocess_scan_arguments( + red_realize_ctx, + expr.expr, nresults, + scan_iname, track_iname, update_insn_depends_on, + insn_id_gen=red_realize_ctx.insn_id_gen) + + update_id = red_realize_ctx.insn_id_gen( + based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = insn.within_inames | {track_iname} + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | {track_iname} + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs), + callables_table, + red_realize_ctx.orig_kernel.target) + + scan_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expression, + depends_on=frozenset(update_insn_depends_on), + within_inames=update_insn_iname_deps, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final, + predicates=guarding_predicates, + ) + + red_realize_ctx.additional_insns.append(scan_insn) + red_realize_ctx.new_insn_add_depends_on.add(scan_insn.id) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0], callables_table + else: + return acc_vars, callables_table + +# }}} + + +# {{{ reduction type: local-parallel scan + +def map_scan_local( + red_realize_ctx, + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride, guarding_predicates): + + orig_kernel = red_realize_ctx.orig_kernel + insn = red_realize_ctx.insn + + scan_size = _get_int_iname_size(orig_kernel, sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + return map_reduction_seq(red_realize_ctx, + expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes, + guarding_predicates) + + outer_insn_inames = insn.within_inames + + from loopy.kernel.data import LocalInameTagBase + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase) + and oiname != sweep_iname) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(orig_kernel, oiname) + for oiname in outer_local_inames) + + track_iname = red_realize_ctx.var_name_gen( + "{sweep_iname}__pre_scan" + .format(sweep_iname=sweep_iname)) + + _get_or_add_sweep_tracking_iname_and_domain( + red_realize_ctx, + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + track_iname) + + # {{{ add separate iname to carry out the scan + + # Doing this sheds any odd conditionals that may be active + # on our scan_iname. + + base_exec_iname = red_realize_ctx.var_name_gen(sweep_iname + "__scan") + red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, scan_size)) + red_realize_ctx.additional_iname_tags[base_exec_iname] \ + = orig_kernel.iname_tags(sweep_iname) + + # }}} + + read_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + address_space=AddressSpace.PRIVATE) + + acc_var_names = _make_temporaries( + red_realize_ctx=red_realize_ctx, + name_based_on="acc_"+scan_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (scan_size,), + dtypes=reduction_dtypes, + address_space=AddressSpace.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + read_vars = tuple(var(n) for n in read_var_names) + + base_iname_deps = (outer_insn_inames + - frozenset(expr.inames) - frozenset([sweep_iname])) + + neutral, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, + target=orig_kernel.target) + + init_insn_depends_on = insn.depends_on + + # FIXME: Explain why we care about global barriers here + if kernel_has_global_barriers(orig_kernel): + global_barrier = find_most_recent_global_barrier( + red_realize_ctx.kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_init") + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + # Do not inherit predicates: Those might read variables + # that may not yet be set, and we don't have a great way + # of figuring out what the dependencies of the accumulator + # initializer should be. + + # This way, we may initialize a few too many accumulators, + # but that's better than being incorrect. + # https://github.com/inducer/loopy/issues/231 + ) + red_realize_ctx.additional_insns.append(init_insn) + + transfer_insn_depends_on = {init_insn.id} | insn.depends_on + + updated_inner_exprs = _preprocess_scan_arguments( + red_realize_ctx, + expr.expr, nresults, + scan_iname, track_iname, transfer_insn_depends_on, + insn_id_gen=red_realize_ctx.insn_id_gen) + + from loopy.symbolic import Reduction + + from loopy.symbolic import pw_aff_to_expr + sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) + + transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_transfer") + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + + (var(sweep_iname) - sweep_min_value_expr,)] + for acc_var in acc_vars), + expression=Reduction( + operation=expr.operation, + inames=(track_iname,), + expr=_strip_if_scalar(acc_vars, updated_inner_exprs), + allow_simultaneous=False, + ), + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(transfer_insn_depends_on), + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with, + predicates=insn.predicates, + ) + + red_realize_ctx.additional_insns.append(transfer_insn) + + prev_id = transfer_id + + istage = 0 + cur_size = 1 + + while cur_size < scan_size: + stage_exec_iname = red_realize_ctx.var_name_gen( + "%s__scan_s%d" % (sweep_iname, istage)) + red_realize_ctx.domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + red_realize_ctx.additional_iname_tags[stage_exec_iname] \ + = orig_kernel.iname_tags(sweep_iname) + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = red_realize_ctx.insn_id_gen( + "scan_%s_read_stage_%d" % (scan_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + predicates=insn.predicates, + ) + + if cur_size == 1: + # Performance hack: don't add a barrier here with transfer_insn. + # NOTE: This won't work if the way that local inames + # are lowered changes. + read_stage_insn = read_stage_insn.copy( + no_sync_with=( + read_stage_insn.no_sync_with + | frozenset([(transfer_id, "any")]))) + + red_realize_ctx.additional_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = red_realize_ctx.insn_id_gen( + "scan_%s_write_stage_%d" % (scan_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + callables_table, + orig_kernel.target) + + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expression, + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + predicates=insn.predicates, + ) + + red_realize_ctx.additional_insns.append(write_stage_insn) + prev_id = write_stage_id + + cur_size *= 2 + istage += 1 + + red_realize_ctx.new_insn_add_depends_on.add(prev_id) + red_realize_ctx.new_insn_add_within_inames.add(sweep_iname) + + output_idx = var(sweep_iname) - sweep_min_value_expr + + if nresults == 1: + assert len(acc_vars) == 1 + return (acc_vars[0][outer_local_iname_vars + (output_idx,)], + callables_table) + else: + return [acc_var[outer_local_iname_vars + (output_idx,)] + for acc_var in acc_vars], callables_table + +# }}} + + +# {{{ top-level dispatch among reduction types + +def map_reduction( + expr, *, rec, + callables_table, red_realize_ctx, + guarding_predicates, nresults): + insn = red_realize_ctx.insn + + # Only expand one level of reduction at a time, going from outermost to + # innermost. Otherwise we get the (iname + insn) dependencies wrong. + + from loopy.type_inference import ( + infer_arg_and_reduction_dtypes_for_reduction_expression) + arg_dtypes, reduction_dtypes = ( + infer_arg_and_reduction_dtypes_for_reduction_expression( + red_realize_ctx.kernel, expr, callables_table, + red_realize_ctx.unknown_types_ok)) + + outer_insn_inames = insn.within_inames + bad_inames = frozenset(expr.inames) & outer_insn_inames + if bad_inames: + raise LoopyError("reduction used within loop(s) that it was " + "supposed to reduce over: " + ", ".join(bad_inames)) + + iname_classes = _classify_reduction_inames(red_realize_ctx.kernel, expr.inames) + + n_sequential = len(iname_classes.sequential) + n_local_par = len(iname_classes.local_parallel) + n_nonlocal_par = len(iname_classes.nonlocal_parallel) + + really_force_scan = red_realize_ctx.force_scan and ( + len(expr.inames) != 1 + or expr.inames[0] not in red_realize_ctx.inames_added_for_scan) + + def _error_if_force_scan_on(cls, msg): + if really_force_scan: + raise cls(msg) + + may_be_implemented_as_scan = False + if red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok: + try: + # Try to determine scan candidate information (sweep iname, scan + # iname, etc). + scan_param = _try_infer_scan_candidate_from_expr( + red_realize_ctx.kernel, expr, outer_insn_inames, + sweep_iname=red_realize_ctx.force_outer_iname_for_scan) + + except ValueError as v: + error = str(v) + + else: + # Ensures the reduction is triangular (somewhat expensive). + may_be_implemented_as_scan, error = _check_reduction_is_triangular( + red_realize_ctx.kernel, expr, scan_param) + + if not may_be_implemented_as_scan: + _error_if_force_scan_on(ReductionIsNotTriangularError, error) + + # {{{ sanity checks + + if n_local_par and n_sequential: + raise LoopyError("Reduction over '%s' contains both parallel and " + "sequential inames. It must be split " + "(using split_reduction_{in,out}ward) " + "before code generation." + % ", ".join(expr.inames)) + + if n_local_par > 1: + raise LoopyError("Reduction over '%s' contains more than" + "one parallel iname. It must be split " + "(using split_reduction_{in,out}ward) " + "before code generation." + % ", ".join(expr.inames)) + + if n_nonlocal_par: + bad_inames = iname_classes.nonlocal_parallel + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname(s) '%s' " + "respectively tagged '%s'" + % (", ".join(bad_inames), + ", ".join(str(red_realize_ctx.orig_kernel.iname_tags(iname)) + for iname in bad_inames))) + + # }}} + + red_realize_ctx.changes_made() + + if n_local_par == 0 and n_sequential == 0: + warn_with_kernel(red_realize_ctx.kernel, "empty_reduction", + "Empty reduction found (no inames to reduce over). " + "Eliminating.") + + # We're not supposed to reduce/sum at all. (Note how this is distinct + # from an empty reduction--there is an element here, just no inames + # to reduce over. It's rather similar to an array with () shape in + # numpy.) + + return expr.expr, callables_table + + if may_be_implemented_as_scan: + assert red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok + + # We require the "scan" iname to be tagged sequential. + if n_sequential: + sweep_iname = scan_param.sweep_iname + sweep_class = _classify_reduction_inames( + red_realize_ctx.orig_kernel, (sweep_iname,)) + + sequential = sweep_iname in sweep_class.sequential + parallel = sweep_iname in sweep_class.local_parallel + bad_parallel = sweep_iname in sweep_class.nonlocal_parallel + + if sweep_iname not in outer_insn_inames: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' was detected, but is not an iname " + "for the instruction." % sweep_iname) + elif bad_parallel: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' has an unsupported parallel tag '%s' " + "- the only parallelism allowed is 'local'." % + (sweep_iname, + ", ".join(tag.key + for tag in red_realize_ctx.kernel.iname_tags(sweep_iname)))) + elif parallel: + return map_scan_local( + red_realize_ctx, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride, + guarding_predicates) + elif sequential: + return map_scan_seq( + red_realize_ctx, + expr, rec, callables_table, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride, + guarding_predicates) + + # fallthrough to reduction implementation + + else: + assert n_local_par > 0 + scan_iname, = expr.inames + _error_if_force_scan_on(LoopyError, + "Scan iname '%s' is parallel tagged: this is not allowed " + "(only the sweep iname should be tagged if parallelism " + "is desired)." % scan_iname) + + # fallthrough to reduction implementation + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq( + red_realize_ctx, + expr, rec, callables_table, + nresults, arg_dtypes, reduction_dtypes, + guarding_predicates) + else: + assert n_local_par > 0 + return map_reduction_local( + red_realize_ctx, + expr, rec, callables_table, nresults, arg_dtypes, + reduction_dtypes, guarding_predicates) + +# }}} + + +# {{{ realize_reduction_for_single_kernel + +# @remove_any_newly_unused_inames +def realize_reduction_for_single_kernel(kernel, callables_table, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): + logger.debug("%s: realize reduction" % kernel.name) + + orig_kernel = kernel + + finished_insns = [] + + insn_id_gen = kernel.get_instruction_id_generator() + var_name_gen = kernel.get_var_name_generator() + + cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) + + insn_queue = kernel.instructions[:] + domains = kernel.domains[:] + + inames_added_for_scan = set() + + kernel_changed = False + + while insn_queue: + insn = insn_queue.pop(0) + + red_realize_ctx = _ReductionRealizationContext( + force_scan=force_scan, + automagic_scans_ok=automagic_scans_ok, + unknown_types_ok=unknown_types_ok, + force_outer_iname_for_scan=force_outer_iname_for_scan, + + orig_kernel=orig_kernel, + kernel=kernel, + insn=insn, + + insn_id_gen=insn_id_gen, + var_name_gen=var_name_gen, + + additional_temporary_variables={}, + additional_insns=[], + domains=domains, + additional_iname_tags={}, + + inames_added_for_scan=inames_added_for_scan, + + new_insn_add_depends_on=set(), + new_insn_add_no_sync_with=set(), + new_insn_add_within_inames=set(), + + were_changes_made=False, + ) + + if insn_id_filter is not None and insn.id != insn_id_filter \ + or not isinstance(insn, MultiAssignmentBase): + finished_insns.append(insn) + continue + + nresults = len(insn.assignees) + + # Run reduction expansion. + from loopy.symbolic import Reduction + if isinstance(insn.expression, Reduction) and nresults > 1: + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table, + red_realize_ctx=red_realize_ctx, + guarding_predicates=insn.predicates, + nresults=nresults) + else: + new_expressions = cb_mapper(insn.expression, + callables_table=cb_mapper.callables_table, + red_realize_ctx=red_realize_ctx, + guarding_predicates=insn.predicates, + nresults=1), + + if red_realize_ctx.were_changes_made: + # An expansion happened, so insert the generated stuff plus + # ourselves back into the queue. + + # {{{ apply changes + + kernel_changed = True + + insn_id_replacements = {} + + result_assignment_dep_on = ( + insn.depends_on + | frozenset(red_realize_ctx.new_insn_add_depends_on)) + kwargs = insn.get_copy_kwargs( + no_sync_with=insn.no_sync_with + | frozenset(red_realize_ctx.new_insn_add_no_sync_with), + within_inames=( + insn.within_inames + | red_realize_ctx.new_insn_add_within_inames)) + + kwargs.pop("id") + kwargs.pop("depends_on") + kwargs.pop("expression") + kwargs.pop("assignee", None) + kwargs.pop("assignees", None) + kwargs.pop("temp_var_type", None) + kwargs.pop("temp_var_types", None) + + if isinstance(insn.expression, Reduction) and nresults > 1: + result_assignment_ids = [ + insn_id_gen(insn.id) for i in range(nresults)] + replacement_insns = [ + Assignment( + id=result_assignment_ids[i], + depends_on=( + result_assignment_dep_on + | (frozenset([result_assignment_ids[i-1]]) + if i else frozenset())), + assignee=assignee, + expression=new_expr, + **kwargs) + for i, (assignee, new_expr) in enumerate(zip( + insn.assignees, new_expressions))] + + insn_id_replacements[insn.id] = [ + rinsn.id for rinsn in replacement_insns] + else: + new_expr, = new_expressions + # since we are replacing the instruction with + # only one instruction, there's no need to replace id + replacement_insns = [ + make_assignment( + id=insn.id, + depends_on=result_assignment_dep_on, + assignees=insn.assignees, + expression=new_expr, + **kwargs) + ] + + insn_queue = ( + red_realize_ctx.additional_insns + + replacement_insns + + insn_queue) + + # The reduction expander needs an up-to-date kernel + # object to find dependencies. Keep kernel up-to-date. + new_temporary_variables = kernel.temporary_variables.copy() + new_temporary_variables.update( + red_realize_ctx.additional_temporary_variables) + + finished_insns = [ + replace_instruction_ids_in_insn(insn, insn_id_replacements) + for insn in finished_insns] + insn_queue = [ + replace_instruction_ids_in_insn(insn, insn_id_replacements) + for insn in insn_queue] + + kernel = kernel.copy( + instructions=finished_insns + insn_queue, + temporary_variables=new_temporary_variables, + domains=domains) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, red_realize_ctx.additional_iname_tags) + + del insn_id_replacements + + # }}} + + else: + # nothing happened, we're done with insn + assert not red_realize_ctx.new_insn_add_depends_on + + finished_insns.append(insn) + + if kernel_changed: + kernel = kernel.copy(instructions=finished_insns) + else: + return orig_kernel, callables_table + + kernel = _hackily_ensure_multi_assignment_return_values_are_scoped_private( + kernel) + + return kernel, cb_mapper.callables_table + +# }}} + + +def realize_reduction(t_unit, *args, **kwargs): + """Rewrites reductions into their imperative form. With *insn_id_filter* + specified, operate only on the instruction with an instruction id matching + *insn_id_filter*. + + If *insn_id_filter* is given, only the outermost level of reductions will be + expanded, inner reductions will be left alone (because they end up in a new + instruction with a different ID, which doesn't match the filter). + + If *insn_id_filter* is not given, all reductions in all instructions will + be realized. + + If *automagic_scans_ok*, this function will attempt to rewrite triangular + reductions as scans automatically. + + If *force_scan* is *True*, this function will attempt to rewrite *all* + candidate reductions as scans and raise an error if this is not possible + (this is most useful combined with *insn_id_filter*). + + If *force_outer_iname_for_scan* is not *None*, this function will attempt + to realize candidate reductions as scans using the specified iname as the + outer (sweep) iname. + """ + + assert isinstance(t_unit, TranslationUnit) + + callables_table = dict(t_unit.callables_table) + kernels_to_scan = [in_knl_callable.subkernel + for in_knl_callable in t_unit.callables_table.values() + if isinstance(in_knl_callable, CallableKernel)] + + for knl in kernels_to_scan: + new_knl, callables_table = realize_reduction_for_single_kernel( + knl, callables_table, *args, **kwargs) + in_knl_callable = callables_table[knl.name].copy( + subkernel=new_knl) + callables_table[knl.name] = in_knl_callable + + return t_unit.copy(callables_table=callables_table) + +# vim: foldmethod=marker From f51acad8aba6ffd04454626c15d76081b0a8ff20 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 8 Feb 2022 18:32:06 -0600 Subject: [PATCH 16/27] Fix stringification of arg{min,max} --- loopy/library/reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 091e4a2c1..9a9b1c6e9 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -444,7 +444,7 @@ def neutral_element(self, scalar_dtype, index_dtype, callables_table, index_dtype.numpy_dtype.type(-1)), callables_table def __str__(self): - return self.which + return "arg" + self.which def __hash__(self): return hash(type(self)) From 39d7fd1187faab89f2ec0c45f585fcf6ef0f2915 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 8 Feb 2022 18:32:26 -0600 Subject: [PATCH 17/27] Fix an error message in make_assignment --- loopy/kernel/instruction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0721eccf0..09a0711a3 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1248,7 +1248,8 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): from loopy.symbolic import Reduction if not isinstance(expression, (Call, Reduction)): raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) + "function call or reduction, got: " + f"'{type(expression).__name__}'") if not is_array_call(assignees, expression): return CallInstruction( From 059fef724ffefbed3bbff7976e68a0016b8dd1b8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 4 Feb 2022 01:08:03 -0600 Subject: [PATCH 18/27] Make realize_reduction actually recursive (closes gh-533) --- loopy/transform/realize_reduction.py | 849 +++++++++++++++------------ test/test_scan.py | 5 - 2 files changed, 476 insertions(+), 378 deletions(-) diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index c02c05fdf..67aa627f8 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -24,8 +24,9 @@ """ -from dataclasses import dataclass -from typing import Tuple, Dict, Callable, List, Optional, Set, Sequence +from dataclasses import dataclass, replace +from typing import (Tuple, Dict, Callable, List, Optional, Set, Sequence, + FrozenSet) import logging logger = logging.getLogger(__name__) @@ -33,10 +34,11 @@ from pytools import memoize_on_first_arg from pytools.tag import Tag import islpy as isl +from pymbolic.primitives import Expression + +from pyrsistent import PMap from loopy.kernel.data import make_assignment -from loopy.kernel.tools import ( - kernel_has_global_barriers, find_most_recent_global_barrier) from loopy.symbolic import ReductionCallbackMapper from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel @@ -51,30 +53,34 @@ # {{{ reduction realization context +@dataclass +class _ChangeFlag: + changes_made: bool + + @dataclass(frozen=True) class _ReductionRealizationContext: # {{{ read-only + mapper: "RealizeReductionCallbackMapper" + force_scan: bool automagic_scans_ok: bool unknown_types_ok: bool - # FIXME: This feels like a broken-by-design concept + # FIXME: This feels like a broken-by-design concept. force_outer_iname_for_scan: Optional[str] # We use the original kernel for a number of lookups whose value # we do not change and which might be already cached on it. orig_kernel: LoopKernel - kernel: LoopKernel - # FIXME: This shouldn't be here. We might generate multiple instructions - # in a nested manner. Why should the 'top-level' instruction be special? - insn: InstructionBase + id_prefix: str # }}} - # {{{ internally mutable + # {{{ internally mutable, same across entire recursion insn_id_gen: Callable[[str], str] var_name_gen: Callable[[str], str] @@ -83,28 +89,84 @@ class _ReductionRealizationContext: additional_insns: List[InstructionBase] domains: List[isl.BasicSet] additional_iname_tags: Dict[str, Sequence[Tag]] + # list only to facilitate mutation + boxed_callables_table: List[PMap] # FIXME: This is a broken-by-design concept. Local-parallel scans emit a # reduction internally. This serves to avoid force_scan acting on that # reduction. inames_added_for_scan: Set[str] - # FIXME: Clarify how these relate to recursively generated instructions. - new_insn_add_depends_on: Set[str] - new_insn_add_no_sync_with: Set[Tuple[str, str]] - new_insn_add_within_inames: Set[str] + # }}} + + # {{{ surrounding instruction, read-only (different at each recursive level) + + # These are attributes from 'surrounding' instruction, for generated + # instructions to potentially inherit. + surrounding_within_inames: FrozenSet[str] + surrounding_depends_on: FrozenSet[str] + surrounding_no_sync_with: FrozenSet[Tuple[str, str]] + surrounding_predicates: FrozenSet[Expression] # }}} - # {{{ change tracking + # {{{ surrounding instruction, internally mutable + # (different at each recursive level) + + # These are requested additions to attributes of the surrounding instruction. + + # FIXME add_within_inames seems broken by design. + surrounding_insn_add_within_inames: Set[str] + + surrounding_insn_add_depends_on: Set[str] + surrounding_insn_add_no_sync_with: Set[Tuple[str, str]] + + # }}} + + # {{{ change tracking (same across entire recursion) + + _change_flag: _ChangeFlag - were_changes_made: bool + @property + def were_changes_made(self): + return self._change_flag.changes_made def changes_made(self): - object.__setattr__(self, "were_changes_made", True) + self._change_flag.changes_made = True # }}} + def new_subinstruction(self, *, within_inames, depends_on, + no_sync_with=None, predicates=None): + if no_sync_with is None: + no_sync_with = self.surrounding_no_sync_with + if predicates is None: + predicates = self.surrounding_predicates + + return replace(self, + surrounding_within_inames=within_inames, + surrounding_depends_on=depends_on, + surrounding_no_sync_with=no_sync_with, + surrounding_predicates=predicates, + + surrounding_insn_add_within_inames=set(), + surrounding_insn_add_depends_on=set(), + surrounding_insn_add_no_sync_with=set()) + + def get_insn_kwargs(self): + return dict( + within_inames=( + self.surrounding_within_inames + | frozenset(self.surrounding_insn_add_within_inames)), + within_inames_is_final=True, + depends_on=( + self.surrounding_depends_on + | frozenset(self.surrounding_insn_add_depends_on)), + no_sync_with=( + self.surrounding_no_sync_with + | frozenset(self.surrounding_insn_add_no_sync_with)), + predicates=self.surrounding_predicates) + # }}} @@ -117,7 +179,7 @@ class _InameClassification: nonlocal_parallel: Tuple[str, ...] -def _classify_reduction_inames(kernel, inames): +def _classify_reduction_inames(red_realize_ctx, inames): sequential = [] local_par = [] nonlocal_par = [] @@ -127,7 +189,10 @@ def _classify_reduction_inames(kernel, inames): ConcurrentTag, filter_iname_tags_by_type) for iname in inames: - iname_tags = kernel.iname_tags(iname) + try: + iname_tags = red_realize_ctx.additional_iname_tags[iname] + except KeyError: + iname_tags = red_realize_ctx.kernel.iname_tags(iname) if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with @@ -333,8 +398,13 @@ def _try_infer_scan_candidate_from_expr( "(sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) - return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, - sweep_upper_bound, scan_lower_bound, stride) + return _ScanCandidateParameters( + sweep_iname=sweep_iname, + scan_iname=scan_iname, + sweep_lower_bound=sweep_lower_bound, + sweep_upper_bound=sweep_upper_bound, + scan_lower_bound=scan_lower_bound, + stride=stride) def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): @@ -499,15 +569,16 @@ def _get_domain_with_iname_as_param(domain, iname): dim_type.set, iname_idx, 1) -def _create_domain_for_sweep_tracking(orig_domain, - tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): +def _create_domain_for_sweep_tracking(orig_domain, tracking_iname, scan_param): + sp = scan_param + dim_type = isl.dim_type subd = isl.BasicSet.universe(orig_domain.params().space) # Add tracking_iname and sweep iname. - subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + subd = _add_params_to_domain(subd, (sp.sweep_iname, tracking_iname)) # Here we realize the domain: # @@ -526,11 +597,11 @@ def _create_domain_for_sweep_tracking(orig_domain, # affs = isl.affs_from_space(subd.space) - subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) - subd &= (affs[tracking_iname] - scan_min_value)\ - .le_set(stride * (affs[sweep_iname] - sweep_min_value)) - subd &= (affs[tracking_iname] - scan_min_value)\ - .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) + subd &= (affs[tracking_iname] - sp.scan_lower_bound).ge_set(affs[0]) + subd &= (affs[tracking_iname] - sp.scan_lower_bound)\ + .le_set(sp.stride * (affs[sp.sweep_iname] - sp.sweep_lower_bound)) + subd &= (affs[tracking_iname] - sp.scan_lower_bound)\ + .gt_set(sp.stride * (affs[sp.sweep_iname] - sp.sweep_lower_bound - 1)) # Move tracking_iname into a set dim (NOT sweep iname). subd = subd.move_dims( @@ -539,7 +610,7 @@ def _create_domain_for_sweep_tracking(orig_domain, # Simplify (maybe). orig_domain_with_sweep_param = ( - _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + _get_domain_with_iname_as_param(orig_domain, sp.sweep_iname)) subd = subd.gist_params(orig_domain_with_sweep_param.params()) subd, = subd.get_basic_sets() @@ -738,39 +809,56 @@ def _add_to_depends_on(insn_id, new_depends_on_params): # {{{ RealizeReductionCallbackMapper class RealizeReductionCallbackMapper(ReductionCallbackMapper): - def __init__(self, callback, callables_table): + def __init__(self, callback): super().__init__(callback) - self.callables_table = callables_table def map_reduction(self, expr, **kwargs): - result, self.callables_table = self.callback(expr, rec=self.rec, - **kwargs) - return result + return self.callback(expr, **kwargs) - def map_if(self, expr, *, - callables_table, red_realize_ctx, - guarding_predicates, nresults): + def map_if(self, expr, *, red_realize_ctx, nresults): + common_kwargs = dict(nresults=nresults) - common_kwargs = dict( - callables_table=callables_table, - red_realize_ctx=red_realize_ctx, - nresults=nresults) + # {{{ generate code for condition + rrc_cond = replace(red_realize_ctx, + surrounding_insn_add_depends_on=set(), + surrounding_insn_add_no_sync_with=set(), + surrounding_insn_add_within_inames=set()) import pymbolic.primitives as prim rec_cond = self.rec( expr.condition, - guarding_predicates=guarding_predicates, + red_realize_ctx=rrc_cond, **common_kwargs) + assert not rrc_cond.surrounding_insn_add_no_sync_with + assert not rrc_cond.surrounding_insn_add_within_inames + + cond_dep_on = rrc_cond.surrounding_insn_add_depends_on + red_realize_ctx.surrounding_insn_add_depends_on.update(cond_dep_on) + + # }}} + return prim.If(rec_cond, self.rec(expr.then, - guarding_predicates=( - guarding_predicates - | frozenset([rec_cond])), + red_realize_ctx=replace( + red_realize_ctx, + surrounding_depends_on=( + red_realize_ctx.surrounding_depends_on + | cond_dep_on), + surrounding_predicates=( + red_realize_ctx.surrounding_predicates + | frozenset([rec_cond]) + )), **common_kwargs), self.rec(expr.else_, - guarding_predicates=( - guarding_predicates - | frozenset([prim.LogicalNot(rec_cond)])), + red_realize_ctx=replace( + red_realize_ctx, + surrounding_depends_on=( + red_realize_ctx.surrounding_depends_on + | cond_dep_on), + surrounding_predicates=( + red_realize_ctx.surrounding_predicates + | frozenset([prim.LogicalNot(rec_cond)]) + )), **common_kwargs)) # }}} @@ -788,13 +876,10 @@ def _strip_if_scalar(reference, val): def _preprocess_scan_arguments( red_realize_ctx, expr, nresults, scan_iname, track_iname, - newly_generated_insn_id_set, - insn_id_gen): + newly_generated_insn_id_set): """Does iname substitution within scan arguments and returns a set of values suitable to be passed to the binary op. Returns a tuple.""" - insn = red_realize_ctx.insn - if nresults > 1: inner_expr = expr @@ -802,21 +887,21 @@ def _preprocess_scan_arguments( # the arguments in order to pass them to the binary op - so we expand # items that are not "plain" tuples here. if not isinstance(inner_expr, tuple): - get_args_insn_id = insn_id_gen( - "{}_{}_get".format(insn.id, "_".join(expr.inames))) + get_args_insn_id = red_realize_ctx.insn_id_gen( + f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_get") inner_expr = expand_inner_reduction( red_realize_ctx=red_realize_ctx, id=get_args_insn_id, expr=inner_expr, nresults=nresults, - depends_on=insn.depends_on, - within_inames=insn.within_inames | expr.inames, - within_inames_is_final=insn.within_inames_is_final, - predicates=insn.predicates, + depends_on=red_realize_ctx.surrounding_depends_on, + within_inames=red_realize_ctx.surrounding_within_inames, + predicates=red_realize_ctx.surrounding_predicates, ) - newly_generated_insn_id_set.add(get_args_insn_id) + newly_generated_insn_id_set = ( + newly_generated_insn_id_set | frozenset({get_args_insn_id})) updated_inner_exprs = tuple( replace_var_within_expr( @@ -829,14 +914,13 @@ def _preprocess_scan_arguments( red_realize_ctx.kernel, red_realize_ctx.var_name_gen, expr, scan_iname, track_iname),) - return updated_inner_exprs + return updated_inner_exprs, newly_generated_insn_id_set # }}} def expand_inner_reduction( - red_realize_ctx, id, expr, nresults, depends_on, within_inames, - within_inames_is_final, predicates): + red_realize_ctx, id, expr, nresults, depends_on, within_inames, predicates): # FIXME: use _make_temporaries from pymbolic.primitives import Call from loopy.symbolic import Reduction @@ -862,7 +946,7 @@ def expand_inner_reduction( expression=expr, depends_on=depends_on, within_inames=within_inames, - within_inames_is_final=within_inames_is_final, + within_inames_is_final=True, predicates=predicates) red_realize_ctx.additional_insns.append(call_insn) @@ -872,13 +956,8 @@ def expand_inner_reduction( # {{{ reduction type: sequential -def map_reduction_seq( - red_realize_ctx, expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates): +def map_reduction_seq(red_realize_ctx, expr, nresults, arg_dtypes, reduction_dtypes): orig_kernel = red_realize_ctx.orig_kernel - insn = red_realize_ctx.insn - - outer_insn_inames = red_realize_ctx.insn.within_inames acc_var_names = _make_temporaries( red_realize_ctx=red_realize_ctx, @@ -888,37 +967,24 @@ def map_reduction_seq( dtypes=reduction_dtypes, address_space=AddressSpace.PRIVATE) - init_insn_depends_on = frozenset() - - # check first that the original kernel had global barriers - # if not, we don't need to check. Since the function - # kernel_has_global_barriers is cached, we don't do - # extra work compared to not checking. - # FIXME: Explain why we care about global barriers here - if kernel_has_global_barriers(orig_kernel): - global_barrier = find_most_recent_global_barrier( - red_realize_ctx.kernel, - insn.id) - - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) - from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) init_id = red_realize_ctx.insn_id_gen( - "{}_{}_init".format(insn.id, "_".join(expr.inames))) + f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_init") - expression, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, - target=red_realize_ctx.orig_kernel.target) + expression, red_realize_ctx.boxed_callables_table[0] = \ + expr.operation.neutral_element( + *arg_dtypes, + callables_table=red_realize_ctx.boxed_callables_table[0], + target=red_realize_ctx.orig_kernel.target) init_insn = make_assignment( id=init_id, assignees=acc_vars, - within_inames=outer_insn_inames - frozenset(expr.inames), - within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on, + within_inames=red_realize_ctx.surrounding_within_inames, + within_inames_is_final=True, + depends_on=frozenset(), expression=expression, # Do not inherit predicates: Those might read variables @@ -934,61 +1000,60 @@ def map_reduction_seq( red_realize_ctx.additional_insns.append(init_insn) update_id = red_realize_ctx.insn_id_gen( - based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) + based_on=f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_update") - update_insn_iname_deps = insn.within_inames | set(expr.inames) - if insn.within_inames_is_final: - update_insn_iname_deps = insn.within_inames | set(expr.inames) + update_red_realize_ctx = red_realize_ctx.new_subinstruction( + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset(expr.inames)), + depends_on=( + frozenset({init_id}) + | red_realize_ctx.surrounding_depends_on)) - reduction_insn_depends_on = {init_id} + reduction_expr = red_realize_ctx.mapper( + expr.expr, red_realize_ctx=update_red_realize_ctx, + nresults=1) # In the case of a multi-argument reduction, we need a name for each of # the arguments in order to pass them to the binary op - so we expand # items that are not "plain" tuples here. - if nresults > 1 and not isinstance(expr.expr, tuple): + if nresults > 1 and not isinstance(reduction_expr, tuple): get_args_insn_id = red_realize_ctx.insn_id_gen( - "{}_{}_get".format(insn.id, "_".join(expr.inames))) + f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_get") reduction_expr = expand_inner_reduction( red_realize_ctx=red_realize_ctx, id=get_args_insn_id, - expr=expr.expr, + expr=reduction_expr, nresults=nresults, - depends_on=insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, + depends_on=red_realize_ctx.surrounding_depends_on, + within_inames=update_red_realize_ctx.surrounding_within_inames, + predicates=red_realize_ctx.surrounding_predicates, ) - reduction_insn_depends_on.add(get_args_insn_id) - else: - reduction_expr = expr.expr + update_red_realize_ctx.surrounding_insn_add_depends_on.add(get_args_insn_id) - expression, callables_table = expr.operation( + expression, red_realize_ctx.boxed_callables_table[0] = expr.operation( arg_dtypes, _strip_if_scalar(acc_vars, acc_vars), reduction_expr, - callables_table, + red_realize_ctx.boxed_callables_table[0], orig_kernel.target) reduction_insn = make_assignment( id=update_id, assignees=acc_vars, expression=expression, - depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates,) + **update_red_realize_ctx.get_insn_kwargs()) red_realize_ctx.additional_insns.append(reduction_insn) - - red_realize_ctx.new_insn_add_depends_on.add(reduction_insn.id) + red_realize_ctx.surrounding_insn_add_depends_on.add(reduction_insn.id) if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0], callables_table + return acc_vars[0] else: - return acc_vars, callables_table + return acc_vars # }}} @@ -1024,30 +1089,26 @@ def _make_slab_set_from_range(iname, lbound, ubound): return bs -def map_reduction_local( - red_realize_ctx, - expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates): +def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes, + reduction_dtypes): orig_kernel = red_realize_ctx.orig_kernel - insn = red_realize_ctx.insn red_iname, = expr.inames size = _get_int_iname_size(orig_kernel, red_iname) - outer_insn_inames = insn.within_inames - from loopy.kernel.data import LocalInameTagBase - outer_local_inames = tuple(oiname for oiname in outer_insn_inames + surrounding_local_inames = tuple( + oiname for oiname in red_realize_ctx.surrounding_within_inames if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase)) from pymbolic import var outer_local_iname_vars = tuple( - var(oiname) for oiname in outer_local_inames) + var(oiname) for oiname in surrounding_local_inames) outer_local_iname_sizes = tuple( _get_int_iname_size(orig_kernel, oiname) - for oiname in outer_local_inames) + for oiname in surrounding_local_inames) neutral_var_names = _make_temporaries( red_realize_ctx=red_realize_ctx, @@ -1079,19 +1140,22 @@ def map_reduction_local( # }}} - base_iname_deps = outer_insn_inames - frozenset(expr.inames) - - neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, - callables_table=callables_table, target=orig_kernel.target) - init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_init") + neutral, red_realize_ctx.boxed_callables_table[0] = \ + expr.operation.neutral_element(*arg_dtypes, + callables_table=red_realize_ctx.boxed_callables_table[0], + target=orig_kernel.target) + init_id = red_realize_ctx.insn_id_gen( + f"{red_realize_ctx.id_prefix}_{red_iname}_init") init_insn = make_assignment( id=init_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset([base_exec_iname])), + within_inames_is_final=True, depends_on=frozenset(), # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way @@ -1105,52 +1169,65 @@ def map_reduction_local( red_realize_ctx.additional_insns.append(init_insn) init_neutral_id = red_realize_ctx.insn_id_gen( - f"{insn.id}_{red_iname}_init_neutral") + f"{red_realize_ctx.id_prefix}_{red_iname}_init_neutral") init_neutral_insn = make_assignment( id=init_neutral_id, assignees=tuple(var(nvn) for nvn in neutral_var_names), expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset([base_exec_iname])), + within_inames_is_final=True, depends_on=frozenset(), - predicates=guarding_predicates, + predicates=red_realize_ctx.surrounding_predicates, ) red_realize_ctx.additional_insns.append(init_neutral_insn) transfer_depends_on = {init_neutral_id, init_id} + transfer_red_realize_ctx = red_realize_ctx.new_subinstruction( + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset([red_iname])), + depends_on=( + red_realize_ctx.surrounding_depends_on + | frozenset([init_id, init_neutral_id])), + no_sync_with=( + red_realize_ctx.surrounding_no_sync_with + | frozenset([(init_id, "any")]))) + + reduction_expr = red_realize_ctx.mapper( + expr.expr, red_realize_ctx=transfer_red_realize_ctx, + nresults=1) + # In the case of a multi-argument reduction, we need a name for each of # the arguments in order to pass them to the binary op - so we expand # items that are not "plain" tuples here. - if nresults > 1 and not isinstance(expr.expr, tuple): + if nresults > 1 and not isinstance(reduction_expr, tuple): get_args_insn_id = red_realize_ctx.insn_id_gen( - f"{insn.id}_{red_iname}_get") + f"{red_realize_ctx.id_prefix}_{red_iname}_get") reduction_expr = expand_inner_reduction( red_realize_ctx=red_realize_ctx, id=get_args_insn_id, - expr=expr.expr, + expr=reduction_expr, nresults=nresults, - depends_on=insn.depends_on, - within_inames=( - (outer_insn_inames - frozenset(expr.inames)) - | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, + depends_on=red_realize_ctx.surrounding_depends_on, + within_inames=transfer_red_realize_ctx.surrounding_within_inames, + predicates=red_realize_ctx.surrounding_predicates, ) transfer_depends_on.add(get_args_insn_id) - else: - reduction_expr = expr.expr - transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_transfer") - expression, callables_table = expr.operation( + transfer_id = red_realize_ctx.insn_id_gen( + f"{red_realize_ctx.id_prefix}_{red_iname}_transfer") + expression, red_realize_ctx.boxed_callables_table[0] = expr.operation( arg_dtypes, _strip_if_scalar( neutral_var_names, tuple(var(nvn) for nvn in neutral_var_names)), reduction_expr, - callables_table, + red_realize_ctx.boxed_callables_table[0], orig_kernel.target) transfer_insn = make_assignment( id=transfer_id, @@ -1158,14 +1235,7 @@ def map_reduction_local( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expression, - within_inames=( - (outer_insn_inames - frozenset(expr.inames)) - | frozenset([red_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, - no_sync_with=frozenset([(init_id, "any")]), - predicates=insn.predicates, - ) + **transfer_red_realize_ctx.get_insn_kwargs()) red_realize_ctx.additional_insns.append(transfer_insn) cur_size = 1 @@ -1193,7 +1263,7 @@ def map_reduction_local( stage_id = red_realize_ctx.insn_id_gen( "red_%s_stage_%d" % (red_iname, istage)) - expression, callables_table = expr.operation( + expression, red_realize_ctx.boxed_callables_table[0] = expr.operation( arg_dtypes, _strip_if_scalar(acc_vars, tuple( acc_var[ @@ -1204,7 +1274,7 @@ def map_reduction_local( outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] for acc_var in acc_vars)), - callables_table, + red_realize_ctx.boxed_callables_table[0], orig_kernel.target) stage_insn = make_assignment( @@ -1214,10 +1284,11 @@ def map_reduction_local( for acc_var in acc_vars), expression=expression, within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, + red_realize_ctx.surrounding_within_inames + | frozenset([stage_exec_iname])), + within_inames_is_final=True, depends_on=frozenset([prev_id]), - predicates=insn.predicates, + predicates=red_realize_ctx.surrounding_predicates, ) red_realize_ctx.additional_insns.append(stage_insn) @@ -1227,17 +1298,17 @@ def map_reduction_local( bound = cur_size istage += 1 - red_realize_ctx.new_insn_add_depends_on.add(prev_id) - red_realize_ctx.new_insn_add_no_sync_with.add((prev_id, "any")) - red_realize_ctx.new_insn_add_within_inames.add( + red_realize_ctx.surrounding_insn_add_depends_on.add(prev_id) + red_realize_ctx.surrounding_insn_add_no_sync_with.add((prev_id, "any")) + red_realize_ctx.surrounding_insn_add_within_inames.add( stage_exec_iname or base_exec_iname) if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)], callables_table + return acc_vars[0][outer_local_iname_vars + (0,)] else: return [acc_var[outer_local_iname_vars + (0,)] for acc_var in - acc_vars], callables_table + acc_vars] # }}} @@ -1246,16 +1317,17 @@ def map_reduction_local( @memoize_on_first_arg def _get_or_add_sweep_tracking_iname_and_domain( red_realize_ctx, - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + scan_param, tracking_iname): kernel = red_realize_ctx.kernel - domain = kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) + domain = kernel.get_inames_domain( + frozenset((scan_param.scan_iname, scan_param.sweep_iname))) red_realize_ctx.inames_added_for_scan.add(tracking_iname) - new_domain = _create_domain_for_sweep_tracking(domain, - tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + new_domain = _create_domain_for_sweep_tracking( + domain, tracking_iname, scan_param) _insert_subdomain_into_domain_tree(kernel, red_realize_ctx.domains, new_domain) @@ -1268,6 +1340,9 @@ def replace_var_within_expr(kernel, var_name_gen, expr, from_var, to_var): from loopy.symbolic import ( SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + # FIXME: This is broken. SubstitutionRuleMappingContext produces a new + # kernel (via finish_kernel) with new subst rules. These get dropped on the + # floor here. rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) @@ -1302,28 +1377,21 @@ def _make_temporaries( # {{{ reduction type: sequential scan -def map_scan_seq( - red_realize_ctx, - expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, - scan_min_value, stride, guarding_predicates): - insn = red_realize_ctx.insn - - outer_insn_inames = insn.within_inames +def map_scan_seq(red_realize_ctx, expr, nresults, arg_dtypes, + reduction_dtypes, scan_param): track_iname = red_realize_ctx.var_name_gen( "{sweep_iname}__seq_scan" - .format(sweep_iname=sweep_iname)) + .format(sweep_iname=scan_param.sweep_iname)) _get_or_add_sweep_tracking_iname_and_domain( - red_realize_ctx, - scan_iname, sweep_iname, sweep_min_value, scan_min_value, - stride, track_iname) + red_realize_ctx, scan_param, track_iname) + red_realize_ctx.additional_iname_tags[track_iname] = frozenset() from loopy.kernel.data import AddressSpace acc_var_names = _make_temporaries( red_realize_ctx=red_realize_ctx, - name_based_on="acc_" + scan_iname, + name_based_on="acc_" + scan_param.scan_iname, nvars=nresults, shape=(), dtypes=reduction_dtypes, @@ -1333,28 +1401,22 @@ def map_scan_seq( acc_vars = tuple(var(n) for n in acc_var_names) init_id = red_realize_ctx.insn_id_gen( - "{}_{}_init".format(insn.id, "_".join(expr.inames))) + f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_init") init_insn_depends_on = frozenset() - # FIXME: Explain why we care about global barriers here - if kernel_has_global_barriers(red_realize_ctx.orig_kernel): - global_barrier = find_most_recent_global_barrier( - red_realize_ctx.kernel, insn.id) - - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) - - expression, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, - target=red_realize_ctx.orig_kernel.target) + expression, red_realize_ctx.boxed_callables_table[0] = \ + expr.operation.neutral_element(*arg_dtypes, + callables_table=red_realize_ctx.boxed_callables_table[0], + target=red_realize_ctx.orig_kernel.target) init_insn = make_assignment( id=init_id, assignees=acc_vars, - within_inames=outer_insn_inames - frozenset( - (sweep_iname,) + expr.inames), - within_inames_is_final=insn.within_inames_is_final, + within_inames=( + red_realize_ctx.surrounding_within_inames + - frozenset((scan_param.sweep_iname,) + expr.inames)), + within_inames_is_final=True, depends_on=init_insn_depends_on, expression=expression, # Do not inherit predicates: Those might read variables @@ -1369,78 +1431,86 @@ def map_scan_seq( red_realize_ctx.additional_insns.append(init_insn) - update_insn_depends_on = {init_insn.id} | insn.depends_on + scan_insn_depends_on = {init_insn.id} | red_realize_ctx.surrounding_depends_on - updated_inner_exprs = _preprocess_scan_arguments( - red_realize_ctx, - expr.expr, nresults, - scan_iname, track_iname, update_insn_depends_on, - insn_id_gen=red_realize_ctx.insn_id_gen) + scan_red_realize_ctx = red_realize_ctx.new_subinstruction( + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset({scan_param.scan_iname})), + depends_on=red_realize_ctx.surrounding_depends_on) - update_id = red_realize_ctx.insn_id_gen( - based_on="{}_{}_update".format(insn.id, "_".join(expr.inames))) + reduction_expr = red_realize_ctx.mapper( + expr.expr, red_realize_ctx=scan_red_realize_ctx, + nresults=1) + + updated_inner_exprs, scan_insn_depends_on = _preprocess_scan_arguments( + scan_red_realize_ctx, + reduction_expr, nresults, + scan_param.scan_iname, track_iname, scan_insn_depends_on) - update_insn_iname_deps = insn.within_inames | {track_iname} - if insn.within_inames_is_final: - update_insn_iname_deps = insn.within_inames | {track_iname} + scan_id = red_realize_ctx.insn_id_gen( + based_on=f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_scan") - expression, callables_table = expr.operation( + expression, red_realize_ctx.boxed_callables_table[0] = expr.operation( arg_dtypes, _strip_if_scalar(acc_vars, acc_vars), _strip_if_scalar(acc_vars, updated_inner_exprs), - callables_table, + red_realize_ctx.boxed_callables_table[0], red_realize_ctx.orig_kernel.target) scan_insn = make_assignment( - id=update_id, + id=scan_id, assignees=acc_vars, expression=expression, - depends_on=frozenset(update_insn_depends_on), - within_inames=update_insn_iname_deps, - no_sync_with=insn.no_sync_with, - within_inames_is_final=insn.within_inames_is_final, - predicates=guarding_predicates, + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset( + scan_red_realize_ctx.surrounding_insn_add_within_inames) + | {track_iname}), + depends_on=( + frozenset(scan_insn_depends_on) + | frozenset(scan_red_realize_ctx.surrounding_insn_add_depends_on) + ), + no_sync_with=( + red_realize_ctx.surrounding_no_sync_with + | frozenset(scan_red_realize_ctx.surrounding_insn_add_no_sync_with) + ), + within_inames_is_final=True, + predicates=red_realize_ctx.surrounding_predicates, ) red_realize_ctx.additional_insns.append(scan_insn) - red_realize_ctx.new_insn_add_depends_on.add(scan_insn.id) + red_realize_ctx.surrounding_insn_add_depends_on.add(scan_insn.id) if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0], callables_table + return acc_vars[0] else: - return acc_vars, callables_table + return acc_vars # }}} # {{{ reduction type: local-parallel scan -def map_scan_local( - red_realize_ctx, - expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, - scan_min_value, stride, guarding_predicates): +def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes, + reduction_dtypes, scan_param): orig_kernel = red_realize_ctx.orig_kernel - insn = red_realize_ctx.insn - scan_size = _get_int_iname_size(orig_kernel, sweep_iname) + scan_size = _get_int_iname_size(orig_kernel, scan_param.sweep_iname) assert scan_size > 0 if scan_size == 1: return map_reduction_seq(red_realize_ctx, - expr, rec, callables_table, - nresults, arg_dtypes, reduction_dtypes, - guarding_predicates) - - outer_insn_inames = insn.within_inames + expr, nresults, arg_dtypes, reduction_dtypes) from loopy.kernel.data import LocalInameTagBase - outer_local_inames = tuple(oiname for oiname in outer_insn_inames + outer_local_inames = tuple( + oiname for oiname in red_realize_ctx.surrounding_within_inames if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase) - and oiname != sweep_iname) + and oiname != scan_param.sweep_iname) from pymbolic import var outer_local_iname_vars = tuple( @@ -1452,28 +1522,29 @@ def map_scan_local( track_iname = red_realize_ctx.var_name_gen( "{sweep_iname}__pre_scan" - .format(sweep_iname=sweep_iname)) + .format(sweep_iname=scan_param.sweep_iname)) _get_or_add_sweep_tracking_iname_and_domain( red_realize_ctx, - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + scan_param, track_iname) + red_realize_ctx.additional_iname_tags[track_iname] = frozenset() # {{{ add separate iname to carry out the scan # Doing this sheds any odd conditionals that may be active # on our scan_iname. - base_exec_iname = red_realize_ctx.var_name_gen(sweep_iname + "__scan") + base_exec_iname = red_realize_ctx.var_name_gen(scan_param.sweep_iname + "__scan") red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, scan_size)) red_realize_ctx.additional_iname_tags[base_exec_iname] \ - = orig_kernel.iname_tags(sweep_iname) + = orig_kernel.iname_tags(scan_param.sweep_iname) # }}} read_var_names = _make_temporaries( red_realize_ctx=red_realize_ctx, - name_based_on="read_"+scan_iname+"_arg_{index}", + name_based_on="read_"+scan_param.scan_iname+"_arg_{index}", nvars=nresults, shape=(), dtypes=reduction_dtypes, @@ -1481,7 +1552,7 @@ def map_scan_local( acc_var_names = _make_temporaries( red_realize_ctx=red_realize_ctx, - name_based_on="acc_"+scan_iname, + name_based_on="acc_"+scan_param.scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, @@ -1490,24 +1561,17 @@ def map_scan_local( acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) - base_iname_deps = (outer_insn_inames - - frozenset(expr.inames) - frozenset([sweep_iname])) - - neutral, callables_table = expr.operation.neutral_element( - *arg_dtypes, callables_table=callables_table, - target=orig_kernel.target) - - init_insn_depends_on = insn.depends_on - - # FIXME: Explain why we care about global barriers here - if kernel_has_global_barriers(orig_kernel): - global_barrier = find_most_recent_global_barrier( - red_realize_ctx.kernel, insn.id) + base_iname_deps = ( + red_realize_ctx.surrounding_within_inames + - frozenset([scan_param.sweep_iname])) - if global_barrier is not None: - init_insn_depends_on |= frozenset([global_barrier]) + neutral, red_realize_ctx.boxed_callables_table[0] = \ + expr.operation.neutral_element(*arg_dtypes, + callables_table=red_realize_ctx.boxed_callables_table[0], + target=orig_kernel.target) - init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_init") + init_id = red_realize_ctx.insn_id_gen( + f"{red_realize_ctx.id_prefix}_{scan_param.scan_iname}_init") init_insn = make_assignment( id=init_id, assignees=tuple( @@ -1515,8 +1579,8 @@ def map_scan_local( for acc_var in acc_vars), expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, - depends_on=init_insn_depends_on, + within_inames_is_final=True, + depends_on=frozenset(), # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way # of figuring out what the dependencies of the accumulator @@ -1528,57 +1592,88 @@ def map_scan_local( ) red_realize_ctx.additional_insns.append(init_insn) - transfer_insn_depends_on = {init_insn.id} | insn.depends_on + transfer_insn_depends_on = ( + frozenset({init_insn.id}) + | red_realize_ctx.surrounding_depends_on) - updated_inner_exprs = _preprocess_scan_arguments( - red_realize_ctx, - expr.expr, nresults, - scan_iname, track_iname, transfer_insn_depends_on, - insn_id_gen=red_realize_ctx.insn_id_gen) + transfer_red_realize_ctx = red_realize_ctx.new_subinstruction( + within_inames=( + red_realize_ctx.surrounding_within_inames + | frozenset({scan_param.scan_iname})), + depends_on=red_realize_ctx.surrounding_depends_on) - from loopy.symbolic import Reduction + reduction_expr = red_realize_ctx.mapper( + expr.expr, red_realize_ctx=transfer_red_realize_ctx, + nresults=1) - from loopy.symbolic import pw_aff_to_expr - sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) + updated_inner_exprs, transfer_insn_depends_on = _preprocess_scan_arguments( + red_realize_ctx, + reduction_expr, nresults, + scan_param.scan_iname, track_iname, transfer_insn_depends_on) - transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_transfer") - transfer_insn = make_assignment( - id=transfer_id, - assignees=tuple( - acc_var[outer_local_iname_vars - + (var(sweep_iname) - sweep_min_value_expr,)] - for acc_var in acc_vars), - expression=Reduction( + from loopy.symbolic import Reduction + pre_scan_reduction = Reduction( operation=expr.operation, inames=(track_iname,), expr=_strip_if_scalar(acc_vars, updated_inner_exprs), allow_simultaneous=False, - ), - within_inames=outer_insn_inames - frozenset(expr.inames), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(transfer_insn_depends_on), - no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with, - predicates=insn.predicates, - ) + ) - red_realize_ctx.additional_insns.append(transfer_insn) + pre_scan_result = red_realize_ctx.mapper( + pre_scan_reduction, red_realize_ctx=transfer_red_realize_ctx, + nresults=len(acc_vars)) - prev_id = transfer_id + from loopy.symbolic import pw_aff_to_expr + sweep_lower_bound_expr = pw_aff_to_expr(scan_param.sweep_lower_bound) + + if nresults == 1: + assert not isinstance(pre_scan_result, tuple) + pre_scan_result = (pre_scan_result,) + + transfer_ids = frozenset() + for acc_var, pre_scan_result_i in zip(acc_vars, pre_scan_result): + transfer_id = red_realize_ctx.insn_id_gen( + f"{red_realize_ctx.id_prefix}_{scan_param.scan_iname}_transfer") + transfer_insn = make_assignment( + id=transfer_id, + assignees=(acc_var[outer_local_iname_vars + + (var(scan_param.sweep_iname) - sweep_lower_bound_expr,)],), + expression=pre_scan_result_i, + within_inames=( + red_realize_ctx.surrounding_within_inames + | transfer_red_realize_ctx.surrounding_insn_add_within_inames + | frozenset({scan_param.sweep_iname})), + within_inames_is_final=True, + depends_on=( + transfer_insn_depends_on + | transfer_red_realize_ctx.surrounding_insn_add_depends_on), + no_sync_with=( + frozenset([(init_id, "any")]) + | transfer_red_realize_ctx.surrounding_insn_add_no_sync_with), + predicates=red_realize_ctx.surrounding_predicates, + ) + + red_realize_ctx.additional_insns.append(transfer_insn) + transfer_ids = transfer_ids | frozenset({transfer_id}) + + del transfer_id + + prev_ids = transfer_ids istage = 0 cur_size = 1 while cur_size < scan_size: stage_exec_iname = red_realize_ctx.var_name_gen( - "%s__scan_s%d" % (sweep_iname, istage)) + f"{scan_param.sweep_iname}__scan_s{istage}") red_realize_ctx.domains.append( _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) red_realize_ctx.additional_iname_tags[stage_exec_iname] \ - = orig_kernel.iname_tags(sweep_iname) + = orig_kernel.iname_tags(scan_param.sweep_iname) for read_var, acc_var in zip(read_vars, acc_vars): read_stage_id = red_realize_ctx.insn_id_gen( - "scan_%s_read_stage_%d" % (scan_iname, istage)) + f"scan_{scan_param.scan_iname}_read_stage_{istage}") read_stage_insn = make_assignment( id=read_stage_id, @@ -1589,9 +1684,9 @@ def map_scan_local( + (var(stage_exec_iname) - cur_size,)]), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - predicates=insn.predicates, + within_inames_is_final=True, + depends_on=prev_ids, + predicates=red_realize_ctx.surrounding_predicates, ) if cur_size == 1: @@ -1601,22 +1696,22 @@ def map_scan_local( read_stage_insn = read_stage_insn.copy( no_sync_with=( read_stage_insn.no_sync_with - | frozenset([(transfer_id, "any")]))) + | frozenset([(tid, "any") for tid in transfer_ids]))) red_realize_ctx.additional_insns.append(read_stage_insn) - prev_id = read_stage_id + prev_ids = frozenset({read_stage_id}) write_stage_id = red_realize_ctx.insn_id_gen( - "scan_%s_write_stage_%d" % (scan_iname, istage)) + f"scan_{scan_param.scan_iname}_write_stage_{istage}") - expression, callables_table = expr.operation( + expression, red_realize_ctx.boxed_callables_table[0] = expr.operation( arg_dtypes, _strip_if_scalar(acc_vars, read_vars), _strip_if_scalar(acc_vars, tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars)), - callables_table, + red_realize_ctx.boxed_callables_table[0], orig_kernel.target) write_stage_insn = make_assignment( @@ -1627,58 +1722,52 @@ def map_scan_local( expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - predicates=insn.predicates, + within_inames_is_final=True, + depends_on=prev_ids, + predicates=red_realize_ctx.surrounding_predicates, ) red_realize_ctx.additional_insns.append(write_stage_insn) - prev_id = write_stage_id + prev_ids = frozenset({write_stage_id}) cur_size *= 2 istage += 1 - red_realize_ctx.new_insn_add_depends_on.add(prev_id) - red_realize_ctx.new_insn_add_within_inames.add(sweep_iname) + red_realize_ctx.surrounding_insn_add_depends_on.update(prev_ids) + red_realize_ctx.surrounding_insn_add_within_inames.add(scan_param.sweep_iname) - output_idx = var(sweep_iname) - sweep_min_value_expr + output_idx = var(scan_param.sweep_iname) - sweep_lower_bound_expr if nresults == 1: assert len(acc_vars) == 1 - return (acc_vars[0][outer_local_iname_vars + (output_idx,)], - callables_table) + return acc_vars[0][outer_local_iname_vars + (output_idx,)] else: return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars], callables_table + for acc_var in acc_vars] # }}} # {{{ top-level dispatch among reduction types -def map_reduction( - expr, *, rec, - callables_table, red_realize_ctx, - guarding_predicates, nresults): - insn = red_realize_ctx.insn - - # Only expand one level of reduction at a time, going from outermost to - # innermost. Otherwise we get the (iname + insn) dependencies wrong. +def map_reduction(expr, *, red_realize_ctx, nresults): + kernel_with_updated_domains = red_realize_ctx.kernel.copy( + domains=red_realize_ctx.domains) from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - red_realize_ctx.kernel, expr, callables_table, + kernel_with_updated_domains, expr, + red_realize_ctx.boxed_callables_table[0], red_realize_ctx.unknown_types_ok)) - outer_insn_inames = insn.within_inames - bad_inames = frozenset(expr.inames) & outer_insn_inames + bad_inames = frozenset(expr.inames) & red_realize_ctx.surrounding_within_inames if bad_inames: raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - iname_classes = _classify_reduction_inames(red_realize_ctx.kernel, expr.inames) + iname_classes = _classify_reduction_inames(red_realize_ctx, expr.inames) n_sequential = len(iname_classes.sequential) n_local_par = len(iname_classes.local_parallel) @@ -1698,7 +1787,8 @@ def _error_if_force_scan_on(cls, msg): # Try to determine scan candidate information (sweep iname, scan # iname, etc). scan_param = _try_infer_scan_candidate_from_expr( - red_realize_ctx.kernel, expr, outer_insn_inames, + kernel_with_updated_domains, expr, + red_realize_ctx.surrounding_within_inames, sweep_iname=red_realize_ctx.force_outer_iname_for_scan) except ValueError as v: @@ -1707,7 +1797,7 @@ def _error_if_force_scan_on(cls, msg): else: # Ensures the reduction is triangular (somewhat expensive). may_be_implemented_as_scan, error = _check_reduction_is_triangular( - red_realize_ctx.kernel, expr, scan_param) + kernel_with_updated_domains, expr, scan_param) if not may_be_implemented_as_scan: _error_if_force_scan_on(ReductionIsNotTriangularError, error) @@ -1751,7 +1841,7 @@ def _error_if_force_scan_on(cls, msg): # to reduce over. It's rather similar to an array with () shape in # numpy.) - return expr.expr, callables_table + return expr.expr if may_be_implemented_as_scan: assert red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok @@ -1759,14 +1849,13 @@ def _error_if_force_scan_on(cls, msg): # We require the "scan" iname to be tagged sequential. if n_sequential: sweep_iname = scan_param.sweep_iname - sweep_class = _classify_reduction_inames( - red_realize_ctx.orig_kernel, (sweep_iname,)) + sweep_class = _classify_reduction_inames(red_realize_ctx, (sweep_iname,)) sequential = sweep_iname in sweep_class.sequential parallel = sweep_iname in sweep_class.local_parallel bad_parallel = sweep_iname in sweep_class.nonlocal_parallel - if sweep_iname not in outer_insn_inames: + if sweep_iname not in red_realize_ctx.surrounding_within_inames: _error_if_force_scan_on(LoopyError, "Sweep iname '%s' was detected, but is not an iname " "for the instruction." % sweep_iname) @@ -1778,25 +1867,11 @@ def _error_if_force_scan_on(cls, msg): ", ".join(tag.key for tag in red_realize_ctx.kernel.iname_tags(sweep_iname)))) elif parallel: - return map_scan_local( - red_realize_ctx, - expr, rec, callables_table, nresults, - arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, - scan_param.sweep_lower_bound, - scan_param.scan_lower_bound, - scan_param.stride, - guarding_predicates) + return map_scan_local(red_realize_ctx, expr, nresults, + arg_dtypes, reduction_dtypes, scan_param) elif sequential: - return map_scan_seq( - red_realize_ctx, - expr, rec, callables_table, nresults, - arg_dtypes, reduction_dtypes, sweep_iname, - scan_param.scan_iname, - scan_param.sweep_lower_bound, - scan_param.scan_lower_bound, - scan_param.stride, - guarding_predicates) + return map_scan_seq(red_realize_ctx, expr, nresults, + arg_dtypes, reduction_dtypes, scan_param) # fallthrough to reduction implementation @@ -1814,15 +1889,13 @@ def _error_if_force_scan_on(cls, msg): assert n_local_par == 0 return map_reduction_seq( red_realize_ctx, - expr, rec, callables_table, - nresults, arg_dtypes, reduction_dtypes, - guarding_predicates) + expr, nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( red_realize_ctx, - expr, rec, callables_table, nresults, arg_dtypes, - reduction_dtypes, guarding_predicates) + expr, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1842,7 +1915,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_gen = kernel.get_instruction_id_generator() var_name_gen = kernel.get_var_name_generator() - cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) + cb_mapper = RealizeReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] domains = kernel.domains[:] @@ -1855,6 +1928,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, insn = insn_queue.pop(0) red_realize_ctx = _ReductionRealizationContext( + mapper=cb_mapper, + force_scan=force_scan, automagic_scans_ok=automagic_scans_ok, unknown_types_ok=unknown_types_ok, @@ -1862,7 +1937,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, orig_kernel=orig_kernel, kernel=kernel, - insn=insn, + + id_prefix=insn.id, insn_id_gen=insn_id_gen, var_name_gen=var_name_gen, @@ -1871,14 +1947,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table, additional_insns=[], domains=domains, additional_iname_tags={}, + boxed_callables_table=[callables_table], inames_added_for_scan=inames_added_for_scan, - new_insn_add_depends_on=set(), - new_insn_add_no_sync_with=set(), - new_insn_add_within_inames=set(), + surrounding_within_inames=insn.within_inames, + surrounding_depends_on=insn.depends_on, + surrounding_no_sync_with=insn.no_sync_with, + surrounding_predicates=insn.predicates, - were_changes_made=False, + surrounding_insn_add_within_inames=set(), + surrounding_insn_add_depends_on=set(), + surrounding_insn_add_no_sync_with=set(), + + _change_flag=_ChangeFlag(changes_made=False) ) if insn_id_filter is not None and insn.id != insn_id_filter \ @@ -1892,15 +1974,11 @@ def realize_reduction_for_single_kernel(kernel, callables_table, from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, - callables_table=cb_mapper.callables_table, red_realize_ctx=red_realize_ctx, - guarding_predicates=insn.predicates, nresults=nresults) else: new_expressions = cb_mapper(insn.expression, - callables_table=cb_mapper.callables_table, red_realize_ctx=red_realize_ctx, - guarding_predicates=insn.predicates, nresults=1), if red_realize_ctx.were_changes_made: @@ -1911,17 +1989,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, kernel_changed = True - insn_id_replacements = {} + callables_table = red_realize_ctx.boxed_callables_table[0] result_assignment_dep_on = ( insn.depends_on - | frozenset(red_realize_ctx.new_insn_add_depends_on)) + | frozenset(red_realize_ctx.surrounding_insn_add_depends_on)) kwargs = insn.get_copy_kwargs( no_sync_with=insn.no_sync_with - | frozenset(red_realize_ctx.new_insn_add_no_sync_with), + | frozenset(red_realize_ctx.surrounding_insn_add_no_sync_with), within_inames=( insn.within_inames - | red_realize_ctx.new_insn_add_within_inames)) + | red_realize_ctx.surrounding_insn_add_within_inames)) kwargs.pop("id") kwargs.pop("depends_on") @@ -1931,6 +2009,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, kwargs.pop("temp_var_type", None) kwargs.pop("temp_var_types", None) + insn_id_replacements = {} + if isinstance(insn.expression, Reduction) and nresults > 1: result_assignment_ids = [ insn_id_gen(insn.id) for i in range(nresults)] @@ -1962,10 +2042,32 @@ def realize_reduction_for_single_kernel(kernel, callables_table, **kwargs) ] - insn_queue = ( - red_realize_ctx.additional_insns - + replacement_insns - + insn_queue) + additional_insns = red_realize_ctx.additional_insns + + # {{{ make additional insns depend on most recent global barrier + + # FIXME This is weird and hokey and ad-hoc and probably broken. + # I *think* the idea is to keep a reduction/scan implementation + # from crossing a global barrier, because that would be costly. + + # check first that the original kernel had global barriers + # if not, we don't need to check. Since the function + # kernel_has_global_barriers is cached, we don't do + # extra work compared to not checking. + + from loopy.kernel.tools import ( + kernel_has_global_barriers, find_most_recent_global_barrier) + + if kernel_has_global_barriers(orig_kernel): + global_barrier = find_most_recent_global_barrier(kernel, insn.id) + + if global_barrier is not None: + gb_dep = frozenset([global_barrier]) + additional_insns = [addl_insn.copy( + depends_on=addl_insn.depends_on | gb_dep) + for addl_insn in additional_insns] + + # }}} # The reduction expander needs an up-to-date kernel # object to find dependencies. Keep kernel up-to-date. @@ -1980,6 +2082,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, replace_instruction_ids_in_insn(insn, insn_id_replacements) for insn in insn_queue] + finished_insns.extend(additional_insns) + finished_insns.extend(replacement_insns) + kernel = kernel.copy( instructions=finished_insns + insn_queue, temporary_variables=new_temporary_variables, @@ -1993,19 +2098,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, else: # nothing happened, we're done with insn - assert not red_realize_ctx.new_insn_add_depends_on + assert not red_realize_ctx.surrounding_insn_add_depends_on finished_insns.append(insn) - if kernel_changed: - kernel = kernel.copy(instructions=finished_insns) - else: + if not kernel_changed: return orig_kernel, callables_table kernel = _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel) - return kernel, cb_mapper.callables_table + return kernel, callables_table # }}} diff --git a/test/test_scan.py b/test/test_scan.py index 94778ef4d..f5aa8a7c2 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -221,12 +221,8 @@ def test_local_parallel_scan(ctx_factory, n): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) - knl = lp.realize_reduction(knl) - knl = lp.add_dtypes(knl, dict(a=int)) - print(knl) - evt, (a,) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all() @@ -246,7 +242,6 @@ def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) - knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out,) = knl(queue, a=np.arange(1, 17)) From 6a4a99b39c5d135d65b750f3c54e249f54cd77c6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 4 Feb 2022 01:06:52 -0600 Subject: [PATCH 19/27] Add test_reduction_in_conditional (gh-533) --- test/test_reduction.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_reduction.py b/test/test_reduction.py index 1aa3b52b6..065d3de46 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -480,6 +480,26 @@ def test_reduction_without_inames(ctx_factory): assert out_dict["out"].get() == 5 +def test_reduction_in_conditional(ctx_factory): + # https://github.com/inducer/loopy/issues/533#issuecomment-1028472366 + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i, j, k]: 0<=i,j,k<10}", + """ + y[i] = 1729 if (sum(j, j) == 0) else sum(k, k) + """) + + knl = lp.set_options(knl, write_cl=True) + + knl = lp.preprocess_program(knl) + + evt, (out,) = knl(cq) + + assert (out == 45).all() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From bf1ceda994b4a8d4354c3226eff531840a1606c5 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 13 Feb 2022 15:01:05 -0600 Subject: [PATCH 20/27] Fix path to tasksys.cpp --- examples/python/ispc-stream-harness.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index 722cd917c..f603aabbe 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -54,7 +54,8 @@ def gen_code(knl): def main(): - with open("tasksys.cpp") as ts_file: + this_dir = os.path.dirname(__file__) + with open(os.path.join(this_dir, "tasksys.cpp")) as ts_file: tasksys_source = ts_file.read() def make_knl(name, insn, vars): From efcb5598d637655d820898f9d9965fb2ea3cb8fd Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 15 Feb 2022 12:47:33 -0600 Subject: [PATCH 21/27] Remove redundant multiplication by one --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 7871eadf2..9a6be115c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -247,7 +247,7 @@ def binary_tree_add(start, end): complex_sum = binary_tree_add(0, len(c_applied)) - if real_sum: + if reals: return p.Variable("%s_radd" % tgt_name)(real_sum, complex_sum) else: return complex_sum From b77e416e399ea3079a635d9e7583ec3f56153afa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 16 Feb 2022 11:33:47 -0600 Subject: [PATCH 22/27] guards passing unused variables in privatize_temporaries_with_inames --- loopy/transform/privatize.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index fb2ce37a1..e9d696481 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -122,6 +122,16 @@ def privatize_temporaries_with_inames( s.strip() for s in only_var_names.split(",")) + # {{{ sanity checks + + if (only_var_names is not None + and privatizing_inames <= kernel.all_inames() + and not (frozenset(only_var_names) <= kernel.all_variable_names())): + raise LoopyError(f"Some variables in '{only_var_names}'" + f" not used in kernel '{kernel.name}'.") + + # }}} + wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} From 1f9cd4b2cea5ee11efbd8ed2af646ccd11234d1f Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 16 Feb 2022 13:05:14 -0600 Subject: [PATCH 23/27] Remove redundant multiplication by one --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 9a6be115c..06ff41908 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -293,7 +293,7 @@ def binary_tree_mul(start, end): complex_prd = binary_tree_mul(0, len(complexes)) - if real_prd: + if reals: return p.Variable("%s_rmul" % tgt_name)(real_prd, complex_prd) else: return complex_prd From 7241bd636afe82566aa0e80b7c7b2dbb9e49312a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 16 Feb 2022 17:21:28 -0600 Subject: [PATCH 24/27] [bugfix]: precompute over insns after a gbarrier --- loopy/transform/precompute.py | 14 ++++++++++++++ test/test_transform.py | 26 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 7c20d7a01..201abd470 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -29,6 +29,8 @@ from pymbolic.mapper.substitutor import make_subst_func from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.tools import (kernel_has_global_barriers, + find_most_recent_global_barrier) import numpy as np from pymbolic import var @@ -217,6 +219,18 @@ def map_substitution(self, name, tag, arguments, expn_state): self.replaced_something = True + # {{{ add gbarriers that the replaced insn depends-on to compute insn's deps + + if (kernel_has_global_barriers(expn_state.kernel) + and (find_most_recent_global_barrier(expn_state.kernel, + expn_state.instruction.id + ) is not None)): + self.compute_insn_depends_on.add( + find_most_recent_global_barrier(expn_state.kernel, + expn_state.instruction.id)) + + # }}} + return new_outer_expr def map_kernel(self, kernel): diff --git a/test/test_transform.py b/test/test_transform.py index e42eeb498..2043b127e 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1366,6 +1366,32 @@ def test_rename_inames_existing_ok(ctx_factory): lp.auto_test_vs_ref(knl, ctx, ref_knl) +def test_precompute_with_gbarrier(ctx_factory): + # See https://github.com/inducer/loopy/issues/543 + ctx = ctx_factory() + + t_unit = lp.make_kernel( + ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}", + "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"], + """ + out0[i0] = sum(j0, A[i0] * x[j0]) + ... gbarrier {id=gbarrier} + out1[i1] = sum(j1, A[i1] * x[j1]) + """, seq_dependencies=True) + t_unit = lp.add_dtypes(t_unit, {"A": np.float64, + "x": np.float64}) + ref_t_unit = t_unit + + t_unit = lp.add_prefetch(t_unit, + "x", + sweep_inames=["j1"], + within="writes:out1", + prefetch_insn_id="x_fetch") + assert "gbarrier" in t_unit.default_entrypoint.id_to_insn["x_fetch"].depends_on + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) From c58d075c07f9cb158a2d9484e6e37c2e5f0588c0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 16 Feb 2022 19:19:00 -0600 Subject: [PATCH 25/27] Drop uses of islpy.SuppressedWarnings (deprecated, now a no-op) --- loopy/isl_helpers.py | 3 +-- loopy/kernel/tools.py | 6 ++--- loopy/symbolic.py | 34 ++++++++++++++-------------- loopy/transform/realize_reduction.py | 16 ++++++------- 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 57183109b..45f74d70a 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -349,8 +349,7 @@ def is_nonnegative(expr, over_set): space = over_set.get_space() from loopy.symbolic import aff_from_expr try: - with isl.SuppressedWarnings(space.get_ctx()): - aff = aff_from_expr(space, -expr-1) + aff = aff_from_expr(space, -expr-1) except Exception: return None expr_neg_set = isl.BasicSet.universe(space).add_constraint( diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 64e3cd84b..9806fbe8d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -818,8 +818,7 @@ def assign_axis(recursion_axis, iname, axis=None): If *axis* is None, find a suitable axis automatically. """ try: - with isl.SuppressedWarnings(kernel.isl_context): - desired_length = kernel.get_constant_iname_length(iname) + desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. @@ -947,8 +946,7 @@ def assign_axis(recursion_axis, iname, axis=None): def get_iname_length(iname): try: - with isl.SuppressedWarnings(kernel.isl_context): - return kernel.get_constant_iname_length(iname) + return kernel.get_constant_iname_length(iname) except isl.Error: return -1 # assign longest auto axis inames first diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b47fe9266..8f702f783 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1875,23 +1875,23 @@ def with_aff_conversion_guard(f, space, expr, *args): from loopy.diagnostic import ExpressionNotAffineError err = None - with isl.SuppressedWarnings(space.get_ctx()): - try: - return f(space, expr, *args) - except TypeError as e: - err = e - except isl.Error as e: - err = e - except UnknownVariableError as e: - err = e - except ExpressionNotAffineError as e: - err = e - - assert err is not None - from loopy.diagnostic import ExpressionToAffineConversionError - raise ExpressionToAffineConversionError( - "could not convert expression '%s' to affine representation: " - "%s: %s" % (expr, type(err).__name__, str(err))) + + try: + return f(space, expr, *args) + except TypeError as e: + err = e + except isl.Error as e: + err = e + except UnknownVariableError as e: + err = e + except ExpressionNotAffineError as e: + err = e + + assert err is not None + from loopy.diagnostic import ExpressionToAffineConversionError + raise ExpressionToAffineConversionError( + "could not convert expression '%s' to affine representation: " + "%s: %s" % (expr, type(err).__name__, str(err))) def guarded_aff_from_expr(space, expr, vars_to_zero=None): diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index 67aa627f8..2f8e3abe8 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -469,10 +469,9 @@ def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_ina within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) try: - with isl.SuppressedWarnings(domain.get_ctx()): - sweep_lower_bound = domain.dim_min(sweep_idx) - sweep_upper_bound = domain.dim_max(sweep_idx) - scan_lower_bound = domain.dim_min(scan_idx) + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) except isl.Error as e: raise ValueError("isl error: %s" % e) @@ -499,11 +498,10 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): # Should be equal to k * sweep_iname, where k is the stride. try: - with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()): - scan_iname_range = ( - domain_with_sweep_param.dim_max(scan_iname_idx) - - domain_with_sweep_param.dim_min(scan_iname_idx) - ).gist(domain_with_sweep_param.params()) + scan_iname_range = ( + domain_with_sweep_param.dim_max(scan_iname_idx) + - domain_with_sweep_param.dim_min(scan_iname_idx) + ).gist(domain_with_sweep_param.params()) except isl.Error as e: raise ValueError("isl error: '%s'" % e) From d2cd0d89c68b03cde169c9351a17aa376c8ef427 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 16 Feb 2022 19:01:45 -0600 Subject: [PATCH 26/27] preserve rev. depends for buffer array's store instructions --- loopy/transform/buffer.py | 13 +++++++++++-- test/test_transform.py | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index a6e25457d..e3dbeeb51 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -499,14 +499,23 @@ def none_to_empty_set(s): new_insns.append(init_instruction) if did_write: - new_insns.append(store_instruction) + # new_insns_with_redirected_deps: if an insn depends on a modified + # insn, then it should also depend on the store insn. + new_insns_with_redirected_deps = [ + insn.copy(depends_on=(insn.depends_on | {store_instruction.id})) + if insn.depends_on & aar.modified_insn_ids + else insn + for insn in new_insns + ] + [store_instruction] else: for iname in store_inames: del new_iname_to_tag[iname] + new_insns_with_redirected_deps = new_insns + kernel = kernel.copy( domains=new_kernel_domains, - instructions=new_insns, + instructions=new_insns_with_redirected_deps, temporary_variables=new_temporary_variables) from loopy import tag_inames diff --git a/test/test_transform.py b/test/test_transform.py index 2043b127e..2aa07dabb 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1351,6 +1351,30 @@ def test_rename_inames(ctx_factory): lp.auto_test_vs_ref(knl, ctx, ref_knl) +def test_buffer_array_preserves_rev_deps(ctx_factory): + # See https://github.com/inducer/loopy/issues/546 + ctx = ctx_factory() + knl = lp.make_kernel( + ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}", + "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"], + """ + out0[i0] = sum(j0, A[i0] * x[j0]) + ... gbarrier {id=gbarrier} + out1[i1] = sum(j1, A[i1] * x[j1]) + """, seq_dependencies=True) + knl = lp.add_dtypes(knl, {"A": np.float64, + "x": np.float64}) + ref_knl = knl + + knl = lp.split_iname(knl, "j0", 2) + knl = lp.split_iname(knl, "i0", 2, outer_tag="g.0") + knl = lp.buffer_array(knl, "out0", + buffer_inames=["i0_inner"], + init_expression="0") + assert "store_out0" in knl.default_entrypoint.id_to_insn["gbarrier"].depends_on + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + def test_rename_inames_existing_ok(ctx_factory): ctx = ctx_factory() From 21e2fb6899285b22e2943a64b34186aea18cbdd3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 17 Nov 2021 10:01:32 -0600 Subject: [PATCH 27/27] rename_inames should use remove_unused inames --- loopy/transform/iname.py | 268 +++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 134 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 3712d678b..d82b2b352 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1125,140 +1125,6 @@ def has_schedulable_iname_nesting(kernel): # }}} -# {{{ rename_inames - -@for_each_kernel -def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None): - """ - :arg old_inames: A collection of inames that must be renamed to **new_iname**. - :arg within: a stack match as understood by - :func:`loopy.match.parse_stack_match`. - :arg existing_ok: execute even if *new_iname* already exists - """ - from collections.abc import Collection - if (isinstance(old_inames, str) - or not isinstance(old_inames, Collection)): - raise LoopyError("'old_inames' must be a collection of strings, " - f"got '{type(old_inames)}'.") - - if new_iname in old_inames: - raise LoopyError("new iname is part of inames being renamed") - - if new_iname in (kernel.all_variable_names() - kernel.all_inames()): - raise LoopyError(f"New iname '{new_iname}' is already a variable in the" - "kernel") - - if any((len(insn.within_inames & frozenset(old_inames)) > 1) - for insn in kernel.instructions): - raise LoopyError("old_inames contains nested inames" - " -- renaming is illegal.") - - # sort to have deterministic implementation. - old_inames = sorted(old_inames) - - var_name_gen = kernel.get_var_name_generator() - - # FIXME: Distinguish existing iname vs. existing other variable - does_exist = new_iname in kernel.all_inames() - - if not (frozenset(old_inames) <= kernel.all_inames()): - raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}" - " do not exist.") - - if does_exist and not existing_ok: - raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier" - " --cannot rename") - - if not does_exist: - # {{{ rename old_inames[0] -> new_iname - # so that the code below can focus on "merging" inames that already exist - - kernel = duplicate_inames( - kernel, old_inames[0], within=within, new_inames=[new_iname]) - kernel = remove_unused_inames(kernel, old_inames[0]) - - # old_iname[0] is already renamed to new_iname => do not rename again. - old_inames = old_inames[1:] - - # }}} - - del does_exist - assert new_iname in kernel.all_inames() - - for old_iname in old_inames: - # {{{ check that the domains match up - - dom = kernel.get_inames_domain(frozenset((old_iname, new_iname))) - - var_dict = dom.get_var_dict() - _, old_idx = var_dict[old_iname] - _, new_idx = var_dict[new_iname] - - par_idx = dom.dim(dim_type.param) - dom_old = dom.move_dims( - dim_type.param, par_idx, dim_type.set, old_idx, 1) - dom_old = dom_old.move_dims( - dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1) - dom_old = dom_old.project_out( - dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1) - - par_idx = dom.dim(dim_type.param) - dom_new = dom.move_dims( - dim_type.param, par_idx, dim_type.set, new_idx, 1) - dom_new = dom_new.move_dims( - dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1) - dom_new = dom_new.project_out( - dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1) - - if not (dom_old <= dom_new and dom_new <= dom_old): - raise LoopyError( - "inames {old} and {new} do not iterate over the same domain" - .format(old=old_iname, new=new_iname)) - - # }}} - - from pymbolic import var - subst_dict = {old_iname: var(new_iname) for old_iname in old_inames} - - from loopy.match import parse_stack_match - within = parse_stack_match(within) - - from pymbolic.mapper.substitutor import make_subst_func - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, var_name_gen) - smap = RuleAwareSubstitutionMapper(rule_mapping_context, - make_subst_func(subst_dict), within) - - from loopy.kernel.instruction import MultiAssignmentBase - - def does_insn_involve_iname(kernel, insn, *args): - return (not isinstance(insn, MultiAssignmentBase) - or frozenset(old_inames) & insn.dependency_names() - or frozenset(old_inames) & insn.reduction_inames()) - - kernel = rule_mapping_context.finish_kernel( - smap.map_kernel(kernel, within=does_insn_involve_iname)) - - new_instructions = [insn.copy(within_inames=((insn.within_inames - - frozenset(old_inames)) - | frozenset([new_iname]))) - if ((len(frozenset(old_inames) & insn.within_inames) != 0) - and within(kernel, insn, ())) - else insn - for insn in kernel.instructions] - - kernel = kernel.copy(instructions=new_instructions) - kernel = remove_unused_inames(kernel, old_inames) - - return kernel - - -def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): - return rename_inames(kernel, [old_iname], new_iname, existing_ok, within) - -# }}} - - # {{{ remove unused inames def get_used_inames(kernel): @@ -2422,4 +2288,138 @@ def add_inames_for_unused_hw_axes(kernel, within=None): return kernel.copy(instructions=new_insns) + +# {{{ rename_inames + +@for_each_kernel +@remove_any_newly_unused_inames +def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None): + """ + :arg old_inames: A collection of inames that must be renamed to **new_iname**. + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match`. + :arg existing_ok: execute even if *new_iname* already exists + """ + from collections.abc import Collection + if (isinstance(old_inames, str) + or not isinstance(old_inames, Collection)): + raise LoopyError("'old_inames' must be a collection of strings, " + f"got '{type(old_inames)}'.") + + if new_iname in old_inames: + raise LoopyError("new iname is part of inames being renamed") + + if new_iname in (kernel.all_variable_names() - kernel.all_inames()): + raise LoopyError(f"New iname '{new_iname}' is already a variable in the" + "kernel") + + if any((len(insn.within_inames & frozenset(old_inames)) > 1) + for insn in kernel.instructions): + raise LoopyError("old_inames contains nested inames" + " -- renaming is illegal.") + + # sort to have deterministic implementation. + old_inames = sorted(old_inames) + + var_name_gen = kernel.get_var_name_generator() + + # FIXME: Distinguish existing iname vs. existing other variable + does_exist = new_iname in kernel.all_inames() + + if not (frozenset(old_inames) <= kernel.all_inames()): + raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}" + " do not exist.") + + if does_exist and not existing_ok: + raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier" + " --cannot rename") + + if not does_exist: + # {{{ rename old_inames[0] -> new_iname + # so that the code below can focus on "merging" inames that already exist + + kernel = duplicate_inames( + kernel, old_inames[0], within=within, new_inames=[new_iname]) + + # old_iname[0] is already renamed to new_iname => do not rename again. + old_inames = old_inames[1:] + + # }}} + + del does_exist + assert new_iname in kernel.all_inames() + + for old_iname in old_inames: + # {{{ check that the domains match up + + dom = kernel.get_inames_domain(frozenset((old_iname, new_iname))) + + var_dict = dom.get_var_dict() + _, old_idx = var_dict[old_iname] + _, new_idx = var_dict[new_iname] + + par_idx = dom.dim(dim_type.param) + dom_old = dom.move_dims( + dim_type.param, par_idx, dim_type.set, old_idx, 1) + dom_old = dom_old.move_dims( + dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1) + dom_old = dom_old.project_out( + dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1) + + par_idx = dom.dim(dim_type.param) + dom_new = dom.move_dims( + dim_type.param, par_idx, dim_type.set, new_idx, 1) + dom_new = dom_new.move_dims( + dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1) + dom_new = dom_new.project_out( + dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1) + + if not (dom_old <= dom_new and dom_new <= dom_old): + raise LoopyError( + "inames {old} and {new} do not iterate over the same domain" + .format(old=old_iname, new=new_iname)) + + # }}} + + from pymbolic import var + subst_dict = {old_iname: var(new_iname) for old_iname in old_inames} + + from loopy.match import parse_stack_match + within = parse_stack_match(within) + + from pymbolic.mapper.substitutor import make_subst_func + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + smap = RuleAwareSubstitutionMapper(rule_mapping_context, + make_subst_func(subst_dict), within) + + from loopy.kernel.instruction import MultiAssignmentBase + + def does_insn_involve_iname(kernel, insn, *args): + return (not isinstance(insn, MultiAssignmentBase) + or frozenset(old_inames) & insn.dependency_names() + or frozenset(old_inames) & insn.reduction_inames()) + + kernel = rule_mapping_context.finish_kernel( + smap.map_kernel(kernel, within=does_insn_involve_iname)) + + new_instructions = [insn.copy(within_inames=((insn.within_inames + - frozenset(old_inames)) + | frozenset([new_iname]))) + if ((len(frozenset(old_inames) & insn.within_inames) != 0) + and within(kernel, insn, ())) + else insn + for insn in kernel.instructions] + + kernel = kernel.copy(instructions=new_instructions) + + return kernel + + +def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): + return rename_inames(kernel, [old_iname], new_iname, existing_ok, within) + +# }}} + + # vim: foldmethod=marker