From 9e69cf4b2dedad95e5f52ac5d7a8fbe1a08a8c80 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 17 Dec 2021 14:14:10 -0600
Subject: [PATCH 01/27] Test, fix reductions with no inames

---
 loopy/preprocess.py    | 17 +++++++++++------
 test/test_reduction.py | 21 +++++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fc0e82afb..55b735f4a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -1751,6 +1751,8 @@ def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes,
 
     def map_reduction(expr, rec, callables_table,
                       guarding_predicates, nresults=1):
+        nonlocal insn_changed
+
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
@@ -1827,6 +1829,10 @@ def _error_if_force_scan_on(cls, msg):
                        ", ".join(str(kernel.iname_tags(iname))
                                  for iname in bad_inames)))
 
+        # }}}
+
+        insn_changed = True
+
         if n_local_par == 0 and n_sequential == 0:
             from loopy.diagnostic import warn_with_kernel
             warn_with_kernel(kernel, "empty_reduction",
@@ -1840,8 +1846,6 @@ def _error_if_force_scan_on(cls, msg):
 
             return expr.expr, callables_table
 
-        # }}}
-
         if may_be_implemented_as_scan:
             assert force_scan or automagic_scans_ok
 
@@ -1916,7 +1920,7 @@ def _error_if_force_scan_on(cls, msg):
     domains = kernel.domains[:]
 
     temp_kernel = kernel
-    changed = False
+    kernel_changed = False
 
     import loopy as lp
     while insn_queue:
@@ -1925,6 +1929,7 @@ def _error_if_force_scan_on(cls, msg):
         new_insn_add_within_inames = set()
 
         generated_insns = []
+        insn_changed = False
 
         insn = insn_queue.pop(0)
 
@@ -1947,7 +1952,7 @@ def _error_if_force_scan_on(cls, msg):
                                         callables_table=cb_mapper.callables_table,
                                         guarding_predicates=insn.predicates),
 
-        if generated_insns:
+        if insn_changed:
             # An expansion happened, so insert the generated stuff plus
             # ourselves back into the queue.
 
@@ -2010,14 +2015,14 @@ def _error_if_force_scan_on(cls, msg):
                     domains=domains)
             temp_kernel = lp.replace_instruction_ids(
                     temp_kernel, insn_id_replacements)
-            changed = True
+            kernel_changed = True
         else:
             # nothing happened, we're done with insn
             assert not new_insn_add_depends_on
 
             new_insns.append(insn)
 
-    if changed:
+    if kernel_changed:
         kernel = kernel.copy(
             instructions=new_insns,
             temporary_variables=new_temporary_variables,
diff --git a/test/test_reduction.py b/test/test_reduction.py
index c623c68c6..931628a04 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -460,6 +460,27 @@ def test_any_all(ctx_factory):
     assert not out_dict["out2"].get()
 
 
+def test_reduction_without_inames(ctx_factory):
+    """Ensure that reductions with no inames get rewritten to the element
+    being reduced over. This was sometimes erroneously eliminated because
+    reduction realization used the generation of new statements as a criterion
+    for whether work was done.
+    """
+    ctx = ctx_factory()
+    cq = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{:}",
+            """
+            out = reduce(any, [], 5)
+            """)
+    knl = lp.set_options(knl, return_dict=True)
+
+    _, out_dict = knl(cq)
+
+    assert out_dict["out"].get() == 5
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])

From 4f3ad69fbdfa39e31532ae034c5ef76523e73e00 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sat, 13 Nov 2021 11:32:25 -0600
Subject: [PATCH 02/27] implements rename_inames

---
 loopy/__init__.py        |   4 +-
 loopy/transform/iname.py | 109 +++++++++++++++++++++++++++------------
 2 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 9bd01534b..7e6ee5234 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -71,7 +71,7 @@
 from loopy.transform.iname import (
         set_loop_priority, prioritize_loops, untag_inames,
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
-        rename_iname, remove_unused_inames,
+        rename_iname, rename_inames, remove_unused_inames,
         split_reduction_inward, split_reduction_outward,
         affine_map_inames, find_unused_axis_tag,
         make_reduction_inames_unique,
@@ -198,7 +198,7 @@
         "set_loop_priority", "prioritize_loops", "untag_inames",
         "split_iname", "chunk_iname", "join_inames", "tag_inames",
         "duplicate_inames",
-        "rename_iname", "remove_unused_inames",
+        "rename_iname", "rename_inames", "remove_unused_inames",
         "split_reduction_inward", "split_reduction_outward",
         "affine_map_inames", "find_unused_axis_tag",
         "make_reduction_inames_unique",
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index e55bad50c..3712d678b 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -58,6 +58,8 @@
 
 .. autofunction:: rename_iname
 
+.. autofunction:: rename_inames
+
 .. autofunction:: remove_unused_inames
 
 .. autofunction:: split_reduction_inward
@@ -1126,26 +1128,64 @@ def has_schedulable_iname_nesting(kernel):
 # {{{ rename_inames
 
 @for_each_kernel
-def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
+def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None):
     """
+    :arg old_inames: A collection of inames that must be renamed to **new_iname**.
     :arg within: a stack match as understood by
         :func:`loopy.match.parse_stack_match`.
     :arg existing_ok: execute even if *new_iname* already exists
     """
+    from collections.abc import Collection
+    if (isinstance(old_inames, str)
+            or not isinstance(old_inames, Collection)):
+        raise LoopyError("'old_inames' must be a collection of strings, "
+                         f"got '{type(old_inames)}'.")
+
+    if new_iname in old_inames:
+        raise LoopyError("new iname is part of inames being renamed")
+
+    if new_iname in (kernel.all_variable_names() - kernel.all_inames()):
+        raise LoopyError(f"New iname '{new_iname}' is already a variable in the"
+                         "kernel")
+
+    if any((len(insn.within_inames & frozenset(old_inames)) > 1)
+           for insn in kernel.instructions):
+        raise LoopyError("old_inames contains nested inames"
+                         " -- renaming is illegal.")
+
+    # sort to have deterministic implementation.
+    old_inames = sorted(old_inames)
 
     var_name_gen = kernel.get_var_name_generator()
 
     # FIXME: Distinguish existing iname vs. existing other variable
-    does_exist = var_name_gen.is_name_conflicting(new_iname)
+    does_exist = new_iname in kernel.all_inames()
 
-    if old_iname not in kernel.all_inames():
-        raise LoopyError("old iname '%s' does not exist" % old_iname)
+    if not (frozenset(old_inames) <= kernel.all_inames()):
+        raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}"
+                         " do not exist.")
 
     if does_exist and not existing_ok:
-        raise LoopyError("iname '%s' conflicts with an existing identifier"
-                "--cannot rename" % new_iname)
+        raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier"
+                         " --cannot rename")
 
-    if does_exist:
+    if not does_exist:
+        # {{{ rename old_inames[0] -> new_iname
+        # so that the code below can focus on "merging" inames that already exist
+
+        kernel = duplicate_inames(
+                kernel, old_inames[0], within=within, new_inames=[new_iname])
+        kernel = remove_unused_inames(kernel, old_inames[0])
+
+        # old_iname[0] is already renamed to new_iname => do not rename again.
+        old_inames = old_inames[1:]
+
+        # }}}
+
+    del does_exist
+    assert new_iname in kernel.all_inames()
+
+    for old_iname in old_inames:
         # {{{ check that the domains match up
 
         dom = kernel.get_inames_domain(frozenset((old_iname, new_iname)))
@@ -1177,42 +1217,45 @@ def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
 
         # }}}
 
-        from pymbolic import var
-        subst_dict = {old_iname: var(new_iname)}
-
-        from loopy.match import parse_stack_match
-        within = parse_stack_match(within)
+    from pymbolic import var
+    subst_dict = {old_iname: var(new_iname) for old_iname in old_inames}
 
-        from pymbolic.mapper.substitutor import make_subst_func
-        rule_mapping_context = SubstitutionRuleMappingContext(
-                kernel.substitutions, var_name_gen)
-        smap = RuleAwareSubstitutionMapper(rule_mapping_context,
-                        make_subst_func(subst_dict), within)
+    from loopy.match import parse_stack_match
+    within = parse_stack_match(within)
 
-        kernel = rule_mapping_context.finish_kernel(
-                smap.map_kernel(kernel))
+    from pymbolic.mapper.substitutor import make_subst_func
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, var_name_gen)
+    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
+                    make_subst_func(subst_dict), within)
 
-        new_instructions = []
-        for insn in kernel.instructions:
-            if (old_iname in insn.within_inames
-                    and within(kernel, insn, ())):
-                insn = insn.copy(
-                        within_inames=(
-                            (insn.within_inames - frozenset([old_iname]))
-                            | frozenset([new_iname])))
+    from loopy.kernel.instruction import MultiAssignmentBase
 
-            new_instructions.append(insn)
+    def does_insn_involve_iname(kernel, insn, *args):
+        return (not isinstance(insn, MultiAssignmentBase)
+                or frozenset(old_inames) & insn.dependency_names()
+                or frozenset(old_inames) & insn.reduction_inames())
 
-        kernel = kernel.copy(instructions=new_instructions)
+    kernel = rule_mapping_context.finish_kernel(
+            smap.map_kernel(kernel, within=does_insn_involve_iname))
 
-    else:
-        kernel = duplicate_inames(
-                kernel, [old_iname], within=within, new_inames=[new_iname])
+    new_instructions = [insn.copy(within_inames=((insn.within_inames
+                                                  - frozenset(old_inames))
+                                                 | frozenset([new_iname])))
+                        if ((len(frozenset(old_inames) & insn.within_inames) != 0)
+                            and within(kernel, insn, ()))
+                        else insn
+                        for insn in kernel.instructions]
 
-    kernel = remove_unused_inames(kernel, [old_iname])
+    kernel = kernel.copy(instructions=new_instructions)
+    kernel = remove_unused_inames(kernel, old_inames)
 
     return kernel
 
+
+def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
+    return rename_inames(kernel, [old_iname], new_iname, existing_ok, within)
+
 # }}}
 
 

From 14ae87913f60e0eed25e3e9e995e69054c7d33be Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sat, 13 Nov 2021 11:32:35 -0600
Subject: [PATCH 03/27] tests rename inames

---
 test/test_transform.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/test/test_transform.py b/test/test_transform.py
index 3915ce161..e42eeb498 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1337,6 +1337,35 @@ def test_rename_inames_redn():
     assert "ifused" in t_unit.default_entrypoint.all_inames()
 
 
+def test_rename_inames(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i1, i2]: 0<=i1, i2<10}",
+        """
+        y1[i1] = 2
+        y2[i2] = 3
+        """)
+    ref_knl = knl
+    knl = lp.rename_inames(knl, ["i1", "i2"], "ifused")
+    lp.auto_test_vs_ref(knl, ctx, ref_knl)
+
+
+def test_rename_inames_existing_ok(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i1, i2, i3]: 0<=i1, i2, i3<10}",
+        """
+        y1[i1] = 2
+        y2[i2] = 3
+        y3[i3] = 4
+        """)
+    ref_knl = knl
+    knl = lp.rename_inames(knl, ["i1", "i2"], "i3", existing_ok=True)
+    lp.auto_test_vs_ref(knl, ctx, ref_knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])

From 468cc9576b806d58f27d8c88cb329a0188237928 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 17 Nov 2021 16:59:11 -0600
Subject: [PATCH 04/27] adds a routine to memoize the transformation results to
 disk

---
 loopy/__init__.py |  4 +--
 loopy/tools.py    | 62 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 7e6ee5234..1a6c1599e 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -154,7 +154,7 @@
 from loopy.target.ispc import ISPCTarget
 from loopy.target.numba import NumbaTarget, NumbaCudaTarget
 
-from loopy.tools import Optional, t_unit_to_python
+from loopy.tools import Optional, t_unit_to_python, memoize_on_disk
 
 
 __all__ = [
@@ -299,7 +299,7 @@
         "NumbaTarget", "NumbaCudaTarget",
         "ASTBuilderBase",
 
-        "Optional",
+        "Optional", "memoize_on_disk",
 
         # {{{ from this file
 
diff --git a/loopy/tools.py b/loopy/tools.py
index facfe6ee6..9216cbc19 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -23,7 +23,7 @@
 import collections.abc as abc
 
 import numpy as np
-from pytools import memoize_method
+from pytools import memoize_method, ProcessLogger
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 from loopy.symbolic import (WalkMapper as LoopyWalkMapper,
                             RuleAwareIdentityMapper)
@@ -31,6 +31,9 @@
         PersistentHashWalkMapper as PersistentHashWalkMapperBase)
 from sys import intern
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 def is_integer(obj):
     return isinstance(obj, (int, np.integer))
@@ -862,4 +865,61 @@ def t_unit_to_python(t_unit, var_name="t_unit",
     else:
         return python_code
 
+
+def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder):
+    from loopy.version import DATA_MODEL_VERSION
+    from functools import wraps
+    from pytools.persistent_dict import WriteOncePersistentDict
+    from loopy.translation_unit import TranslationUnit
+    from loopy.kernel import LoopKernel
+    import pymbolic.primitives as prim
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        from loopy import CACHING_ENABLED
+
+        if (not CACHING_ENABLED
+                or kwargs.pop("_no_memoize_on_disk", False)):
+            return func(*args, **kwargs)
+
+        transform_cache = WriteOncePersistentDict(
+            ("loopy-memoize-cache-"
+             f"{key_builder_t.__qualname__}-{key_builder_t.__name__}"
+             f"-v0-{DATA_MODEL_VERSION}"),
+            key_builder=key_builder_t())
+
+        def _get_persistent_hashable_arg(arg):
+            if isinstance(arg, prim.Expression):
+                return PymbolicExpressionHashWrapper(arg)
+            else:
+                return arg
+
+        cache_key = (tuple(_get_persistent_hashable_arg(arg)
+                           for arg in args),
+                     {kw: _get_persistent_hashable_arg(arg)
+                      for kw, arg in kwargs.items()})
+
+        try:
+            result = transform_cache[cache_key]
+            logger.debug(f"Function {func.__name__} returned from"
+                         " memoized result on disk.")
+            return result
+        except KeyError:
+            logger.debug(f"Function {func.__name__} not present"
+                         " on disk.")
+            if args and isinstance(args[0], LoopKernel):
+                proc_log_str = f"{func.__name__} on '{args[0].name}'"
+            elif args and isinstance(args[0], TranslationUnit):
+                proc_log_str = f"{func.__name__} on '{args[0].entrypoints}'"
+            else:
+                proc_log_str = f"{func.__name__}"
+
+            with ProcessLogger(logger, proc_log_str):
+                result = func(*args, **kwargs)
+
+            transform_cache.store_if_not_present(cache_key, result)
+            return result
+
+    return wrapper
+
 # vim: fdm=marker

From 18371df332a89880d4501f7adc059a6e4036785d Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 17 Nov 2021 17:03:48 -0600
Subject: [PATCH 05/27] test memoize_on_disk

---
 test/test_misc.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/test/test_misc.py b/test/test_misc.py
index 58ba732ac..0e8a528ec 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -21,6 +21,7 @@
 """
 
 import pytest
+import loopy as lp
 
 import sys
 
@@ -279,6 +280,58 @@ def test_Optional():  # noqa
     # }}}
 
 
+@lp.memoize_on_disk
+def very_costly_transform(knl, iname):
+    from time import sleep
+    sleep(5)
+    return lp.split_iname(knl, iname, 4)
+
+
+def test_memoize_on_disk():
+    if not lp.CACHING_ENABLED:
+        # if caching is disabled => don't test the caching behavior
+        pytest.skip("cannot test memoization if caching disabled")
+
+    knl = lp.make_kernel("{[i]: 0<=i<10}",
+                         """
+                         y[i] = i
+                         """)
+
+    from time import time
+    uncached_knl = very_costly_transform(knl, "i")
+
+    start = time()
+    cached_knl = very_costly_transform(knl, "i")
+    end = time()
+    assert (end - start) < 4
+    assert cached_knl == uncached_knl
+
+
+@lp.memoize_on_disk
+def get_twice_of_pym_expr(expr):
+    from time import sleep
+    sleep(2)
+    return 2 * expr
+
+
+def test_memoize_on_disk_with_pym_expr():
+    if not lp.CACHING_ENABLED:
+        # if caching is disabled => don't test the caching behavior
+        pytest.skip("cannot test memoization if caching disabled")
+
+    from pymbolic import parse
+    expr = parse("a[i] + b[i]")
+
+    from time import time
+    uncached_result = get_twice_of_pym_expr(expr)
+
+    start = time()
+    cached_result = get_twice_of_pym_expr(expr)
+    end = time()
+    assert (end - start) < 1.5
+    assert cached_result == uncached_result
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])

From 55143b21711a534c07bbb14aaa63ff3879a93433 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 8 Dec 2021 12:42:59 +0530
Subject: [PATCH 06/27] use memoize_on_disk rather than hand rolling self
 memoization implementation

---
 loopy/preprocess.py       | 30 ++----------------------------
 loopy/transform/buffer.py | 34 ++--------------------------------
 test/test_reduction.py    |  3 +--
 3 files changed, 5 insertions(+), 62 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 55b735f4a..d30e68d80 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -28,10 +28,7 @@
         LoopyAdvisory)
 import islpy as isl
 
-from pytools.persistent_dict import WriteOncePersistentDict
-
-from loopy.tools import LoopyKeyBuilder
-from loopy.version import DATA_MODEL_VERSION
+from loopy.tools import memoize_on_disk
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 from loopy.kernel.tools import kernel_has_global_barriers
 # for the benefit of loopy.statistics, for now
@@ -2360,11 +2357,6 @@ def filter_reachable_callables(t_unit):
     return t_unit.copy(callables_table=new_callables)
 
 
-preprocess_cache = WriteOncePersistentDict(
-        "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
-        key_builder=LoopyKeyBuilder())
-
-
 def _preprocess_single_kernel(kernel, callables_table, device=None):
     from loopy.kernel import KernelState
 
@@ -2413,24 +2405,9 @@ def _preprocess_single_kernel(kernel, callables_table, device=None):
     return kernel
 
 
+@memoize_on_disk
 def preprocess_program(program, device=None):
 
-    # {{{ cache retrieval
-
-    from loopy import CACHING_ENABLED
-    if CACHING_ENABLED:
-        input_program = program
-
-        try:
-            result = preprocess_cache[program]
-            logger.debug(f"program with entrypoints: {program.entrypoints}"
-                    " preprocess cache hit")
-            return result
-        except KeyError:
-            pass
-
-    # }}}
-
     from loopy.kernel import KernelState
     if program.state >= KernelState.PREPROCESSED:
         return program
@@ -2519,9 +2496,6 @@ def preprocess_program(program, device=None):
 
     # }}}
 
-    if CACHING_ENABLED:
-        preprocess_cache.store_if_not_present(input_program, program)
-
     return program
 
 
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index b23ccf526..a6e25457d 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -26,9 +26,7 @@
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
         SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
-from pytools.persistent_dict import WriteOncePersistentDict
-from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
-from loopy.version import DATA_MODEL_VERSION
+from loopy.tools import memoize_on_disk
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.translation_unit import TranslationUnit
@@ -124,12 +122,6 @@ def map_array_access(self, index, expn_state):
 # }}}
 
 
-buffer_array_cache = WriteOncePersistentDict(
-        "loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
-        key_builder=LoopyKeyBuilder())
-
-
-# Adding an argument? also add something to the cache_key below.
 def buffer_array_for_single_kernel(kernel, callables_table, var_name,
         buffer_inames, init_expression=None, store_expression=None,
         within=None, default_tag="l.auto", temporary_scope=None,
@@ -248,26 +240,6 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name,
 
     # }}}
 
-    # {{{ caching
-
-    from loopy import CACHING_ENABLED
-
-    cache_key = (kernel, var_name,
-            tuple(buffer_inames),
-            PymbolicExpressionHashWrapper(init_expression),
-            PymbolicExpressionHashWrapper(store_expression), within,
-            default_tag, temporary_scope, fetch_bounding_box)
-
-    if CACHING_ENABLED:
-        try:
-            result = buffer_array_cache[cache_key]
-            logger.info("%s: buffer_array cache hit" % kernel.name)
-            return result
-        except KeyError:
-            pass
-
-    # }}}
-
     var_name_gen = kernel.get_var_name_generator()
     within_inames = set()
 
@@ -543,12 +515,10 @@ def none_to_empty_set(s):
     from loopy.kernel.tools import assign_automatic_axes
     kernel = assign_automatic_axes(kernel, callables_table)
 
-    if CACHING_ENABLED:
-        buffer_array_cache.store_if_not_present(cache_key, kernel)
-
     return kernel
 
 
+@memoize_on_disk
 def buffer_array(program, *args, **kwargs):
     assert isinstance(program, TranslationUnit)
 
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 931628a04..1aa3b52b6 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -51,7 +51,6 @@
 
 
 def test_nonsense_reduction(ctx_factory):
-    ctx = ctx_factory()
 
     knl = lp.make_kernel(
             "{[i]: 0<=i<100}",
@@ -63,7 +62,7 @@ def test_nonsense_reduction(ctx_factory):
 
     import pytest
     with pytest.raises(RuntimeError):
-        knl = lp.preprocess_kernel(knl, ctx.devices[0])
+        knl = lp.preprocess_kernel(knl)
 
 
 def test_empty_reduction(ctx_factory):

From 712ede3cf000699c6902d39dd533aa8cc2899459 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 11 Jan 2022 12:55:41 -0600
Subject: [PATCH 07/27] Fix test for numpy builtin type to avoid numpy#4317

---
 loopy/target/pyopencl_execution.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 87e13faa2..255858c19 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -21,6 +21,7 @@
 """
 
 
+import numpy as np
 from pytools import memoize_method
 from pytools.py_codegen import Indentation
 from loopy.target.execution import (
@@ -51,9 +52,12 @@ def __init__(self):
 
     def python_dtype_str_inner(self, dtype):
         import pyopencl.tools as cl_tools
-        if dtype.isbuiltin:
+        # Test for types built into numpy. dtype.isbuiltin does not work:
+        # https://github.com/numpy/numpy/issues/4317
+        # Guided by https://numpy.org/doc/stable/reference/arrays.scalars.html
+        if issubclass(dtype.type, (np.bool_, np.number)):
             name = dtype.name
-            if dtype.name == "bool":
+            if dtype.type == np.bool_:
                 name = "bool8"
             return f"_lpy_np.dtype(_lpy_np.{name})"
         else:

From d6b8fb9ea28302c433046d569219492001956ec6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 12 Jan 2022 22:59:17 -0600
Subject: [PATCH 08/27] Implements `LazilyUnpicklingList.__(add|mul)__`

---
 loopy/tools.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/loopy/tools.py b/loopy/tools.py
index 9216cbc19..d12ff750c 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -501,6 +501,12 @@ def insert(self, key, value):
     def __getstate__(self):
         return {"_list": [_PickledObject(val) for val in self._list]}
 
+    def __add__(self, other):
+        return self._list + other
+
+    def __mul__(self, other):
+        return self._list * other
+
 
 class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList):
     """A list which lazily unpickles its values, and supports equality comparison

From 23d19bd684146ac35521b3ad49b89c19623e37d3 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 23 Jan 2022 15:27:06 -0600
Subject: [PATCH 09/27] Fix ISPC dev download link

---
 .github/workflows/ci.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b4f8ee7da..744c2c162 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -100,8 +100,11 @@ jobs:
                 . ./ci-support-v0
                 build_py_project_in_conda_env
 
-                curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
-                export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
+                # https://github.com/ispc/ispc/issues/2240
+                # curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
+                # export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
+                curl -L https://github.com/ispc/ispc/releases/download/v1.17.0/ispc-v1.17.0-linux.tar.gz  | tar xfz -
+                export PATH="$(pwd)/ispc-v1.17.0-linux/bin:$PATH"
 
                 export PYOPENCL_TEST=portable:pthread
 
@@ -198,7 +201,7 @@ jobs:
             run: |
                 # helps with tmate debugging
                 sudo chmod a+rwX -R $(whoami) /__w/_temp || true
-                
+
         # -   uses: mxschmitt/action-tmate@v3
 
 # vim: sw=4

From f810d61ca41c5fd772f5ed8c13fe8abdf08fe570 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 23 Jan 2022 17:46:35 -0600
Subject: [PATCH 10/27] Use (new) install_ispc command from ci-support

---
 .github/workflows/ci.yml | 6 +-----
 .gitlab-ci.yml           | 8 ++++----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 744c2c162..efc8bdde2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -100,11 +100,7 @@ jobs:
                 . ./ci-support-v0
                 build_py_project_in_conda_env
 
-                # https://github.com/ispc/ispc/issues/2240
-                # curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
-                # export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
-                curl -L https://github.com/ispc/ispc/releases/download/v1.17.0/ispc-v1.17.0-linux.tar.gz  | tar xfz -
-                export PATH="$(pwd)/ispc-v1.17.0-linux/bin:$PATH"
+                install_ispc
 
                 export PYOPENCL_TEST=portable:pthread
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3945734eb..32d1b886b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -114,12 +114,12 @@ Pytest POCL Examples:
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
 
-    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
-    . ./ci-support.sh
+    curl -L -O -k https://tiker.net/ci-support-v0
+    . ./ci-support-v0
+
     build_py_project_in_venv
 
-    curl -L "https://ci.appveyor.com/api/projects/ispc/ispc/artifacts/build%2Fispc-trunk-linux.tar.gz?job=Environment%3A%20APPVEYOR_BUILD_WORKER_IMAGE%3DUbuntu1604%2C%20LLVM_VERSION%3Dlatest" | tar xfz -
-    export PATH="$(pwd)/ispc-trunk-linux/bin:$PATH"
+    install_ispc
 
     . ./build-py-project-and-run-examples.sh
     run_py_examples

From 6b9d8fa8e0ef62ac641c09767195d88f3ff59b50 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 23 Jan 2022 17:47:39 -0600
Subject: [PATCH 11/27] Drop curl -k flags in gitlab CI config

---
 .gitlab-ci.yml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 32d1b886b..721f90b58 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,7 +8,7 @@ Pytest POCL:
   - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
@@ -26,7 +26,7 @@ Pytest Nvidia Titan V:
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
   - source /opt/enable-intel-cl.sh
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
@@ -44,7 +44,7 @@ Pytest POCL without arg check:
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
   - export _LOOPY_SKIP_ARG_CHECKS=1
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
@@ -63,7 +63,7 @@ Pytest Intel:
   - export LOOPY_NO_CACHE=1
   - export LOOPY_INTEL_CL_OK_FOR_TEST_REF=1
   - source /opt/enable-intel-cl.sh
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
   - python3
@@ -80,7 +80,7 @@ Pytest POCL Twice With Cache:
   script: |
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako"
-    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
+    curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
     . ./ci-support.sh
     build_py_project_in_venv
     ( test_py_project )
@@ -100,7 +100,7 @@ Pytest POCL Twice With Cache:
 #   - export PY_EXE=pypy
 #   - export PYOPENCL_TEST=portable:pthread
 #   - export EXTRA_INSTALL="pybind11 numpy mako"
-#   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+#   - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
 #   - ". ./build-and-test-py-project.sh"
 #   tags:
 #   - pypy
@@ -114,7 +114,7 @@ Pytest POCL Examples:
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
 
-    curl -L -O -k https://tiker.net/ci-support-v0
+    curl -L -O https://tiker.net/ci-support-v0
     . ./ci-support-v0
 
     build_py_project_in_venv
@@ -140,7 +140,7 @@ Pylint:
   # Needed to avoid name shadowing issues when running from source directory.
   - PROJECT_INSTALL_FLAGS="--editable"
   - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
   - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
   tags:
   - python3
@@ -152,7 +152,7 @@ Documentation:
   script:
   - PROJECT=loopy
   - EXTRA_INSTALL="pybind11 numpy"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-docs.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-docs.sh
   - ". ./build-docs.sh"
   tags:
   - python3
@@ -160,7 +160,7 @@ Documentation:
 Flake8:
   stage: test
   script:
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
   - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
   tags:
   - python3
@@ -175,7 +175,7 @@ Benchmarks:
   - PYOPENCL_TEST=portable:pthread
   - export LOOPY_NO_CACHE=1
   - export ASV_FACTOR=1.5
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh
+  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh
   - ". ./build-and-benchmark-py-project.sh"
   tags:
   - linux

From 7f44a241ad11ab2afcd1dbd14de5cd0191923825 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 3 Feb 2022 19:00:27 -0600
Subject: [PATCH 12/27] Drop -k flags on curl in Github CI

---
 .github/workflows/ci.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index efc8bdde2..a2b69d81e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,7 +20,7 @@ jobs:
                 python-version: '3.6'
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
                 . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples
 
     pylint:
@@ -32,7 +32,7 @@ jobs:
             run: |
                 sed 's/python=3/python=3.7/' .test-conda-env-py3.yml > .test-conda-env.yml
                 USE_CONDA_BUILD=1
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
                 . ./prepare-and-run-pylint.sh "$(basename $GITHUB_REPOSITORY)" test/test_*.py
 
     pytest:
@@ -42,7 +42,7 @@ jobs:
         -   uses: actions/checkout@v2
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
                 . ./build-and-test-py-project-within-miniconda.sh
 
     pytest_intel:
@@ -71,7 +71,7 @@ jobs:
         -   uses: actions/checkout@v2
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
+                curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project-within-miniconda.sh
                 export _LOOPY_SKIP_ARG_CHECKS=1
                 . ./build-and-test-py-project-within-miniconda.sh
 
@@ -82,7 +82,7 @@ jobs:
         -   uses: actions/checkout@v2
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://tiker.net/ci-support-v0
+                curl -L -O https://tiker.net/ci-support-v0
                 . ./ci-support-v0
                 build_py_project_in_conda_env
                 ( test_py_project )
@@ -96,7 +96,7 @@ jobs:
         -   name: "Main Script"
             run: |
                 EXTRA_INSTALL="matplotlib ipykernel nbconvert"
-                curl -L -O -k https://tiker.net/ci-support-v0
+                curl -L -O https://tiker.net/ci-support-v0
                 . ./ci-support-v0
                 build_py_project_in_conda_env
 
@@ -121,7 +121,7 @@ jobs:
         -   name: "Main Script"
             run: |
                 PROJECT=loopy
-                curl -L -O -k https://tiker.net/ci-support-v0
+                curl -L -O https://tiker.net/ci-support-v0
                 . ci-support-v0
                 build_py_project_in_conda_env
                 build_docs

From 375977a4ab9398093df637c6a3f23925f3c3363e Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 3 Feb 2022 19:01:00 -0600
Subject: [PATCH 13/27] Drop unnecessary .. currentmodule in loopy.kernel.data

---
 loopy/kernel/data.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index f04f3cbc5..b8194e107 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -48,8 +48,6 @@
 from warnings import warn
 
 __doc__ = """
-.. currentmodule:: loopy.kernel.data
-
 .. autofunction:: filter_iname_tags_by_type
 
 .. autoclass:: InameImplementationTag

From 18f78846354a93f7d3b98ade7f1fecf816402f42 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 3 Feb 2022 19:45:51 -0600
Subject: [PATCH 14/27] Introduce replace_instruction_ids_in_insn

---
 loopy/transform/instruction.py | 62 ++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 9a7936cd3..bdf74fc56 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -250,40 +250,42 @@ def remove_instructions(kernel, insn_ids):
 
 # {{{ replace_instruction_ids
 
-def replace_instruction_ids(kernel, replacements):
-    if not replacements:
-        return kernel
+def replace_instruction_ids_in_insn(insn, replacements):
+    changed = False
+    new_depends_on = list(insn.depends_on)
+    extra_depends_on = []
+    new_no_sync_with = []
+
+    for idep, dep in enumerate(insn.depends_on):
+        if dep in replacements:
+            new_deps = list(replacements[dep])
+            new_depends_on[idep] = new_deps[0]
+            extra_depends_on.extend(new_deps[1:])
+            changed = True
 
-    new_insns = []
+    for insn_id, scope in insn.no_sync_with:
+        if insn_id in replacements:
+            new_no_sync_with.extend(
+                    (repl, scope) for repl in replacements[insn_id])
+            changed = True
+        else:
+            new_no_sync_with.append((insn_id, scope))
 
-    for insn in kernel.instructions:
-        changed = False
-        new_depends_on = list(insn.depends_on)
-        extra_depends_on = []
-        new_no_sync_with = []
-
-        for idep, dep in enumerate(insn.depends_on):
-            if dep in replacements:
-                new_deps = list(replacements[dep])
-                new_depends_on[idep] = new_deps[0]
-                extra_depends_on.extend(new_deps[1:])
-                changed = True
-
-        for insn_id, scope in insn.no_sync_with:
-            if insn_id in replacements:
-                new_no_sync_with.extend(
-                        (repl, scope) for repl in replacements[insn_id])
-                changed = True
-            else:
-                new_no_sync_with.append((insn_id, scope))
+    if changed:
+        return insn.copy(
+                depends_on=frozenset(new_depends_on + extra_depends_on),
+                no_sync_with=frozenset(new_no_sync_with))
+    else:
+        return insn
 
-        new_insns.append(
-                insn.copy(
-                    depends_on=frozenset(new_depends_on + extra_depends_on),
-                    no_sync_with=frozenset(new_no_sync_with))
-                if changed else insn)
 
-    return kernel.copy(instructions=new_insns)
+def replace_instruction_ids(kernel, replacements):
+    if not replacements:
+        return kernel
+
+    return kernel.copy(instructions=[
+        replace_instruction_ids_in_insn(insn, replacements)
+        for insn in kernel.instructions])
 
 # }}}
 

From 0986f8e771ee3ded5d4d93fb7011fa6258735867 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 3 Feb 2022 18:56:01 -0600
Subject: [PATCH 15/27] Refactor realize_reduction, move to separate file

---
 loopy/__init__.py                    |    4 +-
 loopy/preprocess.py                  | 1815 +----------------------
 loopy/transform/realize_reduction.py | 2053 ++++++++++++++++++++++++++
 3 files changed, 2059 insertions(+), 1813 deletions(-)
 create mode 100644 loopy/transform/realize_reduction.py

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 1a6c1599e..0e6e9f87c 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -122,10 +122,12 @@
         merge, inline_callable_kernel, rename_callable)
 from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call
 
+from loopy.transform.realize_reduction import realize_reduction
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
-from loopy.preprocess import (preprocess_kernel, realize_reduction,
+from loopy.preprocess import (preprocess_kernel,
         preprocess_program, infer_arg_descr)
 from loopy.schedule import (
     generate_loop_schedules, get_one_scheduled_kernel,
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index d30e68d80..1b2a01840 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -26,19 +26,16 @@
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
-import islpy as isl
 
 from loopy.tools import memoize_on_disk
-from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
-from loopy.kernel.tools import kernel_has_global_barriers
+from loopy.kernel.data import filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
-from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper
+from loopy.symbolic import RuleAwareIdentityMapper
 # from loopy.transform.iname import remove_any_newly_unused_inames
 
 from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
         CallInstruction,  _DataObliviousInstruction)
-from loopy.translation_unit import TranslationUnit
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 from pytools import ProcessLogger
@@ -250,1813 +247,6 @@ def find_temporary_address_space(kernel):
 # }}}
 
 
-# {{{ rewrite reduction to imperative form
-
-
-# {{{ utils (not stateful)
-
-from collections import namedtuple
-
-
-_InameClassification = namedtuple("_InameClassifiction",
-                                  "sequential, local_parallel, nonlocal_parallel")
-
-
-def _classify_reduction_inames(kernel, inames):
-    sequential = []
-    local_par = []
-    nonlocal_par = []
-
-    from loopy.kernel.data import (
-            LocalInameTagBase, UnrolledIlpTag, UnrollTag,
-            ConcurrentTag, filter_iname_tags_by_type)
-
-    for iname in inames:
-        iname_tags = kernel.iname_tags(iname)
-
-        if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)):
-            # These are nominally parallel, but we can live with
-            # them as sequential.
-            sequential.append(iname)
-
-        elif filter_iname_tags_by_type(iname_tags, LocalInameTagBase):
-            local_par.append(iname)
-
-        elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
-            nonlocal_par.append(iname)
-
-        else:
-            sequential.append(iname)
-
-    return _InameClassification(
-            tuple(sequential), tuple(local_par), tuple(nonlocal_par))
-
-
-def _add_params_to_domain(domain, param_names):
-    dim_type = isl.dim_type
-    nparams_orig = domain.dim(dim_type.param)
-    domain = domain.add_dims(dim_type.param, len(param_names))
-
-    for param_idx, param_name in enumerate(param_names):
-        domain = domain.set_dim_name(
-                dim_type.param, param_idx + nparams_orig, param_name)
-
-    return domain
-
-
-def _move_set_to_param_dims_except(domain, except_dims):
-    dim_type = isl.dim_type
-
-    iname_idx = 0
-    for iname in domain.get_var_names(dim_type.set):
-        if iname not in except_dims:
-            domain = domain.move_dims(
-                    dim_type.param, 0,
-                    dim_type.set, iname_idx, 1)
-            iname_idx -= 1
-        iname_idx += 1
-
-    return domain
-
-
-def _domain_depends_on_given_set_dims(domain, set_dim_names):
-    set_dim_names = frozenset(set_dim_names)
-
-    return any(
-            set_dim_names & set(constr.get_coefficients_by_name())
-            for constr in domain.get_constraints())
-
-
-def _check_reduction_is_triangular(kernel, expr, scan_param):
-    """Check whether the reduction within `expr` with scan parameters described by
-    the structure `scan_param` is triangular. This attempts to verify that the
-    domain for the scan and sweep inames is as follows:
-
-    [params] -> {
-        [other inames..., scan_iname, sweep_iname]:
-            (sweep_min_value
-                <= sweep_iname
-                <= sweep_max_value)
-            and
-            (scan_min_value
-                <= scan_iname
-                <= stride * (sweep_iname - sweep_min_value) + scan_min_value)
-            and
-            (irrelevant constraints)
-    }
-    """
-
-    orig_domain = kernel.get_inames_domain(
-            frozenset((scan_param.sweep_iname, scan_param.scan_iname)))
-
-    sweep_iname = scan_param.sweep_iname
-    scan_iname = scan_param.scan_iname
-    affs = isl.affs_from_space(orig_domain.space)
-
-    sweep_lower_bound = isl.align_spaces(
-            scan_param.sweep_lower_bound,
-            affs[0])
-
-    sweep_upper_bound = isl.align_spaces(
-            scan_param.sweep_upper_bound,
-            affs[0])
-
-    scan_lower_bound = isl.align_spaces(
-            scan_param.scan_lower_bound,
-            affs[0])
-
-    from itertools import product
-
-    for (sweep_lb_domain, sweep_lb_aff), \
-        (sweep_ub_domain, sweep_ub_aff), \
-        (scan_lb_domain, scan_lb_aff) in \
-            product(sweep_lower_bound.get_pieces(),
-                    sweep_upper_bound.get_pieces(),
-                    scan_lower_bound.get_pieces()):
-
-        # Assumptions inherited from the domains of the pwaffs
-        assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain
-
-        # Sweep iname constraints
-        hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff)
-        hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff)
-
-        # Scan iname constraints
-        hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff)
-        hyp_domain &= affs[scan_iname].le_set(
-                scan_param.stride * (affs[sweep_iname] - sweep_lb_aff)
-                + scan_lb_aff)
-
-        hyp_domain, = (hyp_domain & assumptions).get_basic_sets()
-        test_domain, = (orig_domain & assumptions).get_basic_sets()
-
-        hyp_gist_against_test = hyp_domain.gist(test_domain)
-        if _domain_depends_on_given_set_dims(hyp_gist_against_test,
-                (sweep_iname, scan_iname)):
-            return False, (
-                    "gist of hypothesis against test domain "
-                    "has sweep or scan dependent constraints: '%s'"
-                    % hyp_gist_against_test)
-
-        test_gist_against_hyp = test_domain.gist(hyp_domain)
-        if _domain_depends_on_given_set_dims(test_gist_against_hyp,
-                (sweep_iname, scan_iname)):
-            return False, (
-                   "gist of test against hypothesis domain "
-                   "has sweep or scan dependent constraint: '%s'"
-                   % test_gist_against_hyp)
-
-    return True, "ok"
-
-
-_ScanCandidateParameters = namedtuple(
-        "_ScanCandidateParameters",
-        "sweep_iname, scan_iname, sweep_lower_bound, "
-        "sweep_upper_bound, scan_lower_bound, stride")
-
-
-def _try_infer_scan_candidate_from_expr(
-        kernel, expr, within_inames, sweep_iname=None):
-    """Analyze `expr` and determine if it can be implemented as a scan.
-    """
-    from loopy.symbolic import Reduction
-    assert isinstance(expr, Reduction)
-
-    if len(expr.inames) != 1:
-        raise ValueError(
-                "Multiple inames in reduction: '{}'".format(", ".join(expr.inames)))
-
-    scan_iname, = expr.inames
-
-    from loopy.kernel.tools import DomainChanger
-    dchg = DomainChanger(kernel, (scan_iname,))
-    domain = dchg.get_original_domain()
-
-    if sweep_iname is None:
-        try:
-            sweep_iname = _try_infer_sweep_iname(
-                    domain, scan_iname, kernel.all_inames())
-        except ValueError as v:
-            raise ValueError(
-                    "Couldn't determine a sweep iname for the scan "
-                    "expression '%s': %s" % (expr, v))
-
-    try:
-        sweep_lower_bound, sweep_upper_bound, scan_lower_bound = (
-                _try_infer_scan_and_sweep_bounds(
-                    kernel, scan_iname, sweep_iname, within_inames))
-    except ValueError as v:
-        raise ValueError(
-                "Couldn't determine bounds for the scan with expression '%s' "
-                "(sweep iname: '%s', scan iname: '%s'): %s"
-                % (expr, sweep_iname, scan_iname, v))
-
-    try:
-        stride = _try_infer_scan_stride(
-                kernel, scan_iname, sweep_iname, sweep_lower_bound)
-    except ValueError as v:
-        raise ValueError(
-                "Couldn't determine a scan stride for the scan with expression '%s' "
-                "(sweep iname: '%s', scan iname: '%s'): %s"
-                % (expr, sweep_iname, scan_iname, v))
-
-    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
-            sweep_upper_bound, scan_lower_bound, stride)
-
-
-def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
-    """The sweep iname is the outer iname which guides the scan.
-
-    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname.
-    """
-    constrs = domain.get_constraints()
-    sweep_iname_candidate = None
-
-    for constr in constrs:
-        candidate_vars = {
-                var for var in constr.get_var_dict()
-                if var in candidate_inames}
-
-        # Irrelevant constraint - skip
-        if scan_iname not in candidate_vars:
-            continue
-
-        # No additional inames - skip
-        if len(candidate_vars) == 1:
-            continue
-
-        candidate_vars.remove(scan_iname)
-
-        # Depends on more than one iname - error
-        if len(candidate_vars) > 1:
-            raise ValueError(
-                    "More than one sweep iname candidate for scan iname '%s' found "
-                    "(via constraint '%s')" % (scan_iname, constr))
-
-        next_candidate = candidate_vars.pop()
-
-        if sweep_iname_candidate is None:
-            sweep_iname_candidate = next_candidate
-            defining_constraint = constr
-        else:
-            # Check next_candidate consistency
-            if sweep_iname_candidate != next_candidate:
-                raise ValueError(
-                        "More than one sweep iname candidate for scan iname '%s' "
-                        "found (via constraints '%s', '%s')" %
-                        (scan_iname, defining_constraint, constr))
-
-    if sweep_iname_candidate is None:
-        raise ValueError(
-                "Couldn't find any sweep iname candidates for "
-                "scan iname '%s'" % scan_iname)
-
-    return sweep_iname_candidate
-
-
-def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames):
-    domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname)))
-    domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname))
-
-    var_dict = domain.get_var_dict()
-    sweep_idx = var_dict[sweep_iname][1]
-    scan_idx = var_dict[scan_iname][1]
-
-    domain = domain.project_out_except(
-            within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
-
-    try:
-        with isl.SuppressedWarnings(domain.get_ctx()):
-            sweep_lower_bound = domain.dim_min(sweep_idx)
-            sweep_upper_bound = domain.dim_max(sweep_idx)
-            scan_lower_bound = domain.dim_min(scan_idx)
-    except isl.Error as e:
-        raise ValueError("isl error: %s" % e)
-
-    return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound)
-
-
-def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
-    """The stride is the number of steps the scan iname takes per iteration
-    of the sweep iname. This is allowed to be an integer constant.
-
-    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6.
-    """
-    dim_type = isl.dim_type
-
-    domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname]))
-    domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,))
-
-    domain_with_sweep_param = domain_with_sweep_param.project_out_except(
-            (sweep_iname, scan_iname), (dim_type.set, dim_type.param))
-
-    scan_iname_idx = domain_with_sweep_param.find_dim_by_name(
-            dim_type.set, scan_iname)
-
-    # Should be equal to k * sweep_iname, where k is the stride.
-
-    try:
-        with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()):
-            scan_iname_range = (
-                    domain_with_sweep_param.dim_max(scan_iname_idx)
-                    - domain_with_sweep_param.dim_min(scan_iname_idx)
-                    ).gist(domain_with_sweep_param.params())
-    except isl.Error as e:
-        raise ValueError("isl error: '%s'" % e)
-
-    scan_iname_pieces = scan_iname_range.get_pieces()
-
-    if len(scan_iname_pieces) > 1:
-        raise ValueError("range in multiple pieces: %s" % scan_iname_range)
-    elif len(scan_iname_pieces) == 0:
-        raise ValueError("empty range found for iname '%s'" % scan_iname)
-
-    scan_iname_constr, scan_iname_aff = scan_iname_pieces[0]
-
-    if not scan_iname_constr.plain_is_universe():
-        raise ValueError("found constraints: %s" % scan_iname_constr)
-
-    if scan_iname_aff.dim(dim_type.div):
-        raise ValueError("aff has div: %s" % scan_iname_aff)
-
-    coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param)
-
-    if len(coeffs) == 0:
-        try:
-            scan_iname_aff.get_constant_val()
-        except Exception:
-            raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
-
-        # If this point is reached we're assuming the domain is of the form
-        # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value
-        # this function returns will be verified later by
-        # _check_reduction_is_triangular().
-        return 1
-
-    if sweep_iname not in coeffs:
-        raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname)
-
-    stride = coeffs[sweep_iname]
-
-    if not stride.is_int():
-        raise ValueError("stride not an integer: %s" % stride)
-
-    if not stride.is_pos():
-        raise ValueError("stride not positive: %s" % stride)
-
-    return stride.to_python()
-
-
-def _get_domain_with_iname_as_param(domain, iname):
-    dim_type = isl.dim_type
-
-    if domain.find_dim_by_name(dim_type.param, iname) >= 0:
-        return domain
-
-    iname_idx = domain.find_dim_by_name(dim_type.set, iname)
-
-    assert iname_idx >= 0, (iname, domain)
-
-    return domain.move_dims(
-        dim_type.param, domain.dim(dim_type.param),
-        dim_type.set, iname_idx, 1)
-
-
-def _create_domain_for_sweep_tracking(orig_domain,
-        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
-    dim_type = isl.dim_type
-
-    subd = isl.BasicSet.universe(orig_domain.params().space)
-
-    # Add tracking_iname and sweep iname.
-
-    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
-
-    # Here we realize the domain:
-    #
-    # [..., i] -> {
-    #  [j]: 0 <= j - l
-    #       and
-    #       j - l <= k * (i - m)
-    #       and
-    #       k * (i - m - 1) < j - l }
-    # where
-    #   * i is the sweep iname
-    #   * j is the tracking iname
-    #   * k is the stride for the scan
-    #   * l is the lower bound for the scan
-    #   * m is the lower bound for the sweep iname
-    #
-    affs = isl.affs_from_space(subd.space)
-
-    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
-    subd &= (affs[tracking_iname] - scan_min_value)\
-            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
-    subd &= (affs[tracking_iname] - scan_min_value)\
-            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
-
-    # Move tracking_iname into a set dim (NOT sweep iname).
-    subd = subd.move_dims(
-            dim_type.set, 0,
-            dim_type.param, subd.dim(dim_type.param) - 1, 1)
-
-    # Simplify (maybe).
-    orig_domain_with_sweep_param = (
-            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
-    subd = subd.gist_params(orig_domain_with_sweep_param.params())
-
-    subd, = subd.get_basic_sets()
-
-    return subd
-
-
-def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
-    """
-    Multi assignment function calls are currently lowered into OpenCL so that
-    the function call::
-
-       a, b = segmented_sum(x, y, z, w)
-
-    becomes::
-
-       a = segmented_sum_mangled(x, y, z, w, &b).
-
-    For OpenCL, the scope of "b" is significant, and the preamble generation
-    currently assumes the scope is always private. This function forces that to
-    be the case by introducing temporary assignments into the kernel.
-    """
-
-    insn_id_gen = kernel.get_instruction_id_generator()
-    var_name_gen = kernel.get_var_name_generator()
-
-    new_or_updated_instructions = {}
-    new_temporaries = {}
-
-    dep_map = {
-            insn.id: insn.depends_on for insn in kernel.instructions}
-
-    inverse_dep_map = {insn.id: set() for insn in kernel.instructions}
-
-    for insn_id, deps in dep_map.items():
-        for dep in deps:
-            inverse_dep_map[dep].add(insn_id)
-
-    del dep_map
-
-    # {{{ utils
-
-    def _add_to_no_sync_with(insn_id, new_no_sync_with_params):
-        insn = kernel.id_to_insn.get(insn_id)
-        insn = new_or_updated_instructions.get(insn_id, insn)
-        new_or_updated_instructions[insn_id] = (
-                insn.copy(
-                    no_sync_with=(
-                        insn.no_sync_with | frozenset(new_no_sync_with_params))))
-
-    def _add_to_depends_on(insn_id, new_depends_on_params):
-        insn = kernel.id_to_insn.get(insn_id)
-        insn = new_or_updated_instructions.get(insn_id, insn)
-        new_or_updated_instructions[insn_id] = (
-                insn.copy(
-                    depends_on=insn.depends_on | frozenset(new_depends_on_params)))
-
-    # }}}
-
-    from loopy.kernel.instruction import CallInstruction, is_array_call
-    for insn in kernel.instructions:
-        if not isinstance(insn, CallInstruction):
-            continue
-
-        if len(insn.assignees) <= 1:
-            continue
-
-        if is_array_call(insn.assignees, insn.expression):
-            continue
-
-        assignees = insn.assignees
-        assignee_var_names = insn.assignee_var_names()
-
-        new_assignees = [assignees[0]]
-        newly_added_assignments_ids = set()
-        needs_replacement = False
-
-        last_added_insn_id = insn.id
-
-        from loopy.kernel.data import AddressSpace, TemporaryVariable
-
-        FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
-
-        for assignee_nr, assignee_var_name, assignee in zip(
-                range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
-                assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
-                assignees[FIRST_POINTER_ASSIGNEE_IDX:]):
-
-            if (
-                    assignee_var_name in kernel.temporary_variables
-                    and
-                    (kernel.temporary_variables[assignee_var_name].address_space
-                         == AddressSpace.PRIVATE)):
-                new_assignees.append(assignee)
-                continue
-
-            needs_replacement = True
-
-            # {{{ generate a new assignent instruction
-
-            new_assignee_name = var_name_gen(
-                    "{insn_id}_retval_{assignee_nr}"
-                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
-
-            new_assignment_id = insn_id_gen(
-                    "{insn_id}_assign_retval_{assignee_nr}"
-                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
-
-            newly_added_assignments_ids.add(new_assignment_id)
-
-            new_temporaries[new_assignee_name] = (
-                    TemporaryVariable(
-                        name=new_assignee_name,
-                        dtype=None,
-                        address_space=AddressSpace.PRIVATE))
-
-            from pymbolic import var
-            new_assignee = var(new_assignee_name)
-            new_assignees.append(new_assignee)
-
-            new_or_updated_instructions[new_assignment_id] = (
-                    make_assignment(
-                        assignees=(assignee,),
-                        expression=new_assignee,
-                        id=new_assignment_id,
-                        depends_on=frozenset([last_added_insn_id]),
-                        depends_on_is_final=True,
-                        no_sync_with=(
-                            insn.no_sync_with | frozenset([(insn.id, "any")])),
-                        predicates=insn.predicates,
-                        within_inames=insn.within_inames))
-
-            last_added_insn_id = new_assignment_id
-
-            # }}}
-
-        if not needs_replacement:
-            continue
-
-        # {{{ update originating instruction
-
-        orig_insn = new_or_updated_instructions.get(insn.id, insn)
-
-        new_or_updated_instructions[insn.id] = (
-                orig_insn.copy(assignees=tuple(new_assignees)))
-
-        _add_to_no_sync_with(insn.id,
-                [(id, "any") for id in newly_added_assignments_ids])
-
-        # }}}
-
-        # {{{ squash spurious memory dependencies amongst new assignments
-
-        for new_insn_id in newly_added_assignments_ids:
-            _add_to_no_sync_with(new_insn_id,
-                    [(id, "any")
-                     for id in newly_added_assignments_ids
-                     if id != new_insn_id])
-
-        # }}}
-
-        # {{{ update instructions that depend on the originating instruction
-
-        for inverse_dep in inverse_dep_map[insn.id]:
-            _add_to_depends_on(inverse_dep, newly_added_assignments_ids)
-
-            for insn_id, scope in (
-                    new_or_updated_instructions[inverse_dep].no_sync_with):
-                if insn_id == insn.id:
-                    _add_to_no_sync_with(
-                            inverse_dep,
-                            [(id, scope) for id in newly_added_assignments_ids])
-
-        # }}}
-
-    if not new_temporaries and not new_or_updated_instructions:
-        return kernel
-
-    new_temporary_variables = kernel.temporary_variables.copy()
-    new_temporary_variables.update(new_temporaries)
-
-    new_instructions = (
-            list(new_or_updated_instructions.values())
-            + list(insn
-                for insn in kernel.instructions
-                if insn.id not in new_or_updated_instructions))
-
-    return kernel.copy(temporary_variables=new_temporary_variables,
-                       instructions=new_instructions)
-
-
-def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
-    # Intersect with inames, because we could have captured some kernel params
-    # in here too...
-    dependent_inames = (
-            frozenset(subdomain.get_var_names(isl.dim_type.param))
-            & kernel.all_inames())
-    idx, = kernel.get_leaf_domain_indices(dependent_inames)
-    domains.insert(idx + 1, subdomain)
-
-# }}}
-
-
-class RealizeReductionCallbackMapper(ReductionCallbackMapper):
-    def __init__(self, callback, callables_table):
-        super().__init__(callback)
-        self.callables_table = callables_table
-
-    def map_reduction(self, expr, **kwargs):
-        result, self.callables_table = self.callback(expr, self.rec,
-                **kwargs)
-        return result
-
-    def map_if(self, expr, callables_table, guarding_predicates, nresults=1):
-        import pymbolic.primitives as prim
-        rec_cond = self.rec(expr.condition, callables_table=callables_table,
-                                guarding_predicates=guarding_predicates,
-                                nresults=nresults)
-        return prim.If(rec_cond,
-                       self.rec(expr.then, callables_table=callables_table,
-                                guarding_predicates=(
-                                    guarding_predicates
-                                    | frozenset([rec_cond])),
-                                nresults=nresults),
-                       self.rec(expr.else_, callables_table=callables_table,
-                                guarding_predicates=(
-                                    guarding_predicates
-                                    | frozenset([prim.LogicalNot(rec_cond)])),
-                                nresults=nresults))
-
-
-# @remove_any_newly_unused_inames
-def realize_reduction_for_single_kernel(kernel, callables_table,
-        insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
-        force_scan=False, force_outer_iname_for_scan=None):
-    """Rewrites reductions into their imperative form. With *insn_id_filter*
-    specified, operate only on the instruction with an instruction id matching
-    *insn_id_filter*.
-
-    If *insn_id_filter* is given, only the outermost level of reductions will be
-    expanded, inner reductions will be left alone (because they end up in a new
-    instruction with a different ID, which doesn't match the filter).
-
-    If *insn_id_filter* is not given, all reductions in all instructions will
-    be realized.
-
-    If *automagic_scans_ok*, this function will attempt to rewrite triangular
-    reductions as scans automatically.
-
-    If *force_scan* is *True*, this function will attempt to rewrite *all*
-    candidate reductions as scans and raise an error if this is not possible
-    (this is most useful combined with *insn_id_filter*).
-
-    If *force_outer_iname_for_scan* is not *None*, this function will attempt
-    to realize candidate reductions as scans using the specified iname as the
-    outer (sweep) iname.
-    """
-
-    logger.debug("%s: realize reduction" % kernel.name)
-
-    new_insns = []
-    new_iname_tags = {}
-
-    insn_id_gen = kernel.get_instruction_id_generator()
-
-    var_name_gen = kernel.get_var_name_generator()
-    new_temporary_variables = kernel.temporary_variables.copy()
-    inames_added_for_scan = set()
-    inames_to_remove = set()
-
-    # {{{ helpers
-
-    def _strip_if_scalar(reference, val):
-        if len(reference) == 1:
-            return val[0]
-        else:
-            return val
-
-    def preprocess_scan_arguments(
-                insn, expr, nresults, scan_iname, track_iname,
-                newly_generated_insn_id_set):
-        """Does iname substitution within scan arguments and returns a set of values
-        suitable to be passed to the binary op. Returns a tuple."""
-
-        if nresults > 1:
-            inner_expr = expr
-
-            # In the case of a multi-argument scan, we need a name for each of
-            # the arguments in order to pass them to the binary op - so we expand
-            # items that are not "plain" tuples here.
-            if not isinstance(inner_expr, tuple):
-                get_args_insn_id = insn_id_gen(
-                        "{}_{}_get".format(insn.id, "_".join(expr.inames)))
-
-                inner_expr = expand_inner_reduction(
-                        id=get_args_insn_id,
-                        expr=inner_expr,
-                        nresults=nresults,
-                        depends_on=insn.depends_on,
-                        within_inames=insn.within_inames | expr.inames,
-                        within_inames_is_final=insn.within_inames_is_final,
-                        predicates=insn.predicates,
-                        )
-
-                newly_generated_insn_id_set.add(get_args_insn_id)
-
-            updated_inner_exprs = tuple(
-                    replace_var_within_expr(sub_expr, scan_iname, track_iname)
-                    for sub_expr in inner_expr)
-        else:
-            updated_inner_exprs = (
-                    replace_var_within_expr(expr, scan_iname, track_iname),)
-
-        return updated_inner_exprs
-
-    def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
-            within_inames_is_final, predicates):
-        # FIXME: use make_temporaries
-        from pymbolic.primitives import Call
-        from loopy.symbolic import Reduction
-        assert isinstance(expr, (Call, Reduction))
-
-        temp_var_names = [
-                var_name_gen(id + "_arg" + str(i))
-                for i in range(nresults)]
-
-        for name in temp_var_names:
-            from loopy.kernel.data import TemporaryVariable, AddressSpace
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=(),
-                    dtype=None,
-                    address_space=AddressSpace.PRIVATE)
-
-        from pymbolic import var
-        temp_vars = tuple(var(n) for n in temp_var_names)
-
-        call_insn = make_assignment(
-                id=id,
-                assignees=temp_vars,
-                expression=expr,
-                depends_on=depends_on,
-                within_inames=within_inames,
-                within_inames_is_final=within_inames_is_final,
-                predicates=predicates)
-
-        generated_insns.append(call_insn)
-
-        return temp_vars
-
-    # }}}
-
-    # {{{ sequential
-
-    def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes,
-                          reduction_dtypes, guarding_predicates):
-        outer_insn_inames = insn.within_inames
-
-        from loopy.kernel.data import AddressSpace
-        acc_var_names = make_temporaries(
-                name_based_on="acc_"+"_".join(expr.inames),
-                nvars=nresults,
-                shape=(),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.PRIVATE)
-
-        init_insn_depends_on = frozenset()
-
-        # check first that the original kernel had global barriers
-        # if not, we don't need to check. Since the function
-        # kernel_has_global_barriers is cached, we don't do
-        # extra work compared to not checking.
-        # FIXME: Explain why we care about global barriers her
-        if kernel_has_global_barriers(kernel):
-            global_barrier = lp.find_most_recent_global_barrier(temp_kernel,
-                    insn.id)
-
-            if global_barrier is not None:
-                init_insn_depends_on |= frozenset([global_barrier])
-
-        from pymbolic import var
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        init_id = insn_id_gen(
-                "{}_{}_init".format(insn.id, "_".join(expr.inames)))
-
-        expression, callables_table = expr.operation.neutral_element(
-                *arg_dtypes, callables_table=callables_table, target=kernel.target)
-
-        init_insn = make_assignment(
-                id=init_id,
-                assignees=acc_vars,
-                within_inames=outer_insn_inames - frozenset(expr.inames),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=init_insn_depends_on,
-                expression=expression,
-
-                # Do not inherit predicates: Those might read variables
-                # that may not yet be set, and we don't have a great way
-                # of figuring out what the dependencies of the accumulator
-                # initializer should be.
-
-                # This way, we may initialize a few too many accumulators,
-                # but that's better than being incorrect.
-                # https://github.com/inducer/loopy/issues/231
-                )
-
-        generated_insns.append(init_insn)
-
-        update_id = insn_id_gen(
-                based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
-
-        update_insn_iname_deps = insn.within_inames | set(expr.inames)
-        if insn.within_inames_is_final:
-            update_insn_iname_deps = insn.within_inames | set(expr.inames)
-
-        reduction_insn_depends_on = {init_id}
-
-        # In the case of a multi-argument reduction, we need a name for each of
-        # the arguments in order to pass them to the binary op - so we expand
-        # items that are not "plain" tuples here.
-        if nresults > 1 and not isinstance(expr.expr, tuple):
-            get_args_insn_id = insn_id_gen(
-                    "{}_{}_get".format(insn.id, "_".join(expr.inames)))
-
-            reduction_expr = expand_inner_reduction(
-                    id=get_args_insn_id,
-                    expr=expr.expr,
-                    nresults=nresults,
-                    depends_on=insn.depends_on,
-                    within_inames=update_insn_iname_deps,
-                    within_inames_is_final=insn.within_inames_is_final,
-                    predicates=guarding_predicates,
-                    )
-
-            reduction_insn_depends_on.add(get_args_insn_id)
-        else:
-            reduction_expr = expr.expr
-
-        expression, callables_table = expr.operation(
-                arg_dtypes,
-                _strip_if_scalar(acc_vars, acc_vars),
-                reduction_expr,
-                callables_table,
-                kernel.target)
-
-        reduction_insn = make_assignment(
-                id=update_id,
-                assignees=acc_vars,
-                expression=expression,
-                depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
-                within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final,
-                predicates=guarding_predicates,)
-
-        generated_insns.append(reduction_insn)
-
-        new_insn_add_depends_on.add(reduction_insn.id)
-
-        if nresults == 1:
-            assert len(acc_vars) == 1
-            return acc_vars[0], callables_table
-        else:
-            return acc_vars, callables_table
-
-    # }}}
-
-    # {{{ local-parallel
-
-    def _get_int_iname_size(iname):
-        from loopy.isl_helpers import static_max_of_pw_aff
-        from loopy.symbolic import pw_aff_to_expr
-        size = pw_aff_to_expr(
-                static_max_of_pw_aff(
-                    kernel.get_iname_bounds(iname).size,
-                    constants_only=True))
-        assert isinstance(size, int)
-        return size
-
-    def _make_slab_set(iname, size):
-        v = isl.make_zero_and_vars([iname])
-        bs, = (
-                v[0].le_set(v[iname])
-                &
-                v[iname].lt_set(v[0] + size)).get_basic_sets()
-        return bs
-
-    def _make_slab_set_from_range(iname, lbound, ubound):
-        v = isl.make_zero_and_vars([iname])
-        bs, = (
-                v[iname].ge_set(v[0] + lbound)
-                &
-                v[iname].lt_set(v[0] + ubound)).get_basic_sets()
-        return bs
-
-    def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes,
-                            reduction_dtypes, guarding_predicates):
-        red_iname, = expr.inames
-
-        size = _get_int_iname_size(red_iname)
-
-        outer_insn_inames = insn.within_inames
-
-        from loopy.kernel.data import LocalInameTagBase
-        outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if kernel.iname_tags_of_type(oiname, LocalInameTagBase))
-
-        from pymbolic import var
-        outer_local_iname_vars = tuple(
-                var(oiname) for oiname in outer_local_inames)
-
-        outer_local_iname_sizes = tuple(
-                _get_int_iname_size(oiname)
-                for oiname in outer_local_inames)
-
-        from loopy.kernel.data import AddressSpace
-
-        neutral_var_names = make_temporaries(
-                name_based_on="neutral_"+red_iname,
-                nvars=nresults,
-                shape=(),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.PRIVATE)
-
-        acc_var_names = make_temporaries(
-                name_based_on="acc_"+red_iname,
-                nvars=nresults,
-                shape=outer_local_iname_sizes + (size,),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.LOCAL)
-
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        # {{{ add separate iname to carry out the reduction
-
-        # Doing this sheds any odd conditionals that may be active
-        # on our red_iname.
-
-        base_exec_iname = var_name_gen("red_"+red_iname)
-        domains.append(_make_slab_set(base_exec_iname, size))
-        new_iname_tags[base_exec_iname] = kernel.iname_tags(red_iname)
-
-        # }}}
-
-        base_iname_deps = outer_insn_inames - frozenset(expr.inames)
-
-        neutral, callables_table = expr.operation.neutral_element(*arg_dtypes,
-                callables_table=callables_table, target=kernel.target)
-        init_id = insn_id_gen(f"{insn.id}_{red_iname}_init")
-        init_insn = make_assignment(
-                id=init_id,
-                assignees=tuple(
-                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
-                    for acc_var in acc_vars),
-                expression=neutral,
-                within_inames=base_iname_deps | frozenset([base_exec_iname]),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(),
-                # Do not inherit predicates: Those might read variables
-                # that may not yet be set, and we don't have a great way
-                # of figuring out what the dependencies of the accumulator
-                # initializer should be.
-
-                # This way, we may initialize a few too many accumulators,
-                # but that's better than being incorrect.
-                # https://github.com/inducer/loopy/issues/231
-                )
-        generated_insns.append(init_insn)
-
-        init_neutral_id = insn_id_gen(f"{insn.id}_{red_iname}_init_neutral")
-        init_neutral_insn = make_assignment(
-                id=init_neutral_id,
-                assignees=tuple(var(nvn) for nvn in neutral_var_names),
-                expression=neutral,
-                within_inames=base_iname_deps | frozenset([base_exec_iname]),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(),
-                predicates=guarding_predicates,
-                )
-        generated_insns.append(init_neutral_insn)
-
-        transfer_depends_on = {init_neutral_id, init_id}
-
-        # In the case of a multi-argument reduction, we need a name for each of
-        # the arguments in order to pass them to the binary op - so we expand
-        # items that are not "plain" tuples here.
-        if nresults > 1 and not isinstance(expr.expr, tuple):
-            get_args_insn_id = insn_id_gen(
-                    f"{insn.id}_{red_iname}_get")
-
-            reduction_expr = expand_inner_reduction(
-                    id=get_args_insn_id,
-                    expr=expr.expr,
-                    nresults=nresults,
-                    depends_on=insn.depends_on,
-                    within_inames=(
-                        (outer_insn_inames - frozenset(expr.inames))
-                        | frozenset([red_iname])),
-                    within_inames_is_final=insn.within_inames_is_final,
-                    predicates=guarding_predicates,
-                    )
-
-            transfer_depends_on.add(get_args_insn_id)
-        else:
-            reduction_expr = expr.expr
-
-        transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer")
-        expression, callables_table = expr.operation(
-                arg_dtypes,
-                _strip_if_scalar(
-                    neutral_var_names,
-                    tuple(var(nvn) for nvn in neutral_var_names)),
-                reduction_expr,
-                callables_table,
-                kernel.target)
-        transfer_insn = make_assignment(
-                id=transfer_id,
-                assignees=tuple(
-                    acc_var[outer_local_iname_vars + (var(red_iname),)]
-                    for acc_var in acc_vars),
-                expression=expression,
-                within_inames=(
-                    (outer_insn_inames - frozenset(expr.inames))
-                    | frozenset([red_iname])),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
-                no_sync_with=frozenset([(init_id, "any")]),
-                predicates=insn.predicates,
-                )
-        generated_insns.append(transfer_insn)
-
-        cur_size = 1
-        while cur_size < size:
-            cur_size *= 2
-
-        prev_id = transfer_id
-        bound = size
-
-        stage_exec_iname = None
-
-        istage = 0
-        while cur_size > 1:
-
-            new_size = cur_size // 2
-            assert new_size * 2 == cur_size
-
-            stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
-            domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname)
-
-            stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
-            expression, callables_table = expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(acc_vars, tuple(
-                        acc_var[
-                            outer_local_iname_vars + (var(stage_exec_iname),)]
-                        for acc_var in acc_vars)),
-                    _strip_if_scalar(acc_vars, tuple(
-                        acc_var[
-                            outer_local_iname_vars + (
-                                var(stage_exec_iname) + new_size,)]
-                        for acc_var in acc_vars)),
-                    callables_table,
-                    kernel.target)
-
-            stage_insn = make_assignment(
-                    id=stage_id,
-                    assignees=tuple(
-                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
-                        for acc_var in acc_vars),
-                    expression=expression,
-                    within_inames=(
-                        base_iname_deps | frozenset([stage_exec_iname])),
-                    within_inames_is_final=insn.within_inames_is_final,
-                    depends_on=frozenset([prev_id]),
-                    predicates=insn.predicates,
-                    )
-
-            generated_insns.append(stage_insn)
-            prev_id = stage_id
-
-            cur_size = new_size
-            bound = cur_size
-            istage += 1
-
-        new_insn_add_depends_on.add(prev_id)
-        new_insn_add_no_sync_with.add((prev_id, "any"))
-        new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)
-
-        if nresults == 1:
-            assert len(acc_vars) == 1
-            return acc_vars[0][outer_local_iname_vars + (0,)], callables_table
-        else:
-            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
-                    acc_vars], callables_table
-    # }}}
-
-    # {{{ utils (stateful)
-
-    from pytools import memoize
-
-    @memoize
-    def get_or_add_sweep_tracking_iname_and_domain(
-            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
-            tracking_iname):
-        domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
-
-        inames_added_for_scan.add(tracking_iname)
-
-        new_domain = _create_domain_for_sweep_tracking(domain,
-                tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
-
-        _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain)
-
-        return tracking_iname
-
-    def replace_var_within_expr(expr, from_var, to_var):
-        from pymbolic.mapper.substitutor import make_subst_func
-
-        from loopy.symbolic import (
-            SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
-
-        rule_mapping_context = SubstitutionRuleMappingContext(
-            temp_kernel.substitutions, var_name_gen)
-
-        from pymbolic import var
-        mapper = RuleAwareSubstitutionMapper(
-            rule_mapping_context,
-            make_subst_func({from_var: var(to_var)}),
-            within=lambda *args: True)
-
-        return mapper(expr, temp_kernel, None)
-
-    def make_temporaries(name_based_on, nvars, shape, dtypes, address_space):
-        var_names = [
-                var_name_gen(name_based_on.format(index=i))
-                for i in range(nvars)]
-
-        from loopy.kernel.data import TemporaryVariable
-
-        for name, dtype in zip(var_names, dtypes):
-            new_temporary_variables[name] = TemporaryVariable(
-                    name=name,
-                    shape=shape,
-                    dtype=dtype,
-                    address_space=address_space)
-
-        return var_names
-
-    # }}}
-
-    # {{{ sequential scan
-
-    def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes,
-                     reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
-                     scan_min_value, stride, guarding_predicates):
-        outer_insn_inames = insn.within_inames
-        inames_to_remove.add(scan_iname)
-
-        track_iname = var_name_gen(
-                "{sweep_iname}__seq_scan"
-                .format(sweep_iname=sweep_iname))
-
-        get_or_add_sweep_tracking_iname_and_domain(
-                scan_iname, sweep_iname, sweep_min_value, scan_min_value,
-                stride, track_iname)
-
-        from loopy.kernel.data import AddressSpace
-        acc_var_names = make_temporaries(
-                name_based_on="acc_" + scan_iname,
-                nvars=nresults,
-                shape=(),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.PRIVATE)
-
-        from pymbolic import var
-        acc_vars = tuple(var(n) for n in acc_var_names)
-
-        init_id = insn_id_gen(
-                "{}_{}_init".format(insn.id, "_".join(expr.inames)))
-
-        init_insn_depends_on = frozenset()
-
-        # FIXME: Explain why we care about global barriers here
-        if kernel_has_global_barriers(kernel):
-            global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
-
-            if global_barrier is not None:
-                init_insn_depends_on |= frozenset([global_barrier])
-
-        expression, callables_table = expr.operation.neutral_element(
-                *arg_dtypes, callables_table=callables_table, target=kernel.target)
-
-        init_insn = make_assignment(
-                id=init_id,
-                assignees=acc_vars,
-                within_inames=outer_insn_inames - frozenset(
-                    (sweep_iname,) + expr.inames),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=init_insn_depends_on,
-                expression=expression,
-                # Do not inherit predicates: Those might read variables
-                # that may not yet be set, and we don't have a great way
-                # of figuring out what the dependencies of the accumulator
-                # initializer should be.
-
-                # This way, we may initialize a few too many accumulators,
-                # but that's better than being incorrect.
-                # https://github.com/inducer/loopy/issues/231
-                )
-
-        generated_insns.append(init_insn)
-
-        update_insn_depends_on = {init_insn.id} | insn.depends_on
-
-        updated_inner_exprs = (
-                preprocess_scan_arguments(insn, expr.expr, nresults,
-                    scan_iname, track_iname, update_insn_depends_on))
-
-        update_id = insn_id_gen(
-                based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
-
-        update_insn_iname_deps = insn.within_inames | {track_iname}
-        if insn.within_inames_is_final:
-            update_insn_iname_deps = insn.within_inames | {track_iname}
-
-        expression, callables_table = expr.operation(
-                arg_dtypes,
-                _strip_if_scalar(acc_vars, acc_vars),
-                _strip_if_scalar(acc_vars, updated_inner_exprs),
-                callables_table,
-                kernel.target)
-
-        scan_insn = make_assignment(
-                id=update_id,
-                assignees=acc_vars,
-                expression=expression,
-                depends_on=frozenset(update_insn_depends_on),
-                within_inames=update_insn_iname_deps,
-                no_sync_with=insn.no_sync_with,
-                within_inames_is_final=insn.within_inames_is_final,
-                predicates=guarding_predicates,
-                )
-
-        generated_insns.append(scan_insn)
-
-        new_insn_add_depends_on.add(scan_insn.id)
-
-        if nresults == 1:
-            assert len(acc_vars) == 1
-            return acc_vars[0], callables_table
-        else:
-            return acc_vars, callables_table
-
-    # }}}
-
-    # {{{ local-parallel scan
-
-    def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes,
-            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
-            scan_min_value, stride, guarding_predicates):
-
-        scan_size = _get_int_iname_size(sweep_iname)
-
-        assert scan_size > 0
-
-        if scan_size == 1:
-            return map_reduction_seq(expr, rec, callables_table,
-                                     nresults, arg_dtypes, reduction_dtypes,
-                                     guarding_predicates)
-
-        outer_insn_inames = insn.within_inames
-
-        from loopy.kernel.data import LocalInameTagBase
-        outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if kernel.iname_tags_of_type(oiname, LocalInameTagBase)
-                and oiname != sweep_iname)
-
-        from pymbolic import var
-        outer_local_iname_vars = tuple(
-                var(oiname) for oiname in outer_local_inames)
-
-        outer_local_iname_sizes = tuple(
-                _get_int_iname_size(oiname)
-                for oiname in outer_local_inames)
-
-        track_iname = var_name_gen(
-                "{sweep_iname}__pre_scan"
-                .format(sweep_iname=sweep_iname))
-
-        get_or_add_sweep_tracking_iname_and_domain(
-                scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
-                track_iname)
-
-        # {{{ add separate iname to carry out the scan
-
-        # Doing this sheds any odd conditionals that may be active
-        # on our scan_iname.
-
-        base_exec_iname = var_name_gen(sweep_iname + "__scan")
-        domains.append(_make_slab_set(base_exec_iname, scan_size))
-        new_iname_tags[base_exec_iname] = kernel.iname_tags(sweep_iname)
-
-        # }}}
-
-        from loopy.kernel.data import AddressSpace
-
-        read_var_names = make_temporaries(
-                name_based_on="read_"+scan_iname+"_arg_{index}",
-                nvars=nresults,
-                shape=(),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.PRIVATE)
-
-        acc_var_names = make_temporaries(
-                name_based_on="acc_"+scan_iname,
-                nvars=nresults,
-                shape=outer_local_iname_sizes + (scan_size,),
-                dtypes=reduction_dtypes,
-                address_space=AddressSpace.LOCAL)
-
-        acc_vars = tuple(var(n) for n in acc_var_names)
-        read_vars = tuple(var(n) for n in read_var_names)
-
-        base_iname_deps = (outer_insn_inames
-                - frozenset(expr.inames) - frozenset([sweep_iname]))
-
-        neutral, callables_table = expr.operation.neutral_element(
-                *arg_dtypes, callables_table=callables_table, target=kernel.target)
-
-        init_insn_depends_on = insn.depends_on
-
-        # FIXME: Explain why we care about global barriers here
-        if kernel_has_global_barriers(kernel):
-            global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id)
-
-            if global_barrier is not None:
-                init_insn_depends_on |= frozenset([global_barrier])
-
-        init_id = insn_id_gen(f"{insn.id}_{scan_iname}_init")
-        init_insn = make_assignment(
-                id=init_id,
-                assignees=tuple(
-                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
-                    for acc_var in acc_vars),
-                expression=neutral,
-                within_inames=base_iname_deps | frozenset([base_exec_iname]),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=init_insn_depends_on,
-                # Do not inherit predicates: Those might read variables
-                # that may not yet be set, and we don't have a great way
-                # of figuring out what the dependencies of the accumulator
-                # initializer should be.
-
-                # This way, we may initialize a few too many accumulators,
-                # but that's better than being incorrect.
-                # https://github.com/inducer/loopy/issues/231
-                )
-        generated_insns.append(init_insn)
-
-        transfer_insn_depends_on = {init_insn.id} | insn.depends_on
-
-        updated_inner_exprs = (
-                preprocess_scan_arguments(insn, expr.expr, nresults,
-                    scan_iname, track_iname, transfer_insn_depends_on))
-
-        from loopy.symbolic import Reduction
-
-        from loopy.symbolic import pw_aff_to_expr
-        sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
-
-        transfer_id = insn_id_gen(f"{insn.id}_{scan_iname}_transfer")
-        transfer_insn = make_assignment(
-                id=transfer_id,
-                assignees=tuple(
-                    acc_var[outer_local_iname_vars
-                            + (var(sweep_iname) - sweep_min_value_expr,)]
-                    for acc_var in acc_vars),
-                expression=Reduction(
-                    operation=expr.operation,
-                    inames=(track_iname,),
-                    expr=_strip_if_scalar(acc_vars, updated_inner_exprs),
-                    allow_simultaneous=False,
-                    ),
-                within_inames=outer_insn_inames - frozenset(expr.inames),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset(transfer_insn_depends_on),
-                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
-                predicates=insn.predicates,
-                )
-
-        generated_insns.append(transfer_insn)
-
-        prev_id = transfer_id
-
-        istage = 0
-        cur_size = 1
-
-        while cur_size < scan_size:
-            stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
-            domains.append(
-                    _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_tags(sweep_iname)
-
-            for read_var, acc_var in zip(read_vars, acc_vars):
-                read_stage_id = insn_id_gen(
-                        "scan_%s_read_stage_%d" % (scan_iname, istage))
-
-                read_stage_insn = make_assignment(
-                        id=read_stage_id,
-                        assignees=(read_var,),
-                        expression=(
-                                acc_var[
-                                    outer_local_iname_vars
-                                    + (var(stage_exec_iname) - cur_size,)]),
-                        within_inames=(
-                            base_iname_deps | frozenset([stage_exec_iname])),
-                        within_inames_is_final=insn.within_inames_is_final,
-                        depends_on=frozenset([prev_id]),
-                        predicates=insn.predicates,
-                        )
-
-                if cur_size == 1:
-                    # Performance hack: don't add a barrier here with transfer_insn.
-                    # NOTE: This won't work if the way that local inames
-                    # are lowered changes.
-                    read_stage_insn = read_stage_insn.copy(
-                            no_sync_with=(
-                                read_stage_insn.no_sync_with
-                                | frozenset([(transfer_id, "any")])))
-
-                generated_insns.append(read_stage_insn)
-                prev_id = read_stage_id
-
-            write_stage_id = insn_id_gen(
-                    "scan_%s_write_stage_%d" % (scan_iname, istage))
-
-            expression, callables_table = expr.operation(
-                arg_dtypes,
-                _strip_if_scalar(acc_vars, read_vars),
-                _strip_if_scalar(acc_vars, tuple(
-                    acc_var[
-                        outer_local_iname_vars + (var(stage_exec_iname),)]
-                    for acc_var in acc_vars)),
-                callables_table,
-                kernel.target)
-
-            write_stage_insn = make_assignment(
-                    id=write_stage_id,
-                    assignees=tuple(
-                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
-                        for acc_var in acc_vars),
-                    expression=expression,
-                    within_inames=(
-                        base_iname_deps | frozenset([stage_exec_iname])),
-                    within_inames_is_final=insn.within_inames_is_final,
-                    depends_on=frozenset([prev_id]),
-                    predicates=insn.predicates,
-                    )
-
-            generated_insns.append(write_stage_insn)
-            prev_id = write_stage_id
-
-            cur_size *= 2
-            istage += 1
-
-        new_insn_add_depends_on.add(prev_id)
-        new_insn_add_within_inames.add(sweep_iname)
-
-        output_idx = var(sweep_iname) - sweep_min_value_expr
-
-        if nresults == 1:
-            assert len(acc_vars) == 1
-            return (acc_vars[0][outer_local_iname_vars + (output_idx,)],
-                    callables_table)
-        else:
-            return [acc_var[outer_local_iname_vars + (output_idx,)]
-                    for acc_var in acc_vars], callables_table
-
-    # }}}
-
-    # {{{ seq/par dispatch
-
-    def map_reduction(expr, rec, callables_table,
-                      guarding_predicates, nresults=1):
-        nonlocal insn_changed
-
-        # Only expand one level of reduction at a time, going from outermost to
-        # innermost. Otherwise we get the (iname + insn) dependencies wrong.
-
-        from loopy.type_inference import (
-                infer_arg_and_reduction_dtypes_for_reduction_expression)
-        arg_dtypes, reduction_dtypes = (
-                infer_arg_and_reduction_dtypes_for_reduction_expression(
-                    temp_kernel, expr, callables_table, unknown_types_ok))
-
-        outer_insn_inames = insn.within_inames
-        bad_inames = frozenset(expr.inames) & outer_insn_inames
-        if bad_inames:
-            raise LoopyError("reduction used within loop(s) that it was "
-                    "supposed to reduce over: " + ", ".join(bad_inames))
-
-        iname_classes = _classify_reduction_inames(temp_kernel, expr.inames)
-
-        n_sequential = len(iname_classes.sequential)
-        n_local_par = len(iname_classes.local_parallel)
-        n_nonlocal_par = len(iname_classes.nonlocal_parallel)
-
-        really_force_scan = force_scan and (
-                len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan)
-
-        def _error_if_force_scan_on(cls, msg):
-            if really_force_scan:
-                raise cls(msg)
-
-        may_be_implemented_as_scan = False
-        if force_scan or automagic_scans_ok:
-            from loopy.diagnostic import ReductionIsNotTriangularError
-
-            try:
-                # Try to determine scan candidate information (sweep iname, scan
-                # iname, etc).
-                scan_param = _try_infer_scan_candidate_from_expr(
-                        temp_kernel, expr, outer_insn_inames,
-                        sweep_iname=force_outer_iname_for_scan)
-
-            except ValueError as v:
-                error = str(v)
-
-            else:
-                # Ensures the reduction is triangular (somewhat expensive).
-                may_be_implemented_as_scan, error = (
-                        _check_reduction_is_triangular(
-                            temp_kernel, expr, scan_param))
-
-            if not may_be_implemented_as_scan:
-                _error_if_force_scan_on(ReductionIsNotTriangularError, error)
-
-        # {{{ sanity checks
-
-        if n_local_par and n_sequential:
-            raise LoopyError("Reduction over '%s' contains both parallel and "
-                    "sequential inames. It must be split "
-                    "(using split_reduction_{in,out}ward) "
-                    "before code generation."
-                    % ", ".join(expr.inames))
-
-        if n_local_par > 1:
-            raise LoopyError("Reduction over '%s' contains more than"
-                    "one parallel iname. It must be split "
-                    "(using split_reduction_{in,out}ward) "
-                    "before code generation."
-                    % ", ".join(expr.inames))
-
-        if n_nonlocal_par:
-            bad_inames = iname_classes.nonlocal_parallel
-            raise LoopyError("the only form of parallelism supported "
-                    "by reductions is 'local'--found iname(s) '%s' "
-                    "respectively tagged '%s'"
-                    % (", ".join(bad_inames),
-                       ", ".join(str(kernel.iname_tags(iname))
-                                 for iname in bad_inames)))
-
-        # }}}
-
-        insn_changed = True
-
-        if n_local_par == 0 and n_sequential == 0:
-            from loopy.diagnostic import warn_with_kernel
-            warn_with_kernel(kernel, "empty_reduction",
-                    "Empty reduction found (no inames to reduce over). "
-                    "Eliminating.")
-
-            # We're not supposed to reduce/sum at all. (Note how this is distinct
-            # from an empty reduction--there is an element here, just no inames
-            # to reduce over. It's rather similar to an array with () shape in
-            # numpy.)
-
-            return expr.expr, callables_table
-
-        if may_be_implemented_as_scan:
-            assert force_scan or automagic_scans_ok
-
-            # We require the "scan" iname to be tagged sequential.
-            if n_sequential:
-                sweep_iname = scan_param.sweep_iname
-                sweep_class = _classify_reduction_inames(kernel, (sweep_iname,))
-
-                sequential = sweep_iname in sweep_class.sequential
-                parallel = sweep_iname in sweep_class.local_parallel
-                bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
-
-                if sweep_iname not in outer_insn_inames:
-                    _error_if_force_scan_on(LoopyError,
-                            "Sweep iname '%s' was detected, but is not an iname "
-                            "for the instruction." % sweep_iname)
-                elif bad_parallel:
-                    _error_if_force_scan_on(LoopyError,
-                            "Sweep iname '%s' has an unsupported parallel tag '%s' "
-                            "- the only parallelism allowed is 'local'." %
-                            (sweep_iname,
-                             ", ".join(tag.key
-                            for tag in temp_kernel.iname_tags(sweep_iname))))
-                elif parallel:
-                    return map_scan_local(
-                            expr, rec, callables_table, nresults,
-                            arg_dtypes, reduction_dtypes,
-                            sweep_iname, scan_param.scan_iname,
-                            scan_param.sweep_lower_bound,
-                            scan_param.scan_lower_bound,
-                            scan_param.stride,
-                            guarding_predicates)
-                elif sequential:
-                    return map_scan_seq(
-                            expr, rec, callables_table, nresults,
-                            arg_dtypes, reduction_dtypes, sweep_iname,
-                            scan_param.scan_iname,
-                            scan_param.sweep_lower_bound,
-                            scan_param.scan_lower_bound,
-                            scan_param.stride,
-                            guarding_predicates)
-
-                # fallthrough to reduction implementation
-
-            else:
-                assert n_local_par > 0
-                scan_iname, = expr.inames
-                _error_if_force_scan_on(LoopyError,
-                        "Scan iname '%s' is parallel tagged: this is not allowed "
-                        "(only the sweep iname should be tagged if parallelism "
-                        "is desired)." % scan_iname)
-
-                # fallthrough to reduction implementation
-
-        if n_sequential:
-            assert n_local_par == 0
-            return map_reduction_seq(expr, rec, callables_table,
-                                     nresults, arg_dtypes, reduction_dtypes,
-                                     guarding_predicates)
-        else:
-            assert n_local_par > 0
-            return map_reduction_local(
-                    expr, rec, callables_table, nresults, arg_dtypes,
-                    reduction_dtypes, guarding_predicates)
-
-    # }}}
-
-    cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table)
-
-    insn_queue = kernel.instructions[:]
-    insn_id_replacements = {}
-    domains = kernel.domains[:]
-
-    temp_kernel = kernel
-    kernel_changed = False
-
-    import loopy as lp
-    while insn_queue:
-        new_insn_add_depends_on = set()
-        new_insn_add_no_sync_with = set()
-        new_insn_add_within_inames = set()
-
-        generated_insns = []
-        insn_changed = False
-
-        insn = insn_queue.pop(0)
-
-        if insn_id_filter is not None and insn.id != insn_id_filter \
-                or not isinstance(insn, lp.MultiAssignmentBase):
-            new_insns.append(insn)
-            continue
-
-        nresults = len(insn.assignees)
-
-        # Run reduction expansion.
-        from loopy.symbolic import Reduction
-        if isinstance(insn.expression, Reduction) and nresults > 1:
-            new_expressions = cb_mapper(insn.expression,
-                    callables_table=cb_mapper.callables_table,
-                    guarding_predicates=insn.predicates,
-                    nresults=nresults)
-        else:
-            new_expressions = cb_mapper(insn.expression,
-                                        callables_table=cb_mapper.callables_table,
-                                        guarding_predicates=insn.predicates),
-
-        if insn_changed:
-            # An expansion happened, so insert the generated stuff plus
-            # ourselves back into the queue.
-
-            result_assignment_dep_on = \
-                    insn.depends_on | frozenset(new_insn_add_depends_on)
-            kwargs = insn.get_copy_kwargs(
-                    no_sync_with=insn.no_sync_with
-                    | frozenset(new_insn_add_no_sync_with),
-                    within_inames=(
-                        insn.within_inames
-                        | new_insn_add_within_inames))
-
-            kwargs.pop("id")
-            kwargs.pop("depends_on")
-            kwargs.pop("expression")
-            kwargs.pop("assignee", None)
-            kwargs.pop("assignees", None)
-            kwargs.pop("temp_var_type", None)
-            kwargs.pop("temp_var_types", None)
-
-            if isinstance(insn.expression, Reduction) and nresults > 1:
-                result_assignment_ids = [
-                        insn_id_gen(insn.id) for i in range(nresults)]
-                replacement_insns = [
-                        lp.Assignment(
-                            id=result_assignment_ids[i],
-                            depends_on=(
-                                result_assignment_dep_on
-                                | (frozenset([result_assignment_ids[i-1]])
-                                    if i else frozenset())),
-                            assignee=assignee,
-                            expression=new_expr,
-                            **kwargs)
-                        for i, (assignee, new_expr) in enumerate(zip(
-                            insn.assignees, new_expressions))]
-
-                insn_id_replacements[insn.id] = [
-                    rinsn.id for rinsn in replacement_insns]
-            else:
-                new_expr, = new_expressions
-                # since we are replacing the instruction with
-                # only one instruction, there's no need to replace id
-                replacement_insns = [
-                        make_assignment(
-                            id=insn.id,
-                            depends_on=result_assignment_dep_on,
-                            assignees=insn.assignees,
-                            expression=new_expr,
-                            **kwargs)
-                        ]
-
-            insn_queue = generated_insns + replacement_insns + insn_queue
-
-            # The reduction expander needs an up-to-date kernel
-            # object to find dependencies. Keep temp_kernel up-to-date.
-
-            temp_kernel = kernel.copy(
-                    instructions=new_insns + insn_queue,
-                    temporary_variables=new_temporary_variables,
-                    domains=domains)
-            temp_kernel = lp.replace_instruction_ids(
-                    temp_kernel, insn_id_replacements)
-            kernel_changed = True
-        else:
-            # nothing happened, we're done with insn
-            assert not new_insn_add_depends_on
-
-            new_insns.append(insn)
-
-    if kernel_changed:
-        kernel = kernel.copy(
-            instructions=new_insns,
-            temporary_variables=new_temporary_variables,
-            domains=domains)
-
-    kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)
-
-    from loopy.transform.iname import tag_inames
-    kernel = tag_inames(kernel, new_iname_tags)
-
-    kernel = (
-            _hackily_ensure_multi_assignment_return_values_are_scoped_private(
-                kernel))
-
-    return kernel, cb_mapper.callables_table
-
-
-def realize_reduction(program, *args, **kwargs):
-    assert isinstance(program, TranslationUnit)
-
-    callables_table = dict(program.callables_table)
-    kernels_to_scan = [in_knl_callable.subkernel
-            for in_knl_callable in program.callables_table.values()
-            if isinstance(in_knl_callable, CallableKernel)]
-
-    for knl in kernels_to_scan:
-        new_knl, callables_table = realize_reduction_for_single_kernel(
-                knl, callables_table, *args, **kwargs)
-        in_knl_callable = callables_table[knl.name].copy(
-                subkernel=new_knl)
-        callables_table[knl.name] = in_knl_callable
-
-    return program.copy(callables_table=callables_table)
-
-# }}}
-
-
 # {{{ realize_ilp
 
 def realize_ilp(kernel):
@@ -2449,6 +639,7 @@ def preprocess_program(program, device=None):
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
 
+    from loopy.transform.realize_reduction import realize_reduction
     program = realize_reduction(program, unknown_types_ok=False)
 
     # {{{ preprocess callable kernels
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
new file mode 100644
index 000000000..c02c05fdf
--- /dev/null
+++ b/loopy/transform/realize_reduction.py
@@ -0,0 +1,2053 @@
+__copyright__ = """
+Copyright (C) 2012 Andreas Kloeckner
+Copyright (C) 2022 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from dataclasses import dataclass
+from typing import Tuple, Dict, Callable, List, Optional, Set, Sequence
+
+import logging
+logger = logging.getLogger(__name__)
+
+from pytools import memoize_on_first_arg
+from pytools.tag import Tag
+import islpy as isl
+
+from loopy.kernel.data import make_assignment
+from loopy.kernel.tools import (
+        kernel_has_global_barriers, find_most_recent_global_barrier)
+from loopy.symbolic import ReductionCallbackMapper
+from loopy.translation_unit import TranslationUnit
+from loopy.kernel.function_interface import CallableKernel
+from loopy.kernel.data import TemporaryVariable, AddressSpace
+from loopy.kernel.instruction import (
+        InstructionBase, MultiAssignmentBase, Assignment)
+from loopy.kernel import LoopKernel
+from loopy.diagnostic import (
+        LoopyError, warn_with_kernel, ReductionIsNotTriangularError)
+from loopy.transform.instruction import replace_instruction_ids_in_insn
+
+
+# {{{ reduction realization context
+
+@dataclass(frozen=True)
+class _ReductionRealizationContext:
+    # {{{ read-only
+
+    force_scan: bool
+    automagic_scans_ok: bool
+    unknown_types_ok: bool
+
+    # FIXME: This feels like a broken-by-design concept
+    force_outer_iname_for_scan: Optional[str]
+
+    # We use the original kernel for a number of lookups whose value
+    # we do not change and which might be already cached on it.
+    orig_kernel: LoopKernel
+
+    kernel: LoopKernel
+
+    # FIXME: This shouldn't be here. We might generate multiple instructions
+    # in a nested manner. Why should the 'top-level' instruction be special?
+    insn: InstructionBase
+
+    # }}}
+
+    # {{{ internally mutable
+
+    insn_id_gen: Callable[[str], str]
+    var_name_gen: Callable[[str], str]
+
+    additional_temporary_variables: Dict[str, TemporaryVariable]
+    additional_insns: List[InstructionBase]
+    domains: List[isl.BasicSet]
+    additional_iname_tags: Dict[str, Sequence[Tag]]
+
+    # FIXME: This is a broken-by-design concept. Local-parallel scans emit a
+    # reduction internally. This serves to avoid force_scan acting on that
+    # reduction.
+    inames_added_for_scan: Set[str]
+
+    # FIXME: Clarify how these relate to recursively generated instructions.
+    new_insn_add_depends_on: Set[str]
+    new_insn_add_no_sync_with: Set[Tuple[str, str]]
+    new_insn_add_within_inames: Set[str]
+
+    # }}}
+
+    # {{{ change tracking
+
+    were_changes_made: bool
+
+    def changes_made(self):
+        object.__setattr__(self, "were_changes_made", True)
+
+    # }}}
+
+# }}}
+
+
+# {{{ iname/domain wrangling
+
+@dataclass(frozen=True)
+class _InameClassification:
+    sequential: Tuple[str, ...]
+    local_parallel: Tuple[str, ...]
+    nonlocal_parallel: Tuple[str, ...]
+
+
+def _classify_reduction_inames(kernel, inames):
+    sequential = []
+    local_par = []
+    nonlocal_par = []
+
+    from loopy.kernel.data import (
+            LocalInameTagBase, UnrolledIlpTag, UnrollTag,
+            ConcurrentTag, filter_iname_tags_by_type)
+
+    for iname in inames:
+        iname_tags = kernel.iname_tags(iname)
+
+        if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)):
+            # These are nominally parallel, but we can live with
+            # them as sequential.
+            sequential.append(iname)
+
+        elif filter_iname_tags_by_type(iname_tags, LocalInameTagBase):
+            local_par.append(iname)
+
+        elif filter_iname_tags_by_type(iname_tags, ConcurrentTag):
+            nonlocal_par.append(iname)
+
+        else:
+            sequential.append(iname)
+
+    return _InameClassification(
+            tuple(sequential), tuple(local_par), tuple(nonlocal_par))
+
+
+def _add_params_to_domain(domain, param_names):
+    dim_type = isl.dim_type
+    nparams_orig = domain.dim(dim_type.param)
+    domain = domain.add_dims(dim_type.param, len(param_names))
+
+    for param_idx, param_name in enumerate(param_names):
+        domain = domain.set_dim_name(
+                dim_type.param, param_idx + nparams_orig, param_name)
+
+    return domain
+
+
+def _move_set_to_param_dims_except(domain, except_dims):
+    dim_type = isl.dim_type
+
+    iname_idx = 0
+    for iname in domain.get_var_names(dim_type.set):
+        if iname not in except_dims:
+            domain = domain.move_dims(
+                    dim_type.param, 0,
+                    dim_type.set, iname_idx, 1)
+            iname_idx -= 1
+        iname_idx += 1
+
+    return domain
+
+
+def _domain_depends_on_given_set_dims(domain, set_dim_names):
+    set_dim_names = frozenset(set_dim_names)
+
+    return any(
+            set_dim_names & set(constr.get_coefficients_by_name())
+            for constr in domain.get_constraints())
+
+
+def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
+    # Intersect with inames, because we could have captured some kernel params
+    # in here too...
+    dependent_inames = (
+            frozenset(subdomain.get_var_names(isl.dim_type.param))
+            & kernel.all_inames())
+    idx, = kernel.get_leaf_domain_indices(dependent_inames)
+    domains.insert(idx + 1, subdomain)
+
+# }}}
+
+
+# {{{ scan inference
+
+def _check_reduction_is_triangular(kernel, expr, scan_param):
+    """Check whether the reduction within `expr` with scan parameters described by
+    the structure `scan_param` is triangular. This attempts to verify that the
+    domain for the scan and sweep inames is as follows:
+
+    [params] -> {
+        [other inames..., scan_iname, sweep_iname]:
+            (sweep_min_value
+                <= sweep_iname
+                <= sweep_max_value)
+            and
+            (scan_min_value
+                <= scan_iname
+                <= stride * (sweep_iname - sweep_min_value) + scan_min_value)
+            and
+            (irrelevant constraints)
+    }
+    """
+
+    orig_domain = kernel.get_inames_domain(
+            frozenset((scan_param.sweep_iname, scan_param.scan_iname)))
+
+    sweep_iname = scan_param.sweep_iname
+    scan_iname = scan_param.scan_iname
+    affs = isl.affs_from_space(orig_domain.space)
+
+    sweep_lower_bound = isl.align_spaces(
+            scan_param.sweep_lower_bound,
+            affs[0])
+
+    sweep_upper_bound = isl.align_spaces(
+            scan_param.sweep_upper_bound,
+            affs[0])
+
+    scan_lower_bound = isl.align_spaces(
+            scan_param.scan_lower_bound,
+            affs[0])
+
+    from itertools import product
+
+    for (sweep_lb_domain, sweep_lb_aff), \
+        (sweep_ub_domain, sweep_ub_aff), \
+        (scan_lb_domain, scan_lb_aff) in \
+            product(sweep_lower_bound.get_pieces(),
+                    sweep_upper_bound.get_pieces(),
+                    scan_lower_bound.get_pieces()):
+
+        # Assumptions inherited from the domains of the pwaffs
+        assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain
+
+        # Sweep iname constraints
+        hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff)
+        hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff)
+
+        # Scan iname constraints
+        hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff)
+        hyp_domain &= affs[scan_iname].le_set(
+                scan_param.stride * (affs[sweep_iname] - sweep_lb_aff)
+                + scan_lb_aff)
+
+        hyp_domain, = (hyp_domain & assumptions).get_basic_sets()
+        test_domain, = (orig_domain & assumptions).get_basic_sets()
+
+        hyp_gist_against_test = hyp_domain.gist(test_domain)
+        if _domain_depends_on_given_set_dims(hyp_gist_against_test,
+                (sweep_iname, scan_iname)):
+            return False, (
+                    "gist of hypothesis against test domain "
+                    "has sweep or scan dependent constraints: '%s'"
+                    % hyp_gist_against_test)
+
+        test_gist_against_hyp = test_domain.gist(hyp_domain)
+        if _domain_depends_on_given_set_dims(test_gist_against_hyp,
+                (sweep_iname, scan_iname)):
+            return False, (
+                   "gist of test against hypothesis domain "
+                   "has sweep or scan dependent constraint: '%s'"
+                   % test_gist_against_hyp)
+
+    return True, "ok"
+
+
+@dataclass(frozen=True)
+class _ScanCandidateParameters:
+    sweep_iname: str
+    scan_iname: str
+    sweep_lower_bound: isl.PwAff
+    sweep_upper_bound: isl.PwAff
+    scan_lower_bound: isl.PwAff
+    stride: int
+
+
+def _try_infer_scan_candidate_from_expr(
+        kernel, expr, within_inames, sweep_iname=None):
+    """Analyze `expr` and determine if it can be implemented as a scan.
+    """
+    from loopy.symbolic import Reduction
+    assert isinstance(expr, Reduction)
+
+    if len(expr.inames) != 1:
+        raise ValueError(
+                "Multiple inames in reduction: '{}'".format(", ".join(expr.inames)))
+
+    scan_iname, = expr.inames
+
+    from loopy.kernel.tools import DomainChanger
+    dchg = DomainChanger(kernel, (scan_iname,))
+    domain = dchg.get_original_domain()
+
+    if sweep_iname is None:
+        try:
+            sweep_iname = _try_infer_sweep_iname(
+                    domain, scan_iname, kernel.all_inames())
+        except ValueError as v:
+            raise ValueError(
+                    "Couldn't determine a sweep iname for the scan "
+                    "expression '%s': %s" % (expr, v))
+
+    try:
+        sweep_lower_bound, sweep_upper_bound, scan_lower_bound = (
+                _try_infer_scan_and_sweep_bounds(
+                    kernel, scan_iname, sweep_iname, within_inames))
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine bounds for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    try:
+        stride = _try_infer_scan_stride(
+                kernel, scan_iname, sweep_iname, sweep_lower_bound)
+    except ValueError as v:
+        raise ValueError(
+                "Couldn't determine a scan stride for the scan with expression '%s' "
+                "(sweep iname: '%s', scan iname: '%s'): %s"
+                % (expr, sweep_iname, scan_iname, v))
+
+    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
+            sweep_upper_bound, scan_lower_bound, stride)
+
+
+def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
+    """The sweep iname is the outer iname which guides the scan.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname.
+    """
+    constrs = domain.get_constraints()
+    sweep_iname_candidate = None
+
+    for constr in constrs:
+        candidate_vars = {
+                var for var in constr.get_var_dict()
+                if var in candidate_inames}
+
+        # Irrelevant constraint - skip
+        if scan_iname not in candidate_vars:
+            continue
+
+        # No additional inames - skip
+        if len(candidate_vars) == 1:
+            continue
+
+        candidate_vars.remove(scan_iname)
+
+        # Depends on more than one iname - error
+        if len(candidate_vars) > 1:
+            raise ValueError(
+                    "More than one sweep iname candidate for scan iname '%s' found "
+                    "(via constraint '%s')" % (scan_iname, constr))
+
+        next_candidate = candidate_vars.pop()
+
+        if sweep_iname_candidate is None:
+            sweep_iname_candidate = next_candidate
+            defining_constraint = constr
+        else:
+            # Check next_candidate consistency
+            if sweep_iname_candidate != next_candidate:
+                raise ValueError(
+                        "More than one sweep iname candidate for scan iname '%s' "
+                        "found (via constraints '%s', '%s')" %
+                        (scan_iname, defining_constraint, constr))
+
+    if sweep_iname_candidate is None:
+        raise ValueError(
+                "Couldn't find any sweep iname candidates for "
+                "scan iname '%s'" % scan_iname)
+
+    return sweep_iname_candidate
+
+
+def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames):
+    domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname)))
+    domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname))
+
+    var_dict = domain.get_var_dict()
+    sweep_idx = var_dict[sweep_iname][1]
+    scan_idx = var_dict[scan_iname][1]
+
+    domain = domain.project_out_except(
+            within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
+
+    try:
+        with isl.SuppressedWarnings(domain.get_ctx()):
+            sweep_lower_bound = domain.dim_min(sweep_idx)
+            sweep_upper_bound = domain.dim_max(sweep_idx)
+            scan_lower_bound = domain.dim_min(scan_idx)
+    except isl.Error as e:
+        raise ValueError("isl error: %s" % e)
+
+    return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound)
+
+
+def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
+    """The stride is the number of steps the scan iname takes per iteration
+    of the sweep iname. This is allowed to be an integer constant.
+
+    E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6.
+    """
+    dim_type = isl.dim_type
+
+    domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname]))
+    domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,))
+
+    domain_with_sweep_param = domain_with_sweep_param.project_out_except(
+            (sweep_iname, scan_iname), (dim_type.set, dim_type.param))
+
+    scan_iname_idx = domain_with_sweep_param.find_dim_by_name(
+            dim_type.set, scan_iname)
+
+    # Should be equal to k * sweep_iname, where k is the stride.
+
+    try:
+        with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()):
+            scan_iname_range = (
+                    domain_with_sweep_param.dim_max(scan_iname_idx)
+                    - domain_with_sweep_param.dim_min(scan_iname_idx)
+                    ).gist(domain_with_sweep_param.params())
+    except isl.Error as e:
+        raise ValueError("isl error: '%s'" % e)
+
+    scan_iname_pieces = scan_iname_range.get_pieces()
+
+    if len(scan_iname_pieces) > 1:
+        raise ValueError("range in multiple pieces: %s" % scan_iname_range)
+    elif len(scan_iname_pieces) == 0:
+        raise ValueError("empty range found for iname '%s'" % scan_iname)
+
+    scan_iname_constr, scan_iname_aff = scan_iname_pieces[0]
+
+    if not scan_iname_constr.plain_is_universe():
+        raise ValueError("found constraints: %s" % scan_iname_constr)
+
+    if scan_iname_aff.dim(dim_type.div):
+        raise ValueError("aff has div: %s" % scan_iname_aff)
+
+    coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param)
+
+    if len(coeffs) == 0:
+        try:
+            scan_iname_aff.get_constant_val()
+        except Exception:
+            raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
+
+        # If this point is reached we're assuming the domain is of the form
+        # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value
+        # this function returns will be verified later by
+        # _check_reduction_is_triangular().
+        return 1
+
+    if sweep_iname not in coeffs:
+        raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname)
+
+    stride = coeffs[sweep_iname]
+
+    if not stride.is_int():
+        raise ValueError("stride not an integer: %s" % stride)
+
+    if not stride.is_pos():
+        raise ValueError("stride not positive: %s" % stride)
+
+    return stride.to_python()
+
+# }}}
+
+
+# {{{ domain creation for scans
+
+def _get_domain_with_iname_as_param(domain, iname):
+    dim_type = isl.dim_type
+
+    if domain.find_dim_by_name(dim_type.param, iname) >= 0:
+        return domain
+
+    iname_idx = domain.find_dim_by_name(dim_type.set, iname)
+
+    assert iname_idx >= 0, (iname, domain)
+
+    return domain.move_dims(
+        dim_type.param, domain.dim(dim_type.param),
+        dim_type.set, iname_idx, 1)
+
+
+def _create_domain_for_sweep_tracking(orig_domain,
+        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
+    dim_type = isl.dim_type
+
+    subd = isl.BasicSet.universe(orig_domain.params().space)
+
+    # Add tracking_iname and sweep iname.
+
+    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
+
+    # Here we realize the domain:
+    #
+    # [..., i] -> {
+    #  [j]: 0 <= j - l
+    #       and
+    #       j - l <= k * (i - m)
+    #       and
+    #       k * (i - m - 1) < j - l }
+    # where
+    #   * i is the sweep iname
+    #   * j is the tracking iname
+    #   * k is the stride for the scan
+    #   * l is the lower bound for the scan
+    #   * m is the lower bound for the sweep iname
+    #
+    affs = isl.affs_from_space(subd.space)
+
+    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
+    subd &= (affs[tracking_iname] - scan_min_value)\
+            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
+
+    # Move tracking_iname into a set dim (NOT sweep iname).
+    subd = subd.move_dims(
+            dim_type.set, 0,
+            dim_type.param, subd.dim(dim_type.param) - 1, 1)
+
+    # Simplify (maybe).
+    orig_domain_with_sweep_param = (
+            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
+    subd = subd.gist_params(orig_domain_with_sweep_param.params())
+
+    subd, = subd.get_basic_sets()
+
+    return subd
+
+# }}}
+
+
+# {{{ _hackily_ensure_multi_assignment_return_values_are_scoped_private
+
+def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
+    """
+    Multi assignment function calls are currently lowered into OpenCL so that
+    the function call::
+
+       a, b = segmented_sum(x, y, z, w)
+
+    becomes::
+
+       a = segmented_sum_mangled(x, y, z, w, &b).
+
+    For OpenCL, the scope of "b" is significant, and the preamble generation
+    currently assumes the scope is always private. This function forces that to
+    be the case by introducing temporary assignments into the kernel.
+    """
+
+    insn_id_gen = kernel.get_instruction_id_generator()
+    var_name_gen = kernel.get_var_name_generator()
+
+    new_or_updated_instructions = {}
+    new_temporaries = {}
+
+    dep_map = {
+            insn.id: insn.depends_on for insn in kernel.instructions}
+
+    inverse_dep_map = {insn.id: set() for insn in kernel.instructions}
+
+    for insn_id, deps in dep_map.items():
+        for dep in deps:
+            inverse_dep_map[dep].add(insn_id)
+
+    del dep_map
+
+    # {{{ utils
+
+    def _add_to_no_sync_with(insn_id, new_no_sync_with_params):
+        insn = kernel.id_to_insn.get(insn_id)
+        insn = new_or_updated_instructions.get(insn_id, insn)
+        new_or_updated_instructions[insn_id] = (
+                insn.copy(
+                    no_sync_with=(
+                        insn.no_sync_with | frozenset(new_no_sync_with_params))))
+
+    def _add_to_depends_on(insn_id, new_depends_on_params):
+        insn = kernel.id_to_insn.get(insn_id)
+        insn = new_or_updated_instructions.get(insn_id, insn)
+        new_or_updated_instructions[insn_id] = (
+                insn.copy(
+                    depends_on=insn.depends_on | frozenset(new_depends_on_params)))
+
+    # }}}
+
+    from loopy.kernel.instruction import CallInstruction, is_array_call
+    for insn in kernel.instructions:
+        if not isinstance(insn, CallInstruction):
+            continue
+
+        if len(insn.assignees) <= 1:
+            continue
+
+        if is_array_call(insn.assignees, insn.expression):
+            continue
+
+        assignees = insn.assignees
+        assignee_var_names = insn.assignee_var_names()
+
+        new_assignees = [assignees[0]]
+        newly_added_assignments_ids = set()
+        needs_replacement = False
+
+        last_added_insn_id = insn.id
+
+        FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa
+
+        for assignee_nr, assignee_var_name, assignee in zip(
+                range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
+                assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
+                assignees[FIRST_POINTER_ASSIGNEE_IDX:]):
+
+            if (
+                    assignee_var_name in kernel.temporary_variables
+                    and
+                    (kernel.temporary_variables[assignee_var_name].address_space
+                         == AddressSpace.PRIVATE)):
+                new_assignees.append(assignee)
+                continue
+
+            needs_replacement = True
+
+            # {{{ generate a new assignent instruction
+
+            new_assignee_name = var_name_gen(
+                    "{insn_id}_retval_{assignee_nr}"
+                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
+
+            new_assignment_id = insn_id_gen(
+                    "{insn_id}_assign_retval_{assignee_nr}"
+                    .format(insn_id=insn.id, assignee_nr=assignee_nr))
+
+            newly_added_assignments_ids.add(new_assignment_id)
+
+            new_temporaries[new_assignee_name] = (
+                    TemporaryVariable(
+                        name=new_assignee_name,
+                        dtype=None,
+                        address_space=AddressSpace.PRIVATE))
+
+            from pymbolic import var
+            new_assignee = var(new_assignee_name)
+            new_assignees.append(new_assignee)
+
+            new_or_updated_instructions[new_assignment_id] = (
+                    make_assignment(
+                        assignees=(assignee,),
+                        expression=new_assignee,
+                        id=new_assignment_id,
+                        depends_on=frozenset([last_added_insn_id]),
+                        depends_on_is_final=True,
+                        no_sync_with=(
+                            insn.no_sync_with | frozenset([(insn.id, "any")])),
+                        predicates=insn.predicates,
+                        within_inames=insn.within_inames))
+
+            last_added_insn_id = new_assignment_id
+
+            # }}}
+
+        if not needs_replacement:
+            continue
+
+        # {{{ update originating instruction
+
+        orig_insn = new_or_updated_instructions.get(insn.id, insn)
+
+        new_or_updated_instructions[insn.id] = (
+                orig_insn.copy(assignees=tuple(new_assignees)))
+
+        _add_to_no_sync_with(insn.id,
+                [(id, "any") for id in newly_added_assignments_ids])
+
+        # }}}
+
+        # {{{ squash spurious memory dependencies amongst new assignments
+
+        for new_insn_id in newly_added_assignments_ids:
+            _add_to_no_sync_with(new_insn_id,
+                    [(id, "any")
+                     for id in newly_added_assignments_ids
+                     if id != new_insn_id])
+
+        # }}}
+
+        # {{{ update instructions that depend on the originating instruction
+
+        for inverse_dep in inverse_dep_map[insn.id]:
+            _add_to_depends_on(inverse_dep, newly_added_assignments_ids)
+
+            for insn_id, scope in (
+                    new_or_updated_instructions[inverse_dep].no_sync_with):
+                if insn_id == insn.id:
+                    _add_to_no_sync_with(
+                            inverse_dep,
+                            [(id, scope) for id in newly_added_assignments_ids])
+
+        # }}}
+
+    if not new_temporaries and not new_or_updated_instructions:
+        return kernel
+
+    new_temporary_variables = kernel.temporary_variables.copy()
+    new_temporary_variables.update(new_temporaries)
+
+    new_instructions = (
+            list(new_or_updated_instructions.values())
+            + list(insn
+                for insn in kernel.instructions
+                if insn.id not in new_or_updated_instructions))
+
+    return kernel.copy(temporary_variables=new_temporary_variables,
+                       instructions=new_instructions)
+
+# }}}
+
+
+# {{{ RealizeReductionCallbackMapper
+
+class RealizeReductionCallbackMapper(ReductionCallbackMapper):
+    def __init__(self, callback, callables_table):
+        super().__init__(callback)
+        self.callables_table = callables_table
+
+    def map_reduction(self, expr, **kwargs):
+        result, self.callables_table = self.callback(expr, rec=self.rec,
+                **kwargs)
+        return result
+
+    def map_if(self, expr, *,
+            callables_table, red_realize_ctx,
+            guarding_predicates, nresults):
+
+        common_kwargs = dict(
+                callables_table=callables_table,
+                red_realize_ctx=red_realize_ctx,
+                nresults=nresults)
+
+        import pymbolic.primitives as prim
+        rec_cond = self.rec(
+                expr.condition,
+                guarding_predicates=guarding_predicates,
+                **common_kwargs)
+        return prim.If(rec_cond,
+                       self.rec(expr.then,
+                           guarding_predicates=(
+                               guarding_predicates
+                               | frozenset([rec_cond])),
+                           **common_kwargs),
+                       self.rec(expr.else_,
+                           guarding_predicates=(
+                               guarding_predicates
+                               | frozenset([prim.LogicalNot(rec_cond)])),
+                           **common_kwargs))
+
+# }}}
+
+
+# {{{ helpers
+
+def _strip_if_scalar(reference, val):
+    if len(reference) == 1:
+        return val[0]
+    else:
+        return val
+
+
+def _preprocess_scan_arguments(
+        red_realize_ctx,
+        expr, nresults, scan_iname, track_iname,
+        newly_generated_insn_id_set,
+        insn_id_gen):
+    """Does iname substitution within scan arguments and returns a set of values
+    suitable to be passed to the binary op. Returns a tuple."""
+
+    insn = red_realize_ctx.insn
+
+    if nresults > 1:
+        inner_expr = expr
+
+        # In the case of a multi-argument scan, we need a name for each of
+        # the arguments in order to pass them to the binary op - so we expand
+        # items that are not "plain" tuples here.
+        if not isinstance(inner_expr, tuple):
+            get_args_insn_id = insn_id_gen(
+                    "{}_{}_get".format(insn.id, "_".join(expr.inames)))
+
+            inner_expr = expand_inner_reduction(
+                    red_realize_ctx=red_realize_ctx,
+                    id=get_args_insn_id,
+                    expr=inner_expr,
+                    nresults=nresults,
+                    depends_on=insn.depends_on,
+                    within_inames=insn.within_inames | expr.inames,
+                    within_inames_is_final=insn.within_inames_is_final,
+                    predicates=insn.predicates,
+                    )
+
+            newly_generated_insn_id_set.add(get_args_insn_id)
+
+        updated_inner_exprs = tuple(
+                replace_var_within_expr(
+                    red_realize_ctx.kernel, red_realize_ctx.var_name_gen,
+                    sub_expr, scan_iname, track_iname)
+                for sub_expr in inner_expr)
+    else:
+        updated_inner_exprs = (
+                replace_var_within_expr(
+                    red_realize_ctx.kernel, red_realize_ctx.var_name_gen,
+                    expr, scan_iname, track_iname),)
+
+    return updated_inner_exprs
+
+# }}}
+
+
+def expand_inner_reduction(
+        red_realize_ctx, id, expr, nresults, depends_on, within_inames,
+        within_inames_is_final, predicates):
+    # FIXME: use _make_temporaries
+    from pymbolic.primitives import Call
+    from loopy.symbolic import Reduction
+    assert isinstance(expr, (Call, Reduction))
+
+    temp_var_names = [
+            red_realize_ctx.var_name_gen(id + "_arg" + str(i))
+            for i in range(nresults)]
+
+    for name in temp_var_names:
+        red_realize_ctx.additional_temporary_variables[name] = TemporaryVariable(
+                name=name,
+                shape=(),
+                dtype=None,
+                address_space=AddressSpace.PRIVATE)
+
+    from pymbolic import var
+    temp_vars = tuple(var(n) for n in temp_var_names)
+
+    call_insn = make_assignment(
+            id=id,
+            assignees=temp_vars,
+            expression=expr,
+            depends_on=depends_on,
+            within_inames=within_inames,
+            within_inames_is_final=within_inames_is_final,
+            predicates=predicates)
+
+    red_realize_ctx.additional_insns.append(call_insn)
+
+    return temp_vars
+
+
+# {{{ reduction type: sequential
+
+def map_reduction_seq(
+        red_realize_ctx, expr, rec, callables_table, nresults, arg_dtypes,
+        reduction_dtypes, guarding_predicates):
+    orig_kernel = red_realize_ctx.orig_kernel
+    insn = red_realize_ctx.insn
+
+    outer_insn_inames = red_realize_ctx.insn.within_inames
+
+    acc_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="acc_"+"_".join(expr.inames),
+            nvars=nresults,
+            shape=(),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.PRIVATE)
+
+    init_insn_depends_on = frozenset()
+
+    # check first that the original kernel had global barriers
+    # if not, we don't need to check. Since the function
+    # kernel_has_global_barriers is cached, we don't do
+    # extra work compared to not checking.
+    # FIXME: Explain why we care about global barriers here
+    if kernel_has_global_barriers(orig_kernel):
+        global_barrier = find_most_recent_global_barrier(
+                red_realize_ctx.kernel,
+                insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+    from pymbolic import var
+    acc_vars = tuple(var(n) for n in acc_var_names)
+
+    init_id = red_realize_ctx.insn_id_gen(
+            "{}_{}_init".format(insn.id, "_".join(expr.inames)))
+
+    expression, callables_table = expr.operation.neutral_element(
+            *arg_dtypes, callables_table=callables_table,
+            target=red_realize_ctx.orig_kernel.target)
+
+    init_insn = make_assignment(
+            id=init_id,
+            assignees=acc_vars,
+            within_inames=outer_insn_inames - frozenset(expr.inames),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=init_insn_depends_on,
+            expression=expression,
+
+            # Do not inherit predicates: Those might read variables
+            # that may not yet be set, and we don't have a great way
+            # of figuring out what the dependencies of the accumulator
+            # initializer should be.
+
+            # This way, we may initialize a few too many accumulators,
+            # but that's better than being incorrect.
+            # https://github.com/inducer/loopy/issues/231
+            )
+
+    red_realize_ctx.additional_insns.append(init_insn)
+
+    update_id = red_realize_ctx.insn_id_gen(
+            based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
+
+    update_insn_iname_deps = insn.within_inames | set(expr.inames)
+    if insn.within_inames_is_final:
+        update_insn_iname_deps = insn.within_inames | set(expr.inames)
+
+    reduction_insn_depends_on = {init_id}
+
+    # In the case of a multi-argument reduction, we need a name for each of
+    # the arguments in order to pass them to the binary op - so we expand
+    # items that are not "plain" tuples here.
+    if nresults > 1 and not isinstance(expr.expr, tuple):
+        get_args_insn_id = red_realize_ctx.insn_id_gen(
+                "{}_{}_get".format(insn.id, "_".join(expr.inames)))
+
+        reduction_expr = expand_inner_reduction(
+                red_realize_ctx=red_realize_ctx,
+                id=get_args_insn_id,
+                expr=expr.expr,
+                nresults=nresults,
+                depends_on=insn.depends_on,
+                within_inames=update_insn_iname_deps,
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=guarding_predicates,
+                )
+
+        reduction_insn_depends_on.add(get_args_insn_id)
+    else:
+        reduction_expr = expr.expr
+
+    expression, callables_table = expr.operation(
+            arg_dtypes,
+            _strip_if_scalar(acc_vars, acc_vars),
+            reduction_expr,
+            callables_table,
+            orig_kernel.target)
+
+    reduction_insn = make_assignment(
+            id=update_id,
+            assignees=acc_vars,
+            expression=expression,
+            depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
+            within_inames=update_insn_iname_deps,
+            within_inames_is_final=insn.within_inames_is_final,
+            predicates=guarding_predicates,)
+
+    red_realize_ctx.additional_insns.append(reduction_insn)
+
+    red_realize_ctx.new_insn_add_depends_on.add(reduction_insn.id)
+
+    if nresults == 1:
+        assert len(acc_vars) == 1
+        return acc_vars[0], callables_table
+    else:
+        return acc_vars, callables_table
+
+# }}}
+
+
+# {{{ reduction type: local-parallel
+
+def _get_int_iname_size(kernel, iname):
+    from loopy.isl_helpers import static_max_of_pw_aff
+    from loopy.symbolic import pw_aff_to_expr
+    size = pw_aff_to_expr(
+            static_max_of_pw_aff(
+                kernel.get_iname_bounds(iname).size,
+                constants_only=True))
+    assert isinstance(size, int)
+    return size
+
+
+def _make_slab_set(iname, size):
+    v = isl.make_zero_and_vars([iname])
+    bs, = (
+            v[0].le_set(v[iname])
+            &
+            v[iname].lt_set(v[0] + size)).get_basic_sets()
+    return bs
+
+
+def _make_slab_set_from_range(iname, lbound, ubound):
+    v = isl.make_zero_and_vars([iname])
+    bs, = (
+            v[iname].ge_set(v[0] + lbound)
+            &
+            v[iname].lt_set(v[0] + ubound)).get_basic_sets()
+    return bs
+
+
+def map_reduction_local(
+        red_realize_ctx,
+        expr, rec, callables_table, nresults, arg_dtypes,
+        reduction_dtypes, guarding_predicates):
+    orig_kernel = red_realize_ctx.orig_kernel
+    insn = red_realize_ctx.insn
+
+    red_iname, = expr.inames
+
+    size = _get_int_iname_size(orig_kernel, red_iname)
+
+    outer_insn_inames = insn.within_inames
+
+    from loopy.kernel.data import LocalInameTagBase
+    outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+            if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase))
+
+    from pymbolic import var
+    outer_local_iname_vars = tuple(
+            var(oiname) for oiname in outer_local_inames)
+
+    outer_local_iname_sizes = tuple(
+            _get_int_iname_size(orig_kernel, oiname)
+            for oiname in outer_local_inames)
+
+    neutral_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="neutral_"+red_iname,
+            nvars=nresults,
+            shape=(),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.PRIVATE)
+
+    acc_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="acc_"+red_iname,
+            nvars=nresults,
+            shape=outer_local_iname_sizes + (size,),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.LOCAL)
+
+    acc_vars = tuple(var(n) for n in acc_var_names)
+
+    # {{{ add separate iname to carry out the reduction
+
+    # Doing this sheds any odd conditionals that may be active
+    # on our red_iname.
+
+    base_exec_iname = red_realize_ctx.var_name_gen("red_"+red_iname)
+    red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, size))
+    red_realize_ctx.additional_iname_tags[base_exec_iname] \
+            = orig_kernel.iname_tags(red_iname)
+
+    # }}}
+
+    base_iname_deps = outer_insn_inames - frozenset(expr.inames)
+
+    neutral, callables_table = expr.operation.neutral_element(*arg_dtypes,
+            callables_table=callables_table, target=orig_kernel.target)
+    init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_init")
+    init_insn = make_assignment(
+            id=init_id,
+            assignees=tuple(
+                acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                for acc_var in acc_vars),
+            expression=neutral,
+            within_inames=base_iname_deps | frozenset([base_exec_iname]),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=frozenset(),
+            # Do not inherit predicates: Those might read variables
+            # that may not yet be set, and we don't have a great way
+            # of figuring out what the dependencies of the accumulator
+            # initializer should be.
+
+            # This way, we may initialize a few too many accumulators,
+            # but that's better than being incorrect.
+            # https://github.com/inducer/loopy/issues/231
+            )
+    red_realize_ctx.additional_insns.append(init_insn)
+
+    init_neutral_id = red_realize_ctx.insn_id_gen(
+            f"{insn.id}_{red_iname}_init_neutral")
+    init_neutral_insn = make_assignment(
+            id=init_neutral_id,
+            assignees=tuple(var(nvn) for nvn in neutral_var_names),
+            expression=neutral,
+            within_inames=base_iname_deps | frozenset([base_exec_iname]),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=frozenset(),
+            predicates=guarding_predicates,
+            )
+    red_realize_ctx.additional_insns.append(init_neutral_insn)
+
+    transfer_depends_on = {init_neutral_id, init_id}
+
+    # In the case of a multi-argument reduction, we need a name for each of
+    # the arguments in order to pass them to the binary op - so we expand
+    # items that are not "plain" tuples here.
+    if nresults > 1 and not isinstance(expr.expr, tuple):
+        get_args_insn_id = red_realize_ctx.insn_id_gen(
+                f"{insn.id}_{red_iname}_get")
+
+        reduction_expr = expand_inner_reduction(
+                red_realize_ctx=red_realize_ctx,
+                id=get_args_insn_id,
+                expr=expr.expr,
+                nresults=nresults,
+                depends_on=insn.depends_on,
+                within_inames=(
+                    (outer_insn_inames - frozenset(expr.inames))
+                    | frozenset([red_iname])),
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=guarding_predicates,
+                )
+
+        transfer_depends_on.add(get_args_insn_id)
+    else:
+        reduction_expr = expr.expr
+
+    transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_transfer")
+    expression, callables_table = expr.operation(
+            arg_dtypes,
+            _strip_if_scalar(
+                neutral_var_names,
+                tuple(var(nvn) for nvn in neutral_var_names)),
+            reduction_expr,
+            callables_table,
+            orig_kernel.target)
+    transfer_insn = make_assignment(
+            id=transfer_id,
+            assignees=tuple(
+                acc_var[outer_local_iname_vars + (var(red_iname),)]
+                for acc_var in acc_vars),
+            expression=expression,
+            within_inames=(
+                (outer_insn_inames - frozenset(expr.inames))
+                | frozenset([red_iname])),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
+            no_sync_with=frozenset([(init_id, "any")]),
+            predicates=insn.predicates,
+            )
+    red_realize_ctx.additional_insns.append(transfer_insn)
+
+    cur_size = 1
+    while cur_size < size:
+        cur_size *= 2
+
+    prev_id = transfer_id
+    bound = size
+
+    stage_exec_iname = None
+
+    istage = 0
+    while cur_size > 1:
+
+        new_size = cur_size // 2
+        assert new_size * 2 == cur_size
+
+        stage_exec_iname = red_realize_ctx.var_name_gen(
+                "red_%s_s%d" % (red_iname, istage))
+        red_realize_ctx.domains.append(
+                _make_slab_set(stage_exec_iname, bound-new_size))
+        red_realize_ctx.additional_iname_tags[stage_exec_iname] \
+                = orig_kernel.iname_tags(red_iname)
+
+        stage_id = red_realize_ctx.insn_id_gen(
+                "red_%s_stage_%d" % (red_iname, istage))
+
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, tuple(
+                    acc_var[
+                        outer_local_iname_vars + (var(stage_exec_iname),)]
+                    for acc_var in acc_vars)),
+                _strip_if_scalar(acc_vars, tuple(
+                    acc_var[
+                        outer_local_iname_vars + (
+                            var(stage_exec_iname) + new_size,)]
+                    for acc_var in acc_vars)),
+                callables_table,
+                orig_kernel.target)
+
+        stage_insn = make_assignment(
+                id=stage_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=expression,
+                within_inames=(
+                    base_iname_deps | frozenset([stage_exec_iname])),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset([prev_id]),
+                predicates=insn.predicates,
+                )
+
+        red_realize_ctx.additional_insns.append(stage_insn)
+        prev_id = stage_id
+
+        cur_size = new_size
+        bound = cur_size
+        istage += 1
+
+    red_realize_ctx.new_insn_add_depends_on.add(prev_id)
+    red_realize_ctx.new_insn_add_no_sync_with.add((prev_id, "any"))
+    red_realize_ctx.new_insn_add_within_inames.add(
+            stage_exec_iname or base_exec_iname)
+
+    if nresults == 1:
+        assert len(acc_vars) == 1
+        return acc_vars[0][outer_local_iname_vars + (0,)], callables_table
+    else:
+        return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
+                acc_vars], callables_table
+# }}}
+
+
+# {{{ utils (stateful)
+
+@memoize_on_first_arg
+def _get_or_add_sweep_tracking_iname_and_domain(
+        red_realize_ctx,
+        scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+        tracking_iname):
+    kernel = red_realize_ctx.kernel
+
+    domain = kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
+
+    red_realize_ctx.inames_added_for_scan.add(tracking_iname)
+
+    new_domain = _create_domain_for_sweep_tracking(domain,
+            tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
+
+    _insert_subdomain_into_domain_tree(kernel, red_realize_ctx.domains, new_domain)
+
+    return tracking_iname
+
+
+def replace_var_within_expr(kernel, var_name_gen, expr, from_var, to_var):
+    from pymbolic.mapper.substitutor import make_subst_func
+
+    from loopy.symbolic import (
+        SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+        kernel.substitutions, var_name_gen)
+
+    from pymbolic import var
+    mapper = RuleAwareSubstitutionMapper(
+        rule_mapping_context,
+        make_subst_func({from_var: var(to_var)}),
+        within=lambda *args: True)
+
+    return mapper(expr, kernel, None)
+
+
+def _make_temporaries(
+        red_realize_ctx, name_based_on, nvars, shape, dtypes, address_space):
+    var_names = [
+            red_realize_ctx.var_name_gen(name_based_on.format(index=i))
+            for i in range(nvars)]
+
+    from loopy.kernel.data import TemporaryVariable
+
+    for name, dtype in zip(var_names, dtypes):
+        red_realize_ctx.additional_temporary_variables[name] = TemporaryVariable(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                address_space=address_space)
+
+    return var_names
+
+# }}}
+
+
+# {{{ reduction type: sequential scan
+
+def map_scan_seq(
+        red_realize_ctx,
+        expr, rec, callables_table, nresults, arg_dtypes,
+        reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+        scan_min_value, stride, guarding_predicates):
+    insn = red_realize_ctx.insn
+
+    outer_insn_inames = insn.within_inames
+
+    track_iname = red_realize_ctx.var_name_gen(
+            "{sweep_iname}__seq_scan"
+            .format(sweep_iname=sweep_iname))
+
+    _get_or_add_sweep_tracking_iname_and_domain(
+            red_realize_ctx,
+            scan_iname, sweep_iname, sweep_min_value, scan_min_value,
+            stride, track_iname)
+
+    from loopy.kernel.data import AddressSpace
+    acc_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="acc_" + scan_iname,
+            nvars=nresults,
+            shape=(),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.PRIVATE)
+
+    from pymbolic import var
+    acc_vars = tuple(var(n) for n in acc_var_names)
+
+    init_id = red_realize_ctx.insn_id_gen(
+            "{}_{}_init".format(insn.id, "_".join(expr.inames)))
+
+    init_insn_depends_on = frozenset()
+
+    # FIXME: Explain why we care about global barriers here
+    if kernel_has_global_barriers(red_realize_ctx.orig_kernel):
+        global_barrier = find_most_recent_global_barrier(
+                red_realize_ctx.kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+    expression, callables_table = expr.operation.neutral_element(
+            *arg_dtypes, callables_table=callables_table,
+            target=red_realize_ctx.orig_kernel.target)
+
+    init_insn = make_assignment(
+            id=init_id,
+            assignees=acc_vars,
+            within_inames=outer_insn_inames - frozenset(
+                (sweep_iname,) + expr.inames),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=init_insn_depends_on,
+            expression=expression,
+            # Do not inherit predicates: Those might read variables
+            # that may not yet be set, and we don't have a great way
+            # of figuring out what the dependencies of the accumulator
+            # initializer should be.
+
+            # This way, we may initialize a few too many accumulators,
+            # but that's better than being incorrect.
+            # https://github.com/inducer/loopy/issues/231
+            )
+
+    red_realize_ctx.additional_insns.append(init_insn)
+
+    update_insn_depends_on = {init_insn.id} | insn.depends_on
+
+    updated_inner_exprs = _preprocess_scan_arguments(
+            red_realize_ctx,
+            expr.expr, nresults,
+            scan_iname, track_iname, update_insn_depends_on,
+            insn_id_gen=red_realize_ctx.insn_id_gen)
+
+    update_id = red_realize_ctx.insn_id_gen(
+            based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
+
+    update_insn_iname_deps = insn.within_inames | {track_iname}
+    if insn.within_inames_is_final:
+        update_insn_iname_deps = insn.within_inames | {track_iname}
+
+    expression, callables_table = expr.operation(
+            arg_dtypes,
+            _strip_if_scalar(acc_vars, acc_vars),
+            _strip_if_scalar(acc_vars, updated_inner_exprs),
+            callables_table,
+            red_realize_ctx.orig_kernel.target)
+
+    scan_insn = make_assignment(
+            id=update_id,
+            assignees=acc_vars,
+            expression=expression,
+            depends_on=frozenset(update_insn_depends_on),
+            within_inames=update_insn_iname_deps,
+            no_sync_with=insn.no_sync_with,
+            within_inames_is_final=insn.within_inames_is_final,
+            predicates=guarding_predicates,
+            )
+
+    red_realize_ctx.additional_insns.append(scan_insn)
+    red_realize_ctx.new_insn_add_depends_on.add(scan_insn.id)
+
+    if nresults == 1:
+        assert len(acc_vars) == 1
+        return acc_vars[0], callables_table
+    else:
+        return acc_vars, callables_table
+
+# }}}
+
+
+# {{{ reduction type: local-parallel scan
+
+def map_scan_local(
+        red_realize_ctx,
+        expr, rec, callables_table, nresults, arg_dtypes,
+        reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+        scan_min_value, stride, guarding_predicates):
+
+    orig_kernel = red_realize_ctx.orig_kernel
+    insn = red_realize_ctx.insn
+
+    scan_size = _get_int_iname_size(orig_kernel, sweep_iname)
+
+    assert scan_size > 0
+
+    if scan_size == 1:
+        return map_reduction_seq(red_realize_ctx,
+                expr, rec, callables_table,
+                nresults, arg_dtypes, reduction_dtypes,
+                guarding_predicates)
+
+    outer_insn_inames = insn.within_inames
+
+    from loopy.kernel.data import LocalInameTagBase
+    outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+            if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase)
+            and oiname != sweep_iname)
+
+    from pymbolic import var
+    outer_local_iname_vars = tuple(
+            var(oiname) for oiname in outer_local_inames)
+
+    outer_local_iname_sizes = tuple(
+            _get_int_iname_size(orig_kernel, oiname)
+            for oiname in outer_local_inames)
+
+    track_iname = red_realize_ctx.var_name_gen(
+            "{sweep_iname}__pre_scan"
+            .format(sweep_iname=sweep_iname))
+
+    _get_or_add_sweep_tracking_iname_and_domain(
+            red_realize_ctx,
+            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+            track_iname)
+
+    # {{{ add separate iname to carry out the scan
+
+    # Doing this sheds any odd conditionals that may be active
+    # on our scan_iname.
+
+    base_exec_iname = red_realize_ctx.var_name_gen(sweep_iname + "__scan")
+    red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, scan_size))
+    red_realize_ctx.additional_iname_tags[base_exec_iname] \
+            = orig_kernel.iname_tags(sweep_iname)
+
+    # }}}
+
+    read_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="read_"+scan_iname+"_arg_{index}",
+            nvars=nresults,
+            shape=(),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.PRIVATE)
+
+    acc_var_names = _make_temporaries(
+            red_realize_ctx=red_realize_ctx,
+            name_based_on="acc_"+scan_iname,
+            nvars=nresults,
+            shape=outer_local_iname_sizes + (scan_size,),
+            dtypes=reduction_dtypes,
+            address_space=AddressSpace.LOCAL)
+
+    acc_vars = tuple(var(n) for n in acc_var_names)
+    read_vars = tuple(var(n) for n in read_var_names)
+
+    base_iname_deps = (outer_insn_inames
+            - frozenset(expr.inames) - frozenset([sweep_iname]))
+
+    neutral, callables_table = expr.operation.neutral_element(
+            *arg_dtypes, callables_table=callables_table,
+            target=orig_kernel.target)
+
+    init_insn_depends_on = insn.depends_on
+
+    # FIXME: Explain why we care about global barriers here
+    if kernel_has_global_barriers(orig_kernel):
+        global_barrier = find_most_recent_global_barrier(
+                red_realize_ctx.kernel, insn.id)
+
+        if global_barrier is not None:
+            init_insn_depends_on |= frozenset([global_barrier])
+
+    init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_init")
+    init_insn = make_assignment(
+            id=init_id,
+            assignees=tuple(
+                acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                for acc_var in acc_vars),
+            expression=neutral,
+            within_inames=base_iname_deps | frozenset([base_exec_iname]),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=init_insn_depends_on,
+            # Do not inherit predicates: Those might read variables
+            # that may not yet be set, and we don't have a great way
+            # of figuring out what the dependencies of the accumulator
+            # initializer should be.
+
+            # This way, we may initialize a few too many accumulators,
+            # but that's better than being incorrect.
+            # https://github.com/inducer/loopy/issues/231
+            )
+    red_realize_ctx.additional_insns.append(init_insn)
+
+    transfer_insn_depends_on = {init_insn.id} | insn.depends_on
+
+    updated_inner_exprs = _preprocess_scan_arguments(
+            red_realize_ctx,
+            expr.expr, nresults,
+            scan_iname, track_iname, transfer_insn_depends_on,
+            insn_id_gen=red_realize_ctx.insn_id_gen)
+
+    from loopy.symbolic import Reduction
+
+    from loopy.symbolic import pw_aff_to_expr
+    sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
+
+    transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_transfer")
+    transfer_insn = make_assignment(
+            id=transfer_id,
+            assignees=tuple(
+                acc_var[outer_local_iname_vars
+                        + (var(sweep_iname) - sweep_min_value_expr,)]
+                for acc_var in acc_vars),
+            expression=Reduction(
+                operation=expr.operation,
+                inames=(track_iname,),
+                expr=_strip_if_scalar(acc_vars, updated_inner_exprs),
+                allow_simultaneous=False,
+                ),
+            within_inames=outer_insn_inames - frozenset(expr.inames),
+            within_inames_is_final=insn.within_inames_is_final,
+            depends_on=frozenset(transfer_insn_depends_on),
+            no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
+            predicates=insn.predicates,
+            )
+
+    red_realize_ctx.additional_insns.append(transfer_insn)
+
+    prev_id = transfer_id
+
+    istage = 0
+    cur_size = 1
+
+    while cur_size < scan_size:
+        stage_exec_iname = red_realize_ctx.var_name_gen(
+                "%s__scan_s%d" % (sweep_iname, istage))
+        red_realize_ctx.domains.append(
+                _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
+        red_realize_ctx.additional_iname_tags[stage_exec_iname] \
+                = orig_kernel.iname_tags(sweep_iname)
+
+        for read_var, acc_var in zip(read_vars, acc_vars):
+            read_stage_id = red_realize_ctx.insn_id_gen(
+                    "scan_%s_read_stage_%d" % (scan_iname, istage))
+
+            read_stage_insn = make_assignment(
+                    id=read_stage_id,
+                    assignees=(read_var,),
+                    expression=(
+                            acc_var[
+                                outer_local_iname_vars
+                                + (var(stage_exec_iname) - cur_size,)]),
+                    within_inames=(
+                        base_iname_deps | frozenset([stage_exec_iname])),
+                    within_inames_is_final=insn.within_inames_is_final,
+                    depends_on=frozenset([prev_id]),
+                    predicates=insn.predicates,
+                    )
+
+            if cur_size == 1:
+                # Performance hack: don't add a barrier here with transfer_insn.
+                # NOTE: This won't work if the way that local inames
+                # are lowered changes.
+                read_stage_insn = read_stage_insn.copy(
+                        no_sync_with=(
+                            read_stage_insn.no_sync_with
+                            | frozenset([(transfer_id, "any")])))
+
+            red_realize_ctx.additional_insns.append(read_stage_insn)
+            prev_id = read_stage_id
+
+        write_stage_id = red_realize_ctx.insn_id_gen(
+                "scan_%s_write_stage_%d" % (scan_iname, istage))
+
+        expression, callables_table = expr.operation(
+            arg_dtypes,
+            _strip_if_scalar(acc_vars, read_vars),
+            _strip_if_scalar(acc_vars, tuple(
+                acc_var[
+                    outer_local_iname_vars + (var(stage_exec_iname),)]
+                for acc_var in acc_vars)),
+            callables_table,
+            orig_kernel.target)
+
+        write_stage_insn = make_assignment(
+                id=write_stage_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=expression,
+                within_inames=(
+                    base_iname_deps | frozenset([stage_exec_iname])),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset([prev_id]),
+                predicates=insn.predicates,
+                )
+
+        red_realize_ctx.additional_insns.append(write_stage_insn)
+        prev_id = write_stage_id
+
+        cur_size *= 2
+        istage += 1
+
+    red_realize_ctx.new_insn_add_depends_on.add(prev_id)
+    red_realize_ctx.new_insn_add_within_inames.add(sweep_iname)
+
+    output_idx = var(sweep_iname) - sweep_min_value_expr
+
+    if nresults == 1:
+        assert len(acc_vars) == 1
+        return (acc_vars[0][outer_local_iname_vars + (output_idx,)],
+                callables_table)
+    else:
+        return [acc_var[outer_local_iname_vars + (output_idx,)]
+                for acc_var in acc_vars], callables_table
+
+# }}}
+
+
+# {{{ top-level dispatch among reduction types
+
+def map_reduction(
+        expr, *, rec,
+        callables_table, red_realize_ctx,
+        guarding_predicates, nresults):
+    insn = red_realize_ctx.insn
+
+    # Only expand one level of reduction at a time, going from outermost to
+    # innermost. Otherwise we get the (iname + insn) dependencies wrong.
+
+    from loopy.type_inference import (
+            infer_arg_and_reduction_dtypes_for_reduction_expression)
+    arg_dtypes, reduction_dtypes = (
+            infer_arg_and_reduction_dtypes_for_reduction_expression(
+                red_realize_ctx.kernel, expr, callables_table,
+                red_realize_ctx.unknown_types_ok))
+
+    outer_insn_inames = insn.within_inames
+    bad_inames = frozenset(expr.inames) & outer_insn_inames
+    if bad_inames:
+        raise LoopyError("reduction used within loop(s) that it was "
+                "supposed to reduce over: " + ", ".join(bad_inames))
+
+    iname_classes = _classify_reduction_inames(red_realize_ctx.kernel, expr.inames)
+
+    n_sequential = len(iname_classes.sequential)
+    n_local_par = len(iname_classes.local_parallel)
+    n_nonlocal_par = len(iname_classes.nonlocal_parallel)
+
+    really_force_scan = red_realize_ctx.force_scan and (
+            len(expr.inames) != 1
+            or expr.inames[0] not in red_realize_ctx.inames_added_for_scan)
+
+    def _error_if_force_scan_on(cls, msg):
+        if really_force_scan:
+            raise cls(msg)
+
+    may_be_implemented_as_scan = False
+    if red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok:
+        try:
+            # Try to determine scan candidate information (sweep iname, scan
+            # iname, etc).
+            scan_param = _try_infer_scan_candidate_from_expr(
+                    red_realize_ctx.kernel, expr, outer_insn_inames,
+                    sweep_iname=red_realize_ctx.force_outer_iname_for_scan)
+
+        except ValueError as v:
+            error = str(v)
+
+        else:
+            # Ensures the reduction is triangular (somewhat expensive).
+            may_be_implemented_as_scan, error = _check_reduction_is_triangular(
+                        red_realize_ctx.kernel, expr, scan_param)
+
+        if not may_be_implemented_as_scan:
+            _error_if_force_scan_on(ReductionIsNotTriangularError, error)
+
+    # {{{ sanity checks
+
+    if n_local_par and n_sequential:
+        raise LoopyError("Reduction over '%s' contains both parallel and "
+                "sequential inames. It must be split "
+                "(using split_reduction_{in,out}ward) "
+                "before code generation."
+                % ", ".join(expr.inames))
+
+    if n_local_par > 1:
+        raise LoopyError("Reduction over '%s' contains more than"
+                "one parallel iname. It must be split "
+                "(using split_reduction_{in,out}ward) "
+                "before code generation."
+                % ", ".join(expr.inames))
+
+    if n_nonlocal_par:
+        bad_inames = iname_classes.nonlocal_parallel
+        raise LoopyError("the only form of parallelism supported "
+                "by reductions is 'local'--found iname(s) '%s' "
+                "respectively tagged '%s'"
+                % (", ".join(bad_inames),
+                   ", ".join(str(red_realize_ctx.orig_kernel.iname_tags(iname))
+                             for iname in bad_inames)))
+
+    # }}}
+
+    red_realize_ctx.changes_made()
+
+    if n_local_par == 0 and n_sequential == 0:
+        warn_with_kernel(red_realize_ctx.kernel, "empty_reduction",
+                "Empty reduction found (no inames to reduce over). "
+                "Eliminating.")
+
+        # We're not supposed to reduce/sum at all. (Note how this is distinct
+        # from an empty reduction--there is an element here, just no inames
+        # to reduce over. It's rather similar to an array with () shape in
+        # numpy.)
+
+        return expr.expr, callables_table
+
+    if may_be_implemented_as_scan:
+        assert red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok
+
+        # We require the "scan" iname to be tagged sequential.
+        if n_sequential:
+            sweep_iname = scan_param.sweep_iname
+            sweep_class = _classify_reduction_inames(
+                    red_realize_ctx.orig_kernel, (sweep_iname,))
+
+            sequential = sweep_iname in sweep_class.sequential
+            parallel = sweep_iname in sweep_class.local_parallel
+            bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
+
+            if sweep_iname not in outer_insn_inames:
+                _error_if_force_scan_on(LoopyError,
+                        "Sweep iname '%s' was detected, but is not an iname "
+                        "for the instruction." % sweep_iname)
+            elif bad_parallel:
+                _error_if_force_scan_on(LoopyError,
+                        "Sweep iname '%s' has an unsupported parallel tag '%s' "
+                        "- the only parallelism allowed is 'local'." %
+                        (sweep_iname,
+                         ", ".join(tag.key
+                        for tag in red_realize_ctx.kernel.iname_tags(sweep_iname))))
+            elif parallel:
+                return map_scan_local(
+                        red_realize_ctx,
+                        expr, rec, callables_table, nresults,
+                        arg_dtypes, reduction_dtypes,
+                        sweep_iname, scan_param.scan_iname,
+                        scan_param.sweep_lower_bound,
+                        scan_param.scan_lower_bound,
+                        scan_param.stride,
+                        guarding_predicates)
+            elif sequential:
+                return map_scan_seq(
+                        red_realize_ctx,
+                        expr, rec, callables_table, nresults,
+                        arg_dtypes, reduction_dtypes, sweep_iname,
+                        scan_param.scan_iname,
+                        scan_param.sweep_lower_bound,
+                        scan_param.scan_lower_bound,
+                        scan_param.stride,
+                        guarding_predicates)
+
+            # fallthrough to reduction implementation
+
+        else:
+            assert n_local_par > 0
+            scan_iname, = expr.inames
+            _error_if_force_scan_on(LoopyError,
+                    "Scan iname '%s' is parallel tagged: this is not allowed "
+                    "(only the sweep iname should be tagged if parallelism "
+                    "is desired)." % scan_iname)
+
+            # fallthrough to reduction implementation
+
+    if n_sequential:
+        assert n_local_par == 0
+        return map_reduction_seq(
+                red_realize_ctx,
+                expr, rec, callables_table,
+                nresults, arg_dtypes, reduction_dtypes,
+                guarding_predicates)
+    else:
+        assert n_local_par > 0
+        return map_reduction_local(
+                red_realize_ctx,
+                expr, rec, callables_table, nresults, arg_dtypes,
+                reduction_dtypes, guarding_predicates)
+
+# }}}
+
+
+# {{{ realize_reduction_for_single_kernel
+
+# @remove_any_newly_unused_inames
+def realize_reduction_for_single_kernel(kernel, callables_table,
+        insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
+        force_scan=False, force_outer_iname_for_scan=None):
+    logger.debug("%s: realize reduction" % kernel.name)
+
+    orig_kernel = kernel
+
+    finished_insns = []
+
+    insn_id_gen = kernel.get_instruction_id_generator()
+    var_name_gen = kernel.get_var_name_generator()
+
+    cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table)
+
+    insn_queue = kernel.instructions[:]
+    domains = kernel.domains[:]
+
+    inames_added_for_scan = set()
+
+    kernel_changed = False
+
+    while insn_queue:
+        insn = insn_queue.pop(0)
+
+        red_realize_ctx = _ReductionRealizationContext(
+                force_scan=force_scan,
+                automagic_scans_ok=automagic_scans_ok,
+                unknown_types_ok=unknown_types_ok,
+                force_outer_iname_for_scan=force_outer_iname_for_scan,
+
+                orig_kernel=orig_kernel,
+                kernel=kernel,
+                insn=insn,
+
+                insn_id_gen=insn_id_gen,
+                var_name_gen=var_name_gen,
+
+                additional_temporary_variables={},
+                additional_insns=[],
+                domains=domains,
+                additional_iname_tags={},
+
+                inames_added_for_scan=inames_added_for_scan,
+
+                new_insn_add_depends_on=set(),
+                new_insn_add_no_sync_with=set(),
+                new_insn_add_within_inames=set(),
+
+                were_changes_made=False,
+                )
+
+        if insn_id_filter is not None and insn.id != insn_id_filter \
+                or not isinstance(insn, MultiAssignmentBase):
+            finished_insns.append(insn)
+            continue
+
+        nresults = len(insn.assignees)
+
+        # Run reduction expansion.
+        from loopy.symbolic import Reduction
+        if isinstance(insn.expression, Reduction) and nresults > 1:
+            new_expressions = cb_mapper(insn.expression,
+                    callables_table=cb_mapper.callables_table,
+                    red_realize_ctx=red_realize_ctx,
+                    guarding_predicates=insn.predicates,
+                    nresults=nresults)
+        else:
+            new_expressions = cb_mapper(insn.expression,
+                    callables_table=cb_mapper.callables_table,
+                    red_realize_ctx=red_realize_ctx,
+                    guarding_predicates=insn.predicates,
+                    nresults=1),
+
+        if red_realize_ctx.were_changes_made:
+            # An expansion happened, so insert the generated stuff plus
+            # ourselves back into the queue.
+
+            # {{{ apply changes
+
+            kernel_changed = True
+
+            insn_id_replacements = {}
+
+            result_assignment_dep_on = (
+                    insn.depends_on
+                    | frozenset(red_realize_ctx.new_insn_add_depends_on))
+            kwargs = insn.get_copy_kwargs(
+                    no_sync_with=insn.no_sync_with
+                    | frozenset(red_realize_ctx.new_insn_add_no_sync_with),
+                    within_inames=(
+                        insn.within_inames
+                        | red_realize_ctx.new_insn_add_within_inames))
+
+            kwargs.pop("id")
+            kwargs.pop("depends_on")
+            kwargs.pop("expression")
+            kwargs.pop("assignee", None)
+            kwargs.pop("assignees", None)
+            kwargs.pop("temp_var_type", None)
+            kwargs.pop("temp_var_types", None)
+
+            if isinstance(insn.expression, Reduction) and nresults > 1:
+                result_assignment_ids = [
+                        insn_id_gen(insn.id) for i in range(nresults)]
+                replacement_insns = [
+                        Assignment(
+                            id=result_assignment_ids[i],
+                            depends_on=(
+                                result_assignment_dep_on
+                                | (frozenset([result_assignment_ids[i-1]])
+                                    if i else frozenset())),
+                            assignee=assignee,
+                            expression=new_expr,
+                            **kwargs)
+                        for i, (assignee, new_expr) in enumerate(zip(
+                            insn.assignees, new_expressions))]
+
+                insn_id_replacements[insn.id] = [
+                    rinsn.id for rinsn in replacement_insns]
+            else:
+                new_expr, = new_expressions
+                # since we are replacing the instruction with
+                # only one instruction, there's no need to replace id
+                replacement_insns = [
+                        make_assignment(
+                            id=insn.id,
+                            depends_on=result_assignment_dep_on,
+                            assignees=insn.assignees,
+                            expression=new_expr,
+                            **kwargs)
+                        ]
+
+            insn_queue = (
+                    red_realize_ctx.additional_insns
+                    + replacement_insns
+                    + insn_queue)
+
+            # The reduction expander needs an up-to-date kernel
+            # object to find dependencies. Keep kernel up-to-date.
+            new_temporary_variables = kernel.temporary_variables.copy()
+            new_temporary_variables.update(
+                    red_realize_ctx.additional_temporary_variables)
+
+            finished_insns = [
+                    replace_instruction_ids_in_insn(insn, insn_id_replacements)
+                    for insn in finished_insns]
+            insn_queue = [
+                    replace_instruction_ids_in_insn(insn, insn_id_replacements)
+                    for insn in insn_queue]
+
+            kernel = kernel.copy(
+                    instructions=finished_insns + insn_queue,
+                    temporary_variables=new_temporary_variables,
+                    domains=domains)
+            from loopy.transform.iname import tag_inames
+            kernel = tag_inames(kernel, red_realize_ctx.additional_iname_tags)
+
+            del insn_id_replacements
+
+            # }}}
+
+        else:
+            # nothing happened, we're done with insn
+            assert not red_realize_ctx.new_insn_add_depends_on
+
+            finished_insns.append(insn)
+
+    if kernel_changed:
+        kernel = kernel.copy(instructions=finished_insns)
+    else:
+        return orig_kernel, callables_table
+
+    kernel = _hackily_ensure_multi_assignment_return_values_are_scoped_private(
+                kernel)
+
+    return kernel, cb_mapper.callables_table
+
+# }}}
+
+
+def realize_reduction(t_unit, *args, **kwargs):
+    """Rewrites reductions into their imperative form. With *insn_id_filter*
+    specified, operate only on the instruction with an instruction id matching
+    *insn_id_filter*.
+
+    If *insn_id_filter* is given, only the outermost level of reductions will be
+    expanded, inner reductions will be left alone (because they end up in a new
+    instruction with a different ID, which doesn't match the filter).
+
+    If *insn_id_filter* is not given, all reductions in all instructions will
+    be realized.
+
+    If *automagic_scans_ok*, this function will attempt to rewrite triangular
+    reductions as scans automatically.
+
+    If *force_scan* is *True*, this function will attempt to rewrite *all*
+    candidate reductions as scans and raise an error if this is not possible
+    (this is most useful combined with *insn_id_filter*).
+
+    If *force_outer_iname_for_scan* is not *None*, this function will attempt
+    to realize candidate reductions as scans using the specified iname as the
+    outer (sweep) iname.
+    """
+
+    assert isinstance(t_unit, TranslationUnit)
+
+    callables_table = dict(t_unit.callables_table)
+    kernels_to_scan = [in_knl_callable.subkernel
+            for in_knl_callable in t_unit.callables_table.values()
+            if isinstance(in_knl_callable, CallableKernel)]
+
+    for knl in kernels_to_scan:
+        new_knl, callables_table = realize_reduction_for_single_kernel(
+                knl, callables_table, *args, **kwargs)
+        in_knl_callable = callables_table[knl.name].copy(
+                subkernel=new_knl)
+        callables_table[knl.name] = in_knl_callable
+
+    return t_unit.copy(callables_table=callables_table)
+
+# vim: foldmethod=marker

From f51acad8aba6ffd04454626c15d76081b0a8ff20 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 8 Feb 2022 18:32:06 -0600
Subject: [PATCH 16/27] Fix stringification of arg{min,max}

---
 loopy/library/reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 091e4a2c1..9a9b1c6e9 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -444,7 +444,7 @@ def neutral_element(self, scalar_dtype, index_dtype, callables_table,
                 index_dtype.numpy_dtype.type(-1)), callables_table
 
     def __str__(self):
-        return self.which
+        return "arg" + self.which
 
     def __hash__(self):
         return hash(type(self))

From 39d7fd1187faab89f2ec0c45f585fcf6ef0f2915 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 8 Feb 2022 18:32:26 -0600
Subject: [PATCH 17/27] Fix an error message in make_assignment

---
 loopy/kernel/instruction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0721eccf0..09a0711a3 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1248,7 +1248,8 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
         from loopy.symbolic import Reduction
         if not isinstance(expression, (Call, Reduction)):
             raise LoopyError("right-hand side in multiple assignment must be "
-                    "function call or reduction, got: '%s'" % expression)
+                    "function call or reduction, got: "
+                    f"'{type(expression).__name__}'")
 
         if not is_array_call(assignees, expression):
             return CallInstruction(

From 059fef724ffefbed3bbff7976e68a0016b8dd1b8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 4 Feb 2022 01:08:03 -0600
Subject: [PATCH 18/27] Make realize_reduction actually recursive (closes
 gh-533)

---
 loopy/transform/realize_reduction.py | 849 +++++++++++++++------------
 test/test_scan.py                    |   5 -
 2 files changed, 476 insertions(+), 378 deletions(-)

diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index c02c05fdf..67aa627f8 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -24,8 +24,9 @@
 """
 
 
-from dataclasses import dataclass
-from typing import Tuple, Dict, Callable, List, Optional, Set, Sequence
+from dataclasses import dataclass, replace
+from typing import (Tuple, Dict, Callable, List, Optional, Set, Sequence,
+        FrozenSet)
 
 import logging
 logger = logging.getLogger(__name__)
@@ -33,10 +34,11 @@
 from pytools import memoize_on_first_arg
 from pytools.tag import Tag
 import islpy as isl
+from pymbolic.primitives import Expression
+
+from pyrsistent import PMap
 
 from loopy.kernel.data import make_assignment
-from loopy.kernel.tools import (
-        kernel_has_global_barriers, find_most_recent_global_barrier)
 from loopy.symbolic import ReductionCallbackMapper
 from loopy.translation_unit import TranslationUnit
 from loopy.kernel.function_interface import CallableKernel
@@ -51,30 +53,34 @@
 
 # {{{ reduction realization context
 
+@dataclass
+class _ChangeFlag:
+    changes_made: bool
+
+
 @dataclass(frozen=True)
 class _ReductionRealizationContext:
     # {{{ read-only
 
+    mapper: "RealizeReductionCallbackMapper"
+
     force_scan: bool
     automagic_scans_ok: bool
     unknown_types_ok: bool
 
-    # FIXME: This feels like a broken-by-design concept
+    # FIXME: This feels like a broken-by-design concept.
     force_outer_iname_for_scan: Optional[str]
 
     # We use the original kernel for a number of lookups whose value
     # we do not change and which might be already cached on it.
     orig_kernel: LoopKernel
-
     kernel: LoopKernel
 
-    # FIXME: This shouldn't be here. We might generate multiple instructions
-    # in a nested manner. Why should the 'top-level' instruction be special?
-    insn: InstructionBase
+    id_prefix: str
 
     # }}}
 
-    # {{{ internally mutable
+    # {{{ internally mutable, same across entire recursion
 
     insn_id_gen: Callable[[str], str]
     var_name_gen: Callable[[str], str]
@@ -83,28 +89,84 @@ class _ReductionRealizationContext:
     additional_insns: List[InstructionBase]
     domains: List[isl.BasicSet]
     additional_iname_tags: Dict[str, Sequence[Tag]]
+    # list only to facilitate mutation
+    boxed_callables_table: List[PMap]
 
     # FIXME: This is a broken-by-design concept. Local-parallel scans emit a
     # reduction internally. This serves to avoid force_scan acting on that
     # reduction.
     inames_added_for_scan: Set[str]
 
-    # FIXME: Clarify how these relate to recursively generated instructions.
-    new_insn_add_depends_on: Set[str]
-    new_insn_add_no_sync_with: Set[Tuple[str, str]]
-    new_insn_add_within_inames: Set[str]
+    # }}}
+
+    # {{{ surrounding instruction, read-only (different at each recursive level)
+
+    # These are attributes from 'surrounding' instruction, for generated
+    # instructions to potentially inherit.
+    surrounding_within_inames: FrozenSet[str]
+    surrounding_depends_on: FrozenSet[str]
+    surrounding_no_sync_with: FrozenSet[Tuple[str, str]]
+    surrounding_predicates: FrozenSet[Expression]
 
     # }}}
 
-    # {{{ change tracking
+    # {{{ surrounding instruction, internally mutable
+    # (different at each recursive level)
+
+    # These are requested additions to attributes of the surrounding instruction.
+
+    # FIXME add_within_inames seems broken by design.
+    surrounding_insn_add_within_inames: Set[str]
+
+    surrounding_insn_add_depends_on: Set[str]
+    surrounding_insn_add_no_sync_with: Set[Tuple[str, str]]
+
+    # }}}
+
+    # {{{ change tracking (same across entire recursion)
+
+    _change_flag: _ChangeFlag
 
-    were_changes_made: bool
+    @property
+    def were_changes_made(self):
+        return self._change_flag.changes_made
 
     def changes_made(self):
-        object.__setattr__(self, "were_changes_made", True)
+        self._change_flag.changes_made = True
 
     # }}}
 
+    def new_subinstruction(self, *, within_inames, depends_on,
+            no_sync_with=None, predicates=None):
+        if no_sync_with is None:
+            no_sync_with = self.surrounding_no_sync_with
+        if predicates is None:
+            predicates = self.surrounding_predicates
+
+        return replace(self,
+                surrounding_within_inames=within_inames,
+                surrounding_depends_on=depends_on,
+                surrounding_no_sync_with=no_sync_with,
+                surrounding_predicates=predicates,
+
+                surrounding_insn_add_within_inames=set(),
+                surrounding_insn_add_depends_on=set(),
+                surrounding_insn_add_no_sync_with=set())
+
+    def get_insn_kwargs(self):
+        return dict(
+                within_inames=(
+                    self.surrounding_within_inames
+                    | frozenset(self.surrounding_insn_add_within_inames)),
+                within_inames_is_final=True,
+                depends_on=(
+                    self.surrounding_depends_on
+                    | frozenset(self.surrounding_insn_add_depends_on)),
+                no_sync_with=(
+                    self.surrounding_no_sync_with
+                    | frozenset(self.surrounding_insn_add_no_sync_with)),
+                predicates=self.surrounding_predicates)
+
 # }}}
 
 
@@ -117,7 +179,7 @@ class _InameClassification:
     nonlocal_parallel: Tuple[str, ...]
 
 
-def _classify_reduction_inames(kernel, inames):
+def _classify_reduction_inames(red_realize_ctx, inames):
     sequential = []
     local_par = []
     nonlocal_par = []
@@ -127,7 +189,10 @@ def _classify_reduction_inames(kernel, inames):
             ConcurrentTag, filter_iname_tags_by_type)
 
     for iname in inames:
-        iname_tags = kernel.iname_tags(iname)
+        try:
+            iname_tags = red_realize_ctx.additional_iname_tags[iname]
+        except KeyError:
+            iname_tags = red_realize_ctx.kernel.iname_tags(iname)
 
         if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)):
             # These are nominally parallel, but we can live with
@@ -333,8 +398,13 @@ def _try_infer_scan_candidate_from_expr(
                 "(sweep iname: '%s', scan iname: '%s'): %s"
                 % (expr, sweep_iname, scan_iname, v))
 
-    return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound,
-            sweep_upper_bound, scan_lower_bound, stride)
+    return _ScanCandidateParameters(
+            sweep_iname=sweep_iname,
+            scan_iname=scan_iname,
+            sweep_lower_bound=sweep_lower_bound,
+            sweep_upper_bound=sweep_upper_bound,
+            scan_lower_bound=scan_lower_bound,
+            stride=stride)
 
 
 def _try_infer_sweep_iname(domain, scan_iname, candidate_inames):
@@ -499,15 +569,16 @@ def _get_domain_with_iname_as_param(domain, iname):
         dim_type.set, iname_idx, 1)
 
 
-def _create_domain_for_sweep_tracking(orig_domain,
-        tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride):
+def _create_domain_for_sweep_tracking(orig_domain, tracking_iname, scan_param):
+    sp = scan_param
+
     dim_type = isl.dim_type
 
     subd = isl.BasicSet.universe(orig_domain.params().space)
 
     # Add tracking_iname and sweep iname.
 
-    subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname))
+    subd = _add_params_to_domain(subd, (sp.sweep_iname, tracking_iname))
 
     # Here we realize the domain:
     #
@@ -526,11 +597,11 @@ def _create_domain_for_sweep_tracking(orig_domain,
     #
     affs = isl.affs_from_space(subd.space)
 
-    subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0])
-    subd &= (affs[tracking_iname] - scan_min_value)\
-            .le_set(stride * (affs[sweep_iname] - sweep_min_value))
-    subd &= (affs[tracking_iname] - scan_min_value)\
-            .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1))
+    subd &= (affs[tracking_iname] - sp.scan_lower_bound).ge_set(affs[0])
+    subd &= (affs[tracking_iname] - sp.scan_lower_bound)\
+            .le_set(sp.stride * (affs[sp.sweep_iname] - sp.sweep_lower_bound))
+    subd &= (affs[tracking_iname] - sp.scan_lower_bound)\
+            .gt_set(sp.stride * (affs[sp.sweep_iname] - sp.sweep_lower_bound - 1))
 
     # Move tracking_iname into a set dim (NOT sweep iname).
     subd = subd.move_dims(
@@ -539,7 +610,7 @@ def _create_domain_for_sweep_tracking(orig_domain,
 
     # Simplify (maybe).
     orig_domain_with_sweep_param = (
-            _get_domain_with_iname_as_param(orig_domain, sweep_iname))
+            _get_domain_with_iname_as_param(orig_domain, sp.sweep_iname))
     subd = subd.gist_params(orig_domain_with_sweep_param.params())
 
     subd, = subd.get_basic_sets()
@@ -738,39 +809,56 @@ def _add_to_depends_on(insn_id, new_depends_on_params):
 # {{{ RealizeReductionCallbackMapper
 
 class RealizeReductionCallbackMapper(ReductionCallbackMapper):
-    def __init__(self, callback, callables_table):
+    def __init__(self, callback):
         super().__init__(callback)
-        self.callables_table = callables_table
 
     def map_reduction(self, expr, **kwargs):
-        result, self.callables_table = self.callback(expr, rec=self.rec,
-                **kwargs)
-        return result
+        return self.callback(expr, **kwargs)
 
-    def map_if(self, expr, *,
-            callables_table, red_realize_ctx,
-            guarding_predicates, nresults):
+    def map_if(self, expr, *, red_realize_ctx, nresults):
+        common_kwargs = dict(nresults=nresults)
 
-        common_kwargs = dict(
-                callables_table=callables_table,
-                red_realize_ctx=red_realize_ctx,
-                nresults=nresults)
+        # {{{ generate code for condition
 
+        rrc_cond = replace(red_realize_ctx,
+                surrounding_insn_add_depends_on=set(),
+                surrounding_insn_add_no_sync_with=set(),
+                surrounding_insn_add_within_inames=set())
         import pymbolic.primitives as prim
         rec_cond = self.rec(
                 expr.condition,
-                guarding_predicates=guarding_predicates,
+                red_realize_ctx=rrc_cond,
                 **common_kwargs)
+        assert not rrc_cond.surrounding_insn_add_no_sync_with
+        assert not rrc_cond.surrounding_insn_add_within_inames
+
+        cond_dep_on = rrc_cond.surrounding_insn_add_depends_on
+        red_realize_ctx.surrounding_insn_add_depends_on.update(cond_dep_on)
+
+        # }}}
+
         return prim.If(rec_cond,
                        self.rec(expr.then,
-                           guarding_predicates=(
-                               guarding_predicates
-                               | frozenset([rec_cond])),
+                           red_realize_ctx=replace(
+                               red_realize_ctx,
+                               surrounding_depends_on=(
+                                   red_realize_ctx.surrounding_depends_on
+                                   | cond_dep_on),
+                               surrounding_predicates=(
+                                   red_realize_ctx.surrounding_predicates
+                                   | frozenset([rec_cond])
+                                   )),
                            **common_kwargs),
                        self.rec(expr.else_,
-                           guarding_predicates=(
-                               guarding_predicates
-                               | frozenset([prim.LogicalNot(rec_cond)])),
+                           red_realize_ctx=replace(
+                               red_realize_ctx,
+                               surrounding_depends_on=(
+                                   red_realize_ctx.surrounding_depends_on
+                                   | cond_dep_on),
+                               surrounding_predicates=(
+                                   red_realize_ctx.surrounding_predicates
+                                   | frozenset([prim.LogicalNot(rec_cond)])
+                                   )),
                            **common_kwargs))
 
 # }}}
@@ -788,13 +876,10 @@ def _strip_if_scalar(reference, val):
 def _preprocess_scan_arguments(
         red_realize_ctx,
         expr, nresults, scan_iname, track_iname,
-        newly_generated_insn_id_set,
-        insn_id_gen):
+        newly_generated_insn_id_set):
     """Does iname substitution within scan arguments and returns a set of values
     suitable to be passed to the binary op. Returns a tuple."""
 
-    insn = red_realize_ctx.insn
-
     if nresults > 1:
         inner_expr = expr
 
@@ -802,21 +887,21 @@ def _preprocess_scan_arguments(
         # the arguments in order to pass them to the binary op - so we expand
         # items that are not "plain" tuples here.
         if not isinstance(inner_expr, tuple):
-            get_args_insn_id = insn_id_gen(
-                    "{}_{}_get".format(insn.id, "_".join(expr.inames)))
+            get_args_insn_id = red_realize_ctx.insn_id_gen(
+                    f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_get")
 
             inner_expr = expand_inner_reduction(
                     red_realize_ctx=red_realize_ctx,
                     id=get_args_insn_id,
                     expr=inner_expr,
                     nresults=nresults,
-                    depends_on=insn.depends_on,
-                    within_inames=insn.within_inames | expr.inames,
-                    within_inames_is_final=insn.within_inames_is_final,
-                    predicates=insn.predicates,
+                    depends_on=red_realize_ctx.surrounding_depends_on,
+                    within_inames=red_realize_ctx.surrounding_within_inames,
+                    predicates=red_realize_ctx.surrounding_predicates,
                     )
 
-            newly_generated_insn_id_set.add(get_args_insn_id)
+            newly_generated_insn_id_set = (
+                    newly_generated_insn_id_set | frozenset({get_args_insn_id}))
 
         updated_inner_exprs = tuple(
                 replace_var_within_expr(
@@ -829,14 +914,13 @@ def _preprocess_scan_arguments(
                     red_realize_ctx.kernel, red_realize_ctx.var_name_gen,
                     expr, scan_iname, track_iname),)
 
-    return updated_inner_exprs
+    return updated_inner_exprs, newly_generated_insn_id_set
 
 # }}}
 
 
 def expand_inner_reduction(
-        red_realize_ctx, id, expr, nresults, depends_on, within_inames,
-        within_inames_is_final, predicates):
+        red_realize_ctx, id, expr, nresults, depends_on, within_inames, predicates):
     # FIXME: use _make_temporaries
     from pymbolic.primitives import Call
     from loopy.symbolic import Reduction
@@ -862,7 +946,7 @@ def expand_inner_reduction(
             expression=expr,
             depends_on=depends_on,
             within_inames=within_inames,
-            within_inames_is_final=within_inames_is_final,
+            within_inames_is_final=True,
             predicates=predicates)
 
     red_realize_ctx.additional_insns.append(call_insn)
@@ -872,13 +956,8 @@ def expand_inner_reduction(
 
 # {{{ reduction type: sequential
 
-def map_reduction_seq(
-        red_realize_ctx, expr, rec, callables_table, nresults, arg_dtypes,
-        reduction_dtypes, guarding_predicates):
+def map_reduction_seq(red_realize_ctx, expr, nresults, arg_dtypes, reduction_dtypes):
     orig_kernel = red_realize_ctx.orig_kernel
-    insn = red_realize_ctx.insn
-
-    outer_insn_inames = red_realize_ctx.insn.within_inames
 
     acc_var_names = _make_temporaries(
             red_realize_ctx=red_realize_ctx,
@@ -888,37 +967,24 @@ def map_reduction_seq(
             dtypes=reduction_dtypes,
             address_space=AddressSpace.PRIVATE)
 
-    init_insn_depends_on = frozenset()
-
-    # check first that the original kernel had global barriers
-    # if not, we don't need to check. Since the function
-    # kernel_has_global_barriers is cached, we don't do
-    # extra work compared to not checking.
-    # FIXME: Explain why we care about global barriers here
-    if kernel_has_global_barriers(orig_kernel):
-        global_barrier = find_most_recent_global_barrier(
-                red_realize_ctx.kernel,
-                insn.id)
-
-        if global_barrier is not None:
-            init_insn_depends_on |= frozenset([global_barrier])
-
     from pymbolic import var
     acc_vars = tuple(var(n) for n in acc_var_names)
 
     init_id = red_realize_ctx.insn_id_gen(
-            "{}_{}_init".format(insn.id, "_".join(expr.inames)))
+            f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_init")
 
-    expression, callables_table = expr.operation.neutral_element(
-            *arg_dtypes, callables_table=callables_table,
-            target=red_realize_ctx.orig_kernel.target)
+    expression, red_realize_ctx.boxed_callables_table[0] = \
+            expr.operation.neutral_element(
+                    *arg_dtypes,
+                    callables_table=red_realize_ctx.boxed_callables_table[0],
+                    target=red_realize_ctx.orig_kernel.target)
 
     init_insn = make_assignment(
             id=init_id,
             assignees=acc_vars,
-            within_inames=outer_insn_inames - frozenset(expr.inames),
-            within_inames_is_final=insn.within_inames_is_final,
-            depends_on=init_insn_depends_on,
+            within_inames=red_realize_ctx.surrounding_within_inames,
+            within_inames_is_final=True,
+            depends_on=frozenset(),
             expression=expression,
 
             # Do not inherit predicates: Those might read variables
@@ -934,61 +1000,60 @@ def map_reduction_seq(
     red_realize_ctx.additional_insns.append(init_insn)
 
     update_id = red_realize_ctx.insn_id_gen(
-            based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
+            based_on=f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_update")
 
-    update_insn_iname_deps = insn.within_inames | set(expr.inames)
-    if insn.within_inames_is_final:
-        update_insn_iname_deps = insn.within_inames | set(expr.inames)
+    update_red_realize_ctx = red_realize_ctx.new_subinstruction(
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset(expr.inames)),
+            depends_on=(
+                frozenset({init_id})
+                | red_realize_ctx.surrounding_depends_on))
 
-    reduction_insn_depends_on = {init_id}
+    reduction_expr = red_realize_ctx.mapper(
+            expr.expr, red_realize_ctx=update_red_realize_ctx,
+            nresults=1)
 
     # In the case of a multi-argument reduction, we need a name for each of
     # the arguments in order to pass them to the binary op - so we expand
     # items that are not "plain" tuples here.
-    if nresults > 1 and not isinstance(expr.expr, tuple):
+    if nresults > 1 and not isinstance(reduction_expr, tuple):
         get_args_insn_id = red_realize_ctx.insn_id_gen(
-                "{}_{}_get".format(insn.id, "_".join(expr.inames)))
+                f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_get")
 
         reduction_expr = expand_inner_reduction(
                 red_realize_ctx=red_realize_ctx,
                 id=get_args_insn_id,
-                expr=expr.expr,
+                expr=reduction_expr,
                 nresults=nresults,
-                depends_on=insn.depends_on,
-                within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final,
-                predicates=guarding_predicates,
+                depends_on=red_realize_ctx.surrounding_depends_on,
+                within_inames=update_red_realize_ctx.surrounding_within_inames,
+                predicates=red_realize_ctx.surrounding_predicates,
                 )
 
-        reduction_insn_depends_on.add(get_args_insn_id)
-    else:
-        reduction_expr = expr.expr
+        update_red_realize_ctx.surrounding_insn_add_depends_on.add(get_args_insn_id)
 
-    expression, callables_table = expr.operation(
+    expression, red_realize_ctx.boxed_callables_table[0] = expr.operation(
             arg_dtypes,
             _strip_if_scalar(acc_vars, acc_vars),
             reduction_expr,
-            callables_table,
+            red_realize_ctx.boxed_callables_table[0],
             orig_kernel.target)
 
     reduction_insn = make_assignment(
             id=update_id,
             assignees=acc_vars,
             expression=expression,
-            depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
-            within_inames=update_insn_iname_deps,
-            within_inames_is_final=insn.within_inames_is_final,
-            predicates=guarding_predicates,)
+            **update_red_realize_ctx.get_insn_kwargs())
 
     red_realize_ctx.additional_insns.append(reduction_insn)
-
-    red_realize_ctx.new_insn_add_depends_on.add(reduction_insn.id)
+    red_realize_ctx.surrounding_insn_add_depends_on.add(reduction_insn.id)
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return acc_vars[0], callables_table
+        return acc_vars[0]
     else:
-        return acc_vars, callables_table
+        return acc_vars
 
 # }}}
 
@@ -1024,30 +1089,26 @@ def _make_slab_set_from_range(iname, lbound, ubound):
     return bs
 
 
-def map_reduction_local(
-        red_realize_ctx,
-        expr, rec, callables_table, nresults, arg_dtypes,
-        reduction_dtypes, guarding_predicates):
+def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
+        reduction_dtypes):
     orig_kernel = red_realize_ctx.orig_kernel
-    insn = red_realize_ctx.insn
 
     red_iname, = expr.inames
 
     size = _get_int_iname_size(orig_kernel, red_iname)
 
-    outer_insn_inames = insn.within_inames
-
     from loopy.kernel.data import LocalInameTagBase
-    outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+    surrounding_local_inames = tuple(
+            oiname for oiname in red_realize_ctx.surrounding_within_inames
             if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase))
 
     from pymbolic import var
     outer_local_iname_vars = tuple(
-            var(oiname) for oiname in outer_local_inames)
+            var(oiname) for oiname in surrounding_local_inames)
 
     outer_local_iname_sizes = tuple(
             _get_int_iname_size(orig_kernel, oiname)
-            for oiname in outer_local_inames)
+            for oiname in surrounding_local_inames)
 
     neutral_var_names = _make_temporaries(
             red_realize_ctx=red_realize_ctx,
@@ -1079,19 +1140,22 @@ def map_reduction_local(
 
     # }}}
 
-    base_iname_deps = outer_insn_inames - frozenset(expr.inames)
-
-    neutral, callables_table = expr.operation.neutral_element(*arg_dtypes,
-            callables_table=callables_table, target=orig_kernel.target)
-    init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_init")
+    neutral, red_realize_ctx.boxed_callables_table[0] = \
+            expr.operation.neutral_element(*arg_dtypes,
+                    callables_table=red_realize_ctx.boxed_callables_table[0],
+                    target=orig_kernel.target)
+    init_id = red_realize_ctx.insn_id_gen(
+            f"{red_realize_ctx.id_prefix}_{red_iname}_init")
     init_insn = make_assignment(
             id=init_id,
             assignees=tuple(
                 acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
                 for acc_var in acc_vars),
             expression=neutral,
-            within_inames=base_iname_deps | frozenset([base_exec_iname]),
-            within_inames_is_final=insn.within_inames_is_final,
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset([base_exec_iname])),
+            within_inames_is_final=True,
             depends_on=frozenset(),
             # Do not inherit predicates: Those might read variables
             # that may not yet be set, and we don't have a great way
@@ -1105,52 +1169,65 @@ def map_reduction_local(
     red_realize_ctx.additional_insns.append(init_insn)
 
     init_neutral_id = red_realize_ctx.insn_id_gen(
-            f"{insn.id}_{red_iname}_init_neutral")
+            f"{red_realize_ctx.id_prefix}_{red_iname}_init_neutral")
     init_neutral_insn = make_assignment(
             id=init_neutral_id,
             assignees=tuple(var(nvn) for nvn in neutral_var_names),
             expression=neutral,
-            within_inames=base_iname_deps | frozenset([base_exec_iname]),
-            within_inames_is_final=insn.within_inames_is_final,
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset([base_exec_iname])),
+            within_inames_is_final=True,
             depends_on=frozenset(),
-            predicates=guarding_predicates,
+            predicates=red_realize_ctx.surrounding_predicates,
             )
     red_realize_ctx.additional_insns.append(init_neutral_insn)
 
     transfer_depends_on = {init_neutral_id, init_id}
 
+    transfer_red_realize_ctx = red_realize_ctx.new_subinstruction(
+            within_inames=(
+                    red_realize_ctx.surrounding_within_inames
+                    | frozenset([red_iname])),
+            depends_on=(
+                red_realize_ctx.surrounding_depends_on
+                | frozenset([init_id, init_neutral_id])),
+            no_sync_with=(
+                red_realize_ctx.surrounding_no_sync_with
+                | frozenset([(init_id, "any")])))
+
+    reduction_expr = red_realize_ctx.mapper(
+            expr.expr, red_realize_ctx=transfer_red_realize_ctx,
+            nresults=1)
+
     # In the case of a multi-argument reduction, we need a name for each of
     # the arguments in order to pass them to the binary op - so we expand
     # items that are not "plain" tuples here.
-    if nresults > 1 and not isinstance(expr.expr, tuple):
+    if nresults > 1 and not isinstance(reduction_expr, tuple):
         get_args_insn_id = red_realize_ctx.insn_id_gen(
-                f"{insn.id}_{red_iname}_get")
+                f"{red_realize_ctx.id_prefix}_{red_iname}_get")
 
         reduction_expr = expand_inner_reduction(
                 red_realize_ctx=red_realize_ctx,
                 id=get_args_insn_id,
-                expr=expr.expr,
+                expr=reduction_expr,
                 nresults=nresults,
-                depends_on=insn.depends_on,
-                within_inames=(
-                    (outer_insn_inames - frozenset(expr.inames))
-                    | frozenset([red_iname])),
-                within_inames_is_final=insn.within_inames_is_final,
-                predicates=guarding_predicates,
+                depends_on=red_realize_ctx.surrounding_depends_on,
+                within_inames=transfer_red_realize_ctx.surrounding_within_inames,
+                predicates=red_realize_ctx.surrounding_predicates,
                 )
 
         transfer_depends_on.add(get_args_insn_id)
-    else:
-        reduction_expr = expr.expr
 
-    transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{red_iname}_transfer")
-    expression, callables_table = expr.operation(
+    transfer_id = red_realize_ctx.insn_id_gen(
+            f"{red_realize_ctx.id_prefix}_{red_iname}_transfer")
+    expression, red_realize_ctx.boxed_callables_table[0] = expr.operation(
             arg_dtypes,
             _strip_if_scalar(
                 neutral_var_names,
                 tuple(var(nvn) for nvn in neutral_var_names)),
             reduction_expr,
-            callables_table,
+            red_realize_ctx.boxed_callables_table[0],
             orig_kernel.target)
     transfer_insn = make_assignment(
             id=transfer_id,
@@ -1158,14 +1235,7 @@ def map_reduction_local(
                 acc_var[outer_local_iname_vars + (var(red_iname),)]
                 for acc_var in acc_vars),
             expression=expression,
-            within_inames=(
-                (outer_insn_inames - frozenset(expr.inames))
-                | frozenset([red_iname])),
-            within_inames_is_final=insn.within_inames_is_final,
-            depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
-            no_sync_with=frozenset([(init_id, "any")]),
-            predicates=insn.predicates,
-            )
+            **transfer_red_realize_ctx.get_insn_kwargs())
     red_realize_ctx.additional_insns.append(transfer_insn)
 
     cur_size = 1
@@ -1193,7 +1263,7 @@ def map_reduction_local(
         stage_id = red_realize_ctx.insn_id_gen(
                 "red_%s_stage_%d" % (red_iname, istage))
 
-        expression, callables_table = expr.operation(
+        expression, red_realize_ctx.boxed_callables_table[0] = expr.operation(
                 arg_dtypes,
                 _strip_if_scalar(acc_vars, tuple(
                     acc_var[
@@ -1204,7 +1274,7 @@ def map_reduction_local(
                         outer_local_iname_vars + (
                             var(stage_exec_iname) + new_size,)]
                     for acc_var in acc_vars)),
-                callables_table,
+                red_realize_ctx.boxed_callables_table[0],
                 orig_kernel.target)
 
         stage_insn = make_assignment(
@@ -1214,10 +1284,11 @@ def map_reduction_local(
                     for acc_var in acc_vars),
                 expression=expression,
                 within_inames=(
-                    base_iname_deps | frozenset([stage_exec_iname])),
-                within_inames_is_final=insn.within_inames_is_final,
+                    red_realize_ctx.surrounding_within_inames
+                    | frozenset([stage_exec_iname])),
+                within_inames_is_final=True,
                 depends_on=frozenset([prev_id]),
-                predicates=insn.predicates,
+                predicates=red_realize_ctx.surrounding_predicates,
                 )
 
         red_realize_ctx.additional_insns.append(stage_insn)
@@ -1227,17 +1298,17 @@ def map_reduction_local(
         bound = cur_size
         istage += 1
 
-    red_realize_ctx.new_insn_add_depends_on.add(prev_id)
-    red_realize_ctx.new_insn_add_no_sync_with.add((prev_id, "any"))
-    red_realize_ctx.new_insn_add_within_inames.add(
+    red_realize_ctx.surrounding_insn_add_depends_on.add(prev_id)
+    red_realize_ctx.surrounding_insn_add_no_sync_with.add((prev_id, "any"))
+    red_realize_ctx.surrounding_insn_add_within_inames.add(
             stage_exec_iname or base_exec_iname)
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return acc_vars[0][outer_local_iname_vars + (0,)], callables_table
+        return acc_vars[0][outer_local_iname_vars + (0,)]
     else:
         return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
-                acc_vars], callables_table
+                acc_vars]
 # }}}
 
 
@@ -1246,16 +1317,17 @@ def map_reduction_local(
 @memoize_on_first_arg
 def _get_or_add_sweep_tracking_iname_and_domain(
         red_realize_ctx,
-        scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+        scan_param,
         tracking_iname):
     kernel = red_realize_ctx.kernel
 
-    domain = kernel.get_inames_domain(frozenset((scan_iname, sweep_iname)))
+    domain = kernel.get_inames_domain(
+            frozenset((scan_param.scan_iname, scan_param.sweep_iname)))
 
     red_realize_ctx.inames_added_for_scan.add(tracking_iname)
 
-    new_domain = _create_domain_for_sweep_tracking(domain,
-            tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride)
+    new_domain = _create_domain_for_sweep_tracking(
+            domain, tracking_iname, scan_param)
 
     _insert_subdomain_into_domain_tree(kernel, red_realize_ctx.domains, new_domain)
 
@@ -1268,6 +1340,9 @@ def replace_var_within_expr(kernel, var_name_gen, expr, from_var, to_var):
     from loopy.symbolic import (
         SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper)
 
+    # FIXME: This is broken. SubstitutionRuleMappingContext produces a new
+    # kernel (via finish_kernel) with new subst rules. These get dropped on the
+    # floor here.
     rule_mapping_context = SubstitutionRuleMappingContext(
         kernel.substitutions, var_name_gen)
 
@@ -1302,28 +1377,21 @@ def _make_temporaries(
 
 # {{{ reduction type: sequential scan
 
-def map_scan_seq(
-        red_realize_ctx,
-        expr, rec, callables_table, nresults, arg_dtypes,
-        reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
-        scan_min_value, stride, guarding_predicates):
-    insn = red_realize_ctx.insn
-
-    outer_insn_inames = insn.within_inames
+def map_scan_seq(red_realize_ctx, expr, nresults, arg_dtypes,
+        reduction_dtypes, scan_param):
 
     track_iname = red_realize_ctx.var_name_gen(
             "{sweep_iname}__seq_scan"
-            .format(sweep_iname=sweep_iname))
+            .format(sweep_iname=scan_param.sweep_iname))
 
     _get_or_add_sweep_tracking_iname_and_domain(
-            red_realize_ctx,
-            scan_iname, sweep_iname, sweep_min_value, scan_min_value,
-            stride, track_iname)
+            red_realize_ctx, scan_param, track_iname)
+    red_realize_ctx.additional_iname_tags[track_iname] = frozenset()
 
     from loopy.kernel.data import AddressSpace
     acc_var_names = _make_temporaries(
             red_realize_ctx=red_realize_ctx,
-            name_based_on="acc_" + scan_iname,
+            name_based_on="acc_" + scan_param.scan_iname,
             nvars=nresults,
             shape=(),
             dtypes=reduction_dtypes,
@@ -1333,28 +1401,22 @@ def map_scan_seq(
     acc_vars = tuple(var(n) for n in acc_var_names)
 
     init_id = red_realize_ctx.insn_id_gen(
-            "{}_{}_init".format(insn.id, "_".join(expr.inames)))
+            f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_init")
 
     init_insn_depends_on = frozenset()
 
-    # FIXME: Explain why we care about global barriers here
-    if kernel_has_global_barriers(red_realize_ctx.orig_kernel):
-        global_barrier = find_most_recent_global_barrier(
-                red_realize_ctx.kernel, insn.id)
-
-        if global_barrier is not None:
-            init_insn_depends_on |= frozenset([global_barrier])
-
-    expression, callables_table = expr.operation.neutral_element(
-            *arg_dtypes, callables_table=callables_table,
-            target=red_realize_ctx.orig_kernel.target)
+    expression, red_realize_ctx.boxed_callables_table[0] = \
+            expr.operation.neutral_element(*arg_dtypes,
+                    callables_table=red_realize_ctx.boxed_callables_table[0],
+                    target=red_realize_ctx.orig_kernel.target)
 
     init_insn = make_assignment(
             id=init_id,
             assignees=acc_vars,
-            within_inames=outer_insn_inames - frozenset(
-                (sweep_iname,) + expr.inames),
-            within_inames_is_final=insn.within_inames_is_final,
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                - frozenset((scan_param.sweep_iname,) + expr.inames)),
+            within_inames_is_final=True,
             depends_on=init_insn_depends_on,
             expression=expression,
             # Do not inherit predicates: Those might read variables
@@ -1369,78 +1431,86 @@ def map_scan_seq(
 
     red_realize_ctx.additional_insns.append(init_insn)
 
-    update_insn_depends_on = {init_insn.id} | insn.depends_on
+    scan_insn_depends_on = {init_insn.id} | red_realize_ctx.surrounding_depends_on
 
-    updated_inner_exprs = _preprocess_scan_arguments(
-            red_realize_ctx,
-            expr.expr, nresults,
-            scan_iname, track_iname, update_insn_depends_on,
-            insn_id_gen=red_realize_ctx.insn_id_gen)
+    scan_red_realize_ctx = red_realize_ctx.new_subinstruction(
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset({scan_param.scan_iname})),
+            depends_on=red_realize_ctx.surrounding_depends_on)
 
-    update_id = red_realize_ctx.insn_id_gen(
-            based_on="{}_{}_update".format(insn.id, "_".join(expr.inames)))
+    reduction_expr = red_realize_ctx.mapper(
+            expr.expr, red_realize_ctx=scan_red_realize_ctx,
+            nresults=1)
+
+    updated_inner_exprs, scan_insn_depends_on = _preprocess_scan_arguments(
+            scan_red_realize_ctx,
+            reduction_expr, nresults,
+            scan_param.scan_iname, track_iname, scan_insn_depends_on)
 
-    update_insn_iname_deps = insn.within_inames | {track_iname}
-    if insn.within_inames_is_final:
-        update_insn_iname_deps = insn.within_inames | {track_iname}
+    scan_id = red_realize_ctx.insn_id_gen(
+            based_on=f"{red_realize_ctx.id_prefix}_{'_'.join(expr.inames)}_scan")
 
-    expression, callables_table = expr.operation(
+    expression, red_realize_ctx.boxed_callables_table[0] = expr.operation(
             arg_dtypes,
             _strip_if_scalar(acc_vars, acc_vars),
             _strip_if_scalar(acc_vars, updated_inner_exprs),
-            callables_table,
+            red_realize_ctx.boxed_callables_table[0],
             red_realize_ctx.orig_kernel.target)
 
     scan_insn = make_assignment(
-            id=update_id,
+            id=scan_id,
             assignees=acc_vars,
             expression=expression,
-            depends_on=frozenset(update_insn_depends_on),
-            within_inames=update_insn_iname_deps,
-            no_sync_with=insn.no_sync_with,
-            within_inames_is_final=insn.within_inames_is_final,
-            predicates=guarding_predicates,
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset(
+                    scan_red_realize_ctx.surrounding_insn_add_within_inames)
+                | {track_iname}),
+            depends_on=(
+                frozenset(scan_insn_depends_on)
+                | frozenset(scan_red_realize_ctx.surrounding_insn_add_depends_on)
+                ),
+            no_sync_with=(
+                red_realize_ctx.surrounding_no_sync_with
+                | frozenset(scan_red_realize_ctx.surrounding_insn_add_no_sync_with)
+                ),
+            within_inames_is_final=True,
+            predicates=red_realize_ctx.surrounding_predicates,
             )
 
     red_realize_ctx.additional_insns.append(scan_insn)
-    red_realize_ctx.new_insn_add_depends_on.add(scan_insn.id)
+    red_realize_ctx.surrounding_insn_add_depends_on.add(scan_insn.id)
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return acc_vars[0], callables_table
+        return acc_vars[0]
     else:
-        return acc_vars, callables_table
+        return acc_vars
 
 # }}}
 
 
 # {{{ reduction type: local-parallel scan
 
-def map_scan_local(
-        red_realize_ctx,
-        expr, rec, callables_table, nresults, arg_dtypes,
-        reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
-        scan_min_value, stride, guarding_predicates):
+def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
+        reduction_dtypes, scan_param):
 
     orig_kernel = red_realize_ctx.orig_kernel
-    insn = red_realize_ctx.insn
 
-    scan_size = _get_int_iname_size(orig_kernel, sweep_iname)
+    scan_size = _get_int_iname_size(orig_kernel, scan_param.sweep_iname)
 
     assert scan_size > 0
 
     if scan_size == 1:
         return map_reduction_seq(red_realize_ctx,
-                expr, rec, callables_table,
-                nresults, arg_dtypes, reduction_dtypes,
-                guarding_predicates)
-
-    outer_insn_inames = insn.within_inames
+                expr, nresults, arg_dtypes, reduction_dtypes)
 
     from loopy.kernel.data import LocalInameTagBase
-    outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+    outer_local_inames = tuple(
+            oiname for oiname in red_realize_ctx.surrounding_within_inames
             if orig_kernel.iname_tags_of_type(oiname, LocalInameTagBase)
-            and oiname != sweep_iname)
+            and oiname != scan_param.sweep_iname)
 
     from pymbolic import var
     outer_local_iname_vars = tuple(
@@ -1452,28 +1522,29 @@ def map_scan_local(
 
     track_iname = red_realize_ctx.var_name_gen(
             "{sweep_iname}__pre_scan"
-            .format(sweep_iname=sweep_iname))
+            .format(sweep_iname=scan_param.sweep_iname))
 
     _get_or_add_sweep_tracking_iname_and_domain(
             red_realize_ctx,
-            scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride,
+            scan_param,
             track_iname)
+    red_realize_ctx.additional_iname_tags[track_iname] = frozenset()
 
     # {{{ add separate iname to carry out the scan
 
     # Doing this sheds any odd conditionals that may be active
     # on our scan_iname.
 
-    base_exec_iname = red_realize_ctx.var_name_gen(sweep_iname + "__scan")
+    base_exec_iname = red_realize_ctx.var_name_gen(scan_param.sweep_iname + "__scan")
     red_realize_ctx.domains.append(_make_slab_set(base_exec_iname, scan_size))
     red_realize_ctx.additional_iname_tags[base_exec_iname] \
-            = orig_kernel.iname_tags(sweep_iname)
+            = orig_kernel.iname_tags(scan_param.sweep_iname)
 
     # }}}
 
     read_var_names = _make_temporaries(
             red_realize_ctx=red_realize_ctx,
-            name_based_on="read_"+scan_iname+"_arg_{index}",
+            name_based_on="read_"+scan_param.scan_iname+"_arg_{index}",
             nvars=nresults,
             shape=(),
             dtypes=reduction_dtypes,
@@ -1481,7 +1552,7 @@ def map_scan_local(
 
     acc_var_names = _make_temporaries(
             red_realize_ctx=red_realize_ctx,
-            name_based_on="acc_"+scan_iname,
+            name_based_on="acc_"+scan_param.scan_iname,
             nvars=nresults,
             shape=outer_local_iname_sizes + (scan_size,),
             dtypes=reduction_dtypes,
@@ -1490,24 +1561,17 @@ def map_scan_local(
     acc_vars = tuple(var(n) for n in acc_var_names)
     read_vars = tuple(var(n) for n in read_var_names)
 
-    base_iname_deps = (outer_insn_inames
-            - frozenset(expr.inames) - frozenset([sweep_iname]))
-
-    neutral, callables_table = expr.operation.neutral_element(
-            *arg_dtypes, callables_table=callables_table,
-            target=orig_kernel.target)
-
-    init_insn_depends_on = insn.depends_on
-
-    # FIXME: Explain why we care about global barriers here
-    if kernel_has_global_barriers(orig_kernel):
-        global_barrier = find_most_recent_global_barrier(
-                red_realize_ctx.kernel, insn.id)
+    base_iname_deps = (
+            red_realize_ctx.surrounding_within_inames
+            - frozenset([scan_param.sweep_iname]))
 
-        if global_barrier is not None:
-            init_insn_depends_on |= frozenset([global_barrier])
+    neutral, red_realize_ctx.boxed_callables_table[0] = \
+            expr.operation.neutral_element(*arg_dtypes,
+                    callables_table=red_realize_ctx.boxed_callables_table[0],
+                    target=orig_kernel.target)
 
-    init_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_init")
+    init_id = red_realize_ctx.insn_id_gen(
+            f"{red_realize_ctx.id_prefix}_{scan_param.scan_iname}_init")
     init_insn = make_assignment(
             id=init_id,
             assignees=tuple(
@@ -1515,8 +1579,8 @@ def map_scan_local(
                 for acc_var in acc_vars),
             expression=neutral,
             within_inames=base_iname_deps | frozenset([base_exec_iname]),
-            within_inames_is_final=insn.within_inames_is_final,
-            depends_on=init_insn_depends_on,
+            within_inames_is_final=True,
+            depends_on=frozenset(),
             # Do not inherit predicates: Those might read variables
             # that may not yet be set, and we don't have a great way
             # of figuring out what the dependencies of the accumulator
@@ -1528,57 +1592,88 @@ def map_scan_local(
             )
     red_realize_ctx.additional_insns.append(init_insn)
 
-    transfer_insn_depends_on = {init_insn.id} | insn.depends_on
+    transfer_insn_depends_on = (
+            frozenset({init_insn.id})
+            | red_realize_ctx.surrounding_depends_on)
 
-    updated_inner_exprs = _preprocess_scan_arguments(
-            red_realize_ctx,
-            expr.expr, nresults,
-            scan_iname, track_iname, transfer_insn_depends_on,
-            insn_id_gen=red_realize_ctx.insn_id_gen)
+    transfer_red_realize_ctx = red_realize_ctx.new_subinstruction(
+            within_inames=(
+                red_realize_ctx.surrounding_within_inames
+                | frozenset({scan_param.scan_iname})),
+            depends_on=red_realize_ctx.surrounding_depends_on)
 
-    from loopy.symbolic import Reduction
+    reduction_expr = red_realize_ctx.mapper(
+            expr.expr, red_realize_ctx=transfer_red_realize_ctx,
+            nresults=1)
 
-    from loopy.symbolic import pw_aff_to_expr
-    sweep_min_value_expr = pw_aff_to_expr(sweep_min_value)
+    updated_inner_exprs, transfer_insn_depends_on = _preprocess_scan_arguments(
+            red_realize_ctx,
+            reduction_expr, nresults,
+            scan_param.scan_iname, track_iname, transfer_insn_depends_on)
 
-    transfer_id = red_realize_ctx.insn_id_gen(f"{insn.id}_{scan_iname}_transfer")
-    transfer_insn = make_assignment(
-            id=transfer_id,
-            assignees=tuple(
-                acc_var[outer_local_iname_vars
-                        + (var(sweep_iname) - sweep_min_value_expr,)]
-                for acc_var in acc_vars),
-            expression=Reduction(
+    from loopy.symbolic import Reduction
+    pre_scan_reduction = Reduction(
                 operation=expr.operation,
                 inames=(track_iname,),
                 expr=_strip_if_scalar(acc_vars, updated_inner_exprs),
                 allow_simultaneous=False,
-                ),
-            within_inames=outer_insn_inames - frozenset(expr.inames),
-            within_inames_is_final=insn.within_inames_is_final,
-            depends_on=frozenset(transfer_insn_depends_on),
-            no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
-            predicates=insn.predicates,
-            )
+                )
 
-    red_realize_ctx.additional_insns.append(transfer_insn)
+    pre_scan_result = red_realize_ctx.mapper(
+            pre_scan_reduction, red_realize_ctx=transfer_red_realize_ctx,
+            nresults=len(acc_vars))
 
-    prev_id = transfer_id
+    from loopy.symbolic import pw_aff_to_expr
+    sweep_lower_bound_expr = pw_aff_to_expr(scan_param.sweep_lower_bound)
+
+    if nresults == 1:
+        assert not isinstance(pre_scan_result, tuple)
+        pre_scan_result = (pre_scan_result,)
+
+    transfer_ids = frozenset()
+    for acc_var, pre_scan_result_i in zip(acc_vars, pre_scan_result):
+        transfer_id = red_realize_ctx.insn_id_gen(
+                f"{red_realize_ctx.id_prefix}_{scan_param.scan_iname}_transfer")
+        transfer_insn = make_assignment(
+                id=transfer_id,
+                assignees=(acc_var[outer_local_iname_vars
+                    + (var(scan_param.sweep_iname) - sweep_lower_bound_expr,)],),
+                expression=pre_scan_result_i,
+                within_inames=(
+                    red_realize_ctx.surrounding_within_inames
+                    | transfer_red_realize_ctx.surrounding_insn_add_within_inames
+                    | frozenset({scan_param.sweep_iname})),
+                within_inames_is_final=True,
+                depends_on=(
+                    transfer_insn_depends_on
+                    | transfer_red_realize_ctx.surrounding_insn_add_depends_on),
+                no_sync_with=(
+                    frozenset([(init_id, "any")])
+                    | transfer_red_realize_ctx.surrounding_insn_add_no_sync_with),
+                predicates=red_realize_ctx.surrounding_predicates,
+                )
+
+        red_realize_ctx.additional_insns.append(transfer_insn)
+        transfer_ids = transfer_ids | frozenset({transfer_id})
+
+    del transfer_id
+
+    prev_ids = transfer_ids
 
     istage = 0
     cur_size = 1
 
     while cur_size < scan_size:
         stage_exec_iname = red_realize_ctx.var_name_gen(
-                "%s__scan_s%d" % (sweep_iname, istage))
+                f"{scan_param.sweep_iname}__scan_s{istage}")
         red_realize_ctx.domains.append(
                 _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
         red_realize_ctx.additional_iname_tags[stage_exec_iname] \
-                = orig_kernel.iname_tags(sweep_iname)
+                = orig_kernel.iname_tags(scan_param.sweep_iname)
 
         for read_var, acc_var in zip(read_vars, acc_vars):
             read_stage_id = red_realize_ctx.insn_id_gen(
-                    "scan_%s_read_stage_%d" % (scan_iname, istage))
+                    f"scan_{scan_param.scan_iname}_read_stage_{istage}")
 
             read_stage_insn = make_assignment(
                     id=read_stage_id,
@@ -1589,9 +1684,9 @@ def map_scan_local(
                                 + (var(stage_exec_iname) - cur_size,)]),
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
-                    within_inames_is_final=insn.within_inames_is_final,
-                    depends_on=frozenset([prev_id]),
-                    predicates=insn.predicates,
+                    within_inames_is_final=True,
+                    depends_on=prev_ids,
+                    predicates=red_realize_ctx.surrounding_predicates,
                     )
 
             if cur_size == 1:
@@ -1601,22 +1696,22 @@ def map_scan_local(
                 read_stage_insn = read_stage_insn.copy(
                         no_sync_with=(
                             read_stage_insn.no_sync_with
-                            | frozenset([(transfer_id, "any")])))
+                            | frozenset([(tid, "any") for tid in transfer_ids])))
 
             red_realize_ctx.additional_insns.append(read_stage_insn)
-            prev_id = read_stage_id
+            prev_ids = frozenset({read_stage_id})
 
         write_stage_id = red_realize_ctx.insn_id_gen(
-                "scan_%s_write_stage_%d" % (scan_iname, istage))
+                f"scan_{scan_param.scan_iname}_write_stage_{istage}")
 
-        expression, callables_table = expr.operation(
+        expression, red_realize_ctx.boxed_callables_table[0] = expr.operation(
             arg_dtypes,
             _strip_if_scalar(acc_vars, read_vars),
             _strip_if_scalar(acc_vars, tuple(
                 acc_var[
                     outer_local_iname_vars + (var(stage_exec_iname),)]
                 for acc_var in acc_vars)),
-            callables_table,
+            red_realize_ctx.boxed_callables_table[0],
             orig_kernel.target)
 
         write_stage_insn = make_assignment(
@@ -1627,58 +1722,52 @@ def map_scan_local(
                 expression=expression,
                 within_inames=(
                     base_iname_deps | frozenset([stage_exec_iname])),
-                within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset([prev_id]),
-                predicates=insn.predicates,
+                within_inames_is_final=True,
+                depends_on=prev_ids,
+                predicates=red_realize_ctx.surrounding_predicates,
                 )
 
         red_realize_ctx.additional_insns.append(write_stage_insn)
-        prev_id = write_stage_id
+        prev_ids = frozenset({write_stage_id})
 
         cur_size *= 2
         istage += 1
 
-    red_realize_ctx.new_insn_add_depends_on.add(prev_id)
-    red_realize_ctx.new_insn_add_within_inames.add(sweep_iname)
+    red_realize_ctx.surrounding_insn_add_depends_on.update(prev_ids)
+    red_realize_ctx.surrounding_insn_add_within_inames.add(scan_param.sweep_iname)
 
-    output_idx = var(sweep_iname) - sweep_min_value_expr
+    output_idx = var(scan_param.sweep_iname) - sweep_lower_bound_expr
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return (acc_vars[0][outer_local_iname_vars + (output_idx,)],
-                callables_table)
+        return acc_vars[0][outer_local_iname_vars + (output_idx,)]
     else:
         return [acc_var[outer_local_iname_vars + (output_idx,)]
-                for acc_var in acc_vars], callables_table
+                for acc_var in acc_vars]
 
 # }}}
 
 
 # {{{ top-level dispatch among reduction types
 
-def map_reduction(
-        expr, *, rec,
-        callables_table, red_realize_ctx,
-        guarding_predicates, nresults):
-    insn = red_realize_ctx.insn
-
-    # Only expand one level of reduction at a time, going from outermost to
-    # innermost. Otherwise we get the (iname + insn) dependencies wrong.
+def map_reduction(expr, *, red_realize_ctx, nresults):
+    kernel_with_updated_domains = red_realize_ctx.kernel.copy(
+            domains=red_realize_ctx.domains)
 
     from loopy.type_inference import (
             infer_arg_and_reduction_dtypes_for_reduction_expression)
     arg_dtypes, reduction_dtypes = (
             infer_arg_and_reduction_dtypes_for_reduction_expression(
-                red_realize_ctx.kernel, expr, callables_table,
+                kernel_with_updated_domains, expr,
+                red_realize_ctx.boxed_callables_table[0],
                 red_realize_ctx.unknown_types_ok))
 
-    outer_insn_inames = insn.within_inames
-    bad_inames = frozenset(expr.inames) & outer_insn_inames
+    bad_inames = frozenset(expr.inames) & red_realize_ctx.surrounding_within_inames
     if bad_inames:
         raise LoopyError("reduction used within loop(s) that it was "
                 "supposed to reduce over: " + ", ".join(bad_inames))
 
-    iname_classes = _classify_reduction_inames(red_realize_ctx.kernel, expr.inames)
+    iname_classes = _classify_reduction_inames(red_realize_ctx, expr.inames)
 
     n_sequential = len(iname_classes.sequential)
     n_local_par = len(iname_classes.local_parallel)
@@ -1698,7 +1787,8 @@ def _error_if_force_scan_on(cls, msg):
             # Try to determine scan candidate information (sweep iname, scan
             # iname, etc).
             scan_param = _try_infer_scan_candidate_from_expr(
-                    red_realize_ctx.kernel, expr, outer_insn_inames,
+                    kernel_with_updated_domains, expr,
+                    red_realize_ctx.surrounding_within_inames,
                     sweep_iname=red_realize_ctx.force_outer_iname_for_scan)
 
         except ValueError as v:
@@ -1707,7 +1797,7 @@ def _error_if_force_scan_on(cls, msg):
         else:
             # Ensures the reduction is triangular (somewhat expensive).
             may_be_implemented_as_scan, error = _check_reduction_is_triangular(
-                        red_realize_ctx.kernel, expr, scan_param)
+                        kernel_with_updated_domains, expr, scan_param)
 
         if not may_be_implemented_as_scan:
             _error_if_force_scan_on(ReductionIsNotTriangularError, error)
@@ -1751,7 +1841,7 @@ def _error_if_force_scan_on(cls, msg):
         # to reduce over. It's rather similar to an array with () shape in
         # numpy.)
 
-        return expr.expr, callables_table
+        return expr.expr
 
     if may_be_implemented_as_scan:
         assert red_realize_ctx.force_scan or red_realize_ctx.automagic_scans_ok
@@ -1759,14 +1849,13 @@ def _error_if_force_scan_on(cls, msg):
         # We require the "scan" iname to be tagged sequential.
         if n_sequential:
             sweep_iname = scan_param.sweep_iname
-            sweep_class = _classify_reduction_inames(
-                    red_realize_ctx.orig_kernel, (sweep_iname,))
+            sweep_class = _classify_reduction_inames(red_realize_ctx, (sweep_iname,))
 
             sequential = sweep_iname in sweep_class.sequential
             parallel = sweep_iname in sweep_class.local_parallel
             bad_parallel = sweep_iname in sweep_class.nonlocal_parallel
 
-            if sweep_iname not in outer_insn_inames:
+            if sweep_iname not in red_realize_ctx.surrounding_within_inames:
                 _error_if_force_scan_on(LoopyError,
                         "Sweep iname '%s' was detected, but is not an iname "
                         "for the instruction." % sweep_iname)
@@ -1778,25 +1867,11 @@ def _error_if_force_scan_on(cls, msg):
                          ", ".join(tag.key
                         for tag in red_realize_ctx.kernel.iname_tags(sweep_iname))))
             elif parallel:
-                return map_scan_local(
-                        red_realize_ctx,
-                        expr, rec, callables_table, nresults,
-                        arg_dtypes, reduction_dtypes,
-                        sweep_iname, scan_param.scan_iname,
-                        scan_param.sweep_lower_bound,
-                        scan_param.scan_lower_bound,
-                        scan_param.stride,
-                        guarding_predicates)
+                return map_scan_local(red_realize_ctx, expr, nresults,
+                        arg_dtypes, reduction_dtypes, scan_param)
             elif sequential:
-                return map_scan_seq(
-                        red_realize_ctx,
-                        expr, rec, callables_table, nresults,
-                        arg_dtypes, reduction_dtypes, sweep_iname,
-                        scan_param.scan_iname,
-                        scan_param.sweep_lower_bound,
-                        scan_param.scan_lower_bound,
-                        scan_param.stride,
-                        guarding_predicates)
+                return map_scan_seq(red_realize_ctx, expr, nresults,
+                        arg_dtypes, reduction_dtypes, scan_param)
 
             # fallthrough to reduction implementation
 
@@ -1814,15 +1889,13 @@ def _error_if_force_scan_on(cls, msg):
         assert n_local_par == 0
         return map_reduction_seq(
                 red_realize_ctx,
-                expr, rec, callables_table,
-                nresults, arg_dtypes, reduction_dtypes,
-                guarding_predicates)
+                expr, nresults, arg_dtypes, reduction_dtypes)
     else:
         assert n_local_par > 0
         return map_reduction_local(
                 red_realize_ctx,
-                expr, rec, callables_table, nresults, arg_dtypes,
-                reduction_dtypes, guarding_predicates)
+                expr, nresults, arg_dtypes,
+                reduction_dtypes)
 
 # }}}
 
@@ -1842,7 +1915,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
     insn_id_gen = kernel.get_instruction_id_generator()
     var_name_gen = kernel.get_var_name_generator()
 
-    cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table)
+    cb_mapper = RealizeReductionCallbackMapper(map_reduction)
 
     insn_queue = kernel.instructions[:]
     domains = kernel.domains[:]
@@ -1855,6 +1928,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         insn = insn_queue.pop(0)
 
         red_realize_ctx = _ReductionRealizationContext(
+                mapper=cb_mapper,
+
                 force_scan=force_scan,
                 automagic_scans_ok=automagic_scans_ok,
                 unknown_types_ok=unknown_types_ok,
@@ -1862,7 +1937,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
                 orig_kernel=orig_kernel,
                 kernel=kernel,
-                insn=insn,
+
+                id_prefix=insn.id,
 
                 insn_id_gen=insn_id_gen,
                 var_name_gen=var_name_gen,
@@ -1871,14 +1947,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                 additional_insns=[],
                 domains=domains,
                 additional_iname_tags={},
+                boxed_callables_table=[callables_table],
 
                 inames_added_for_scan=inames_added_for_scan,
 
-                new_insn_add_depends_on=set(),
-                new_insn_add_no_sync_with=set(),
-                new_insn_add_within_inames=set(),
+                surrounding_within_inames=insn.within_inames,
+                surrounding_depends_on=insn.depends_on,
+                surrounding_no_sync_with=insn.no_sync_with,
+                surrounding_predicates=insn.predicates,
 
-                were_changes_made=False,
+                surrounding_insn_add_within_inames=set(),
+                surrounding_insn_add_depends_on=set(),
+                surrounding_insn_add_no_sync_with=set(),
+
+                _change_flag=_ChangeFlag(changes_made=False)
                 )
 
         if insn_id_filter is not None and insn.id != insn_id_filter \
@@ -1892,15 +1974,11 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
             new_expressions = cb_mapper(insn.expression,
-                    callables_table=cb_mapper.callables_table,
                     red_realize_ctx=red_realize_ctx,
-                    guarding_predicates=insn.predicates,
                     nresults=nresults)
         else:
             new_expressions = cb_mapper(insn.expression,
-                    callables_table=cb_mapper.callables_table,
                     red_realize_ctx=red_realize_ctx,
-                    guarding_predicates=insn.predicates,
                     nresults=1),
 
         if red_realize_ctx.were_changes_made:
@@ -1911,17 +1989,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
             kernel_changed = True
 
-            insn_id_replacements = {}
+            callables_table = red_realize_ctx.boxed_callables_table[0]
 
             result_assignment_dep_on = (
                     insn.depends_on
-                    | frozenset(red_realize_ctx.new_insn_add_depends_on))
+                    | frozenset(red_realize_ctx.surrounding_insn_add_depends_on))
             kwargs = insn.get_copy_kwargs(
                     no_sync_with=insn.no_sync_with
-                    | frozenset(red_realize_ctx.new_insn_add_no_sync_with),
+                    | frozenset(red_realize_ctx.surrounding_insn_add_no_sync_with),
                     within_inames=(
                         insn.within_inames
-                        | red_realize_ctx.new_insn_add_within_inames))
+                        | red_realize_ctx.surrounding_insn_add_within_inames))
 
             kwargs.pop("id")
             kwargs.pop("depends_on")
@@ -1931,6 +2009,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
             kwargs.pop("temp_var_type", None)
             kwargs.pop("temp_var_types", None)
 
+            insn_id_replacements = {}
+
             if isinstance(insn.expression, Reduction) and nresults > 1:
                 result_assignment_ids = [
                         insn_id_gen(insn.id) for i in range(nresults)]
@@ -1962,10 +2042,32 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                             **kwargs)
                         ]
 
-            insn_queue = (
-                    red_realize_ctx.additional_insns
-                    + replacement_insns
-                    + insn_queue)
+            additional_insns = red_realize_ctx.additional_insns
+
+            # {{{ make additional insns depend on most recent global barrier
+
+            # FIXME This is weird and hokey and ad-hoc and probably broken.
+            # I *think* the idea is to keep a reduction/scan implementation
+            # from crossing a global barrier, because that would be costly.
+
+            # check first that the original kernel had global barriers
+            # if not, we don't need to check. Since the function
+            # kernel_has_global_barriers is cached, we don't do
+            # extra work compared to not checking.
+
+            from loopy.kernel.tools import (
+                    kernel_has_global_barriers, find_most_recent_global_barrier)
+
+            if kernel_has_global_barriers(orig_kernel):
+                global_barrier = find_most_recent_global_barrier(kernel, insn.id)
+
+                if global_barrier is not None:
+                    gb_dep = frozenset([global_barrier])
+                    additional_insns = [addl_insn.copy(
+                        depends_on=addl_insn.depends_on | gb_dep)
+                        for addl_insn in additional_insns]
+
+            # }}}
 
             # The reduction expander needs an up-to-date kernel
             # object to find dependencies. Keep kernel up-to-date.
@@ -1980,6 +2082,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
                     replace_instruction_ids_in_insn(insn, insn_id_replacements)
                     for insn in insn_queue]
 
+            finished_insns.extend(additional_insns)
+            finished_insns.extend(replacement_insns)
+
             kernel = kernel.copy(
                     instructions=finished_insns + insn_queue,
                     temporary_variables=new_temporary_variables,
@@ -1993,19 +2098,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table,
 
         else:
             # nothing happened, we're done with insn
-            assert not red_realize_ctx.new_insn_add_depends_on
+            assert not red_realize_ctx.surrounding_insn_add_depends_on
 
             finished_insns.append(insn)
 
-    if kernel_changed:
-        kernel = kernel.copy(instructions=finished_insns)
-    else:
+    if not kernel_changed:
         return orig_kernel, callables_table
 
     kernel = _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel)
 
-    return kernel, cb_mapper.callables_table
+    return kernel, callables_table
 
 # }}}
 
diff --git a/test/test_scan.py b/test/test_scan.py
index 94778ef4d..f5aa8a7c2 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -221,12 +221,8 @@ def test_local_parallel_scan(ctx_factory, n):
     knl = lp.tag_inames(knl, dict(i="l.0"))
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    knl = lp.realize_reduction(knl)
-
     knl = lp.add_dtypes(knl, dict(a=int))
 
-    print(knl)
-
     evt, (a,) = knl(queue, a=np.arange(n))
     assert (a == np.cumsum(np.arange(n)**2)).all()
 
@@ -246,7 +242,6 @@ def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
     knl = lp.fix_parameters(knl, n=16)
     knl = lp.tag_inames(knl, dict(i="l.0"))
     knl = lp.realize_reduction(knl, force_scan=True)
-    knl = lp.realize_reduction(knl)
 
     knl = lp.add_dtypes(knl, dict(a=int))
     evt, (out,) = knl(queue, a=np.arange(1, 17))

From 6a4a99b39c5d135d65b750f3c54e249f54cd77c6 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 4 Feb 2022 01:06:52 -0600
Subject: [PATCH 19/27] Add test_reduction_in_conditional (gh-533)

---
 test/test_reduction.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/test_reduction.py b/test/test_reduction.py
index 1aa3b52b6..065d3de46 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -480,6 +480,26 @@ def test_reduction_without_inames(ctx_factory):
     assert out_dict["out"].get() == 5
 
 
+def test_reduction_in_conditional(ctx_factory):
+    # https://github.com/inducer/loopy/issues/533#issuecomment-1028472366
+    ctx = ctx_factory()
+    cq = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{[i, j, k]: 0<=i,j,k<10}",
+        """
+        y[i] = 1729 if (sum(j, j) == 0) else sum(k, k)
+        """)
+
+    knl = lp.set_options(knl, write_cl=True)
+
+    knl = lp.preprocess_program(knl)
+
+    evt, (out,) = knl(cq)
+
+    assert (out == 45).all()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])

From bf1ceda994b4a8d4354c3226eff531840a1606c5 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <idf2@illinois.edu>
Date: Sun, 13 Feb 2022 15:01:05 -0600
Subject: [PATCH 20/27] Fix path to tasksys.cpp

---
 examples/python/ispc-stream-harness.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index 722cd917c..f603aabbe 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -54,7 +54,8 @@ def gen_code(knl):
 
 
 def main():
-    with open("tasksys.cpp") as ts_file:
+    this_dir = os.path.dirname(__file__)
+    with open(os.path.join(this_dir, "tasksys.cpp")) as ts_file:
         tasksys_source = ts_file.read()
 
     def make_knl(name, insn, vars):

From efcb5598d637655d820898f9d9965fb2ea3cb8fd Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 15 Feb 2022 12:47:33 -0600
Subject: [PATCH 21/27] Remove redundant multiplication by one

---
 loopy/target/pyopencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 7871eadf2..9a6be115c 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -247,7 +247,7 @@ def binary_tree_add(start, end):
 
             complex_sum = binary_tree_add(0, len(c_applied))
 
-            if real_sum:
+            if reals:
                 return p.Variable("%s_radd" % tgt_name)(real_sum, complex_sum)
             else:
                 return complex_sum

From b77e416e399ea3079a635d9e7583ec3f56153afa Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 16 Feb 2022 11:33:47 -0600
Subject: [PATCH 22/27] guards passing unused variables in
 privatize_temporaries_with_inames

---
 loopy/transform/privatize.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index fb2ce37a1..e9d696481 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -122,6 +122,16 @@ def privatize_temporaries_with_inames(
                 s.strip()
                 for s in only_var_names.split(","))
 
+    # {{{ sanity checks
+
+    if (only_var_names is not None
+            and privatizing_inames <= kernel.all_inames()
+            and not (frozenset(only_var_names) <= kernel.all_variable_names())):
+        raise LoopyError(f"Some variables in '{only_var_names}'"
+                         f" not used in kernel '{kernel.name}'.")
+
+    # }}}
+
     wmap = kernel.writer_map()
 
     var_to_new_priv_axis_iname = {}

From 1f9cd4b2cea5ee11efbd8ed2af646ccd11234d1f Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Wed, 16 Feb 2022 13:05:14 -0600
Subject: [PATCH 23/27] Remove redundant multiplication by one

---
 loopy/target/pyopencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 9a6be115c..06ff41908 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -293,7 +293,7 @@ def binary_tree_mul(start, end):
 
             complex_prd = binary_tree_mul(0, len(complexes))
 
-            if real_prd:
+            if reals:
                 return p.Variable("%s_rmul" % tgt_name)(real_prd, complex_prd)
             else:
                 return complex_prd

From 7241bd636afe82566aa0e80b7c7b2dbb9e49312a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 16 Feb 2022 17:21:28 -0600
Subject: [PATCH 24/27] [bugfix]: precompute over insns after a gbarrier

---
 loopy/transform/precompute.py | 14 ++++++++++++++
 test/test_transform.py        | 26 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 7c20d7a01..201abd470 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -29,6 +29,8 @@
 from pymbolic.mapper.substitutor import make_subst_func
 from loopy.translation_unit import TranslationUnit
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.kernel.tools import (kernel_has_global_barriers,
+                                find_most_recent_global_barrier)
 import numpy as np
 
 from pymbolic import var
@@ -217,6 +219,18 @@ def map_substitution(self, name, tag, arguments, expn_state):
 
         self.replaced_something = True
 
+        # {{{ add gbarriers that the replaced insn depends-on to compute insn's deps
+
+        if (kernel_has_global_barriers(expn_state.kernel)
+                and (find_most_recent_global_barrier(expn_state.kernel,
+                                                     expn_state.instruction.id
+                                                     ) is not None)):
+            self.compute_insn_depends_on.add(
+                find_most_recent_global_barrier(expn_state.kernel,
+                                                expn_state.instruction.id))
+
+        # }}}
+
         return new_outer_expr
 
     def map_kernel(self, kernel):
diff --git a/test/test_transform.py b/test/test_transform.py
index e42eeb498..2043b127e 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1366,6 +1366,32 @@ def test_rename_inames_existing_ok(ctx_factory):
     lp.auto_test_vs_ref(knl, ctx, ref_knl)
 
 
+def test_precompute_with_gbarrier(ctx_factory):
+    # See https://github.com/inducer/loopy/issues/543
+    ctx = ctx_factory()
+
+    t_unit = lp.make_kernel(
+        ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}",
+         "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"],
+        """
+        out0[i0] = sum(j0, A[i0] * x[j0])
+        ... gbarrier {id=gbarrier}
+        out1[i1] = sum(j1, A[i1] * x[j1])
+        """, seq_dependencies=True)
+    t_unit = lp.add_dtypes(t_unit, {"A": np.float64,
+                                    "x": np.float64})
+    ref_t_unit = t_unit
+
+    t_unit = lp.add_prefetch(t_unit,
+                             "x",
+                             sweep_inames=["j1"],
+                             within="writes:out1",
+                             prefetch_insn_id="x_fetch")
+    assert "gbarrier" in t_unit.default_entrypoint.id_to_insn["x_fetch"].depends_on
+
+    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])

From c58d075c07f9cb158a2d9484e6e37c2e5f0588c0 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 16 Feb 2022 19:19:00 -0600
Subject: [PATCH 25/27] Drop uses of islpy.SuppressedWarnings (deprecated, now
 a no-op)

---
 loopy/isl_helpers.py                 |  3 +--
 loopy/kernel/tools.py                |  6 ++---
 loopy/symbolic.py                    | 34 ++++++++++++++--------------
 loopy/transform/realize_reduction.py | 16 ++++++-------
 4 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 57183109b..45f74d70a 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -349,8 +349,7 @@ def is_nonnegative(expr, over_set):
     space = over_set.get_space()
     from loopy.symbolic import aff_from_expr
     try:
-        with isl.SuppressedWarnings(space.get_ctx()):
-            aff = aff_from_expr(space, -expr-1)
+        aff = aff_from_expr(space, -expr-1)
     except Exception:
         return None
     expr_neg_set = isl.BasicSet.universe(space).add_constraint(
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 64e3cd84b..9806fbe8d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -818,8 +818,7 @@ def assign_axis(recursion_axis, iname, axis=None):
         If *axis* is None, find a suitable axis automatically.
         """
         try:
-            with isl.SuppressedWarnings(kernel.isl_context):
-                desired_length = kernel.get_constant_iname_length(iname)
+            desired_length = kernel.get_constant_iname_length(iname)
         except isl.Error:
             # Likely unbounded, automatic assignment is not
             # going to happen for this iname.
@@ -947,8 +946,7 @@ def assign_axis(recursion_axis, iname, axis=None):
 
             def get_iname_length(iname):
                 try:
-                    with isl.SuppressedWarnings(kernel.isl_context):
-                        return kernel.get_constant_iname_length(iname)
+                    return kernel.get_constant_iname_length(iname)
                 except isl.Error:
                     return -1
             # assign longest auto axis inames first
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b47fe9266..8f702f783 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1875,23 +1875,23 @@ def with_aff_conversion_guard(f, space, expr, *args):
     from loopy.diagnostic import ExpressionNotAffineError
 
     err = None
-    with isl.SuppressedWarnings(space.get_ctx()):
-        try:
-            return f(space, expr, *args)
-        except TypeError as e:
-            err = e
-        except isl.Error as e:
-            err = e
-        except UnknownVariableError as e:
-            err = e
-        except ExpressionNotAffineError as e:
-            err = e
-
-        assert err is not None
-        from loopy.diagnostic import ExpressionToAffineConversionError
-        raise ExpressionToAffineConversionError(
-                "could not convert expression '%s' to affine representation: "
-                "%s: %s" % (expr, type(err).__name__, str(err)))
+
+    try:
+        return f(space, expr, *args)
+    except TypeError as e:
+        err = e
+    except isl.Error as e:
+        err = e
+    except UnknownVariableError as e:
+        err = e
+    except ExpressionNotAffineError as e:
+        err = e
+
+    assert err is not None
+    from loopy.diagnostic import ExpressionToAffineConversionError
+    raise ExpressionToAffineConversionError(
+            "could not convert expression '%s' to affine representation: "
+            "%s: %s" % (expr, type(err).__name__, str(err)))
 
 
 def guarded_aff_from_expr(space, expr, vars_to_zero=None):
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index 67aa627f8..2f8e3abe8 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -469,10 +469,9 @@ def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_ina
             within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,))
 
     try:
-        with isl.SuppressedWarnings(domain.get_ctx()):
-            sweep_lower_bound = domain.dim_min(sweep_idx)
-            sweep_upper_bound = domain.dim_max(sweep_idx)
-            scan_lower_bound = domain.dim_min(scan_idx)
+        sweep_lower_bound = domain.dim_min(sweep_idx)
+        sweep_upper_bound = domain.dim_max(sweep_idx)
+        scan_lower_bound = domain.dim_min(scan_idx)
     except isl.Error as e:
         raise ValueError("isl error: %s" % e)
 
@@ -499,11 +498,10 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
     # Should be equal to k * sweep_iname, where k is the stride.
 
     try:
-        with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()):
-            scan_iname_range = (
-                    domain_with_sweep_param.dim_max(scan_iname_idx)
-                    - domain_with_sweep_param.dim_min(scan_iname_idx)
-                    ).gist(domain_with_sweep_param.params())
+        scan_iname_range = (
+                domain_with_sweep_param.dim_max(scan_iname_idx)
+                - domain_with_sweep_param.dim_min(scan_iname_idx)
+                ).gist(domain_with_sweep_param.params())
     except isl.Error as e:
         raise ValueError("isl error: '%s'" % e)
 

From d2cd0d89c68b03cde169c9351a17aa376c8ef427 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 16 Feb 2022 19:01:45 -0600
Subject: [PATCH 26/27] preserve rev. depends for buffer array's store
 instructions

---
 loopy/transform/buffer.py | 13 +++++++++++--
 test/test_transform.py    | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index a6e25457d..e3dbeeb51 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -499,14 +499,23 @@ def none_to_empty_set(s):
 
     new_insns.append(init_instruction)
     if did_write:
-        new_insns.append(store_instruction)
+        # new_insns_with_redirected_deps: if an insn depends on a modified
+        # insn, then it should also depend on the store insn.
+        new_insns_with_redirected_deps = [
+            insn.copy(depends_on=(insn.depends_on | {store_instruction.id}))
+            if insn.depends_on & aar.modified_insn_ids
+            else insn
+            for insn in new_insns
+        ] + [store_instruction]
     else:
         for iname in store_inames:
             del new_iname_to_tag[iname]
 
+        new_insns_with_redirected_deps = new_insns
+
     kernel = kernel.copy(
             domains=new_kernel_domains,
-            instructions=new_insns,
+            instructions=new_insns_with_redirected_deps,
             temporary_variables=new_temporary_variables)
 
     from loopy import tag_inames
diff --git a/test/test_transform.py b/test/test_transform.py
index 2043b127e..2aa07dabb 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1351,6 +1351,30 @@ def test_rename_inames(ctx_factory):
     lp.auto_test_vs_ref(knl, ctx, ref_knl)
 
 
+def test_buffer_array_preserves_rev_deps(ctx_factory):
+    # See https://github.com/inducer/loopy/issues/546
+    ctx = ctx_factory()
+    knl = lp.make_kernel(
+        ["{[i0, j0]: 0<=i0<100 and 0<=j0<10}",
+         "{[i1, j1]: 0<=i1<100 and 0<=j1<10}"],
+        """
+        out0[i0] = sum(j0, A[i0] * x[j0])
+        ... gbarrier {id=gbarrier}
+        out1[i1] = sum(j1, A[i1] * x[j1])
+        """, seq_dependencies=True)
+    knl = lp.add_dtypes(knl, {"A": np.float64,
+                              "x": np.float64})
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "j0", 2)
+    knl = lp.split_iname(knl, "i0", 2, outer_tag="g.0")
+    knl = lp.buffer_array(knl, "out0",
+                          buffer_inames=["i0_inner"],
+                          init_expression="0")
+    assert "store_out0" in knl.default_entrypoint.id_to_insn["gbarrier"].depends_on
+    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
 def test_rename_inames_existing_ok(ctx_factory):
     ctx = ctx_factory()
 

From 21e2fb6899285b22e2943a64b34186aea18cbdd3 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 17 Nov 2021 10:01:32 -0600
Subject: [PATCH 27/27] rename_inames should use remove_unused inames

---
 loopy/transform/iname.py | 268 +++++++++++++++++++--------------------
 1 file changed, 134 insertions(+), 134 deletions(-)

diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 3712d678b..d82b2b352 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -1125,140 +1125,6 @@ def has_schedulable_iname_nesting(kernel):
 # }}}
 
 
-# {{{ rename_inames
-
-@for_each_kernel
-def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None):
-    """
-    :arg old_inames: A collection of inames that must be renamed to **new_iname**.
-    :arg within: a stack match as understood by
-        :func:`loopy.match.parse_stack_match`.
-    :arg existing_ok: execute even if *new_iname* already exists
-    """
-    from collections.abc import Collection
-    if (isinstance(old_inames, str)
-            or not isinstance(old_inames, Collection)):
-        raise LoopyError("'old_inames' must be a collection of strings, "
-                         f"got '{type(old_inames)}'.")
-
-    if new_iname in old_inames:
-        raise LoopyError("new iname is part of inames being renamed")
-
-    if new_iname in (kernel.all_variable_names() - kernel.all_inames()):
-        raise LoopyError(f"New iname '{new_iname}' is already a variable in the"
-                         "kernel")
-
-    if any((len(insn.within_inames & frozenset(old_inames)) > 1)
-           for insn in kernel.instructions):
-        raise LoopyError("old_inames contains nested inames"
-                         " -- renaming is illegal.")
-
-    # sort to have deterministic implementation.
-    old_inames = sorted(old_inames)
-
-    var_name_gen = kernel.get_var_name_generator()
-
-    # FIXME: Distinguish existing iname vs. existing other variable
-    does_exist = new_iname in kernel.all_inames()
-
-    if not (frozenset(old_inames) <= kernel.all_inames()):
-        raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}"
-                         " do not exist.")
-
-    if does_exist and not existing_ok:
-        raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier"
-                         " --cannot rename")
-
-    if not does_exist:
-        # {{{ rename old_inames[0] -> new_iname
-        # so that the code below can focus on "merging" inames that already exist
-
-        kernel = duplicate_inames(
-                kernel, old_inames[0], within=within, new_inames=[new_iname])
-        kernel = remove_unused_inames(kernel, old_inames[0])
-
-        # old_iname[0] is already renamed to new_iname => do not rename again.
-        old_inames = old_inames[1:]
-
-        # }}}
-
-    del does_exist
-    assert new_iname in kernel.all_inames()
-
-    for old_iname in old_inames:
-        # {{{ check that the domains match up
-
-        dom = kernel.get_inames_domain(frozenset((old_iname, new_iname)))
-
-        var_dict = dom.get_var_dict()
-        _, old_idx = var_dict[old_iname]
-        _, new_idx = var_dict[new_iname]
-
-        par_idx = dom.dim(dim_type.param)
-        dom_old = dom.move_dims(
-                dim_type.param, par_idx, dim_type.set, old_idx, 1)
-        dom_old = dom_old.move_dims(
-                dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1)
-        dom_old = dom_old.project_out(
-                dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1)
-
-        par_idx = dom.dim(dim_type.param)
-        dom_new = dom.move_dims(
-                dim_type.param, par_idx, dim_type.set, new_idx, 1)
-        dom_new = dom_new.move_dims(
-                dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1)
-        dom_new = dom_new.project_out(
-                dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1)
-
-        if not (dom_old <= dom_new and dom_new <= dom_old):
-            raise LoopyError(
-                    "inames {old} and {new} do not iterate over the same domain"
-                    .format(old=old_iname, new=new_iname))
-
-        # }}}
-
-    from pymbolic import var
-    subst_dict = {old_iname: var(new_iname) for old_iname in old_inames}
-
-    from loopy.match import parse_stack_match
-    within = parse_stack_match(within)
-
-    from pymbolic.mapper.substitutor import make_subst_func
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            kernel.substitutions, var_name_gen)
-    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
-                    make_subst_func(subst_dict), within)
-
-    from loopy.kernel.instruction import MultiAssignmentBase
-
-    def does_insn_involve_iname(kernel, insn, *args):
-        return (not isinstance(insn, MultiAssignmentBase)
-                or frozenset(old_inames) & insn.dependency_names()
-                or frozenset(old_inames) & insn.reduction_inames())
-
-    kernel = rule_mapping_context.finish_kernel(
-            smap.map_kernel(kernel, within=does_insn_involve_iname))
-
-    new_instructions = [insn.copy(within_inames=((insn.within_inames
-                                                  - frozenset(old_inames))
-                                                 | frozenset([new_iname])))
-                        if ((len(frozenset(old_inames) & insn.within_inames) != 0)
-                            and within(kernel, insn, ()))
-                        else insn
-                        for insn in kernel.instructions]
-
-    kernel = kernel.copy(instructions=new_instructions)
-    kernel = remove_unused_inames(kernel, old_inames)
-
-    return kernel
-
-
-def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
-    return rename_inames(kernel, [old_iname], new_iname, existing_ok, within)
-
-# }}}
-
-
 # {{{ remove unused inames
 
 def get_used_inames(kernel):
@@ -2422,4 +2288,138 @@ def add_inames_for_unused_hw_axes(kernel, within=None):
 
     return kernel.copy(instructions=new_insns)
 
+
+# {{{ rename_inames
+
+@for_each_kernel
+@remove_any_newly_unused_inames
+def rename_inames(kernel, old_inames, new_iname, existing_ok=False, within=None):
+    """
+    :arg old_inames: A collection of inames that must be renamed to **new_iname**.
+    :arg within: a stack match as understood by
+        :func:`loopy.match.parse_stack_match`.
+    :arg existing_ok: execute even if *new_iname* already exists
+    """
+    from collections.abc import Collection
+    if (isinstance(old_inames, str)
+            or not isinstance(old_inames, Collection)):
+        raise LoopyError("'old_inames' must be a collection of strings, "
+                         f"got '{type(old_inames)}'.")
+
+    if new_iname in old_inames:
+        raise LoopyError("new iname is part of inames being renamed")
+
+    if new_iname in (kernel.all_variable_names() - kernel.all_inames()):
+        raise LoopyError(f"New iname '{new_iname}' is already a variable in the"
+                         "kernel")
+
+    if any((len(insn.within_inames & frozenset(old_inames)) > 1)
+           for insn in kernel.instructions):
+        raise LoopyError("old_inames contains nested inames"
+                         " -- renaming is illegal.")
+
+    # sort to have deterministic implementation.
+    old_inames = sorted(old_inames)
+
+    var_name_gen = kernel.get_var_name_generator()
+
+    # FIXME: Distinguish existing iname vs. existing other variable
+    does_exist = new_iname in kernel.all_inames()
+
+    if not (frozenset(old_inames) <= kernel.all_inames()):
+        raise LoopyError(f"old inames {frozenset(old_inames) - kernel.all_inames()}"
+                         " do not exist.")
+
+    if does_exist and not existing_ok:
+        raise LoopyError(f"iname '{new_iname}' conflicts with an existing identifier"
+                         " --cannot rename")
+
+    if not does_exist:
+        # {{{ rename old_inames[0] -> new_iname
+        # so that the code below can focus on "merging" inames that already exist
+
+        kernel = duplicate_inames(
+                kernel, old_inames[0], within=within, new_inames=[new_iname])
+
+        # old_iname[0] is already renamed to new_iname => do not rename again.
+        old_inames = old_inames[1:]
+
+        # }}}
+
+    del does_exist
+    assert new_iname in kernel.all_inames()
+
+    for old_iname in old_inames:
+        # {{{ check that the domains match up
+
+        dom = kernel.get_inames_domain(frozenset((old_iname, new_iname)))
+
+        var_dict = dom.get_var_dict()
+        _, old_idx = var_dict[old_iname]
+        _, new_idx = var_dict[new_iname]
+
+        par_idx = dom.dim(dim_type.param)
+        dom_old = dom.move_dims(
+                dim_type.param, par_idx, dim_type.set, old_idx, 1)
+        dom_old = dom_old.move_dims(
+                dim_type.set, dom_old.dim(dim_type.set), dim_type.param, par_idx, 1)
+        dom_old = dom_old.project_out(
+                dim_type.set, new_idx if new_idx < old_idx else new_idx - 1, 1)
+
+        par_idx = dom.dim(dim_type.param)
+        dom_new = dom.move_dims(
+                dim_type.param, par_idx, dim_type.set, new_idx, 1)
+        dom_new = dom_new.move_dims(
+                dim_type.set, dom_new.dim(dim_type.set), dim_type.param, par_idx, 1)
+        dom_new = dom_new.project_out(
+                dim_type.set, old_idx if old_idx < new_idx else old_idx - 1, 1)
+
+        if not (dom_old <= dom_new and dom_new <= dom_old):
+            raise LoopyError(
+                    "inames {old} and {new} do not iterate over the same domain"
+                    .format(old=old_iname, new=new_iname))
+
+        # }}}
+
+    from pymbolic import var
+    subst_dict = {old_iname: var(new_iname) for old_iname in old_inames}
+
+    from loopy.match import parse_stack_match
+    within = parse_stack_match(within)
+
+    from pymbolic.mapper.substitutor import make_subst_func
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, var_name_gen)
+    smap = RuleAwareSubstitutionMapper(rule_mapping_context,
+                    make_subst_func(subst_dict), within)
+
+    from loopy.kernel.instruction import MultiAssignmentBase
+
+    def does_insn_involve_iname(kernel, insn, *args):
+        return (not isinstance(insn, MultiAssignmentBase)
+                or frozenset(old_inames) & insn.dependency_names()
+                or frozenset(old_inames) & insn.reduction_inames())
+
+    kernel = rule_mapping_context.finish_kernel(
+            smap.map_kernel(kernel, within=does_insn_involve_iname))
+
+    new_instructions = [insn.copy(within_inames=((insn.within_inames
+                                                  - frozenset(old_inames))
+                                                 | frozenset([new_iname])))
+                        if ((len(frozenset(old_inames) & insn.within_inames) != 0)
+                            and within(kernel, insn, ()))
+                        else insn
+                        for insn in kernel.instructions]
+
+    kernel = kernel.copy(instructions=new_instructions)
+
+    return kernel
+
+
+def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
+    return rename_inames(kernel, [old_iname], new_iname, existing_ok, within)
+
+# }}}
+
+
 # vim: foldmethod=marker