From 0b514ae8abf8e6f4e07c5b724f18433dcfe525cf Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Fri, 30 Sep 2022 11:30:06 -0700
Subject: [PATCH 001/121] Run CI on schedule (#404)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6afe451d2..32a514e00 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,6 +6,9 @@ on:
   pull_request:
     branches-ignore:
       - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    - cron:  '* */4 * * *'
 env:
   COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
   PROJECT: github-core-ci

From 2a1a13c6e541885d2a83fe9d682b017865922ab4 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 30 Sep 2022 11:37:04 -0700
Subject: [PATCH 002/121] Adjust consensus match frequency based on field sizes
 (#402) (#406)

* Perform consensus match more frequently for bigger free fields

* Minor cleanup
---
 legate/core/runtime.py | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index fa4fdaad9..b47624378 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -200,9 +200,9 @@ def add_free_field(
     ) -> None:
         self._freed_fields.append(FreeFieldInfo(manager, region, field_id))
 
-    def issue_field_match(self) -> None:
+    def issue_field_match(self, credit: int) -> None:
         # Increment our match counter
-        self._match_counter += 1
+        self._match_counter += credit
         if self._match_counter < self._match_frequency:
             return
         # If the match counter equals our match frequency then do an exchange
@@ -342,9 +342,29 @@ def __init__(
     ) -> None:
         super().__init__(runtime, shape, field_size)
         self._field_match_manager = runtime.field_match_manager
+        self._update_match_credit()
+
+    def _update_match_credit(self) -> None:
+        if self.shape.fixed:
+            size = self.shape.volume() * self.field_size
+            self._match_credit = (
+                size + self.runtime.max_field_reuse_size - 1
+                if size > self.runtime.max_field_reuse_size
+                else self.runtime.max_field_reuse_size
+            ) // self.runtime.max_field_reuse_size
+            # No need to update the credit as the exact size is known
+            self._need_to_update_match_credit = False
+        # If the shape is unknown, we set the credit such that every new
+        # free field leads to a consensus match, and ask the manager
+        # to update the credit.
+        else:
+            self._match_credit = self.runtime.max_field_reuse_frequency
+            self._need_to_update_match_credit = True
 
     def try_reuse_field(self) -> Optional[tuple[Region, int]]:
-        self._field_match_manager.issue_field_match()
+        if self._need_to_update_match_credit:
+            self._update_match_credit()
+        self._field_match_manager.issue_field_match(self._match_credit)
 
         # First, if we have a free field then we know everyone has one of those
         if len(self.free_fields) > 0:
@@ -915,6 +935,12 @@ def __init__(self, core_library: CoreLib) -> None:
                 ty.uint32,
             )
         )
+        self.max_field_reuse_size = int(
+            self._core_context.get_tunable(
+                legion.LEGATE_CORE_TUNABLE_FIELD_REUSE_SIZE,
+                ty.uint64,
+            )
+        )
         self._field_manager_class = (
             ConsensusMatchingFieldManager
             if self._num_nodes > 1 or self._args.consensus
@@ -1246,12 +1272,7 @@ def find_region_manager(self, region: Region) -> RegionManager:
         return self.region_managers_by_region[region]
 
     def revive_manager(self, region_mgr: RegionManager) -> None:
-        lru_managers: Deque[RegionManager] = deque()
-        for to_check in self.lru_managers:
-            if to_check is not region_mgr:
-                lru_managers.append(to_check)
-        assert len(lru_managers) < len(self.lru_managers)
-        self.lru_managers = lru_managers
+        self.lru_managers.remove(region_mgr)
 
     def free_region_manager(
         self, shape: Shape, region: Region, unordered: bool = False

From b66abe0f368ff0634bbd66b2c225ec61fe2bd3fd Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Fri, 30 Sep 2022 12:20:28 -0700
Subject: [PATCH 003/121] Driver verbose only for rank 0 or "none" launcher
 (#403)

---
 legate/driver/driver.py                 |  4 +++-
 tests/unit/legate/driver/test_driver.py | 31 ++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/legate/driver/driver.py b/legate/driver/driver.py
index 9548f197b..57dd653d0 100644
--- a/legate/driver/driver.py
+++ b/legate/driver/driver.py
@@ -88,7 +88,9 @@ def run(self) -> int:
 
         """
         if self.config.info.verbose:
-            print_verbose(self.system, self)
+            # we only want to print verbose output on a "head" node
+            if self.launcher.kind != "none" or self.launcher.rank_id == "0":
+                print_verbose(self.system, self)
 
         self._darwin_gdb_warn()
 
diff --git a/tests/unit/legate/driver/test_driver.py b/tests/unit/legate/driver/test_driver.py
index f6aea4a0b..e346210d3 100644
--- a/tests/unit/legate/driver/test_driver.py
+++ b/tests/unit/legate/driver/test_driver.py
@@ -23,7 +23,7 @@
 import legate.driver.driver as m
 from legate.driver.args import LAUNCHERS
 from legate.driver.command import CMD_PARTS
-from legate.driver.launcher import Launcher
+from legate.driver.launcher import RANK_ENV_VARS, Launcher
 from legate.driver.system import System
 from legate.driver.types import LauncherType
 from legate.driver.ui import scrub
@@ -129,6 +129,35 @@ def test_verbose(
 
         assert pv_out in run_out
 
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    def test_verbose_nonero_rank_id(
+        self,
+        monkeypatch: pytest.MonkeyPatch,
+        capsys: Capsys,
+        genconfig: GenConfig,
+        rank_var: str,
+    ) -> None:
+        for name in RANK_ENV_VARS:
+            monkeypatch.delenv(name, raising=False)
+        monkeypatch.setenv(name, "1")
+
+        # set --dry-run to avoid needing to mock anything
+        config = genconfig(
+            ["--launcher", "none", "--verbose", "--dry-run"], multi_rank=(2, 2)
+        )
+        system = System()
+        driver = m.Driver(config, system)
+
+        driver.run()
+
+        run_out = scrub(capsys.readouterr()[0]).strip()
+
+        print_verbose(driver.system, driver)
+
+        pv_out = scrub(capsys.readouterr()[0]).strip()
+
+        assert pv_out not in run_out
+
     @pytest.mark.parametrize("launch", LAUNCHERS)
     def test_darwin_gdb_warning(
         self,

From 7671dd7ab5ca780baddb18b915f9d9faa29919bf Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 4 Oct 2022 16:50:04 -0700
Subject: [PATCH 004/121] Legion bug WAR: don't instantiate futures on
 framebuffer (#413)

* Legion bug WAR: don't instantiate futures on framebuffer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Wrap the workaround with a define

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 src/core/data/store.cc          | 6 +++++-
 src/core/mapping/base_mapper.cc | 6 +++++-
 src/legate_defines.h            | 3 +++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index 76d4b405f..a4ca73f4b 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -136,9 +136,13 @@ FutureWrapper::FutureWrapper(
 #ifdef DEBUG_LEGATE
     assert(!initialize || future_.get_untyped_size() == field_size);
 #endif
-    auto proc     = Processor::get_executing_processor();
+    auto proc = Processor::get_executing_processor();
+#ifdef LEGATE_NO_FUTURES_ON_FB
+    auto mem_kind = find_memory_kind_for_executing_processor();
+#else
     auto mem_kind = proc.kind() == Processor::Kind::TOC_PROC ? Memory::Kind::GPU_FB_MEM
                                                              : Memory::Kind::SYSTEM_MEM;
+#endif
     if (initialize) {
       auto p_init_value = future_.get_buffer(mem_kind);
 #ifdef LEGATE_USE_CUDA
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 739cd8f97..f975fe3f8 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -610,7 +610,11 @@ void BaseMapper::map_task(const MapperContext ctx,
 
     if (req_indices.empty()) {
       // This is a mapping for futures
-      output.future_locations.push_back(get_target_memory(task.target_proc, mapping.policy.target));
+      StoreTarget target = mapping.policy.target;
+#ifdef LEGATE_NO_FUTURES_ON_FB
+      if (target == StoreTarget::FBMEM) target = StoreTarget::ZCMEM;
+#endif
+      output.future_locations.push_back(get_target_memory(task.target_proc, target));
       continue;
     } else if (mapping.for_unbound_stores()) {
       for (auto req_idx : req_indices) {
diff --git a/src/legate_defines.h b/src/legate_defines.h
index b7c3dacba..fa215e8e7 100644
--- a/src/legate_defines.h
+++ b/src/legate_defines.h
@@ -50,3 +50,6 @@
 #define LEGATE_USE_NETWORK
 #endif
 #endif
+
+// TODO: 2022-10-04: Work around a Legion bug, by not instantiating futures on framebuffer.
+#define LEGATE_NO_FUTURES_ON_FB

From 3db762906a6948be29e50f41d3ebef555a3eab3e Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 5 Oct 2022 12:13:09 -0700
Subject: [PATCH 005/121] Adjust the schedule of the CI runs (#414)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 32a514e00..5a90f5520 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
       - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
   schedule:
     # * is a special character in YAML so you have to quote this string
-    - cron:  '* */4 * * *'
+    - cron:  '0 */6 * * *'
 env:
   COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
   PROJECT: github-core-ci

From 9b9e59cdbaeb22661f11a2babc491620ea3362d5 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 5 Oct 2022 13:17:52 -0700
Subject: [PATCH 006/121] Consolidate driver and test driver codebases (#397)

* initial import of test driver code

* consolidate some utils and types

* ignore vscode workspace for now at least

* parse_command_args -> parse_library_command_args

* factor out colorama

* Consolidate types

* consolidate ui modules

* consolidate system classes

* get rid of driver.util

* temp compat imports

* probable fix for https://github.com/nv-legate/legate.core/issues/393

* bail if legate_module cannot be determined

* use singular util

* use cwd for default test_root

* move custom argparse action to util

* fix test after merge
---
 .gitignore                                    |   2 +
 legate/core/__init__.py                       |   3 +-
 legate/core/runtime.py                        |   5 +-
 legate/driver/__init__.py                     |   1 -
 legate/driver/args.py                         |   2 +-
 legate/driver/command.py                      |   6 +-
 legate/driver/config.py                       |  10 +-
 legate/driver/driver.py                       |  62 ++-
 legate/driver/launcher.py                     |  13 +-
 legate/driver/logs.py                         |  14 +-
 legate/driver/main.py                         |   7 +-
 legate/driver/ui.py                           | 246 ------------
 legate/rc.py                                  | 108 +----
 legate/tester/__init__.py                     |  71 ++++
 legate/tester/args.py                         | 223 +++++++++++
 legate/tester/config.py                       | 163 ++++++++
 legate/tester/logger.py                       |  67 ++++
 legate/tester/stages/__init__.py              |  41 ++
 legate/tester/stages/_linux/__init__.py       |  24 ++
 legate/tester/stages/_linux/cpu.py            |  83 ++++
 legate/tester/stages/_linux/eager.py          |  75 ++++
 legate/tester/stages/_linux/gpu.py            |  85 ++++
 legate/tester/stages/_linux/omp.py            |  87 ++++
 legate/tester/stages/_osx/__init__.py         |  24 ++
 legate/tester/stages/_osx/cpu.py              |  68 ++++
 legate/tester/stages/_osx/eager.py            |  68 ++++
 legate/tester/stages/_osx/gpu.py              |  54 +++
 legate/tester/stages/_osx/omp.py              |  74 ++++
 legate/tester/stages/test_stage.py            | 268 +++++++++++++
 legate/tester/stages/util.py                  | 115 ++++++
 legate/tester/test_plan.py                    | 132 ++++++
 legate/tester/test_system.py                  | 123 ++++++
 .../test_types.py => legate/util/__init__.py  |   0
 legate/util/args.py                           | 182 +++++++++
 legate/util/colors.py                         |  95 +++++
 legate/{driver/util.py => util/fs.py}         | 103 +----
 legate/{driver => util}/system.py             |  53 ++-
 legate/{driver => util}/types.py              |  51 ++-
 legate/util/ui.py                             | 345 ++++++++++++++++
 tests/unit/__init__.py                        |  15 +
 tests/unit/legate/__init__.py                 |  15 +
 tests/unit/legate/driver/__init__.py          |  15 +
 tests/unit/legate/driver/conftest.py          |   6 +-
 tests/unit/legate/driver/test_command.py      |   8 +-
 tests/unit/legate/driver/test_config.py       |   7 +-
 tests/unit/legate/driver/test_driver.py       |  62 ++-
 tests/unit/legate/driver/test_launcher.py     |   8 +-
 tests/unit/legate/driver/test_logs.py         |   6 +-
 tests/unit/legate/driver/test_main.py         |   6 +-
 tests/unit/legate/driver/test_ui.py           | 254 ------------
 tests/unit/legate/driver/test_util.py         | 131 ------
 tests/unit/legate/driver/util.py              |  16 +-
 tests/unit/legate/test_rc.py                  | 122 +-----
 tests/unit/legate/tester/__init__.py          |  15 +
 tests/unit/legate/tester/stages/__init__.py   |  38 ++
 .../legate/tester/stages/_linux/__init__.py   |  22 +
 .../legate/tester/stages/_linux/test_cpu.py   | 132 ++++++
 .../legate/tester/stages/_linux/test_eager.py |  82 ++++
 .../legate/tester/stages/_linux/test_gpu.py   | 101 +++++
 .../legate/tester/stages/_linux/test_omp.py   | 164 ++++++++
 .../legate/tester/stages/test_test_stage.py   |  88 ++++
 tests/unit/legate/tester/stages/test_util.py  |  48 +++
 tests/unit/legate/tester/test___init__.py     |  69 ++++
 tests/unit/legate/tester/test_args.py         |  89 +++++
 tests/unit/legate/tester/test_config.py       | 182 +++++++++
 tests/unit/legate/tester/test_logger.py       |  74 ++++
 tests/unit/legate/tester/test_test_system.py  |  65 +++
 tests/unit/legate/util/__init__.py            |  15 +
 .../{driver => util}/sample_cmake_cache.txt   |   0
 .../legate/{driver => util}/sample_header.h   |   0
 tests/unit/legate/util/test_args.py           | 187 +++++++++
 tests/unit/legate/util/test_colors.py         | 103 +++++
 tests/unit/legate/util/test_fs.py             |  53 +++
 .../legate/{driver => util}/test_system.py    |  19 +-
 tests/unit/legate/util/test_types.py          |  57 +++
 tests/unit/legate/util/test_ui.py             | 375 ++++++++++++++++++
 tests/unit/util.py                            |  33 ++
 77 files changed, 4821 insertions(+), 1014 deletions(-)
 delete mode 100644 legate/driver/ui.py
 create mode 100644 legate/tester/__init__.py
 create mode 100644 legate/tester/args.py
 create mode 100644 legate/tester/config.py
 create mode 100644 legate/tester/logger.py
 create mode 100644 legate/tester/stages/__init__.py
 create mode 100644 legate/tester/stages/_linux/__init__.py
 create mode 100644 legate/tester/stages/_linux/cpu.py
 create mode 100644 legate/tester/stages/_linux/eager.py
 create mode 100644 legate/tester/stages/_linux/gpu.py
 create mode 100644 legate/tester/stages/_linux/omp.py
 create mode 100644 legate/tester/stages/_osx/__init__.py
 create mode 100644 legate/tester/stages/_osx/cpu.py
 create mode 100644 legate/tester/stages/_osx/eager.py
 create mode 100644 legate/tester/stages/_osx/gpu.py
 create mode 100644 legate/tester/stages/_osx/omp.py
 create mode 100644 legate/tester/stages/test_stage.py
 create mode 100644 legate/tester/stages/util.py
 create mode 100644 legate/tester/test_plan.py
 create mode 100644 legate/tester/test_system.py
 rename tests/unit/legate/driver/test_types.py => legate/util/__init__.py (100%)
 create mode 100644 legate/util/args.py
 create mode 100644 legate/util/colors.py
 rename legate/{driver/util.py => util/fs.py} (81%)
 rename legate/{driver => util}/system.py (51%)
 rename legate/{driver => util}/types.py (68%)
 create mode 100644 legate/util/ui.py
 create mode 100644 tests/unit/__init__.py
 create mode 100644 tests/unit/legate/__init__.py
 create mode 100644 tests/unit/legate/driver/__init__.py
 delete mode 100644 tests/unit/legate/driver/test_ui.py
 delete mode 100644 tests/unit/legate/driver/test_util.py
 create mode 100644 tests/unit/legate/tester/__init__.py
 create mode 100644 tests/unit/legate/tester/stages/__init__.py
 create mode 100644 tests/unit/legate/tester/stages/_linux/__init__.py
 create mode 100644 tests/unit/legate/tester/stages/_linux/test_cpu.py
 create mode 100644 tests/unit/legate/tester/stages/_linux/test_eager.py
 create mode 100644 tests/unit/legate/tester/stages/_linux/test_gpu.py
 create mode 100644 tests/unit/legate/tester/stages/_linux/test_omp.py
 create mode 100644 tests/unit/legate/tester/stages/test_test_stage.py
 create mode 100644 tests/unit/legate/tester/stages/test_util.py
 create mode 100644 tests/unit/legate/tester/test___init__.py
 create mode 100644 tests/unit/legate/tester/test_args.py
 create mode 100644 tests/unit/legate/tester/test_config.py
 create mode 100644 tests/unit/legate/tester/test_logger.py
 create mode 100644 tests/unit/legate/tester/test_test_system.py
 create mode 100644 tests/unit/legate/util/__init__.py
 rename tests/unit/legate/{driver => util}/sample_cmake_cache.txt (100%)
 rename tests/unit/legate/{driver => util}/sample_header.h (100%)
 create mode 100644 tests/unit/legate/util/test_args.py
 create mode 100644 tests/unit/legate/util/test_colors.py
 create mode 100644 tests/unit/legate/util/test_fs.py
 rename tests/unit/legate/{driver => util}/test_system.py (83%)
 create mode 100644 tests/unit/legate/util/test_types.py
 create mode 100644 tests/unit/legate/util/test_ui.py
 create mode 100644 tests/unit/util.py

diff --git a/.gitignore b/.gitignore
index 42f7cc262..f7cd920b2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ config.mk
 .vscode
 _cmake_test_compile
 !cmake/versions.json
+legate.core.code-workspace
+
diff --git a/legate/core/__init__.py b/legate/core/__init__.py
index 4ad4c308b..8a6beee0a 100644
--- a/legate/core/__init__.py
+++ b/legate/core/__init__.py
@@ -14,7 +14,8 @@
 #
 from __future__ import annotations
 
-from ..rc import check_legion, parse_command_args
+from ..rc import check_legion
+from ..util.args import parse_library_command_args
 
 check_legion()
 
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index b47624378..c30bc6237 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -24,8 +24,7 @@
 
 from legion_top import add_cleanup_item, top_level
 
-from legate.rc import ArgSpec, Argument, parse_command_args
-
+from ..util.args import ArgSpec, Argument, parse_library_command_args
 from . import ffi  # Make sure we only have one ffi instance
 from . import (
     Fence,
@@ -855,7 +854,7 @@ def __init__(self, core_library: CoreLib) -> None:
         focus on implementing their domain logic.
         """
 
-        self._args = parse_command_args("legate", ARGS)
+        self._args = parse_library_command_args("legate", ARGS)
 
         try:
             self._legion_context = top_level.context[0]
diff --git a/legate/driver/__init__.py b/legate/driver/__init__.py
index f5803f8a8..b8496597d 100644
--- a/legate/driver/__init__.py
+++ b/legate/driver/__init__.py
@@ -18,4 +18,3 @@
 from .driver import Driver
 from .launcher import Launcher
 from .main import main
-from .system import System
diff --git a/legate/driver/args.py b/legate/driver/args.py
index 2e919a2bc..739722170 100755
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -18,8 +18,8 @@
 
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 
+from ..util.types import LauncherType
 from . import defaults
-from .types import LauncherType
 
 __all__ = ("parser",)
 
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 2c582e4b7..7d11c2c9b 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -16,13 +16,13 @@
 
 from typing import TYPE_CHECKING
 
-from .ui import warn
+from ..util.ui import warn
 
 if TYPE_CHECKING:
+    from ..util.system import System
+    from ..util.types import CommandPart
     from .config import Config
     from .launcher import Launcher
-    from .system import System
-    from .types import CommandPart
 
 __all__ = ("CMD_PARTS",)
 
diff --git a/legate/driver/config.py b/legate/driver/config.py
index e90a3454c..c4acb3c41 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -23,10 +23,14 @@
 from pathlib import Path
 from typing import Any
 
+from ..util.types import (
+    ArgList,
+    DataclassMixin,
+    LauncherType,
+    object_to_dataclass,
+)
+from ..util.ui import warn
 from .args import parser
-from .types import ArgList, DataclassMixin, LauncherType
-from .ui import warn
-from .util import object_to_dataclass
 
 __all__ = ("Config",)
 
diff --git a/legate/driver/driver.py b/legate/driver/driver.py
index 57dd653d0..7a3e00c40 100644
--- a/legate/driver/driver.py
+++ b/legate/driver/driver.py
@@ -14,18 +14,22 @@
 #
 from __future__ import annotations
 
+from shlex import quote
 from subprocess import run
+from textwrap import indent
+from typing import TYPE_CHECKING
 
+from ..util.system import System
+from ..util.ui import kvtable, rule, section, value, warn
 from .command import CMD_PARTS
 from .config import Config
 from .launcher import Launcher
 from .logs import process_logs
-from .system import System
-from .types import Command, EnvDict
-from .ui import warn
-from .util import print_verbose
 
-__all__ = ("Driver",)
+if TYPE_CHECKING:
+    from ..util.types import Command, EnvDict
+
+__all__ = ("Driver", "print_verbose")
 
 _DARWIN_GDB_WARN = """\
 You must start the debugging session with the following command,
@@ -113,3 +117,51 @@ def _darwin_gdb_warn(self) -> None:
                     )
                 )
             )
+
+
+def print_verbose(
+    system: System,
+    driver: Driver | None = None,
+) -> None:
+    """Print system and driver configuration values.
+
+    Parameters
+    ----------
+    system : System
+        A System instance to obtain Legate and Legion paths from
+
+    driver : Driver or None, optional
+        If not None, a Driver instance to obtain command invocation and
+        environment from (default: None)
+
+    Returns
+    -------
+        None
+
+    """
+
+    print(f"\n{rule('Legion Python Configuration')}")
+
+    print(section("\nLegate paths:"))
+    print(indent(str(system.legate_paths), prefix="  "))
+
+    print(section("\nLegion paths:"))
+    print(indent(str(system.legion_paths), prefix="  "))
+
+    if driver:
+        print(section("\nCommand:"))
+        cmd = " ".join(quote(t) for t in driver.cmd)
+        print(f"  {value(cmd)}")
+
+        if keys := sorted(driver.custom_env_vars):
+            print(section("\nCustomized Environment:"))
+            print(
+                indent(
+                    kvtable(driver.env, delim="=", align=False, keys=keys),
+                    prefix="  ",
+                )
+            )
+
+    print(f"\n{rule()}")
+
+    print(flush=True)
diff --git a/legate/driver/launcher.py b/legate/driver/launcher.py
index 922eb4f6f..009b0cf6b 100644
--- a/legate/driver/launcher.py
+++ b/legate/driver/launcher.py
@@ -17,12 +17,15 @@
 import os
 import sys
 from pathlib import Path
+from typing import TYPE_CHECKING
 
-from .config import Config
-from .system import System
-from .types import Command, EnvDict, LauncherType
-from .ui import warn
-from .util import read_c_define
+from ..util.fs import read_c_define
+from ..util.ui import warn
+
+if TYPE_CHECKING:
+    from ..util.system import System
+    from ..util.types import Command, EnvDict, LauncherType
+    from .config import Config
 
 __all__ = ("Launcher",)
 
diff --git a/legate/driver/logs.py b/legate/driver/logs.py
index 1173e8486..261ab6dd5 100644
--- a/legate/driver/logs.py
+++ b/legate/driver/logs.py
@@ -22,13 +22,15 @@
 from contextlib import contextmanager
 from shlex import quote
 from subprocess import run
-from typing import Iterator
+from typing import TYPE_CHECKING, Iterator
 
-from .config import Config
-from .launcher import Launcher
-from .system import System
-from .types import Command
-from .ui import warn
+from ..util.ui import warn
+
+if TYPE_CHECKING:
+    from ..util.system import System
+    from ..util.types import Command
+    from .config import Config
+    from .launcher import Launcher
 
 __all__ = (
     "DebuggingHandler",
diff --git a/legate/driver/main.py b/legate/driver/main.py
index c2e0ac577..2ca3f04be 100644
--- a/legate/driver/main.py
+++ b/legate/driver/main.py
@@ -34,9 +34,10 @@ def main(argv: list[str]) -> int:
         int, a process return code
 
     """
-    from . import Config, Driver, System
-    from .ui import error
-    from .util import print_verbose
+    from ..util.system import System
+    from ..util.ui import error
+    from . import Config, Driver
+    from .driver import print_verbose
 
     try:
         config = Config(argv)
diff --git a/legate/driver/ui.py b/legate/driver/ui.py
deleted file mode 100644
index e6f5ee37d..000000000
--- a/legate/driver/ui.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Helper functions for simple text UI output.
-
-The color functions in this module require ``colorama`` to be installed in
-order to generate color output. If ``colorama`` is not available, plain
-text output (i.e. without ANSI color codes) will be generated.
-
-"""
-from __future__ import annotations
-
-import re
-import sys
-from typing import Any, Iterable
-
-__all__ = (
-    "bright",
-    "cyan",
-    "dim",
-    "error",
-    "green",
-    "key",
-    "kvtable",
-    "magenta",
-    "red",
-    "rule",
-    "scrub",
-    "section",
-    "value",
-    "warn",
-    "white",
-    "yellow",
-)
-
-
-def _text(text: str) -> str:
-    return text
-
-
-try:
-    import colorama  # type: ignore[import]
-
-    def bright(text: str) -> str:
-        return f"{colorama.Style.BRIGHT}{text}{colorama.Style.RESET_ALL}"
-
-    def dim(text: str) -> str:
-        return f"{colorama.Style.DIM}{text}{colorama.Style.RESET_ALL}"
-
-    def white(text: str) -> str:
-        return f"{colorama.Fore.WHITE}{text}{colorama.Style.RESET_ALL}"
-
-    def cyan(text: str) -> str:
-        return f"{colorama.Fore.CYAN}{text}{colorama.Style.RESET_ALL}"
-
-    def red(text: str) -> str:
-        return f"{colorama.Fore.RED}{text}{colorama.Style.RESET_ALL}"
-
-    def magenta(text: str) -> str:
-        return f"{colorama.Fore.MAGENTA}{text}{colorama.Style.RESET_ALL}"
-
-    def green(text: str) -> str:
-        return f"{colorama.Fore.GREEN}{text}{colorama.Style.RESET_ALL}"
-
-    def yellow(text: str) -> str:
-        return f"{colorama.Fore.YELLOW}{text}{colorama.Style.RESET_ALL}"
-
-    if sys.platform == "win32":
-        colorama.init()
-
-except ImportError:
-
-    bright = dim = white = cyan = red = magenta = green = yellow = _text
-
-# ref: https://stackoverflow.com/a/14693789
-_ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
-
-
-def error(text: str) -> str:
-    """Format text as an error.
-
-    Parameters
-    ----------
-    text : str
-        The text to format
-
-    Returns
-    -------
-        str
-
-    """
-    return red(f"ERROR: {text}")
-
-
-def key(text: str) -> str:
-    """Format a 'key' from a key-value pair.
-
-    Parameters
-    ----------
-    text : str
-        The key to format
-
-    Returns
-    -------
-        str
-
-    """
-    return dim(green(text))
-
-
-def value(text: str) -> str:
-    """Format a 'value' from of a key-value pair.
-
-    Parameters
-    ----------
-    text : str
-        The key to format
-
-    Returns
-    -------
-        str
-
-    """
-    return yellow(text)
-
-
-def kvtable(
-    items: dict[str, Any],
-    *,
-    delim: str = " : ",
-    align: bool = True,
-    keys: Iterable[str] | None = None,
-) -> str:
-    """Format a dictionay as a table of key-value pairs.
-
-    Parameters
-    ----------
-    items : dict[str, Any]
-        The dictionary of items to format
-
-    delim : str, optional
-        A delimiter to display between keys and values (default: " : ")
-
-    align : bool, optional
-        Whether to align delimiters to the longest key length (default: True)
-
-    keys : Iterable[str] or None, optional
-        If not None, only the specified subset of keys is included in the
-        table output (default: None)
-
-    Returns
-    -------
-        str
-
-    """
-    # annoying but necessary to take len on color-formatted version
-    N = max(len(key(k)) for k in items) if align else 0
-
-    keys = items.keys() if keys is None else keys
-
-    return "\n".join(
-        f"{key(k): <{N}}{delim}{value(str(items[k]))}" for k in keys
-    )
-
-
-def rule(text: str | None = None, *, char: str = "-", N: int = 80) -> str:
-    """Format a horizontal rule, optionally with text
-
-    Parameters
-    ----------
-    text : str or None, optional
-        If not None, display this text inline in the rule (default: None)
-
-    char: str, optional
-        A character to use for the rule (default: "-")
-
-    N : int, optional
-        Character width for the rule (default: 80)
-
-    Returns
-    -------
-        str
-
-    """
-    if text is None:
-        return cyan(char * N)
-    return cyan(char * 3 + f"{f' {text} ' :{char}<{N-3}}")
-
-
-def section(text: str) -> str:
-    """Format text as a section header
-
-    Parameters
-    ----------
-    text : str
-        The text to format
-
-    Returns
-    -------
-        str
-
-    """
-    return bright(white(text))
-
-
-def scrub(text: str) -> str:
-    """Remove ANSI color codes from a text string.
-
-    Parameters
-    ----------
-    text : str
-        The text to scrub
-
-    Returns
-    -------
-        str
-
-    """
-    return _ANSI_ESCAPE.sub("", text)
-
-
-def warn(text: str) -> str:
-    """Format text as a warning.
-
-    Parameters
-    ----------
-    text : str
-        The text to format
-
-    Returns
-    -------
-        str
-
-    """
-    return magenta(f"WARNING: {text}")
diff --git a/legate/rc.py b/legate/rc.py
index 6a54cc530..bd4abca51 100644
--- a/legate/rc.py
+++ b/legate/rc.py
@@ -14,14 +14,6 @@
 #
 from __future__ import annotations
 
-import sys
-import warnings
-from argparse import ArgumentParser, Namespace
-from dataclasses import dataclass, fields
-from typing import Any, Iterable, Literal, Sequence, Type, TypeVar, Union
-
-from typing_extensions import TypeAlias
-
 LEGION_WARNING = """
 
 All Legate programs must be run with a legion_python interperter. We
@@ -35,6 +27,13 @@
 legion_python directly.
 """
 
+# TODO (bv) temp transitive imports until cunumeric is updated
+from .util.args import (  # noqa
+    ArgSpec,
+    Argument,
+    parse_library_command_args as parse_command_args,
+)
+
 
 def has_legion_context() -> bool:
     """Determine whether we are running in legion_python.
@@ -55,96 +54,3 @@ def check_legion(msg: str = LEGION_WARNING) -> None:
     """Raise an error if we are not running in legion_python."""
     if not has_legion_context():
         raise RuntimeError(msg)
-
-
-class _UnsetType:
-    pass
-
-
-Unset = _UnsetType()
-
-_T = TypeVar("_T")
-NotRequired = Union[_UnsetType, _T]
-
-
-def entries(obj: Any) -> Iterable[tuple[str, Any]]:
-    for f in fields(obj):
-        value = getattr(obj, f.name)
-        if value is not Unset:
-            yield (f.name, value)
-
-
-# https://docs.python.org/3/library/argparse.html#action
-ActionType: TypeAlias = Literal[
-    "store",
-    "store_const",
-    "store_true",
-    "append",
-    "append_const",
-    "count",
-    "help",
-    "version",
-    "extend",
-]
-
-# https://docs.python.org/3/library/argparse.html#nargs
-NargsType: TypeAlias = Literal["?", "*", "+", "..."]
-
-
-@dataclass(frozen=True)
-class ArgSpec:
-    dest: str
-    action: NotRequired[ActionType] = "store_true"
-    nargs: NotRequired[Union[int, NargsType]] = Unset
-    const: NotRequired[Any] = Unset
-    default: NotRequired[Any] = Unset
-    type: NotRequired[Type[Any]] = Unset
-    choices: NotRequired[Sequence[Any]] = Unset
-    help: NotRequired[str] = Unset
-    metavar: NotRequired[str] = Unset
-
-
-@dataclass(frozen=True)
-class Argument:
-    name: str
-    spec: ArgSpec
-
-
-def parse_command_args(libname: str, args: Iterable[Argument]) -> Namespace:
-    """ """
-    if not libname.isidentifier():
-        raise ValueError(
-            f"Invalid library {libname!r} for command line arguments"
-        )
-
-    parser = ArgumentParser(
-        prog=f"<{libname} program>", add_help=False, allow_abbrev=False
-    )
-
-    lib_prefix = f"-{libname}:"
-
-    argnames = [arg.name for arg in args]
-
-    for arg in args:
-        argname = f"{lib_prefix}{arg.name}"
-        kwargs = dict(entries(arg.spec))
-        parser.add_argument(argname, **kwargs)
-
-    has_custom_help = "help" in argnames
-
-    if f"{lib_prefix}help" in sys.argv and not has_custom_help:
-        parser.print_help()
-        sys.exit()
-
-    args, extra = parser.parse_known_args()
-
-    for item in extra:
-        if item.startswith(lib_prefix):
-            warnings.warn(
-                f"Unrecognized argument {item!r} for {libname} (passed on as-is)"  # noqa: E501
-            )
-            break
-
-    sys.argv = sys.argv[:1] + extra
-
-    return args
diff --git a/legate/tester/__init__.py b/legate/tester/__init__.py
new file mode 100644
index 000000000..270abcf8d
--- /dev/null
+++ b/legate/tester/__init__.py
@@ -0,0 +1,71 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Utilities and helpers for implementing the Cunumeric custom test runner.
+
+"""
+from __future__ import annotations
+
+from typing import Union
+from typing_extensions import Literal, TypeAlias
+
+#: Define the available feature types for tests
+FeatureType: TypeAlias = Union[
+    Literal["cpus"], Literal["cuda"], Literal["eager"], Literal["openmp"]
+]
+
+#: Value to use if --cpus is not specified.
+DEFAULT_CPUS_PER_NODE = 4
+
+#: Value to use if --gpus is not specified.
+DEFAULT_GPUS_PER_NODE = 1
+
+# Delay to introduce between GPU test invocations (ms)
+DEFAULT_GPU_DELAY = 2000
+
+# Value to use if --fbmem is not specified (MB)
+DEFAULT_GPU_MEMORY_BUDGET = 4096
+
+#: Value to use if --omps is not specified.
+DEFAULT_OMPS_PER_NODE = 1
+
+#: Value to use if --ompthreads is not specified.
+DEFAULT_OMPTHREADS = 4
+
+#: Default values to apply to normalize the testing environment.
+DEFAULT_PROCESS_ENV = {
+    "LEGATE_TEST": "1",
+}
+
+#: Feature values that are accepted for --use, in the relative order
+#: that the corresponding test stages should always execute in
+FEATURES: tuple[FeatureType, ...] = (
+    "cpus",
+    "cuda",
+    "eager",
+    "openmp",
+)
+
+#: Paths to example files that should be skipped.
+SKIPPED_EXAMPLES = {
+    "examples/ingest.py",
+    "examples/kmeans_sort.py",
+    "examples/lstm_full.py",
+    "examples/wgrad.py",
+}
+
+#: Extra arguments to supply when specific examples are executed.
+PER_FILE_ARGS = {
+    "examples/lstm_full.py": ["--file", "resources/lstm_input.txt"],
+}
diff --git a/legate/tester/args.py b/legate/tester/args.py
new file mode 100644
index 000000000..6c3f24962
--- /dev/null
+++ b/legate/tester/args.py
@@ -0,0 +1,223 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide an argparse ArgumentParser for the test runner.
+
+"""
+from __future__ import annotations
+
+from argparse import ArgumentParser
+from typing import Literal, Union
+
+from typing_extensions import TypeAlias
+
+from ..util.args import ExtendAction, MultipleChoices
+from . import (
+    DEFAULT_CPUS_PER_NODE,
+    DEFAULT_GPU_DELAY,
+    DEFAULT_GPU_MEMORY_BUDGET,
+    DEFAULT_GPUS_PER_NODE,
+    DEFAULT_OMPS_PER_NODE,
+    DEFAULT_OMPTHREADS,
+    FEATURES,
+)
+
+PinOptionsType: TypeAlias = Union[
+    Literal["partial"],
+    Literal["none"],
+    Literal["strict"],
+]
+
+PIN_OPTIONS: tuple[PinOptionsType, ...] = (
+    "partial",
+    "none",
+    "strict",
+)
+
+
+#: The argument parser for test.py
+parser = ArgumentParser(
+    description="Run the Cunumeric test suite",
+    epilog="Any extra arguments will be forwarded to the Legate script",
+)
+
+
+stages = parser.add_argument_group("Feature stage selection")
+
+
+stages.add_argument(
+    "--use",
+    dest="features",
+    action=ExtendAction,
+    choices=MultipleChoices(sorted(FEATURES)),
+    type=lambda s: s.split(","),  # type: ignore
+    help="Test Legate with features (also via USE_*)",
+)
+
+
+selection = parser.add_argument_group("Test file selection")
+
+
+selection.add_argument(
+    "--files",
+    nargs="+",
+    default=None,
+    help="Explicit list of test files to run",
+)
+
+
+selection.add_argument(
+    "--unit",
+    dest="unit",
+    action="store_true",
+    default=False,
+    help="Include unit tests",
+)
+
+
+feature_opts = parser.add_argument_group("Feature stage configuration options")
+
+
+feature_opts.add_argument(
+    "--cpus",
+    dest="cpus",
+    type=int,
+    default=DEFAULT_CPUS_PER_NODE,
+    help="Number of CPUs per node to use",
+)
+
+
+feature_opts.add_argument(
+    "--gpus",
+    dest="gpus",
+    type=int,
+    default=DEFAULT_GPUS_PER_NODE,
+    help="Number of GPUs per node to use",
+)
+
+
+feature_opts.add_argument(
+    "--omps",
+    dest="omps",
+    type=int,
+    default=DEFAULT_OMPS_PER_NODE,
+    help="Number OpenMP processors per node to use",
+)
+
+
+feature_opts.add_argument(
+    "--utility",
+    dest="utility",
+    type=int,
+    default=1,
+    help="Number of of utility CPUs to reserve for runtime services",
+)
+
+
+feature_opts.add_argument(
+    "--cpu-pin",
+    dest="cpu_pin",
+    choices=PIN_OPTIONS,
+    default="partial",
+    help="CPU pinning behavior on platforms that support CPU pinning",
+)
+
+feature_opts.add_argument(
+    "--gpu-delay",
+    dest="gpu_delay",
+    type=int,
+    default=DEFAULT_GPU_DELAY,
+    help="Delay to introduce between GPU tests (ms)",
+)
+
+
+feature_opts.add_argument(
+    "--fbmem",
+    dest="fbmem",
+    type=int,
+    default=DEFAULT_GPU_MEMORY_BUDGET,
+    help="GPU framebuffer memory (MB)",
+)
+
+
+feature_opts.add_argument(
+    "--ompthreads",
+    dest="ompthreads",
+    metavar="THREADS",
+    type=int,
+    default=DEFAULT_OMPTHREADS,
+    help="Number of threads per OpenMP processor",
+)
+
+
+test_opts = parser.add_argument_group("Test run configuration options")
+
+
+test_opts.add_argument(
+    "--legate",
+    dest="legate_dir",
+    metavar="LEGATE_DIR",
+    action="store",
+    default=None,
+    required=False,
+    help="Path to Legate installation directory",
+)
+
+
+test_opts.add_argument(
+    "-C",
+    "--directory",
+    dest="test_root",
+    metavar="DIR",
+    action="store",
+    default=None,
+    required=False,
+    help="Root directory containing the tests subdirectory",
+)
+
+
+test_opts.add_argument(
+    "-j",
+    "--workers",
+    dest="workers",
+    type=int,
+    default=None,
+    help="Number of parallel workers for testing",
+)
+
+
+test_opts.add_argument(
+    "-v",
+    "--verbose",
+    dest="verbose",
+    action="count",
+    default=0,
+    help="Display verbose output. Use -vv for even more output (test stdout)",
+)
+
+
+test_opts.add_argument(
+    "--dry-run",
+    dest="dry_run",
+    action="store_true",
+    help="Print the test plan but don't run anything",
+)
+
+
+test_opts.add_argument(
+    "--debug",
+    dest="debug",
+    action="store_true",
+    help="Print out the commands that are to be executed",
+)
diff --git a/legate/tester/config.py b/legate/tester/config.py
new file mode 100644
index 000000000..a621ba8c3
--- /dev/null
+++ b/legate/tester/config.py
@@ -0,0 +1,163 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import os
+from argparse import Namespace
+from pathlib import Path
+
+from ..util.types import ArgList, EnvDict
+from . import DEFAULT_PROCESS_ENV, FEATURES, SKIPPED_EXAMPLES, FeatureType
+from .args import parser
+
+
+class Config:
+    """A centralized configuration object that provides the information
+    needed by test stages in order to run.
+
+    Parameters
+    ----------
+    argv : ArgList
+        command-line arguments to use when building the configuration
+
+    """
+
+    def __init__(self, argv: ArgList) -> None:
+        args, self._extra_args = parser.parse_known_args(argv[1:])
+
+        # which tests to run
+        self.examples = True
+        self.integration = True
+        self.unit = args.unit
+        self.files = args.files
+
+        # feature configuration
+        self.features = self._compute_features(args)
+
+        # feature options for integration tests
+        self.cpus = args.cpus
+        self.gpus = args.gpus
+        self.omps = args.omps
+        self.utility = args.utility
+        self.cpu_pin = args.cpu_pin
+        self.fbmem = args.fbmem
+        self.gpu_delay = args.gpu_delay
+        self.ompthreads = args.ompthreads
+
+        # test run configuration
+        self.debug = args.debug
+        self.dry_run = args.dry_run
+        self.verbose = args.verbose
+        self.test_root = args.test_root
+        self.requested_workers = args.workers
+        self.legate_dir = self._compute_legate_dir(args)
+
+    @property
+    def env(self) -> EnvDict:
+        """Custom environment settings used for process exectution."""
+        return dict(DEFAULT_PROCESS_ENV)
+
+    @property
+    def extra_args(self) -> ArgList:
+        """Extra command-line arguments to pass on to individual test files."""
+        return self._extra_args
+
+    @property
+    def root_dir(self) -> Path:
+        """Path to the directory containing the tests."""
+        if self.test_root:
+            return Path(self.test_root)
+
+        # if not explicitly given, just use cwd assuming we are at a repo top
+        return Path(os.getcwd())
+
+    @property
+    def test_files(self) -> tuple[Path, ...]:
+        """List of all test files to use for each stage.
+
+        An explicit list of files from the command line will take precedence.
+
+        Otherwise, the files are computed based on command-line options, etc.
+
+        """
+        if self.files:
+            return self.files
+
+        files = []
+
+        if self.examples:
+            examples = (
+                path.relative_to(self.root_dir)
+                for path in self.root_dir.joinpath("examples").glob("*.py")
+                if str(path.relative_to(self.root_dir)) not in SKIPPED_EXAMPLES
+            )
+            files.extend(sorted(examples))
+
+        if self.integration:
+            integration_tests = (
+                path.relative_to(self.root_dir)
+                for path in self.root_dir.joinpath("tests/integration").glob(
+                    "*.py"
+                )
+            )
+            files.extend(sorted(integration_tests))
+
+        if self.unit:
+            unit_tests = (
+                path.relative_to(self.root_dir)
+                for path in self.root_dir.joinpath("tests/unit").glob(
+                    "**/*.py"
+                )
+            )
+            files.extend(sorted(unit_tests))
+
+        return tuple(files)
+
+    @property
+    def legate_path(self) -> str:
+        """Computed path to the legate driver script"""
+        if self.legate_dir is None:
+            return "legate"
+        return str(self.legate_dir / "bin" / "legate")
+
+    def _compute_features(self, args: Namespace) -> tuple[FeatureType, ...]:
+        if args.features is not None:
+            computed = args.features
+        else:
+            computed = [
+                feature
+                for feature in FEATURES
+                if os.environ.get(f"USE_{feature.upper()}", None) == "1"
+            ]
+
+        # if nothing is specified any other way, at least run CPU stage
+        if len(computed) == 0:
+            computed.append("cpus")
+
+        return tuple(computed)
+
+    def _compute_legate_dir(self, args: Namespace) -> Path | None:
+        # self._legate_source below is purely for testing
+        if args.legate_dir:
+            self._legate_source = "cmd"
+            return Path(args.legate_dir)
+        elif "LEGATE_DIR" in os.environ:
+            self._legate_source = "env"
+            return Path(os.environ["LEGATE_DIR"])
+        self._legate_source = "install"
+        return None
diff --git a/legate/tester/logger.py b/legate/tester/logger.py
new file mode 100644
index 000000000..f40904219
--- /dev/null
+++ b/legate/tester/logger.py
@@ -0,0 +1,67 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide a basic logger that can scrub ANSI color codes.
+
+"""
+from __future__ import annotations
+
+import re
+
+# ref: https://stackoverflow.com/a/14693789
+_ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+
+
+class Log:
+    def __init__(self) -> None:
+        self._record: list[str] = []
+
+    def __call__(self, *lines: str) -> tuple[int, int]:
+        return self.record(*lines)
+
+    def record(self, *lines: str) -> tuple[int, int]:
+        if len(lines) == 1 and "\n" in lines[0]:
+            lines = tuple(lines[0].split("\n"))
+
+        start = len(self._record)
+        for line in lines:
+            self._record.append(line)
+            print(line, flush=True)
+        return (start, len(self._record))
+
+    def clear(self) -> None:
+        self._record = []
+
+    def dump(
+        self,
+        *,
+        start: int = 0,
+        end: int | None = None,
+        filter_ansi: bool = True,
+    ) -> str:
+        lines = self._record[start:end]
+
+        if filter_ansi:
+            full_text = _ANSI_ESCAPE.sub("", "\n".join(lines))
+        else:
+            full_text = "\n".join(lines)
+
+        return full_text
+
+    @property
+    def lines(self) -> tuple[str, ...]:
+        return tuple(self._record)
+
+
+LOG = Log()
diff --git a/legate/tester/stages/__init__.py b/legate/tester/stages/__init__.py
new file mode 100644
index 000000000..fa8f916d5
--- /dev/null
+++ b/legate/tester/stages/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide TestStage subclasses for running configured test files using
+specific features.
+
+"""
+from __future__ import annotations
+
+import sys
+from typing import Dict, Type
+
+from .. import FeatureType
+from .test_stage import TestStage
+from .util import log_proc
+
+if sys.platform == "darwin":
+    from ._osx import CPU, Eager, GPU, OMP
+elif sys.platform.startswith("linux"):
+    from ._linux import CPU, Eager, GPU, OMP
+else:
+    raise RuntimeError(f"unsupported platform: {sys.platform}")
+
+#: All the available test stages that can be selected
+STAGES: Dict[FeatureType, Type[TestStage]] = {
+    "cpus": CPU,
+    "cuda": GPU,
+    "openmp": OMP,
+    "eager": Eager,
+}
diff --git a/legate/tester/stages/_linux/__init__.py b/legate/tester/stages/_linux/__init__.py
new file mode 100644
index 000000000..032305f9c
--- /dev/null
+++ b/legate/tester/stages/_linux/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide TestStage subclasses for running configured test files using
+specific features on linux platforms.
+
+"""
+from __future__ import annotations
+
+from .cpu import CPU
+from .gpu import GPU
+from .eager import Eager
+from .omp import OMP
diff --git a/legate/tester/stages/_linux/cpu.py b/legate/tester/stages/_linux/cpu.py
new file mode 100644
index 000000000..deb5610a6
--- /dev/null
+++ b/legate/tester/stages/_linux/cpu.py
@@ -0,0 +1,83 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from itertools import chain
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import (
+    CUNUMERIC_TEST_ARG,
+    UNPIN_ENV,
+    Shard,
+    StageSpec,
+    adjust_workers,
+)
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class CPU(TestStage):
+    """A test stage for exercising CPU features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "cpus"
+
+    args = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        args = [
+            "--cpus",
+            str(config.cpus),
+        ]
+        if config.cpu_pin != "none":
+            args += [
+                "--cpu-bind",
+                ",".join(str(x) for x in shard),
+            ]
+        return args
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        cpus = system.cpus
+
+        procs = config.cpus + config.utility + int(config.cpu_pin == "strict")
+        workers = adjust_workers(len(cpus) // procs, config.requested_workers)
+
+        shards: list[tuple[int, ...]] = []
+        for i in range(workers):
+            shard_cpus = range(i * procs, (i + 1) * procs)
+            shard = chain.from_iterable(cpus[j].ids for j in shard_cpus)
+            shards.append(tuple(sorted(shard)))
+
+        return StageSpec(workers, shards)
diff --git a/legate/tester/stages/_linux/eager.py b/legate/tester/stages/_linux/eager.py
new file mode 100644
index 000000000..cc9a08d5a
--- /dev/null
+++ b/legate/tester/stages/_linux/eager.py
@@ -0,0 +1,75 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import Shard, StageSpec, adjust_workers
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class Eager(TestStage):
+    """A test stage for exercising Eager Numpy execution features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "eager"
+
+    args: ArgList = []
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        # Raise min chunk sizes for deferred codepaths to force eager execution
+        env = {
+            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
+            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
+            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
+        }
+        return env
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return [
+            "--cpus",
+            "1",
+            "--cpu-bind",
+            ",".join(str(x) for x in shard),
+        ]
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        N = len(system.cpus)
+
+        degree = min(N, 60)  # ~LEGION_MAX_NUM_PROCS just in case
+        workers = adjust_workers(degree, config.requested_workers)
+
+        # Just put each worker on its own full CPU for eager tests
+        shards = [cpu.ids for cpu in system.cpus]
+
+        return StageSpec(workers, shards)
diff --git a/legate/tester/stages/_linux/gpu.py b/legate/tester/stages/_linux/gpu.py
new file mode 100644
index 000000000..f1a222fc0
--- /dev/null
+++ b/legate/tester/stages/_linux/gpu.py
@@ -0,0 +1,85 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import CUNUMERIC_TEST_ARG, Shard, StageSpec, adjust_workers
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+BLOAT_FACTOR = 1.5  # hard coded for now
+
+
+class GPU(TestStage):
+    """A test stage for exercising GPU features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "cuda"
+
+    args = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return {}
+
+    def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
+        time.sleep(config.gpu_delay / 1000)
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return [
+            "--fbmem",
+            str(config.fbmem),
+            "--gpus",
+            str(len(shard)),
+            "--gpu-bind",
+            ",".join(str(x) for x in shard),
+        ]
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        N = len(system.gpus)
+        degree = N // config.gpus
+
+        fbsize = min(gpu.total for gpu in system.gpus) / (2 << 20)  # MB
+        oversub_factor = int(fbsize // (config.fbmem * BLOAT_FACTOR))
+        workers = adjust_workers(
+            degree * oversub_factor, config.requested_workers
+        )
+
+        # https://docs.python.org/3/library/itertools.html#itertools-recipes
+        # grouper('ABCDEF', 3) --> ABC DEF
+        args = [iter(range(degree * config.gpus))] * config.gpus
+        per_worker_shards = list(zip(*args))
+
+        shards = per_worker_shards * workers
+
+        return StageSpec(workers, shards)
diff --git a/legate/tester/stages/_linux/omp.py b/legate/tester/stages/_linux/omp.py
new file mode 100644
index 000000000..f7af3e9d0
--- /dev/null
+++ b/legate/tester/stages/_linux/omp.py
@@ -0,0 +1,87 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from itertools import chain
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import (
+    CUNUMERIC_TEST_ARG,
+    UNPIN_ENV,
+    Shard,
+    StageSpec,
+    adjust_workers,
+)
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class OMP(TestStage):
+    """A test stage for exercising OpenMP features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "openmp"
+
+    args = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        args = [
+            "--omps",
+            str(config.omps),
+            "--ompthreads",
+            str(config.ompthreads),
+        ]
+        if config.cpu_pin != "none":
+            args += [
+                "--cpu-bind",
+                ",".join(str(x) for x in shard),
+            ]
+        return args
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        cpus = system.cpus
+        omps, threads = config.omps, config.ompthreads
+        procs = (
+            omps * threads + config.utility + int(config.cpu_pin == "strict")
+        )
+        workers = adjust_workers(len(cpus) // procs, config.requested_workers)
+
+        shards: list[tuple[int, ...]] = []
+        for i in range(workers):
+            shard_cpus = range(i * procs, (i + 1) * procs)
+            shard = chain.from_iterable(cpus[j].ids for j in shard_cpus)
+            shards.append(tuple(sorted(shard)))
+
+        return StageSpec(workers, shards)
diff --git a/legate/tester/stages/_osx/__init__.py b/legate/tester/stages/_osx/__init__.py
new file mode 100644
index 000000000..80a7c368d
--- /dev/null
+++ b/legate/tester/stages/_osx/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide TestStage subclasses for running configured test files using
+specific features on OSX.
+
+"""
+from __future__ import annotations
+
+from .cpu import CPU
+from .gpu import GPU
+from .eager import Eager
+from .omp import OMP
diff --git a/legate/tester/stages/_osx/cpu.py b/legate/tester/stages/_osx/cpu.py
new file mode 100644
index 000000000..182a6d76b
--- /dev/null
+++ b/legate/tester/stages/_osx/cpu.py
@@ -0,0 +1,68 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import (
+    CUNUMERIC_TEST_ARG,
+    UNPIN_ENV,
+    Shard,
+    StageSpec,
+    adjust_workers,
+)
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class CPU(TestStage):
+    """A test stage for exercising CPU features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "cpus"
+
+    args = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return UNPIN_ENV
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return ["--cpus", str(config.cpus)]
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        procs = config.cpus + config.utility
+        workers = adjust_workers(
+            len(system.cpus) // procs, config.requested_workers
+        )
+
+        # return a dummy set of shards just for the runner to iterate over
+        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/legate/tester/stages/_osx/eager.py b/legate/tester/stages/_osx/eager.py
new file mode 100644
index 000000000..b32feb17d
--- /dev/null
+++ b/legate/tester/stages/_osx/eager.py
@@ -0,0 +1,68 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import UNPIN_ENV, Shard, StageSpec, adjust_workers
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class Eager(TestStage):
+    """A test stage for exercising Eager Numpy execution features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "eager"
+
+    args: ArgList = []
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        # Raise min chunk sizes for deferred codepaths to force eager execution
+        env = {
+            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
+            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
+            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
+        }
+        env.update(UNPIN_ENV)
+        return env
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return ["--cpus", "1"]
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        N = len(system.cpus)
+        degree = min(N, 60)  # ~LEGION_MAX_NUM_PROCS just in case
+        workers = adjust_workers(degree, config.requested_workers)
+
+        # return a dummy set of shards just for the runner to iterate over
+        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/legate/tester/stages/_osx/gpu.py b/legate/tester/stages/_osx/gpu.py
new file mode 100644
index 000000000..2a1597494
--- /dev/null
+++ b/legate/tester/stages/_osx/gpu.py
@@ -0,0 +1,54 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import CUNUMERIC_TEST_ARG, UNPIN_ENV, Shard
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class GPU(TestStage):
+    """A test stage for exercising GPU features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "cuda"
+
+    args: ArgList = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        raise RuntimeError("GPU test are not supported on OSX")
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return UNPIN_ENV
+
+    def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
+        time.sleep(config.gpu_delay / 1000)
diff --git a/legate/tester/stages/_osx/omp.py b/legate/tester/stages/_osx/omp.py
new file mode 100644
index 000000000..eb279791a
--- /dev/null
+++ b/legate/tester/stages/_osx/omp.py
@@ -0,0 +1,74 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from ..test_stage import TestStage
+from ..util import (
+    CUNUMERIC_TEST_ARG,
+    UNPIN_ENV,
+    Shard,
+    StageSpec,
+    adjust_workers,
+)
+
+if TYPE_CHECKING:
+    from ....util.types import ArgList, EnvDict
+    from ... import FeatureType
+    from ...config import Config
+    from ...test_system import TestSystem
+
+
+class OMP(TestStage):
+    """A test stage for exercising OpenMP features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType = "openmp"
+
+    args = [CUNUMERIC_TEST_ARG]
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._init(config, system)
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        return UNPIN_ENV
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return [
+            "--omps",
+            str(config.omps),
+            "--ompthreads",
+            str(config.ompthreads),
+        ]
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        omps, threads = config.omps, config.ompthreads
+        procs = omps * threads + config.utility
+        workers = adjust_workers(
+            len(system.cpus) // procs, config.requested_workers
+        )
+
+        # return a dummy set of shards just for the runner to iterate over
+        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/legate/tester/stages/test_stage.py b/legate/tester/stages/test_stage.py
new file mode 100644
index 000000000..c21fdd630
--- /dev/null
+++ b/legate/tester/stages/test_stage.py
@@ -0,0 +1,268 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import multiprocessing
+from datetime import datetime
+from pathlib import Path
+
+from typing_extensions import Protocol
+
+from ...util.colors import yellow
+from ...util.types import ArgList, EnvDict
+from ...util.ui import banner, summary
+from .. import PER_FILE_ARGS, FeatureType
+from ..config import Config
+from ..test_system import ProcessResult, TestSystem
+from .util import Shard, StageResult, StageSpec, log_proc
+
+
+class TestStage(Protocol):
+    """Encapsulate running configured test files using specific features.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    kind: FeatureType
+
+    #: The computed specification for processes to launch to run the
+    #: configured test files.
+    spec: StageSpec
+
+    #: The computed sharding id sets to use for job runs
+    shards: multiprocessing.Queue[Shard]
+
+    #: After the stage completes, results will be stored here
+    result: StageResult
+
+    #: Any fixed stage-specific command-line args to pass
+    args: ArgList
+
+    # --- Protocol methods
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        ...
+
+    def env(self, config: Config, system: TestSystem) -> EnvDict:
+        """Generate stage-specific customizations to the process env
+
+        Parameters
+        ----------
+        config: Config
+            Test runner configuration
+
+        system: TestSystem
+            Process execution wrapper
+
+        """
+        ...
+
+    def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
+        """Wait any delay that should be applied before running the next
+        test.
+
+        Parameters
+        ----------
+        shard: Shard
+            The shard to be used for the next test that is run
+
+        config: Config
+            Test runner configuration
+
+        system: TestSystem
+            Process execution wrapper
+
+        """
+        ...
+
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        """Generate the command line arguments necessary to launch
+        the next test process on the given shard.
+
+        Parameters
+        ----------
+        shard: Shard
+            The shard to be used for the next test that is run
+
+        config: Config
+            Test runner configuration
+
+        """
+        ...
+
+    def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
+        """Compute the number of worker processes to launch and stage shards
+        to use for running the configured test files.
+
+        Parameters
+        ----------
+        config: Config
+            Test runner configuration
+
+        system: TestSystem
+            Process execution wrapper
+
+        """
+        ...
+
+    # --- Shared implementation methods
+
+    def __call__(self, config: Config, system: TestSystem) -> None:
+        """Execute this test stage.
+
+        Parameters
+        ----------
+        config: Config
+            Test runner configuration
+
+        system: TestSystem
+            Process execution wrapper
+
+        """
+        t0 = datetime.now()
+        procs = self._launch(config, system)
+        t1 = datetime.now()
+
+        self.result = StageResult(procs, t1 - t0)
+
+    @property
+    def name(self) -> str:
+        """A stage name to display for tests in this stage."""
+        return self.__class__.__name__
+
+    @property
+    def intro(self) -> str:
+        """An informative banner to display at stage end."""
+        workers = self.spec.workers
+        workers_text = f"{workers} worker{'s' if workers > 1 else ''}"
+        return (
+            banner(f"Entering stage: {self.name} (with {workers_text})") + "\n"
+        )
+
+    @property
+    def outro(self) -> str:
+        """An informative banner to display at stage end."""
+        total, passed = self.result.total, self.result.passed
+
+        result = summary(self.name, total, passed, self.result.time)
+
+        footer = banner(
+            f"Exiting stage: {self.name}",
+            details=(
+                "* Results      : "
+                + yellow(
+                    f"{passed} / {total} files passed "  # noqa E500
+                    f"({passed/total*100:0.1f}%)"
+                    if total > 0
+                    else "0 tests are running, Please check "
+                ),
+                "* Elapsed time : " + yellow(f"{self.result.time}"),
+            ),
+        )
+
+        return f"{result}\n{footer}"
+
+    def file_args(self, test_file: Path, config: Config) -> ArgList:
+        """Extra command line arguments based on the test file.
+
+        Parameters
+        ----------
+        test_file : Path
+            Path to a test file
+
+        config: Config
+            Test runner configuration
+
+        """
+        test_file_string = str(test_file)
+        args = PER_FILE_ARGS.get(test_file_string, [])
+
+        # These are a bit ugly but necessary in order to make pytest generate
+        # more verbose output for integration tests when -v, -vv is specified
+        if "integration" in test_file_string and config.verbose > 0:
+            args += ["-v"]
+        if "integration" in test_file_string and config.verbose > 1:
+            args += ["-s"]
+
+        return args
+
+    def run(
+        self, test_file: Path, config: Config, system: TestSystem
+    ) -> ProcessResult:
+        """Execute a single test files with appropriate environment and
+        command-line options for a feature test stage.
+
+        Parameters
+        ----------
+        test_file : Path
+            Test file to execute
+
+        config: Config
+            Test runner configuration
+
+        system: TestSystem
+            Process execution wrapper
+
+        """
+        test_path = config.root_dir / test_file
+
+        shard = self.shards.get()
+
+        stage_args = self.args + self.shard_args(shard, config)
+        file_args = self.file_args(test_file, config)
+
+        cmd = [str(config.legate_path), str(test_path)]
+        cmd += stage_args + file_args + config.extra_args
+
+        self.delay(shard, config, system)
+
+        result = system.run(cmd, test_file, env=self._env(config, system))
+        log_proc(self.name, result, config, verbose=config.verbose)
+
+        self.shards.put(shard)
+
+        return result
+
+    def _env(self, config: Config, system: TestSystem) -> EnvDict:
+        env = dict(config.env)
+        env.update(self.env(config, system))
+        return env
+
+    def _init(self, config: Config, system: TestSystem) -> None:
+        self.spec = self.compute_spec(config, system)
+        self.shards = system.manager.Queue(len(self.spec.shards))
+        for shard in self.spec.shards:
+            self.shards.put(shard)
+
+    def _launch(
+        self, config: Config, system: TestSystem
+    ) -> list[ProcessResult]:
+
+        pool = multiprocessing.pool.ThreadPool(self.spec.workers)
+
+        jobs = [
+            pool.apply_async(self.run, (path, config, system))
+            for path in config.test_files
+        ]
+        pool.close()
+
+        return [job.get() for job in jobs]
diff --git a/legate/tester/stages/util.py b/legate/tester/stages/util.py
new file mode 100644
index 000000000..2d6514877
--- /dev/null
+++ b/legate/tester/stages/util.py
@@ -0,0 +1,115 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import timedelta
+from typing import Tuple, Union
+
+from typing_extensions import TypeAlias
+
+from ...util.ui import failed, passed, shell, skipped
+from ..config import Config
+from ..logger import LOG
+from ..test_system import ProcessResult
+
+CUNUMERIC_TEST_ARG = "-cunumeric:test"
+
+UNPIN_ENV = {"REALM_SYNTHETIC_CORE_MAP": ""}
+
+Shard: TypeAlias = Tuple[int, ...]
+
+
+@dataclass(frozen=True)
+class StageSpec:
+    """Specify the operation of a test run"""
+
+    #: The number of worker processes to start for running tests
+    workers: int
+
+    # A list of (cpu or gpu) shards to draw on for each test
+    shards: list[Shard]
+
+
+@dataclass(frozen=True)
+class StageResult:
+    """Collect results from all tests in a TestStage."""
+
+    #: Individual test process results including return code and stdout.
+    procs: list[ProcessResult]
+
+    #: Cumulative execution time for all tests in a stage.
+    time: timedelta
+
+    @property
+    def total(self) -> int:
+        """The total number of tests run in this stage."""
+        return len(self.procs)
+
+    @property
+    def passed(self) -> int:
+        """The number of tests in this stage that passed."""
+        return sum(p.returncode == 0 for p in self.procs)
+
+
+def adjust_workers(workers: int, requested_workers: Union[int, None]) -> int:
+    """Adjust computed workers according to command line requested workers.
+
+    The final number of workers will only be adjusted down by this function.
+
+    Parameters
+    ----------
+    workers: int
+        The computed number of workers to use
+
+    requested_workers: int | None, optional
+        Requested number of workers from the user, if supplied (default: None)
+
+    Returns
+    -------
+    int
+        The number of workers to actually use
+
+    """
+    if requested_workers is not None and requested_workers < 0:
+        raise ValueError("requested workers must be non-negative")
+
+    if requested_workers is not None:
+        if requested_workers > workers:
+            raise RuntimeError(
+                "Requested workers greater than assignable workers"
+            )
+        workers = requested_workers
+
+    if workers == 0:
+        raise RuntimeError("Current configuration results in zero workers")
+
+    return workers
+
+
+def log_proc(
+    name: str, proc: ProcessResult, config: Config, *, verbose: bool
+) -> None:
+    """Log a process result according to the current configuration"""
+    if config.debug or config.dry_run:
+        LOG(shell(proc.invocation))
+    msg = f"({name}) {proc.test_file}"
+    details = proc.output.split("\n") if verbose else None
+    if proc.skipped:
+        LOG(skipped(msg))
+    elif proc.returncode == 0:
+        LOG(passed(msg, details=details))
+    else:
+        LOG(failed(msg, details=details))
diff --git a/legate/tester/test_plan.py b/legate/tester/test_plan.py
new file mode 100644
index 000000000..cc877f7a4
--- /dev/null
+++ b/legate/tester/test_plan.py
@@ -0,0 +1,132 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide a TestPlan class to coordinate multiple feature test stages.
+
+"""
+from __future__ import annotations
+
+from datetime import timedelta
+from itertools import chain
+
+from ..util.colors import yellow
+from ..util.ui import banner, rule, summary
+from .config import Config
+from .logger import LOG
+from .stages import STAGES, log_proc
+from .test_system import TestSystem
+
+
+class TestPlan:
+    """Encapsulate an entire test run with multiple feature test stages.
+
+    Parameters
+    ----------
+    config: Config
+        Test runner configuration
+
+    system: TestSystem
+        Process execution wrapper
+
+    """
+
+    def __init__(self, config: Config, system: TestSystem) -> None:
+        self._config = config
+        self._system = system
+        self._stages = [
+            STAGES[feature](config, system) for feature in config.features
+        ]
+
+    def execute(self) -> int:
+        """Execute the entire test run with all configured feature stages."""
+        LOG.clear()
+
+        LOG(self.intro)
+
+        for stage in self._stages:
+            LOG(stage.intro)
+            stage(self._config, self._system)
+            LOG(stage.outro)
+
+        all_procs = tuple(
+            chain.from_iterable(s.result.procs for s in self._stages)
+        )
+        total = len(all_procs)
+        passed = sum(proc.returncode == 0 for proc in all_procs)
+
+        LOG(f"\n{rule(pad=4)}")
+
+        self._log_failures(total, passed)
+
+        LOG(self.outro(total, passed))
+
+        return int((total - passed) > 0)
+
+    @property
+    def intro(self) -> str:
+        """An informative banner to display at test run start."""
+
+        cpus = len(self._system.cpus)
+        try:
+            gpus = len(self._system.gpus)
+        except ImportError:
+            gpus = 0
+
+        details = (
+            f"* Feature stages       : {', '.join(yellow(x) for x in self._config.features)}",  # noqa E501
+            f"* Test files per stage : {yellow(str(len(self._config.test_files)))}",  # noqa E501
+            f"* TestSystem description   : {yellow(str(cpus) + ' cpus')} / {yellow(str(gpus) + ' gpus')}",  # noqa E501
+        )
+        return banner("Test Suite Configuration", details=details)
+
+    def outro(self, total: int, passed: int) -> str:
+        """An informative banner to display at test run end.
+
+        Parameters
+        ----------
+        total: int
+            Number of total tests that ran in all stages
+
+        passed: int
+            Number of tests that passed in all stages
+
+        """
+        details = [
+            f"* {s.name: <6}: "
+            + yellow(
+                f"{s.result.passed} / {s.result.total} passed in {s.result.time.total_seconds():0.2f}s"  # noqa E501
+            )
+            for s in self._stages
+        ]
+
+        time = sum((s.result.time for s in self._stages), timedelta(0, 0))
+        details.append("")
+        details.append(
+            summary("All tests", total, passed, time, justify=False)
+        )
+
+        overall = banner("Overall summary", details=details)
+
+        return f"{overall}\n"
+
+    def _log_failures(self, total: int, passed: int) -> None:
+        if total == passed:
+            return
+
+        LOG(f"{banner('FAILURES')}\n")
+
+        for stage in self._stages:
+            procs = (proc for proc in stage.result.procs if proc.returncode)
+            for proc in procs:
+                log_proc(stage.name, proc, self._config, verbose=True)
diff --git a/legate/tester/test_system.py b/legate/tester/test_system.py
new file mode 100644
index 000000000..2c4e9949f
--- /dev/null
+++ b/legate/tester/test_system.py
@@ -0,0 +1,123 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Provide a System class to encapsulate process execution and reporting
+system information (number of CPUs present, etc).
+
+"""
+from __future__ import annotations
+
+import multiprocessing
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from subprocess import PIPE, STDOUT, run as stdlib_run
+from typing import Sequence
+
+from ..util.system import System
+from ..util.types import EnvDict
+
+__all__ = ("TestSystem",)
+
+
+@dataclass
+class ProcessResult:
+
+    #: The command invovation, including relevant environment vars
+    invocation: str
+
+    #  User-friendly test file path to use in reported output
+    test_file: Path
+
+    #: Whether this process was actually invoked
+    skipped: bool = False
+
+    #: The returncode from the process
+    returncode: int = 0
+
+    #: The collected stdout and stderr output from the process
+    output: str = ""
+
+
+class TestSystem(System):
+    """A facade class for system-related functions.
+
+    Parameters
+    ----------
+    dry_run : bool, optional
+        If True, no commands will be executed, but a log of any commands
+        submitted to ``run`` will be made. (default: False)
+
+    """
+
+    def __init__(
+        self,
+        *,
+        dry_run: bool = False,
+    ) -> None:
+        self.manager = multiprocessing.Manager()
+        self.dry_run: bool = dry_run
+
+    def run(
+        self,
+        cmd: Sequence[str],
+        test_file: Path,
+        *,
+        env: EnvDict | None = None,
+        cwd: str | None = None,
+    ) -> ProcessResult:
+        """Wrapper for subprocess.run that encapsulates logging.
+
+        Parameters
+        ----------
+        cmd : sequence of str
+            The command to run, split on whitespace into a sequence
+            of strings
+
+        test_file : Path
+            User-friendly test file path to use in reported output
+
+        env : dict[str, str] or None, optional, default: None
+            Environment variables to apply when running the command
+
+        cwd: str or None, optional, default: None
+            A current working directory to pass to stdlib ``run``.
+
+        """
+
+        env = env or {}
+
+        envstr = (
+            " ".join(f"{k}={v}" for k, v in env.items())
+            + min(len(env), 1) * " "
+        )
+
+        invocation = envstr + " ".join(cmd)
+
+        if self.dry_run:
+            return ProcessResult(invocation, test_file, skipped=True)
+
+        full_env = dict(os.environ)
+        full_env.update(env)
+
+        proc = stdlib_run(
+            cmd, cwd=cwd, env=full_env, stdout=PIPE, stderr=STDOUT, text=True
+        )
+
+        return ProcessResult(
+            invocation,
+            test_file,
+            returncode=proc.returncode,
+            output=proc.stdout,
+        )
diff --git a/tests/unit/legate/driver/test_types.py b/legate/util/__init__.py
similarity index 100%
rename from tests/unit/legate/driver/test_types.py
rename to legate/util/__init__.py
diff --git a/legate/util/args.py b/legate/util/args.py
new file mode 100644
index 000000000..e8fdc0c34
--- /dev/null
+++ b/legate/util/args.py
@@ -0,0 +1,182 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import sys
+import warnings
+from argparse import Action, ArgumentParser, Namespace
+from dataclasses import dataclass, fields
+from typing import (
+    Any,
+    Generic,
+    Iterable,
+    Iterator,
+    Literal,
+    Sequence,
+    Type,
+    TypeVar,
+    Union,
+)
+
+from typing_extensions import TypeAlias
+
+
+class _UnsetType:
+    pass
+
+
+Unset = _UnsetType()
+
+
+T = TypeVar("T")
+
+NotRequired = Union[_UnsetType, T]
+
+
+# https://docs.python.org/3/library/argparse.html#action
+ActionType: TypeAlias = Literal[
+    "store",
+    "store_const",
+    "store_true",
+    "append",
+    "append_const",
+    "count",
+    "help",
+    "version",
+    "extend",
+]
+
+# https://docs.python.org/3/library/argparse.html#nargs
+NargsType: TypeAlias = Literal["?", "*", "+", "..."]
+
+
+@dataclass(frozen=True)
+class ArgSpec:
+    dest: str
+    action: NotRequired[ActionType] = "store_true"
+    nargs: NotRequired[Union[int, NargsType]] = Unset
+    const: NotRequired[Any] = Unset
+    default: NotRequired[Any] = Unset
+    type: NotRequired[Type[Any]] = Unset
+    choices: NotRequired[Sequence[Any]] = Unset
+    help: NotRequired[str] = Unset
+    metavar: NotRequired[str] = Unset
+
+
+@dataclass(frozen=True)
+class Argument:
+    name: str
+    spec: ArgSpec
+
+
+def entries(obj: Any) -> Iterable[tuple[str, Any]]:
+    for f in fields(obj):
+        value = getattr(obj, f.name)
+        if value is not Unset:
+            yield (f.name, value)
+
+
+class MultipleChoices(Generic[T]):
+    """A container that reports True for any item or subset inclusion.
+
+    Parameters
+    ----------
+    choices: Iterable[T]
+        The values to populate the containter.
+
+    Examples
+    --------
+
+    >>> choices = MultipleChoices(["a", "b", "c"])
+
+    >>> "a" in choices
+    True
+
+    >>> ("b", "c") in choices
+    True
+
+    """
+
+    def __init__(self, choices: Iterable[T]) -> None:
+        self._choices = set(choices)
+
+    def __contains__(self, x: Union[T, Sequence[T]]) -> bool:
+        if isinstance(x, (list, tuple)):
+            return set(x).issubset(self._choices)
+        return x in self._choices
+
+    def __iter__(self) -> Iterator[T]:
+        return self._choices.__iter__()
+
+
+class ExtendAction(Action, Generic[T]):
+    """A custom argparse action to collect multiple values into a list."""
+
+    def __call__(
+        self,
+        parser: ArgumentParser,
+        namespace: Namespace,
+        values: Union[str, Sequence[T], None],
+        option_string: Union[str, None] = None,
+    ) -> None:
+        items = getattr(namespace, self.dest) or []
+        if isinstance(values, (list, tuple)):
+            items.extend(values)
+        else:
+            items.append(values)
+        # removing any duplicates before storing
+        setattr(namespace, self.dest, list(set(items)))
+
+
+def parse_library_command_args(
+    libname: str, args: Iterable[Argument]
+) -> Namespace:
+    """ """
+    if not libname.isidentifier():
+        raise ValueError(
+            f"Invalid library {libname!r} for command line arguments"
+        )
+
+    parser = ArgumentParser(
+        prog=f"<{libname} program>", add_help=False, allow_abbrev=False
+    )
+
+    lib_prefix = f"-{libname}:"
+
+    argnames = [arg.name for arg in args]
+
+    for arg in args:
+        argname = f"{lib_prefix}{arg.name}"
+        kwargs = dict(entries(arg.spec))
+        parser.add_argument(argname, **kwargs)
+
+    has_custom_help = "help" in argnames
+
+    if f"{lib_prefix}help" in sys.argv and not has_custom_help:
+        parser.print_help()
+        sys.exit()
+
+    args, extra = parser.parse_known_args()
+
+    for item in extra:
+        if item.startswith(lib_prefix):
+            warnings.warn(
+                f"Unrecognized argument {item!r} for {libname} (passed on as-is)"  # noqa: E501
+            )
+            break
+
+    sys.argv = sys.argv[:1] + extra
+
+    return args
diff --git a/legate/util/colors.py b/legate/util/colors.py
new file mode 100644
index 000000000..5bb0b14b3
--- /dev/null
+++ b/legate/util/colors.py
@@ -0,0 +1,95 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Helper functions for adding colors to simple text UI output.
+
+The color functions in this module require ``colorama`` to be installed in
+order to generate color output. If ``colorama`` is not available, plain
+text output (i.e. without ANSI color codes) will be generated.
+
+"""
+from __future__ import annotations
+
+import re
+import sys
+
+__all__ = (
+    "bright",
+    "cyan",
+    "dim",
+    "green",
+    "magenta",
+    "red",
+    "scrub",
+    "white",
+    "yellow",
+)
+
+
+def _text(text: str) -> str:
+    return text
+
+
+try:
+    import colorama  # type: ignore[import]
+
+    def bright(text: str) -> str:
+        return f"{colorama.Style.BRIGHT}{text}{colorama.Style.RESET_ALL}"
+
+    def dim(text: str) -> str:
+        return f"{colorama.Style.DIM}{text}{colorama.Style.RESET_ALL}"
+
+    def white(text: str) -> str:
+        return f"{colorama.Fore.WHITE}{text}{colorama.Style.RESET_ALL}"
+
+    def cyan(text: str) -> str:
+        return f"{colorama.Fore.CYAN}{text}{colorama.Style.RESET_ALL}"
+
+    def red(text: str) -> str:
+        return f"{colorama.Fore.RED}{text}{colorama.Style.RESET_ALL}"
+
+    def magenta(text: str) -> str:
+        return f"{colorama.Fore.MAGENTA}{text}{colorama.Style.RESET_ALL}"
+
+    def green(text: str) -> str:
+        return f"{colorama.Fore.GREEN}{text}{colorama.Style.RESET_ALL}"
+
+    def yellow(text: str) -> str:
+        return f"{colorama.Fore.YELLOW}{text}{colorama.Style.RESET_ALL}"
+
+    if sys.platform == "win32":
+        colorama.init()
+
+except ImportError:
+
+    bright = dim = white = cyan = red = magenta = green = yellow = _text
+
+# ref: https://stackoverflow.com/a/14693789
+_ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+
+
+def scrub(text: str) -> str:
+    """Remove ANSI color codes from a text string.
+
+    Parameters
+    ----------
+    text : str
+        The text to scrub
+
+    Returns
+    -------
+        str
+
+    """
+    return _ANSI_ESCAPE.sub("", text)
diff --git a/legate/driver/util.py b/legate/util/fs.py
similarity index 81%
rename from legate/driver/util.py
rename to legate/util/fs.py
index 499b250e3..e05e15279 100644
--- a/legate/driver/util.py
+++ b/legate/util/fs.py
@@ -17,100 +17,18 @@
 import re
 import sys
 from pathlib import Path
-from shlex import quote
-from textwrap import indent
-from typing import TYPE_CHECKING, Type, TypeVar
 
-from .types import DataclassProtocol, LegatePaths, LegionPaths
-from .ui import kvtable, rule, section, value
-
-if TYPE_CHECKING:
-    from .driver import Driver
-    from .system import System
+from .types import LegatePaths, LegionPaths
 
 __all__ = (
     "get_legate_build_dir",
     "get_legate_paths",
     "get_legion_paths",
-    "object_to_dataclass",
-    "print_verbose",
     "read_c_define",
     "read_cmake_cache_value",
 )
 
 
-T = TypeVar("T", bound=DataclassProtocol)
-
-
-def object_to_dataclass(obj: object, typ: Type[T]) -> T:
-    """Automatically generate a dataclass from an object with appropriate
-    attributes.
-
-    Parameters
-    ----------
-    obj: object
-        An object to pull values from (e.g. an argparse Namespace)
-
-    typ:
-        A dataclass type to generate from ``obj``
-
-    Returns
-    -------
-        The generated dataclass instance
-
-    """
-    kws = {name: getattr(obj, name) for name in typ.__dataclass_fields__}
-    return typ(**kws)
-
-
-def print_verbose(
-    system: System,
-    driver: Driver | None = None,
-) -> None:
-    """Print system and driver configuration values.
-
-    Parameters
-    ----------
-    system : System
-        A System instance to obtain Legate and Legion paths from
-
-    driver : Driver or None, optional
-        If not None, a Driver instance to obtain command invocation and
-        environment from (default: None)
-
-    Returns
-    -------
-        None
-
-    """
-
-    print(f"\n{rule('Legion Python Configuration')}")
-
-    print(section("\nLegate paths:"))
-    print(indent(str(system.legate_paths), prefix="  "))
-
-    print(section("\nLegion paths:"))
-    print(indent(str(system.legion_paths), prefix="  "))
-
-    if driver:
-        print(section("\nCommand:"))
-        cmd = " ".join(quote(t) for t in driver.cmd)
-        print(f"  {value(cmd)}")
-
-        if keys := sorted(driver.custom_env_vars):
-            print(section("\nCustomized Environment:"))
-            print(
-                indent(
-                    kvtable(driver.env, delim="=", align=False, keys=keys),
-                    prefix="  ",
-                )
-            )
-
-    print(f"\n{rule()}")
-
-    print(flush=True)
-
-
 def read_c_define(header_path: Path, name: str) -> str | None:
     """Open a C header file and read the value of a #define
 
@@ -321,15 +239,16 @@ def get_legion_paths(legate_paths: LegatePaths) -> LegionPaths:
     # local builds over global installations. This allows devs to work in the
     # source tree and re-run without overwriting existing installations.
 
-    def installed_legion_paths(
-        legion_dir: Path, legion_module: Path | None = None
-    ) -> LegionPaths:
-        if legion_module is None:
-            legion_lib_dir = legion_dir / "lib"
-            for f in legion_lib_dir.iterdir():
-                if f.joinpath("site-packages").exists():
-                    legion_module = f / "site-packages"
-                    break
+    def installed_legion_paths(legion_dir: Path) -> LegionPaths:
+        legion_lib_dir = legion_dir / "lib"
+        for f in legion_lib_dir.iterdir():
+            legion_module = f / "site-packages"
+            if legion_module.exists():
+                break
+
+        # NB: for-else clause! (executes if NO loop break)
+        else:
+            raise RuntimeError("could not determine legion module location")
 
         legion_bin_path = legion_dir / "bin"
         legion_include_path = legion_dir / "include"
diff --git a/legate/driver/system.py b/legate/util/system.py
similarity index 51%
rename from legate/driver/system.py
rename to legate/util/system.py
index 57f9ec226..702514cc2 100644
--- a/legate/driver/system.py
+++ b/legate/util/system.py
@@ -14,11 +14,14 @@
 #
 from __future__ import annotations
 
+import multiprocessing
 import os
 import platform
+import sys
 from functools import cached_property
 
-from .util import LegatePaths, LegionPaths, get_legate_paths, get_legion_paths
+from .fs import get_legate_paths, get_legion_paths
+from .types import CPUInfo, GPUInfo, LegatePaths, LegionPaths
 
 __all__ = ("System",)
 
@@ -78,3 +81,51 @@ def LIB_PATH(self) -> str:
 
         """
         return "LD_LIBRARY_PATH" if self.os == "Linux" else "DYLD_LIBRARY_PATH"
+
+    @cached_property
+    def cpus(self) -> tuple[CPUInfo, ...]:
+        """A list of CPUs on the system."""
+
+        N = multiprocessing.cpu_count()
+
+        if sys.platform == "darwin":
+            return tuple(CPUInfo((i,)) for i in range(N))
+
+        sibling_sets: set[tuple[int, ...]] = set()
+        for i in range(N):
+            line = open(
+                f"/sys/devices/system/cpu/cpu{i}/topology/thread_siblings_list"
+            ).read()
+            sibling_sets.add(
+                tuple(sorted(int(x) for x in line.strip().split(",")))
+            )
+        return tuple(CPUInfo(siblings) for siblings in sorted(sibling_sets))
+
+    @cached_property
+    def gpus(self) -> tuple[GPUInfo, ...]:
+        """A list of GPUs on the system, including total memory information."""
+
+        try:
+            # This pynvml import is protected inside this method so that in
+            # case pynvml is not installed, tests stages that don't need gpu
+            # info (e.g. cpus, eager) will proceed unaffected. Test stages
+            # that do require gpu info will fail here with an ImportError.
+            import pynvml  # type: ignore[import]
+
+            # Also a pynvml package is available on some platforms that won't
+            # have GPUs for some reason. In which case this init call will
+            # fail.
+            pynvml.nvmlInit()
+        except Exception:
+            return ()
+
+        num_gpus = pynvml.nvmlDeviceGetCount()
+
+        results = []
+        for i in range(num_gpus):
+            info = pynvml.nvmlDeviceGetMemoryInfo(
+                pynvml.nvmlDeviceGetHandleByIndex(i)
+            )
+            results.append(GPUInfo(i, info.total))
+
+        return tuple(results)
diff --git a/legate/driver/types.py b/legate/util/types.py
similarity index 68%
rename from legate/driver/types.py
rename to legate/util/types.py
index 0bde4643b..2a8166373 100644
--- a/legate/driver/types.py
+++ b/legate/util/types.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-"""Provide types that are useful throughout the driver code.
+"""Provide types that are useful throughout the test driver code.
 
 """
 from __future__ import annotations
 
 from dataclasses import Field, dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Protocol, Tuple, Union
+from typing import Any, Dict, List, Protocol, Tuple, Type, TypeVar, Union
 
 from typing_extensions import Literal, TypeAlias
 
@@ -29,14 +29,37 @@
     "ArgList",
     "Command",
     "CommandPart",
+    "CPUInfo",
     "DataclassMixin",
     "DataclassProtocol",
     "EnvDict",
+    "GPUInfo",
     "LauncherType",
     "LegatePaths",
     "LegionPaths",
+    "object_to_dataclass",
 )
 
+
+@dataclass(frozen=True)
+class CPUInfo:
+    """Encapsulate information about a single CPU"""
+
+    #: IDs of hypterthreading sibling cores for a given physscal core
+    ids: tuple[int, ...]
+
+
+@dataclass(frozen=True)
+class GPUInfo:
+    """Encapsulate information about a single CPU"""
+
+    #: ID of the GPU to specify in test shards
+    id: int
+
+    #: The total framebuffer memory of this GPU
+    total: int
+
+
 #: Define the available launcher for the driver to use
 LauncherType: TypeAlias = Union[
     Literal["mpirun"], Literal["jsrun"], Literal["srun"], Literal["none"]
@@ -73,6 +96,30 @@ def __str__(self) -> str:
         return kvtable(self.__dict__)
 
 
+T = TypeVar("T", bound=DataclassProtocol)
+
+
+def object_to_dataclass(obj: object, typ: Type[T]) -> T:
+    """Automatically generate a dataclass from an object with appropriate
+    attributes.
+
+    Parameters
+    ----------
+    obj: object
+        An object to pull values from (e.g. an argparse Namespace)
+
+    typ:
+        A dataclass type to generate from ``obj``
+
+    Returns
+    -------
+        The generated dataclass instance
+
+    """
+    kws = {name: getattr(obj, name) for name in typ.__dataclass_fields__}
+    return typ(**kws)
+
+
 @dataclass(frozen=True)
 class LegatePaths(DataclassMixin):
     """Collect all the filesystem paths relevant for Legate."""
diff --git a/legate/util/ui.py b/legate/util/ui.py
new file mode 100644
index 000000000..9cf74b094
--- /dev/null
+++ b/legate/util/ui.py
@@ -0,0 +1,345 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Helper functions for simple text UI output.
+
+The color functions in this module require ``colorama`` to be installed in
+order to generate color output. If ``colorama`` is not available, plain
+text output (i.e. without ANSI color codes) will be generated.
+
+"""
+from __future__ import annotations
+
+from datetime import timedelta
+from typing import Any, Iterable
+
+from typing_extensions import TypeAlias
+
+from .colors import bright, cyan, dim, green, magenta, red, white, yellow
+
+Details: TypeAlias = Iterable[str]
+
+__all__ = (
+    "UI_WIDTH",
+    "banner",
+    "error",
+    "key",
+    "kvtable",
+    "rule",
+    "section",
+    "value",
+    "warn",
+)
+
+
+#: Width for terminal ouput headers and footers.
+UI_WIDTH = 80
+
+
+def _format_details(
+    details: Iterable[str] | None = None, pre: str = "   "
+) -> str:
+    if details:
+        return f"{pre}" + f"\n{pre}".join(f"{line}" for line in details)
+    return ""
+
+
+def banner(
+    heading: str,
+    *,
+    char: str = "#",
+    width: int = UI_WIDTH,
+    details: Iterable[str] | None = None,
+) -> str:
+    """Generate a title banner, with optional details included.
+
+    Parameters
+    ----------
+    heading : str
+        Text to use for the title
+
+    char : str, optional
+        A character to use to frame the banner. (default: "#")
+
+    width : int, optional
+        How wide to draw the banner. (Note: user-supplied heading or
+        details willnot be truncated if they exceed this width)
+
+    details : Iterable[str], optional
+        A list of lines to diplay inside the banner area below the heading
+
+    """
+    pre = f"{char*3} "
+    divider = char * width
+    if not details:
+        return f"\n{divider}\n{pre}{heading}\n{divider}"
+    return f"""
+{divider}
+{pre}
+{pre}{heading}
+{pre}
+{_format_details(details, pre)}
+{pre}
+{divider}"""
+
+
+def error(text: str) -> str:
+    """Format text as an error.
+
+    Parameters
+    ----------
+    text : str
+        The text to format
+
+    Returns
+    -------
+        str
+
+    """
+    return red(f"ERROR: {text}")
+
+
+def failed(msg: str, *, details: Details | None = None) -> str:
+    """Report a failed test result with a bright red [FAIL].
+
+    Parameters
+    ----------
+    msg : str
+        Text to display after [FAIL]
+
+    details : Iterable[str], optional
+        A sequenece of text lines to diplay below the ``msg`` line
+
+    """
+    if details:
+        return f"{bright(red('[FAIL]'))} {msg}\n{_format_details(details)}"
+    return f"{bright(red('[FAIL]'))} {msg}"
+
+
+def passed(msg: str, *, details: Details | None = None) -> str:
+    """Report a passed test result with a bright green [PASS].
+
+    Parameters
+    ----------
+    msg : str
+        Text to display after [PASS]
+
+    details : Iterable[str], optional
+        A sequenece of text lines to diplay below the ``msg`` line
+
+    """
+    if details:
+        return f"{bright(green('[PASS]'))} {msg}\n{_format_details(details)}"
+    return f"{bright(green('[PASS]'))} {msg}"
+
+
+def key(text: str) -> str:
+    """Format a 'key' from a key-value pair.
+
+    Parameters
+    ----------
+    text : str
+        The key to format
+
+    Returns
+    -------
+        str
+
+    """
+    return dim(green(text))
+
+
+def value(text: str) -> str:
+    """Format a 'value' from of a key-value pair.
+
+    Parameters
+    ----------
+    text : str
+        The key to format
+
+    Returns
+    -------
+        str
+
+    """
+    return yellow(text)
+
+
+def kvtable(
+    items: dict[str, Any],
+    *,
+    delim: str = " : ",
+    align: bool = True,
+    keys: Iterable[str] | None = None,
+) -> str:
+    """Format a dictionay as a table of key-value pairs.
+
+    Parameters
+    ----------
+    items : dict[str, Any]
+        The dictionary of items to format
+
+    delim : str, optional
+        A delimiter to display between keys and values (default: " : ")
+
+    align : bool, optional
+        Whether to align delimiters to the longest key length (default: True)
+
+    keys : Iterable[str] or None, optional
+        If not None, only the specified subset of keys is included in the
+        table output (default: None)
+
+    Returns
+    -------
+        str
+
+    """
+    # annoying but necessary to take len on color-formatted version
+    N = max(len(key(k)) for k in items) if align else 0
+
+    keys = items.keys() if keys is None else keys
+
+    return "\n".join(
+        f"{key(k): <{N}}{delim}{value(str(items[k]))}" for k in keys
+    )
+
+
+def rule(
+    text: str | None = None,
+    *,
+    pad: int = 0,
+    char: str = "-",
+    N: int = UI_WIDTH,
+) -> str:
+    """Format a horizontal rule, optionally with text
+
+    Parameters
+    ----------
+    text : str or None, optional
+        If not None, display this text inline in the rule (default: None)
+
+    pad : int, optional
+        An amount of padding to put in front of the rule
+
+    char: str, optional
+        A character to use for the rule (default: "-")
+
+    N : int, optional
+        Character width for the rule (default: 80)
+
+    Returns
+    -------
+        str
+
+    """
+    width = N - pad
+    if text is None:
+        return cyan(f"{char*width: >{N}}")
+    return cyan(" " * pad + char * 3 + f"{f' {text} ' :{char}<{width-3}}")
+
+
+def section(text: str) -> str:
+    """Format text as a section header
+
+    Parameters
+    ----------
+    text : str
+        The text to format
+
+    Returns
+    -------
+        str
+
+    """
+    return bright(white(text))
+
+
+def shell(cmd: str, *, char: str = "+") -> str:
+    """Report a shell command in a dim white color.
+
+    Parameters
+    ----------
+    cmd : str
+        The shell command string to display
+
+    char : str, optional
+        A character to prefix the ``cmd`` with. (default: "+")
+
+    """
+    return dim(white(f"{char}{cmd}"))
+
+
+def skipped(msg: str) -> str:
+    """Report a skipped test with a cyan [SKIP]
+
+    Parameters
+    ----------
+    msg : str
+        Text to display after [SKIP]
+
+    """
+    return f"{cyan('[SKIP]')} {msg}"
+
+
+def summary(
+    name: str,
+    total: int,
+    passed: int,
+    time: timedelta,
+    *,
+    justify: bool = True,
+) -> str:
+    """Generate a test result summary line.
+
+    The output is bright green if all tests passed, otherwise bright red.
+
+    Parameters
+    ----------
+    name : str
+        A name to display in this summary line.
+
+    total : int
+        The total number of tests to report.
+
+    passed : int
+        The number of passed tests to report.
+
+    time : timedelta
+        The time taken to run the tests
+
+    """
+    summary = (
+        f"{name}: Passed {passed} of {total} tests ({passed/total*100:0.1f}%) "
+        f"in {time.total_seconds():0.2f}s"
+        if total > 0
+        else f"{name}: 0 tests are running, Please check"
+    )
+    color = green if passed == total and total > 0 else red
+    return bright(color(f"{summary: >{UI_WIDTH}}" if justify else summary))
+
+
+def warn(text: str) -> str:
+    """Format text as a warning.
+
+    Parameters
+    ----------
+    text : str
+        The text to format
+
+    Returns
+    -------
+        str
+
+    """
+    return magenta(f"WARNING: {text}")
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/__init__.py b/tests/unit/legate/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/legate/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/driver/__init__.py b/tests/unit/legate/driver/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/legate/driver/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/driver/conftest.py b/tests/unit/legate/driver/conftest.py
index 1b1f31e48..09c8c7d18 100644
--- a/tests/unit/legate/driver/conftest.py
+++ b/tests/unit/legate/driver/conftest.py
@@ -19,10 +19,12 @@
 from typing import Any, Callable, Iterable
 
 import pytest
-from util import GenConfig, GenSystem
 
-from legate.driver import Config, Launcher, System
+from legate.driver import Config, Launcher
 from legate.driver.config import MultiNode
+from legate.util.system import System
+
+from .util import GenConfig, GenSystem
 
 
 @pytest.fixture
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index f7188990f..29d4a8632 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -18,12 +18,14 @@
 from pathlib import Path
 
 import pytest
-from util import Capsys, GenObjs, powerset_nonempty
 
 import legate.driver.command as m
 from legate.driver.launcher import RANK_ENV_VARS
-from legate.driver.types import LauncherType
-from legate.driver.ui import scrub
+from legate.util.colors import scrub
+from legate.util.types import LauncherType
+
+from ...util import Capsys, powerset_nonempty
+from .util import GenObjs
 
 
 def test___all__() -> None:
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 0523b1db9..536289221 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -20,12 +20,13 @@
 
 import pytest
 from pytest_mock import MockerFixture
-from util import Capsys, powerset, powerset_nonempty
 
 import legate.driver.config as m
 import legate.driver.defaults as defaults
-from legate.driver.types import DataclassMixin
-from legate.driver.ui import scrub
+from legate.util.colors import scrub
+from legate.util.types import DataclassMixin
+
+from ...util import Capsys, powerset, powerset_nonempty
 
 DEFAULTS_ENV_VARS = (
     "LEGATE_EAGER_ALLOC_PERCENTAGE",
diff --git a/tests/unit/legate/driver/test_driver.py b/tests/unit/legate/driver/test_driver.py
index e346210d3..fad492a2f 100644
--- a/tests/unit/legate/driver/test_driver.py
+++ b/tests/unit/legate/driver/test_driver.py
@@ -15,19 +15,22 @@
 from __future__ import annotations
 
 import re
+from shlex import quote
 
 import pytest
 from pytest_mock import MockerFixture
-from util import Capsys, GenConfig
 
 import legate.driver.driver as m
 from legate.driver.args import LAUNCHERS
 from legate.driver.command import CMD_PARTS
+from legate.driver.config import Config
 from legate.driver.launcher import RANK_ENV_VARS, Launcher
-from legate.driver.system import System
-from legate.driver.types import LauncherType
-from legate.driver.ui import scrub
-from legate.driver.util import print_verbose
+from legate.util.colors import scrub
+from legate.util.system import System
+from legate.util.types import LauncherType
+
+from ...util import Capsys
+from .util import GenConfig
 
 SYSTEM = System()
 
@@ -123,7 +126,7 @@ def test_verbose(
 
         run_out = scrub(capsys.readouterr()[0]).strip()
 
-        print_verbose(driver.system, driver)
+        m.print_verbose(driver.system, driver)
 
         pv_out = scrub(capsys.readouterr()[0]).strip()
 
@@ -152,7 +155,7 @@ def test_verbose_nonero_rank_id(
 
         run_out = scrub(capsys.readouterr()[0]).strip()
 
-        print_verbose(driver.system, driver)
+        m.print_verbose(driver.system, driver)
 
         pv_out = scrub(capsys.readouterr()[0]).strip()
 
@@ -180,3 +183,48 @@ def test_darwin_gdb_warning(
         out, _ = capsys.readouterr()
 
         assert re.search(DARWIN_GDB_WARN_EXPECTED_PAT, scrub(out))
+
+
+class Test_print_verbose:
+    def test_system_only(self, capsys: Capsys) -> None:
+        system = System()
+
+        m.print_verbose(system)
+
+        out = scrub(capsys.readouterr()[0]).strip()
+
+        assert out.startswith(f"{'--- Legion Python Configuration ':-<80}")
+        assert "Legate paths:" in out
+        for line in scrub(str(system.legate_paths)).split():
+            assert line in out
+
+        assert "Legion paths:" in out
+        for line in scrub(str(system.legion_paths)).split():
+            assert line in out
+
+    def test_system_and_driver(self, capsys: Capsys) -> None:
+        config = Config(["legate", "--no-replicate"])
+        system = System()
+        driver = m.Driver(config, system)
+
+        m.print_verbose(system, driver)
+
+        out = scrub(capsys.readouterr()[0]).strip()
+
+        assert out.startswith(f"{'--- Legion Python Configuration ':-<80}")
+        assert "Legate paths:" in out
+        for line in scrub(str(system.legate_paths)).split():
+            assert line in out
+
+        assert "Legion paths:" in out
+        for line in scrub(str(system.legion_paths)).split():
+            assert line in out
+
+        assert "Command:" in out
+        assert f"  {' '.join(quote(t) for t in driver.cmd)}" in out
+
+        assert "Customized Environment:" in out
+        for k in driver.custom_env_vars:
+            assert f"{k}={driver.env[k]}" in out
+
+        assert out.endswith(f"\n{'-':-<80}")
diff --git a/tests/unit/legate/driver/test_launcher.py b/tests/unit/legate/driver/test_launcher.py
index 1c5b451af..ecf980d87 100644
--- a/tests/unit/legate/driver/test_launcher.py
+++ b/tests/unit/legate/driver/test_launcher.py
@@ -17,12 +17,14 @@
 import os
 
 import pytest
-from util import GenConfig, GenObjs, powerset_nonempty
 
 import legate.driver.launcher as m
 from legate.driver.args import LAUNCHERS
-from legate.driver.system import System
-from legate.driver.types import LauncherType
+from legate.util.system import System
+from legate.util.types import LauncherType
+
+from ...util import powerset_nonempty
+from .util import GenConfig, GenObjs
 
 SYSTEM = System()
 
diff --git a/tests/unit/legate/driver/test_logs.py b/tests/unit/legate/driver/test_logs.py
index 918dfc283..44e89a364 100644
--- a/tests/unit/legate/driver/test_logs.py
+++ b/tests/unit/legate/driver/test_logs.py
@@ -16,12 +16,14 @@
 
 import pytest
 from pytest_mock import MockerFixture
-from util import Capsys, GenObjs, powerset_nonempty
 
 import legate.driver.logs as m
 from legate.driver.config import Config
 from legate.driver.launcher import RANK_ENV_VARS
-from legate.driver.ui import scrub
+from legate.util.colors import scrub
+
+from ...util import Capsys, powerset_nonempty
+from .util import GenObjs
 
 
 class MockHandler(m.LogHandler):
diff --git a/tests/unit/legate/driver/test_main.py b/tests/unit/legate/driver/test_main.py
index 0784246a3..4c0260abb 100644
--- a/tests/unit/legate/driver/test_main.py
+++ b/tests/unit/legate/driver/test_main.py
@@ -28,10 +28,10 @@
 def test_main(mocker: MockerFixture) -> None:
     import legate.driver.config
     import legate.driver.driver
-    import legate.driver.system
+    import legate.util.system
 
     config_spy = mocker.spy(legate.driver.config.Config, "__init__")
-    system_spy = mocker.spy(legate.driver.system.System, "__init__")
+    system_spy = mocker.spy(legate.util.system.System, "__init__")
     driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
     mocker.patch("legate.driver.driver.Driver.run", return_value=123)
 
@@ -48,7 +48,7 @@ def test_main(mocker: MockerFixture) -> None:
     assert driver_spy.call_count == 1
     assert len(driver_spy.call_args[0]) == 3
     assert isinstance(driver_spy.call_args[0][1], legate.driver.config.Config)
-    assert isinstance(driver_spy.call_args[0][2], legate.driver.system.System)
+    assert isinstance(driver_spy.call_args[0][2], legate.util.system.System)
     assert driver_spy.call_args[1] == {}
 
     assert result == 123
diff --git a/tests/unit/legate/driver/test_ui.py b/tests/unit/legate/driver/test_ui.py
deleted file mode 100644
index 33b8b03eb..000000000
--- a/tests/unit/legate/driver/test_ui.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from typing import Any
-
-import pytest
-from pytest_mock import MockerFixture
-from typing_extensions import TypeAlias
-
-import legate.driver.ui as m
-
-try:
-    import colorama  # type: ignore
-except ImportError:
-    colorama = None
-
-UsePlainTextFixture: TypeAlias = Any
-
-
-@pytest.fixture
-def use_plain_text(mocker: MockerFixture) -> None:
-    mocker.patch.object(m, "bright", m._text)
-    mocker.patch.object(m, "dim", m._text)
-    mocker.patch.object(m, "white", m._text)
-    mocker.patch.object(m, "cyan", m._text)
-    mocker.patch.object(m, "red", m._text)
-    mocker.patch.object(m, "green", m._text)
-    mocker.patch.object(m, "yellow", m._text)
-    mocker.patch.object(m, "magenta", m._text)
-
-
-COLOR_FUNCS = (
-    "cyan",
-    "green",
-    "magenta",
-    "red",
-    "white",
-    "yellow",
-)
-
-STYLE_FUNCS = (
-    "bright",
-    "dim",
-)
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-@pytest.mark.parametrize("color", COLOR_FUNCS)
-def test_color_functions(color: str) -> None:
-    cfunc = getattr(m, color)
-    cprop = getattr(colorama.Fore, color.upper())
-
-    out = cfunc("some text")
-
-    assert out == f"{cprop}some text{colorama.Style.RESET_ALL}"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-@pytest.mark.parametrize("style", STYLE_FUNCS)
-def test_style_functions(style: str) -> None:
-    sfunc = getattr(m, style)
-    sprop = getattr(colorama.Style, style.upper())
-
-    out = sfunc("some text")
-
-    assert out == f"{sprop}some text{colorama.Style.RESET_ALL}"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-def test_error(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.error("some message") == m.red("ERROR: some message")
-
-
-def test_error_plain(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.error("some message") == "ERROR: some message"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-def test_key(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.key("some key") == m.dim(m.green("some key"))
-
-
-def test_key_plain(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.key("some key") == "some key"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-def test_value(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.value("some value") == m.yellow("some value")
-
-
-def test_value_plain(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.value("some value") == "some value"
-
-
-class Test_kvtable:
-    ONE = {"foo": 10}
-    TWO = {"foo": 10, "barbaz": "some value"}
-    THREE = {"foo": 10, "barbaz": "some value", "a": 1.2}
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_default(self, items: dict[str, Any]) -> None:
-        N = max(len(m.key(k)) for k in items)
-        assert m.kvtable(items) == "\n".join(
-            f"{m.key(k): <{N}} : {m.value(str(items[k]))}" for k in items
-        )
-
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_default_plain(
-        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
-    ) -> None:
-        N = max(len(k) for k in items)
-        assert m.kvtable(items) == "\n".join(
-            f"{k: <{N}} : {items[k]}" for k in items
-        )
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_delim(self, items: dict[str, Any]) -> None:
-        N = max(len(m.key(k)) for k in items)
-        assert m.kvtable(items, delim="/") == "\n".join(
-            f"{m.key(k): <{N}}/{m.value(str(items[k]))}" for k in items
-        )
-
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_delim_plain(
-        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
-    ) -> None:
-        N = max(len(k) for k in items)
-        assert m.kvtable(items, delim="/") == "\n".join(
-            f"{k: <{N}}/{items[k]}" for k in items
-        )
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_align_False(self, items: dict[str, Any]) -> None:
-        assert m.kvtable(items, align=False) == "\n".join(
-            f"{m.key(k)} : {m.value(str(items[k]))}" for k in items
-        )
-
-    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
-    def test_align_False_plain(
-        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
-    ) -> None:
-        assert m.kvtable(items, align=False) == "\n".join(
-            f"{k} : {items[k]}" for k in items
-        )
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    def test_keys(self) -> None:
-        items = self.THREE
-        keys = ("foo", "a")
-        N = max(len(m.key(k)) for k in items)
-
-        assert m.kvtable(self.THREE, keys=keys) == "\n".join(
-            f"{m.key(k): <{N}} : {m.value(str(items[k]))}" for k in keys
-        )
-
-    def test_keys_plain(self, use_plain_text: UsePlainTextFixture) -> None:
-        items = self.THREE
-        keys = ("foo", "a")
-        N = max(len(m.key(k)) for k in items)
-
-        assert m.kvtable(items, keys=keys) == "\n".join(
-            f"{k: <{N}} : {items[k]}" for k in keys
-        )
-
-
-class Test_rule:
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    def test_text(self) -> None:
-        assert m.rule("foo bar") == m.cyan("--- foo bar " + "-" * 68)
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    def test_char(self) -> None:
-        assert m.rule(char="a") == m.cyan("a" * 80)
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    def test_N(self) -> None:
-        assert m.rule(N=60) == m.cyan("-" * 60)
-
-    @pytest.mark.skipif(colorama is None, reason="colorama required")
-    def test_N_with_text(self) -> None:
-        assert m.rule("foo bar", N=65) == m.cyan("--- foo bar " + "-" * 53)
-
-    def test_text_plain(self, use_plain_text: UsePlainTextFixture) -> None:
-        assert m.rule("foo bar") == "--- foo bar " + "-" * 68
-
-    def test_char_plain(self, use_plain_text: UsePlainTextFixture) -> None:
-        assert m.rule(char="a") == "a" * 80
-
-    def test_N_plain(self, use_plain_text: UsePlainTextFixture) -> None:
-        assert m.rule(N=60) == "-" * 60
-
-    def test_N_with_text_plain(
-        self, use_plain_text: UsePlainTextFixture
-    ) -> None:
-        assert m.rule("foo bar", N=65) == "--- foo bar " + "-" * 53
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-@pytest.mark.parametrize("color", COLOR_FUNCS)
-@pytest.mark.parametrize("style", STYLE_FUNCS)
-def test_scrub(style: str, color: str) -> None:
-    cfunc = getattr(m, color)
-    sfunc = getattr(m, style)
-
-    assert m.scrub(cfunc(sfunc("some text"))) == "some text"
-    assert m.scrub(sfunc(cfunc("some text"))) == "some text"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-@pytest.mark.parametrize("color", COLOR_FUNCS)
-@pytest.mark.parametrize("style", STYLE_FUNCS)
-def test_scrub_plain(
-    use_plain_text: UsePlainTextFixture, style: str, color: str
-) -> None:
-    cfunc = getattr(m, color)
-    sfunc = getattr(m, style)
-
-    assert m.scrub(cfunc(sfunc("some text"))) == "some text"
-    assert m.scrub(sfunc(cfunc("some text"))) == "some text"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-def test_section(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.section("some section") == m.bright(m.white("some section"))
-
-
-def test_section_plain(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.section("some section") == "some section"
-
-
-@pytest.mark.skipif(colorama is None, reason="colorama required")
-def test_warn(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.warn("some message") == m.magenta("WARNING: some message")
-
-
-def test_warn_plain(use_plain_text: UsePlainTextFixture) -> None:
-    assert m.warn("some message") == "WARNING: some message"
diff --git a/tests/unit/legate/driver/test_util.py b/tests/unit/legate/driver/test_util.py
deleted file mode 100644
index a864ddc8c..000000000
--- a/tests/unit/legate/driver/test_util.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from dataclasses import dataclass
-from pathlib import Path
-from shlex import quote
-
-import pytest
-from util import Capsys
-
-import legate.driver.util as m
-from legate.driver.config import Config
-from legate.driver.driver import Driver
-from legate.driver.system import System
-from legate.driver.ui import scrub
-
-
-class Source:
-    foo = 10
-    bar = 10.2
-    baz = "test"
-    quux = ["a", "b", "c"]
-    extra = (1, 2, 3)
-
-
-@dataclass(frozen=True)
-class Target:
-    foo: int
-    bar: float
-    baz: str
-    quux: list[str]
-
-
-def test_object_to_dataclass() -> None:
-    source = Source()
-    target = m.object_to_dataclass(source, Target)
-
-    assert set(target.__dict__) == set(Target.__dataclass_fields__)
-    for k, v in target.__dict__.items():
-        assert getattr(source, k) == v
-
-
-class Test_print_verbose:
-    def test_system_only(self, capsys: Capsys) -> None:
-        system = System()
-
-        m.print_verbose(system)
-
-        out = scrub(capsys.readouterr()[0]).strip()
-
-        assert out.startswith(f"{'--- Legion Python Configuration ':-<80}")
-        assert "Legate paths:" in out
-        for line in scrub(str(system.legate_paths)).split():
-            assert line in out
-
-        assert "Legion paths:" in out
-        for line in scrub(str(system.legion_paths)).split():
-            assert line in out
-
-    def test_system_and_driver(self, capsys: Capsys) -> None:
-        config = Config(["legate", "--no-replicate"])
-        system = System()
-        driver = Driver(config, system)
-
-        m.print_verbose(system, driver)
-
-        out = scrub(capsys.readouterr()[0]).strip()
-
-        assert out.startswith(f"{'--- Legion Python Configuration ':-<80}")
-        assert "Legate paths:" in out
-        for line in scrub(str(system.legate_paths)).split():
-            assert line in out
-
-        assert "Legion paths:" in out
-        for line in scrub(str(system.legion_paths)).split():
-            assert line in out
-
-        assert "Command:" in out
-        assert f"  {' '.join(quote(t) for t in driver.cmd)}" in out
-
-        assert "Customized Environment:" in out
-        for k in driver.custom_env_vars:
-            assert f"{k}={driver.env[k]}" in out
-
-        assert out.endswith(f"\n{'-':-<80}")
-
-
-HEADER_PATH = Path(__file__).parent / "sample_header.h"
-
-
-def test_read_c_define_hit() -> None:
-    assert m.read_c_define(HEADER_PATH, "FOO") == "10"
-    assert m.read_c_define(HEADER_PATH, "BAR") == '"bar"'
-
-
-def test_read_c_define_miss() -> None:
-    assert m.read_c_define(HEADER_PATH, "JUNK") is None
-
-
-CMAKE_CACHE_PATH = Path(__file__).parent / "sample_cmake_cache.txt"
-
-
-def test_read_cmake_cache_value_hit() -> None:
-    assert (
-        m.read_cmake_cache_value(CMAKE_CACHE_PATH, "Legion_SOURCE_DIR:STATIC=")
-        == '"foo/bar"'
-    )
-    assert (
-        m.read_cmake_cache_value(
-            CMAKE_CACHE_PATH, "FIND_LEGATE_CORE_CPP:BOOL=OFF"
-        )
-        == "OFF"
-    )
-
-
-def test_read_cmake_cache_value_miss() -> None:
-    with pytest.raises(RuntimeError):
-        assert m.read_cmake_cache_value(CMAKE_CACHE_PATH, "JUNK") is None
diff --git a/tests/unit/legate/driver/util.py b/tests/unit/legate/driver/util.py
index d91896977..fad7a9f76 100644
--- a/tests/unit/legate/driver/util.py
+++ b/tests/unit/legate/driver/util.py
@@ -14,26 +14,12 @@
 #
 from __future__ import annotations
 
-from itertools import chain, combinations
-from typing import Any, Iterable, Iterator
+from typing import Any
 
-import pytest
 from typing_extensions import TypeAlias
 
-Capsys: TypeAlias = pytest.CaptureFixture[str]
-
 GenConfig: TypeAlias = Any
 
 GenSystem: TypeAlias = Any
 
 GenObjs: TypeAlias = Any
-
-
-# ref: https://docs.python.org/3/library/itertools.html
-def powerset(iterable: Iterable[Any]) -> Iterator[Any]:
-    s = list(iterable)
-    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
-
-
-def powerset_nonempty(iterable: Iterable[Any]) -> Iterator[Any]:
-    return (x for x in powerset(iterable) if len(x))
diff --git a/tests/unit/legate/test_rc.py b/tests/unit/legate/test_rc.py
index d497163ef..74cea3092 100644
--- a/tests/unit/legate/test_rc.py
+++ b/tests/unit/legate/test_rc.py
@@ -14,7 +14,6 @@
 #
 
 import sys
-from dataclasses import dataclass
 from unittest.mock import MagicMock
 
 import pytest
@@ -30,27 +29,28 @@ def mock_has_legion_context(monkeypatch: pytest.MonkeyPatch) -> MagicMock:
 
 
 class Test_check_legion:
-    def test_True(self, mock_has_legion_context) -> None:
+    def test_True(self, mock_has_legion_context: MagicMock) -> None:
         mock_has_legion_context.return_value = True
-        assert m.check_legion() is None
+        assert m.check_legion() is None  # type: ignore[func-returns-value]
 
-    def test_True_with_msg(self, mock_has_legion_context) -> None:
+    def test_True_with_msg(self, mock_has_legion_context: MagicMock) -> None:
         mock_has_legion_context.return_value = True
-        assert m.check_legion(msg="custom") is None
+        assert m.check_legion(msg="custom") is None  # type: ignore[func-returns-value]  # noqa
 
-    def test_False(self, mock_has_legion_context) -> None:
+    def test_False(self, mock_has_legion_context: MagicMock) -> None:
         mock_has_legion_context.return_value = False
         with pytest.raises(RuntimeError) as e:
             m.check_legion()
             assert str(e) == m.LEGION_WARNING
 
-    def test_False_with_msg(self, mock_has_legion_context) -> None:
+    def test_False_with_msg(self, mock_has_legion_context: MagicMock) -> None:
         mock_has_legion_context.return_value = False
         with pytest.raises(RuntimeError) as e:
             m.check_legion(msg="custom")
             assert str(e) == "custom"
 
 
+@pytest.mark.skip
 class Test_has_legion_context:
     def test_True(self) -> None:
         assert m.has_legion_context() is True
@@ -62,113 +62,5 @@ def test_False(self) -> None:
         pass
 
 
-@dataclass(frozen=True)
-class _TestObj:
-    a: int = 10
-    b: m.NotRequired[int] = m.Unset
-    c: m.NotRequired[str] = "foo"
-    d: m.NotRequired[str] = m.Unset
-
-
-def test_entries() -> None:
-    assert set(m.entries(_TestObj())) == {("a", 10), ("c", "foo")}
-
-
-class TestArgSpec:
-    def test_dest_required(self):
-        with pytest.raises(TypeError) as e:
-            m.ArgSpec()
-        assert (
-            str(e.value)
-            == "__init__() missing 1 required positional argument: 'dest'"
-        )
-
-    def test_default(self):
-        spec = m.ArgSpec("dest")
-        assert spec.dest == "dest"
-        assert spec.action == "store_true"
-
-        # all others are unset
-        assert set(m.entries(spec)) == {
-            ("dest", "dest"),
-            ("action", "store_true"),
-        }
-
-
-class Test_parse_command_args:
-    @pytest.mark.parametrize("name", ("1foo", "a.b", "a/b", "a[", "a("))
-    def test_bad_libname(self, name):
-        with pytest.raises(ValueError):
-            m.parse_command_args(name, [])
-
-    def test_default_help(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
-        with pytest.raises(SystemExit) as e:
-            m.parse_command_args("foo", [])
-        assert e.value.code is None
-        out, err = capsys.readouterr()
-        assert out.startswith("usage: <foo program>")
-
-    def test_default_help_precedence(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
-        args = [m.Argument("bar", m.ArgSpec(dest="help"))]
-        with pytest.raises(SystemExit) as e:
-            m.parse_command_args("foo", args)
-        assert e.value.code is None
-        out, err = capsys.readouterr()
-        assert out.startswith("usage: <foo program>")
-
-    def test_help_override(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
-        args = [m.Argument("help", m.ArgSpec(dest="help"))]
-        ns = m.parse_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"help": True}
-        assert sys.argv == ["app"]
-
-    def test_basic(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:quux", "1"])
-        args = [
-            m.Argument("bar", m.ArgSpec(dest="bar")),
-            m.Argument(
-                "quux", m.ArgSpec(dest="quux", action="store", type=int)
-            ),
-        ]
-        ns = m.parse_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"bar": True, "quux": 1}
-        assert sys.argv == ["app"]
-
-    def test_extra_args_passed_on(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "--extra", "1"])
-        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
-        ns = m.parse_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"bar": True}
-        assert sys.argv == ["app", "--extra", "1"]
-
-    def test_unrecognized_libname_arg(self, monkeypatch, capsys):
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:baz"])
-        with pytest.warns(UserWarning) as record:
-            ns = m.parse_command_args("foo", [])
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {}
-        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
-
-        # issues one warning for the first encountered
-        assert len(record) == 1
-        assert (
-            record[0].message.args[0]
-            == "Unrecognized argument '-foo:bar' for foo (passed on as-is)"
-        )
-        assert out == ""
-        assert vars(ns) == {}
-        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/legate/tester/__init__.py b/tests/unit/legate/tester/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/legate/tester/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/tester/stages/__init__.py b/tests/unit/legate/tester/stages/__init__.py
new file mode 100644
index 000000000..a955e39e0
--- /dev/null
+++ b/tests/unit/legate/tester/stages/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import Any
+
+from legate.tester.test_system import TestSystem
+from legate.util.types import CPUInfo, GPUInfo
+
+
+class FakeSystem(TestSystem):
+    def __init__(
+        self, cpus: int = 6, gpus: int = 6, fbmem: int = 6 << 32, **kwargs: Any
+    ) -> None:
+        self._cpus = cpus
+        self._gpus = gpus
+        self._fbmem = fbmem
+        super().__init__(**kwargs)
+
+    @property
+    def cpus(self) -> tuple[CPUInfo, ...]:
+        return tuple(CPUInfo((i,)) for i in range(self._cpus))
+
+    @property
+    def gpus(self) -> tuple[GPUInfo, ...]:
+        return tuple(GPUInfo(i, self._fbmem) for i in range(self._gpus))
diff --git a/tests/unit/legate/tester/stages/_linux/__init__.py b/tests/unit/legate/tester/stages/_linux/__init__.py
new file mode 100644
index 000000000..345983919
--- /dev/null
+++ b/tests/unit/legate/tester/stages/_linux/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import sys
+
+import pytest
+
+if sys.platform != "linux":
+    pytestmark = pytest.mark.skip()
diff --git a/tests/unit/legate/tester/stages/_linux/test_cpu.py b/tests/unit/legate/tester/stages/_linux/test_cpu.py
new file mode 100644
index 000000000..24a4eef3d
--- /dev/null
+++ b/tests/unit/legate/tester/stages/_linux/test_cpu.py
@@ -0,0 +1,132 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import pytest
+
+from legate.tester.config import Config
+from legate.tester.stages._linux import cpu as m
+from legate.tester.stages.util import UNPIN_ENV
+
+from .. import FakeSystem
+
+
+def test_default() -> None:
+    c = Config([])
+    s = FakeSystem(cpus=12)
+    stage = m.CPU(c, s)
+    assert stage.kind == "cpus"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" in stage.shard_args(shard, c)
+
+
+def test_cpu_pin_strict() -> None:
+    c = Config(["test.py", "--cpu-pin", "strict"])
+    s = FakeSystem(cpus=12)
+    stage = m.CPU(c, s)
+    assert stage.kind == "cpus"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == {}
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" in stage.shard_args(shard, c)
+
+
+def test_cpu_pin_none() -> None:
+    c = Config(["test.py", "--cpu-pin", "none"])
+    s = FakeSystem(cpus=12)
+    stage = m.CPU(c, s)
+    assert stage.kind == "cpus"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" not in stage.shard_args(shard, c)
+
+
+@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
+def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.CPU(c, s)
+    result = stage.shard_args(shard, c)
+    assert result == ["--cpus", f"{c.cpus}", "--cpu-bind", expected]
+
+
+def test_spec_with_cpus_1() -> None:
+    c = Config(["test.py", "--cpus", "1"])
+    s = FakeSystem()
+    stage = m.CPU(c, s)
+    assert stage.spec.workers == 3
+    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)]
+
+
+def test_spec_with_cpus_2() -> None:
+    c = Config(["test.py", "--cpus", "2"])
+    s = FakeSystem()
+    stage = m.CPU(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5)]
+
+
+def test_spec_with_utility() -> None:
+    c = Config(["test.py", "--cpus", "1", "--utility", "2"])
+    s = FakeSystem()
+    stage = m.CPU(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5)]
+
+
+def test_spec_with_requested_workers() -> None:
+    c = Config(["test.py", "--cpus", "1", "-j", "2"])
+    s = FakeSystem()
+    stage = m.CPU(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0, 1), (2, 3)]
+
+
+def test_spec_with_requested_workers_zero() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", "0"])
+    assert c.requested_workers == 0
+    with pytest.raises(RuntimeError):
+        m.CPU(c, s)
+
+
+def test_spec_with_requested_workers_bad() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
+    assert c.requested_workers > len(s.cpus)
+    with pytest.raises(RuntimeError):
+        m.CPU(c, s)
+
+
+def test_spec_with_verbose() -> None:
+    args = ["test.py", "--cpus", "2"]
+    c = Config(args)
+    cv = Config(args + ["--verbose"])
+    s = FakeSystem()
+
+    spec, vspec = m.CPU(c, s).spec, m.CPU(cv, s).spec
+    assert vspec == spec
diff --git a/tests/unit/legate/tester/stages/_linux/test_eager.py b/tests/unit/legate/tester/stages/_linux/test_eager.py
new file mode 100644
index 000000000..eb8c48629
--- /dev/null
+++ b/tests/unit/legate/tester/stages/_linux/test_eager.py
@@ -0,0 +1,82 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import pytest
+
+from legate.tester.config import Config
+from legate.tester.stages._linux import eager as m
+
+from .. import FakeSystem
+
+
+def test_default() -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.Eager(c, s)
+    assert stage.kind == "eager"
+    assert stage.args == []
+    assert stage.env(c, s) == {
+        "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
+        "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
+        "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
+    }
+    assert stage.spec.workers > 0
+
+
+@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
+def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.Eager(c, s)
+    result = stage.shard_args(shard, c)
+    assert result == ["--cpus", "1", "--cpu-bind", expected]
+
+
+def test_spec() -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.Eager(c, s)
+    assert stage.spec.workers == len(s.cpus)
+    #  [cpu.ids for cpu in system.cpus]
+    assert stage.spec.shards == [(i,) for i in range(stage.spec.workers)]
+
+
+def test_spec_with_requested_workers_zero() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", "0"])
+    assert c.requested_workers == 0
+    with pytest.raises(RuntimeError):
+        m.Eager(c, s)
+
+
+def test_spec_with_requested_workers_bad() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
+    assert c.requested_workers > len(s.cpus)
+    with pytest.raises(RuntimeError):
+        m.Eager(c, s)
+
+
+def test_spec_with_verbose() -> None:
+    c = Config(["test.py"])
+    cv = Config(["test.py", "--verbose"])
+    s = FakeSystem()
+
+    spec, vspec = m.Eager(c, s).spec, m.Eager(cv, s).spec
+    assert vspec == spec
diff --git a/tests/unit/legate/tester/stages/_linux/test_gpu.py b/tests/unit/legate/tester/stages/_linux/test_gpu.py
new file mode 100644
index 000000000..df1441c65
--- /dev/null
+++ b/tests/unit/legate/tester/stages/_linux/test_gpu.py
@@ -0,0 +1,101 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import pytest
+
+from legate.tester.config import Config
+from legate.tester.stages._linux import gpu as m
+
+from .. import FakeSystem
+
+
+def test_default() -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.GPU(c, s)
+    assert stage.kind == "cuda"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == {}
+    assert stage.spec.workers > 0
+
+
+@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
+def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
+    c = Config([])
+    s = FakeSystem()
+    stage = m.GPU(c, s)
+    result = stage.shard_args(shard, c)
+    assert result == [
+        "--fbmem",
+        "4096",
+        "--gpus",
+        f"{len(shard)}",
+        "--gpu-bind",
+        expected,
+    ]
+
+
+def test_spec_with_gpus_1() -> None:
+    c = Config(["test.py", "--gpus", "1"])
+    s = FakeSystem()
+    stage = m.GPU(c, s)
+    assert stage.spec.workers == 12
+    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 12
+
+
+def test_spec_with_gpus_2() -> None:
+    c = Config(["test.py", "--gpus", "2"])
+    s = FakeSystem()
+    stage = m.GPU(c, s)
+    assert stage.spec.workers == 6
+    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)] * 6
+
+
+def test_spec_with_requested_workers() -> None:
+    c = Config(["test.py", "--gpus", "1", "-j", "2"])
+    s = FakeSystem()
+    stage = m.GPU(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 2
+
+
+def test_spec_with_requested_workers_zero() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", "0"])
+    assert c.requested_workers == 0
+    with pytest.raises(RuntimeError):
+        m.GPU(c, s)
+
+
+def test_spec_with_requested_workers_bad() -> None:
+    s = FakeSystem()
+    c = Config(["test.py", "-j", f"{len(s.gpus)+100}"])
+    assert c.requested_workers > len(s.gpus)
+    with pytest.raises(RuntimeError):
+        m.GPU(c, s)
+
+
+def test_spec_with_verbose() -> None:
+    args = ["test.py", "--gpus", "2"]
+    c = Config(args)
+    cv = Config(args + ["--verbose"])
+    s = FakeSystem()
+
+    spec, vspec = m.GPU(c, s).spec, m.GPU(cv, s).spec
+    assert vspec == spec
diff --git a/tests/unit/legate/tester/stages/_linux/test_omp.py b/tests/unit/legate/tester/stages/_linux/test_omp.py
new file mode 100644
index 000000000..a4d319fc0
--- /dev/null
+++ b/tests/unit/legate/tester/stages/_linux/test_omp.py
@@ -0,0 +1,164 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import pytest
+
+from legate.tester.config import Config
+from legate.tester.stages._linux import omp as m
+from legate.tester.stages.util import UNPIN_ENV
+
+from .. import FakeSystem
+
+
+def test_default() -> None:
+    c = Config([])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.kind == "openmp"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" in stage.shard_args(shard, c)
+
+
+def test_cpu_pin_strict() -> None:
+    c = Config(["test.py", "--cpu-pin", "strict"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.kind == "openmp"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == {}
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" in stage.shard_args(shard, c)
+
+
+def test_cpu_pin_none() -> None:
+    c = Config(["test.py", "--cpu-pin", "none"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.kind == "openmp"
+    assert stage.args == ["-cunumeric:test"]
+    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.spec.workers > 0
+
+    shard = (1, 2, 3)
+    assert "--cpu-bind" not in stage.shard_args(shard, c)
+
+
+@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
+def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
+    c = Config([])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    result = stage.shard_args(shard, c)
+    assert result == [
+        "--omps",
+        f"{c.omps}",
+        "--ompthreads",
+        f"{c.ompthreads}",
+        "--cpu-bind",
+        expected,
+    ]
+
+
+def test_spec_with_omps_1_threads_1() -> None:
+    c = Config(["test.py", "--omps", "1", "--ompthreads", "1"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 6
+    assert stage.spec.shards == [
+        (0, 1),
+        (2, 3),
+        (4, 5),
+        (6, 7),
+        (8, 9),
+        (10, 11),
+    ]
+
+
+def test_spec_with_omps_1_threads_2() -> None:
+    c = Config(["test.py", "--omps", "1", "--ompthreads", "2"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 4
+    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11)]
+
+
+def test_spec_with_omps_2_threads_1() -> None:
+    c = Config(["test.py", "--omps", "2", "--ompthreads", "1"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 4
+    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11)]
+
+
+def test_spec_with_omps_2_threads_2() -> None:
+    c = Config(["test.py", "--omps", "2", "--ompthreads", "2"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9)]
+
+
+def test_spec_with_utility() -> None:
+    c = Config(
+        ["test.py", "--omps", "2", "--ompthreads", "2", "--utility", "3"]
+    )
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 1
+    assert stage.spec.shards == [(0, 1, 2, 3, 4, 5, 6)]
+
+
+def test_spec_with_requested_workers() -> None:
+    c = Config(["test.py", "--omps", "1", "--ompthreads", "1", "-j", "2"])
+    s = FakeSystem(cpus=12)
+    stage = m.OMP(c, s)
+    assert stage.spec.workers == 2
+    assert stage.spec.shards == [(0, 1), (2, 3)]
+
+
+def test_spec_with_requested_workers_zero() -> None:
+    s = FakeSystem(cpus=12)
+    c = Config(["test.py", "-j", "0"])
+    assert c.requested_workers == 0
+    with pytest.raises(RuntimeError):
+        m.OMP(c, s)
+
+
+def test_spec_with_requested_workers_bad() -> None:
+    s = FakeSystem(cpus=12)
+    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
+    assert c.requested_workers > len(s.cpus)
+    with pytest.raises(RuntimeError):
+        m.OMP(c, s)
+
+
+def test_spec_with_verbose() -> None:
+    args = ["test.py", "--cpus", "2"]
+    c = Config(args)
+    cv = Config(args + ["--verbose"])
+    s = FakeSystem(cpus=12)
+
+    spec, vspec = m.OMP(c, s).spec, m.OMP(cv, s).spec
+    assert vspec == spec
diff --git a/tests/unit/legate/tester/stages/test_test_stage.py b/tests/unit/legate/tester/stages/test_test_stage.py
new file mode 100644
index 000000000..90edfaed4
--- /dev/null
+++ b/tests/unit/legate/tester/stages/test_test_stage.py
@@ -0,0 +1,88 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from datetime import timedelta
+from pathlib import Path
+
+from legate.tester import FeatureType
+from legate.tester.config import Config
+from legate.tester.stages import test_stage as m
+from legate.tester.stages.util import StageResult, StageSpec
+from legate.tester.test_system import ProcessResult, TestSystem as _TestSystem
+
+from . import FakeSystem
+
+s = FakeSystem()
+
+
+class MockTestStage(m.TestStage):
+
+    kind: FeatureType = "eager"
+
+    name = "mock"
+
+    args = ["-foo", "-bar"]
+
+    def __init__(self, config: Config, system: _TestSystem) -> None:
+        self._init(config, system)
+
+    def compute_spec(self, config: Config, system: _TestSystem) -> StageSpec:
+        return StageSpec(2, [(0,), (1,), (2,)])
+
+
+class TestTestStage:
+    def test_name(self) -> None:
+        c = Config([])
+        stage = MockTestStage(c, s)
+        assert stage.name == "mock"
+
+    def test_intro(self) -> None:
+        c = Config([])
+        stage = MockTestStage(c, s)
+        assert "Entering stage: mock" in stage.intro
+
+    def test_outro(self) -> None:
+        c = Config([])
+        stage = MockTestStage(c, s)
+        stage.result = StageResult(
+            [ProcessResult("invoke", Path("test/file"))],
+            timedelta(seconds=2.123),
+        )
+        outro = stage.outro
+        assert "Exiting stage: mock" in outro
+        assert "Passed 1 of 1 tests (100.0%)" in outro
+        assert "2.123" in outro
+
+    def test_file_args_default(self) -> None:
+        c = Config([])
+        stage = MockTestStage(c, s)
+        assert stage.file_args(Path("integration/foo"), c) == []
+        assert stage.file_args(Path("unit/foo"), c) == []
+
+    def test_file_args_v(self) -> None:
+        c = Config(["test.py", "-v"])
+        stage = MockTestStage(c, s)
+        assert stage.file_args(Path("integration/foo"), c) == ["-v"]
+        assert stage.file_args(Path("unit/foo"), c) == []
+
+    def test_file_args_vv(self) -> None:
+        c = Config(["test.py", "-vv"])
+        stage = MockTestStage(c, s)
+        assert stage.file_args(Path("integration/foo"), c) == ["-v", "-s"]
+        assert stage.file_args(Path("unit/foo"), c) == []
diff --git a/tests/unit/legate/tester/stages/test_util.py b/tests/unit/legate/tester/stages/test_util.py
new file mode 100644
index 000000000..b4c528d06
--- /dev/null
+++ b/tests/unit/legate/tester/stages/test_util.py
@@ -0,0 +1,48 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import pytest
+
+from legate.tester.stages import util as m
+
+
+class Test_adjust_workers:
+    @pytest.mark.parametrize("n", (1, 5, 100))
+    def test_None_requested(self, n: int) -> None:
+        assert m.adjust_workers(n, None) == n
+
+    @pytest.mark.parametrize("n", (1, 2, 9))
+    def test_requested(self, n: int) -> None:
+        assert m.adjust_workers(10, n) == n
+
+    def test_negative_requested(self) -> None:
+        with pytest.raises(ValueError):
+            assert m.adjust_workers(10, -1)
+
+    def test_zero_requested(self) -> None:
+        with pytest.raises(RuntimeError):
+            assert m.adjust_workers(10, 0)
+
+    def test_zero_computed(self) -> None:
+        with pytest.raises(RuntimeError):
+            assert m.adjust_workers(0, None)
+
+    def test_requested_too_large(self) -> None:
+        with pytest.raises(RuntimeError):
+            assert m.adjust_workers(10, 11)
diff --git a/tests/unit/legate/tester/test___init__.py b/tests/unit/legate/tester/test___init__.py
new file mode 100644
index 000000000..6431469ff
--- /dev/null
+++ b/tests/unit/legate/tester/test___init__.py
@@ -0,0 +1,69 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from legate.tester import (
+    DEFAULT_CPUS_PER_NODE,
+    DEFAULT_GPU_DELAY,
+    DEFAULT_GPU_MEMORY_BUDGET,
+    DEFAULT_GPUS_PER_NODE,
+    DEFAULT_OMPS_PER_NODE,
+    DEFAULT_OMPTHREADS,
+    DEFAULT_PROCESS_ENV,
+    FEATURES,
+    PER_FILE_ARGS,
+    SKIPPED_EXAMPLES,
+)
+
+
+class TestConsts:
+    def test_DEFAULT_CPUS_PER_NODE(self) -> None:
+        assert DEFAULT_CPUS_PER_NODE == 4
+
+    def test_DEFAULT_GPUS_PER_NODE(self) -> None:
+        assert DEFAULT_GPUS_PER_NODE == 1
+
+    def test_DEFAULT_GPU_DELAY(self) -> None:
+        assert DEFAULT_GPU_DELAY == 2000
+
+    def test_DEFAULT_GPU_MEMORY_BUDGET(self) -> None:
+        assert DEFAULT_GPU_MEMORY_BUDGET == 4096
+
+    def test_DEFAULT_OMPS_PER_NODE(self) -> None:
+        assert DEFAULT_OMPS_PER_NODE == 1
+
+    def test_DEFAULT_OMPTHREADS(self) -> None:
+        assert DEFAULT_OMPTHREADS == 4
+
+    def test_DEFAULT_PROCESS_ENV(self) -> None:
+        assert DEFAULT_PROCESS_ENV == {
+            "LEGATE_TEST": "1",
+        }
+
+    def test_FEATURES(self) -> None:
+        assert FEATURES == ("cpus", "cuda", "eager", "openmp")
+
+    def test_SKIPPED_EXAMPLES(self) -> None:
+        assert isinstance(SKIPPED_EXAMPLES, set)
+        assert all(isinstance(x, str) for x in SKIPPED_EXAMPLES)
+        assert all(x.startswith("examples") for x in SKIPPED_EXAMPLES)
+
+    def test_PER_FILE_ARGS(self) -> None:
+        assert isinstance(PER_FILE_ARGS, dict)
+        assert all(isinstance(x, str) for x in PER_FILE_ARGS.keys())
+        assert all(isinstance(x, list) for x in PER_FILE_ARGS.values())
diff --git a/tests/unit/legate/tester/test_args.py b/tests/unit/legate/tester/test_args.py
new file mode 100644
index 000000000..c307a7080
--- /dev/null
+++ b/tests/unit/legate/tester/test_args.py
@@ -0,0 +1,89 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from legate.tester import (
+    DEFAULT_CPUS_PER_NODE,
+    DEFAULT_GPU_DELAY,
+    DEFAULT_GPU_MEMORY_BUDGET,
+    DEFAULT_GPUS_PER_NODE,
+    DEFAULT_OMPS_PER_NODE,
+    DEFAULT_OMPTHREADS,
+    args as m,
+)
+
+
+class TestParserDefaults:
+    def test_featurs(self) -> None:
+        assert m.parser.get_default("features") is None
+
+    def test_files(self) -> None:
+        assert m.parser.get_default("files") is None
+
+    def test_unit(self) -> None:
+        assert m.parser.get_default("unit") is False
+
+    def test_cpus(self) -> None:
+        assert m.parser.get_default("cpus") == DEFAULT_CPUS_PER_NODE
+
+    def test_gpus(self) -> None:
+        assert m.parser.get_default("gpus") == DEFAULT_GPUS_PER_NODE
+
+    def test_cpu_pin(self) -> None:
+        assert m.parser.get_default("cpu_pin") == "partial"
+
+    def test_gpu_delay(self) -> None:
+        assert m.parser.get_default("gpu_delay") == DEFAULT_GPU_DELAY
+
+    def test_fbmem(self) -> None:
+        assert m.parser.get_default("fbmem") == DEFAULT_GPU_MEMORY_BUDGET
+
+    def test_omps(self) -> None:
+        assert m.parser.get_default("omps") == DEFAULT_OMPS_PER_NODE
+
+    def test_ompthreads(self) -> None:
+        assert m.parser.get_default("ompthreads") == DEFAULT_OMPTHREADS
+
+    def test_legate_dir(self) -> None:
+        assert m.parser.get_default("legate_dir") is None
+
+    def test_test_root(self) -> None:
+        assert m.parser.get_default("test_root") is None
+
+    def test_workers(self) -> None:
+        assert m.parser.get_default("workers") is None
+
+    def test_verbose(self) -> None:
+        assert m.parser.get_default("verbose") == 0
+
+    def test_dry_run(self) -> None:
+        assert m.parser.get_default("dry_run") is False
+
+    def test_debug(self) -> None:
+        assert m.parser.get_default("debug") is False
+
+
+class TestParserConfig:
+    def test_parser_epilog(self) -> None:
+        assert (
+            m.parser.epilog
+            == "Any extra arguments will be forwarded to the Legate script"
+        )
+
+    def test_parser_description(self) -> None:
+        assert m.parser.description == "Run the Cunumeric test suite"
diff --git a/tests/unit/legate/tester/test_config.py b/tests/unit/legate/tester/test_config.py
new file mode 100644
index 000000000..d55104980
--- /dev/null
+++ b/tests/unit/legate/tester/test_config.py
@@ -0,0 +1,182 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from pathlib import Path, PurePath
+
+import pytest
+
+from legate.tester import (
+    DEFAULT_CPUS_PER_NODE,
+    DEFAULT_GPU_DELAY,
+    DEFAULT_GPU_MEMORY_BUDGET,
+    DEFAULT_GPUS_PER_NODE,
+    DEFAULT_OMPS_PER_NODE,
+    DEFAULT_OMPTHREADS,
+    FEATURES,
+    config as m,
+)
+from legate.tester.args import PIN_OPTIONS, PinOptionsType
+
+
+class TestConfig:
+    def test_default_init(self) -> None:
+        c = m.Config([])
+
+        assert c.examples is True
+        assert c.integration is True
+        assert c.unit is False
+        assert c.files is None
+
+        assert c.features == ("cpus",)
+
+        assert c.cpus == DEFAULT_CPUS_PER_NODE
+        assert c.gpus == DEFAULT_GPUS_PER_NODE
+        assert c.cpu_pin == "partial"
+        assert c.gpu_delay == DEFAULT_GPU_DELAY
+        assert c.fbmem == DEFAULT_GPU_MEMORY_BUDGET
+        assert c.omps == DEFAULT_OMPS_PER_NODE
+        assert c.ompthreads == DEFAULT_OMPTHREADS
+
+        assert c.debug is False
+        assert c.dry_run is False
+        assert c.verbose == 0
+        assert c.test_root is None
+        assert c.requested_workers is None
+        assert c.legate_dir is None
+
+        assert c.extra_args == []
+        assert c.root_dir == PurePath(m.__file__).parents[2]
+
+        # TODO (bv) restore when generalized
+        # assert len(c.test_files) > 0
+        # assert any("examples" in str(x) for x in c.test_files)
+        # assert any("integration" in str(x) for x in c.test_files)
+        # assert all("unit" not in str(x) for x in c.test_files)
+
+        assert c.legate_path == "legate"
+
+    @pytest.mark.parametrize("feature", FEATURES)
+    def test_env_features(
+        self, monkeypatch: pytest.MonkeyPatch, feature: str
+    ) -> None:
+        monkeypatch.setenv(f"USE_{feature.upper()}", "1")
+
+        # test default config
+        c = m.Config([])
+        assert set(c.features) == {feature}
+
+        # also test with a --use value provided
+        c = m.Config(["test.py", "--use", "cuda"])
+        assert set(c.features) == {"cuda"}
+
+    @pytest.mark.parametrize("feature", FEATURES)
+    def test_cmd_features(self, feature: str) -> None:
+
+        # test a single value
+        c = m.Config(["test.py", "--use", feature])
+        assert set(c.features) == {feature}
+
+        # also test with multiple / duplication
+        c = m.Config(["test.py", "--use", f"cpus,{feature}"])
+        assert set(c.features) == {"cpus", feature}
+
+    # TODO (bv) restore when generalized
+    @pytest.mark.skip
+    def test_unit(self) -> None:
+        c = m.Config(["test.py", "--unit"])
+        assert len(c.test_files) > 0
+        assert any("examples" in str(x) for x in c.test_files)
+        assert any("integration" in str(x) for x in c.test_files)
+        assert any("unit" in str(x) for x in c.test_files)
+
+    def test_files(self) -> None:
+        c = m.Config(["test.py", "--files", "a", "b", "c"])
+        assert c.files == ["a", "b", "c"]
+
+    @pytest.mark.parametrize(
+        "opt", ("cpus", "gpus", "gpu-delay", "fbmem", "omps", "ompthreads")
+    )
+    def test_feature_options(self, opt: str) -> None:
+        c = m.Config(["test.py", f"--{opt}", "1234"])
+        assert getattr(c, opt.replace("-", "_")) == 1234
+
+    @pytest.mark.parametrize("value", PIN_OPTIONS)
+    def test_cpu_pin(self, value: PinOptionsType) -> None:
+        c = m.Config(["test.py", "--cpu-pin", value])
+        assert c.cpu_pin == value
+
+    def test_workers(self) -> None:
+        c = m.Config(["test.py", "-j", "1234"])
+        assert c.requested_workers == 1234
+
+    def test_debug(self) -> None:
+        c = m.Config(["test.py", "--debug"])
+        assert c.debug is True
+
+    def test_dry_run(self) -> None:
+        c = m.Config(["test.py", "--dry-run"])
+        assert c.dry_run is True
+
+    @pytest.mark.parametrize("arg", ("-v", "--verbose"))
+    def test_verbose1(self, arg: str) -> None:
+        c = m.Config(["test.py", arg])
+        assert c.verbose == 1
+
+    def test_verbose2(self) -> None:
+        c = m.Config(["test.py", "-vv"])
+        assert c.verbose == 2
+
+    @pytest.mark.parametrize("arg", ("-C", "--directory"))
+    def test_test_root(self, arg: str) -> None:
+        c = m.Config(["test.py", arg, "some/path"])
+        assert c.test_root == "some/path"
+
+    def test_legate_dir(self) -> None:
+        c = m.Config([])
+        assert c.legate_dir is None
+        assert c.legate_path == "legate"
+        assert c._legate_source == "install"
+
+    def test_cmd_legate_dir_good(self) -> None:
+        legate_dir = Path("/usr/local")
+        c = m.Config(["test.py", "--legate", str(legate_dir)])
+        assert c.legate_dir == legate_dir
+        assert c.legate_path == str(legate_dir / "bin" / "legate")
+        assert c._legate_source == "cmd"
+
+    def test_env_legate_dir_good(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        legate_dir = Path("/usr/local")
+        monkeypatch.setenv("LEGATE_DIR", str(legate_dir))
+        c = m.Config([])
+        assert c.legate_dir == legate_dir
+        assert c.legate_path == str(legate_dir / "bin" / "legate")
+        assert c._legate_source == "env"
+
+    def test_extra_args(self) -> None:
+        extra = ["-foo", "--bar", "--baz", "10"]
+        c = m.Config(["test.py"] + extra)
+        assert c.extra_args == extra
+
+        # also test with --files since that option collects arguments
+        c = m.Config(["test.py", "--files", "a", "b"] + extra)
+        assert c.extra_args == extra
+        c = m.Config(["test.py"] + extra + ["--files", "a", "b"])
+        assert c.extra_args == extra
diff --git a/tests/unit/legate/tester/test_logger.py b/tests/unit/legate/tester/test_logger.py
new file mode 100644
index 000000000..40228c2f4
--- /dev/null
+++ b/tests/unit/legate/tester/test_logger.py
@@ -0,0 +1,74 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from legate.tester import logger as m
+
+TEST_LINES = (
+    "line 1",
+    "\x1b[31mfoo\x1b[0m",  # ui.red("foo")
+    "bar",
+    "last line",
+)
+
+
+class TestLogger:
+    def test_init(self) -> None:
+        log = m.Log()
+        assert log.lines == ()
+        assert log.dump() == ""
+
+    def test_record_lines(self) -> None:
+        log = m.Log()
+        log.record(*TEST_LINES)
+        assert log.lines == TEST_LINES
+        assert log.dump(filter_ansi=False) == "\n".join(TEST_LINES)
+
+    def test_record_line_with_newlines(self) -> None:
+        log = m.Log()
+        log.record("\n".join(TEST_LINES))
+        assert log.lines == TEST_LINES
+        assert log.dump(filter_ansi=False) == "\n".join(TEST_LINES)
+
+    def test_call(self) -> None:
+        log = m.Log()
+        log(*TEST_LINES)
+        assert log.lines == TEST_LINES
+        assert log.dump() == "line 1\nfoo\nbar\nlast line"
+
+    def test_dump_filter(self) -> None:
+        log = m.Log()
+        log.record(*TEST_LINES)
+        assert log.lines == TEST_LINES
+        assert log.dump() == "line 1\nfoo\nbar\nlast line"
+
+    def test_dump_index(self) -> None:
+        log = m.Log()
+        log.record(*TEST_LINES)
+        assert log.dump(start=1, end=3) == "foo\nbar"
+
+    def test_clear(self) -> None:
+        log = m.Log()
+        log.record(*TEST_LINES)
+        assert len(log.lines) > 0
+        log.clear()
+        assert len(log.lines) == 0
+
+
+def test_LOG() -> None:
+    assert isinstance(m.LOG, m.Log)
diff --git a/tests/unit/legate/tester/test_test_system.py b/tests/unit/legate/tester/test_test_system.py
new file mode 100644
index 000000000..268a6a32f
--- /dev/null
+++ b/tests/unit/legate/tester/test_test_system.py
@@ -0,0 +1,65 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from subprocess import CompletedProcess
+from unittest.mock import MagicMock
+
+import pytest
+from pytest_mock import MockerFixture
+
+from legate.tester import test_system as m
+
+
+@pytest.fixture
+def mock_subprocess_run(mocker: MockerFixture) -> MagicMock:
+    return mocker.patch.object(m, "stdlib_run")
+
+
+CMD = "legate script.py --cpus 4"
+
+
+class TestSystem:
+    def test_init(self) -> None:
+        s = m.TestSystem()
+        assert s.dry_run is False
+
+    def test_run(self, mock_subprocess_run: MagicMock) -> None:
+        s = m.TestSystem()
+
+        expected = m.ProcessResult(
+            CMD, Path("test/file"), returncode=10, output="<output>"
+        )
+        mock_subprocess_run.return_value = CompletedProcess(
+            CMD, 10, stdout="<output>"
+        )
+
+        result = s.run(CMD.split(), Path("test/file"))
+        mock_subprocess_run.assert_called()
+
+        assert result == expected
+
+    def test_dry_run(self, mock_subprocess_run: MagicMock) -> None:
+        s = m.TestSystem(dry_run=True)
+
+        result = s.run(CMD.split(), Path("test/file"))
+        mock_subprocess_run.assert_not_called()
+
+        assert result.output == ""
+        assert result.skipped
diff --git a/tests/unit/legate/util/__init__.py b/tests/unit/legate/util/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/legate/util/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/driver/sample_cmake_cache.txt b/tests/unit/legate/util/sample_cmake_cache.txt
similarity index 100%
rename from tests/unit/legate/driver/sample_cmake_cache.txt
rename to tests/unit/legate/util/sample_cmake_cache.txt
diff --git a/tests/unit/legate/driver/sample_header.h b/tests/unit/legate/util/sample_header.h
similarity index 100%
rename from tests/unit/legate/driver/sample_header.h
rename to tests/unit/legate/util/sample_header.h
diff --git a/tests/unit/legate/util/test_args.py b/tests/unit/legate/util/test_args.py
new file mode 100644
index 000000000..02d01a58c
--- /dev/null
+++ b/tests/unit/legate/util/test_args.py
@@ -0,0 +1,187 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from typing import Iterable, TypeVar
+
+import pytest
+
+import legate.util.args as m
+
+from ...util import Capsys, powerset
+
+T = TypeVar("T")
+
+
+class TestMultipleChoices:
+    @pytest.mark.parametrize("choices", ([1, 2, 3], range(4), ("a", "b")))
+    def test_init(self, choices: Iterable[T]) -> None:
+        mc = m.MultipleChoices(choices)
+        assert mc._choices == set(choices)
+
+    def test_contains_item(self) -> None:
+        choices = [1, 2, 3]
+        mc = m.MultipleChoices(choices)
+        for item in choices:
+            assert item in mc
+
+    def test_contains_subset(self) -> None:
+        choices = [1, 2, 3]
+        mc = m.MultipleChoices(choices)
+        for subset in powerset(choices):
+            assert subset in mc
+
+    def test_iter(self) -> None:
+        choices = [1, 2, 3]
+        mc = m.MultipleChoices(choices)
+        assert list(mc) == choices
+
+
+class TestExtendAction:
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--foo", dest="foo", action=m.ExtendAction, choices=("a", "b", "c")
+    )
+
+    def test_single(self) -> None:
+        ns = self.parser.parse_args(["--foo", "a"])
+        assert ns.foo == ["a"]
+
+    def test_multi(self) -> None:
+        ns = self.parser.parse_args(["--foo", "a", "--foo", "b"])
+        assert sorted(ns.foo) == ["a", "b"]
+
+    def test_repeat(self) -> None:
+        ns = self.parser.parse_args(["--foo", "a", "--foo", "a"])
+        assert ns.foo == ["a"]
+
+
+@dataclass(frozen=True)
+class _TestObj:
+    a: int = 10
+    b: m.NotRequired[int] = m.Unset
+    c: m.NotRequired[str] = "foo"
+    d: m.NotRequired[str] = m.Unset
+
+
+class TestArgSpec:
+    def test_default(self) -> None:
+        spec = m.ArgSpec("dest")
+        assert spec.dest == "dest"
+        assert spec.action == "store_true"
+
+        # all others are unset
+        assert set(m.entries(spec)) == {
+            ("dest", "dest"),
+            ("action", "store_true"),
+        }
+
+
+def test_entries() -> None:
+    assert set(m.entries(_TestObj())) == {("a", 10), ("c", "foo")}
+
+
+class Test_parse_library_command_args:
+    @pytest.mark.parametrize("name", ("1foo", "a.b", "a/b", "a[", "a("))
+    def test_bad_libname(self, name: str) -> None:
+        with pytest.raises(ValueError):
+            m.parse_library_command_args(name, [])
+
+    def test_default_help(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
+        with pytest.raises(SystemExit) as e:
+            m.parse_library_command_args("foo", [])
+        assert e.value.code is None
+        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        assert out.startswith("usage: <foo program>")
+
+    def test_default_help_precedence(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
+        args = [m.Argument("bar", m.ArgSpec(dest="help"))]
+        with pytest.raises(SystemExit) as e:
+            m.parse_library_command_args("foo", args)
+        assert e.value.code is None
+        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        assert out.startswith("usage: <foo program>")
+
+    def test_help_override(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
+        args = [m.Argument("help", m.ArgSpec(dest="help"))]
+        ns = m.parse_library_command_args("foo", args)
+        out, err = capsys.readouterr()
+        assert out == ""
+        assert vars(ns) == {"help": True}
+        assert sys.argv == ["app"]
+
+    def test_basic(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:quux", "1"])
+        args = [
+            m.Argument("bar", m.ArgSpec(dest="bar")),
+            m.Argument(
+                "quux", m.ArgSpec(dest="quux", action="store", type=int)
+            ),
+        ]
+        ns = m.parse_library_command_args("foo", args)
+        out, err = capsys.readouterr()
+        assert out == ""
+        assert vars(ns) == {"bar": True, "quux": 1}
+        assert sys.argv == ["app"]
+
+    def test_extra_args_passed_on(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "--extra", "1"])
+        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
+        ns = m.parse_library_command_args("foo", args)
+        out, err = capsys.readouterr()
+        assert out == ""
+        assert vars(ns) == {"bar": True}
+        assert sys.argv == ["app", "--extra", "1"]
+
+    def test_unrecognized_libname_arg(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:baz"])
+        with pytest.warns(UserWarning) as record:
+            ns = m.parse_library_command_args("foo", [])
+        out, err = capsys.readouterr()
+        assert out == ""
+        assert vars(ns) == {}
+        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
+
+        # issues one warning for the first encountered
+        assert len(record) == 1
+        assert isinstance(record[0].message, Warning)
+        assert (
+            record[0].message.args[0]
+            == "Unrecognized argument '-foo:bar' for foo (passed on as-is)"
+        )
+        assert out == ""
+        assert vars(ns) == {}
+        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/legate/util/test_colors.py b/tests/unit/legate/util/test_colors.py
new file mode 100644
index 000000000..873f3dc53
--- /dev/null
+++ b/tests/unit/legate/util/test_colors.py
@@ -0,0 +1,103 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+from pytest_mock import MockerFixture
+from typing_extensions import TypeAlias
+
+import legate.util.colors as m
+
+try:
+    import colorama  # type: ignore
+except ImportError:
+    colorama = None
+
+UsePlainTextFixture: TypeAlias = Any
+
+
+@pytest.fixture
+def use_plain_text(mocker: MockerFixture) -> None:
+    mocker.patch.object(m, "bright", m._text)
+    mocker.patch.object(m, "dim", m._text)
+    mocker.patch.object(m, "white", m._text)
+    mocker.patch.object(m, "cyan", m._text)
+    mocker.patch.object(m, "red", m._text)
+    mocker.patch.object(m, "green", m._text)
+    mocker.patch.object(m, "yellow", m._text)
+    mocker.patch.object(m, "magenta", m._text)
+
+
+COLOR_FUNCS = (
+    "cyan",
+    "green",
+    "magenta",
+    "red",
+    "white",
+    "yellow",
+)
+
+STYLE_FUNCS = (
+    "bright",
+    "dim",
+)
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+@pytest.mark.parametrize("color", COLOR_FUNCS)
+def test_color_functions(color: str) -> None:
+    cfunc = getattr(m, color)
+    cprop = getattr(colorama.Fore, color.upper())
+
+    out = cfunc("some text")
+
+    assert out == f"{cprop}some text{colorama.Style.RESET_ALL}"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+@pytest.mark.parametrize("style", STYLE_FUNCS)
+def test_style_functions(style: str) -> None:
+    sfunc = getattr(m, style)
+    sprop = getattr(colorama.Style, style.upper())
+
+    out = sfunc("some text")
+
+    assert out == f"{sprop}some text{colorama.Style.RESET_ALL}"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+@pytest.mark.parametrize("color", COLOR_FUNCS)
+@pytest.mark.parametrize("style", STYLE_FUNCS)
+def test_scrub(style: str, color: str) -> None:
+    cfunc = getattr(m, color)
+    sfunc = getattr(m, style)
+
+    assert m.scrub(cfunc(sfunc("some text"))) == "some text"
+    assert m.scrub(sfunc(cfunc("some text"))) == "some text"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+@pytest.mark.parametrize("color", COLOR_FUNCS)
+@pytest.mark.parametrize("style", STYLE_FUNCS)
+def test_scrub_plain(
+    use_plain_text: UsePlainTextFixture, style: str, color: str
+) -> None:
+    cfunc = getattr(m, color)
+    sfunc = getattr(m, style)
+
+    assert m.scrub(cfunc(sfunc("some text"))) == "some text"
+    assert m.scrub(sfunc(cfunc("some text"))) == "some text"
diff --git a/tests/unit/legate/util/test_fs.py b/tests/unit/legate/util/test_fs.py
new file mode 100644
index 000000000..32cd452b3
--- /dev/null
+++ b/tests/unit/legate/util/test_fs.py
@@ -0,0 +1,53 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+import legate.util.fs as m
+
+HEADER_PATH = Path(__file__).parent / "sample_header.h"
+
+
+def test_read_c_define_hit() -> None:
+    assert m.read_c_define(HEADER_PATH, "FOO") == "10"
+    assert m.read_c_define(HEADER_PATH, "BAR") == '"bar"'
+
+
+def test_read_c_define_miss() -> None:
+    assert m.read_c_define(HEADER_PATH, "JUNK") is None
+
+
+CMAKE_CACHE_PATH = Path(__file__).parent / "sample_cmake_cache.txt"
+
+
+def test_read_cmake_cache_value_hit() -> None:
+    assert (
+        m.read_cmake_cache_value(CMAKE_CACHE_PATH, "Legion_SOURCE_DIR:STATIC=")
+        == '"foo/bar"'
+    )
+    assert (
+        m.read_cmake_cache_value(
+            CMAKE_CACHE_PATH, "FIND_LEGATE_CORE_CPP:BOOL=OFF"
+        )
+        == "OFF"
+    )
+
+
+def test_read_cmake_cache_value_miss() -> None:
+    with pytest.raises(RuntimeError):
+        assert m.read_cmake_cache_value(CMAKE_CACHE_PATH, "JUNK") is None
diff --git a/tests/unit/legate/driver/test_system.py b/tests/unit/legate/util/test_system.py
similarity index 83%
rename from tests/unit/legate/driver/test_system.py
rename to tests/unit/legate/util/test_system.py
index a1b905496..3ae242b6f 100644
--- a/tests/unit/legate/driver/test_system.py
+++ b/tests/unit/legate/util/test_system.py
@@ -15,11 +15,12 @@
 from __future__ import annotations
 
 import os
+import sys
 
 import pytest
 from pytest_mock import MockerFixture
 
-import legate.driver.system as m
+import legate.util.system as m
 
 
 def test___all__() -> None:
@@ -73,7 +74,7 @@ def test_LIBPATH_Darwin(self, mocker: MockerFixture) -> None:
 
     def test_legate_paths(self, mocker: MockerFixture) -> None:
         mocker.patch(
-            "legate.driver.system.get_legate_paths",
+            "legate.util.system.get_legate_paths",
             return_value="legate paths",
         )
 
@@ -83,10 +84,22 @@ def test_legate_paths(self, mocker: MockerFixture) -> None:
 
     def test_legion_paths(self, mocker: MockerFixture) -> None:
         mocker.patch(
-            "legate.driver.system.get_legion_paths",
+            "legate.util.system.get_legion_paths",
             return_value="legion paths",
         )
 
         s = m.System()
 
         assert s.legion_paths == "legion paths"  # type: ignore
+
+    def test_cpus(self) -> None:
+        s = m.System()
+        cpus = s.cpus
+        assert len(cpus) > 0
+        assert all(len(cpu.ids) > 0 for cpu in cpus)
+
+    @pytest.mark.skipif(sys.platform != "linux", reason="pynvml required")
+    def test_gpus(self) -> None:
+        s = m.System()
+        # can't really assume / test much here
+        s.gpus
diff --git a/tests/unit/legate/util/test_types.py b/tests/unit/legate/util/test_types.py
new file mode 100644
index 000000000..01835f882
--- /dev/null
+++ b/tests/unit/legate/util/test_types.py
@@ -0,0 +1,57 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate test configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import legate.util.types as m
+
+
+class TestCPUInfo:
+    def test_fields(self) -> None:
+        assert set(m.CPUInfo.__dataclass_fields__) == {"ids"}
+
+
+class TestGPUInfo:
+    def test_fields(self) -> None:
+        assert set(m.GPUInfo.__dataclass_fields__) == {"id", "total"}
+
+
+class Source:
+    foo = 10
+    bar = 10.2
+    baz = "test"
+    quux = ["a", "b", "c"]
+    extra = (1, 2, 3)
+
+
+@dataclass(frozen=True)
+class Target:
+    foo: int
+    bar: float
+    baz: str
+    quux: list[str]
+
+
+def test_object_to_dataclass() -> None:
+    source = Source()
+    target = m.object_to_dataclass(source, Target)
+
+    assert set(target.__dict__) == set(Target.__dataclass_fields__)
+    for k, v in target.__dict__.items():
+        assert getattr(source, k) == v
diff --git a/tests/unit/legate/util/test_ui.py b/tests/unit/legate/util/test_ui.py
new file mode 100644
index 000000000..a9ac7d890
--- /dev/null
+++ b/tests/unit/legate/util/test_ui.py
@@ -0,0 +1,375 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from datetime import timedelta
+from typing import Any
+
+import pytest
+from pytest_mock import MockerFixture
+from typing_extensions import TypeAlias
+
+from legate.util import colors, ui as m
+
+try:
+    import colorama  # type: ignore
+except ImportError:
+    colorama = None
+
+UsePlainTextFixture: TypeAlias = Any
+
+
+@pytest.fixture
+def use_plain_text(mocker: MockerFixture) -> None:
+    mocker.patch.object(m, "bright", colors._text)
+    mocker.patch.object(m, "dim", colors._text)
+    mocker.patch.object(m, "white", colors._text)
+    mocker.patch.object(m, "cyan", colors._text)
+    mocker.patch.object(m, "red", colors._text)
+    mocker.patch.object(m, "green", colors._text)
+    mocker.patch.object(m, "yellow", colors._text)
+    mocker.patch.object(m, "magenta", colors._text)
+
+
+def test_UI_WIDTH() -> None:
+    assert m.UI_WIDTH == 80
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_banner_simple() -> None:
+    assert (
+        m.banner("some text")
+        == "\n" + "#" * m.UI_WIDTH + "\n### some text\n" + "#" * m.UI_WIDTH
+    )
+
+
+def test_banner_simple_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert (
+        m.banner("some text")
+        == "\n" + "#" * m.UI_WIDTH + "\n### some text\n" + "#" * m.UI_WIDTH
+    )
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_banner_full() -> None:
+    assert (
+        m.banner("some text", char="*", width=100, details=["a", "b"])
+        == "\n"
+        + "*" * 100
+        + "\n*** \n*** some text\n*** \n*** a\n*** b\n*** \n"
+        + "*" * 100
+    )
+
+
+def test_banner_full_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert (
+        m.banner("some text", char="*", width=100, details=["a", "b"])
+        == "\n"
+        + "*" * 100
+        + "\n*** \n*** some text\n*** \n*** a\n*** b\n*** \n"
+        + "*" * 100
+    )
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_error() -> None:
+    assert m.error("some message") == colors.red("ERROR: some message")
+
+
+def test_error_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.error("some message") == "ERROR: some message"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_key() -> None:
+    assert m.key("some key") == colors.dim(colors.green("some key"))
+
+
+def test_key_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.key("some key") == "some key"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_value() -> None:
+    assert m.value("some value") == m.yellow("some value")
+
+
+def test_value_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.value("some value") == "some value"
+
+
+class Test_kvtable:
+    ONE = {"foo": 10}
+    TWO = {"foo": 10, "barbaz": "some value"}
+    THREE = {"foo": 10, "barbaz": "some value", "a": 1.2}
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_default(self, items: dict[str, Any]) -> None:
+        N = max(len(m.key(k)) for k in items)
+        assert m.kvtable(items) == "\n".join(
+            f"{m.key(k): <{N}} : {m.value(str(items[k]))}" for k in items
+        )
+
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_default_plain(
+        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
+    ) -> None:
+        N = max(len(k) for k in items)
+        assert m.kvtable(items) == "\n".join(
+            f"{k: <{N}} : {items[k]}" for k in items
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_delim(self, items: dict[str, Any]) -> None:
+        N = max(len(m.key(k)) for k in items)
+        assert m.kvtable(items, delim="/") == "\n".join(
+            f"{m.key(k): <{N}}/{m.value(str(items[k]))}" for k in items
+        )
+
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_delim_plain(
+        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
+    ) -> None:
+        N = max(len(k) for k in items)
+        assert m.kvtable(items, delim="/") == "\n".join(
+            f"{k: <{N}}/{items[k]}" for k in items
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_align_False(self, items: dict[str, Any]) -> None:
+        assert m.kvtable(items, align=False) == "\n".join(
+            f"{m.key(k)} : {m.value(str(items[k]))}" for k in items
+        )
+
+    @pytest.mark.parametrize("items", (ONE, TWO, THREE))
+    def test_align_False_plain(
+        self, use_plain_text: UsePlainTextFixture, items: dict[str, Any]
+    ) -> None:
+        assert m.kvtable(items, align=False) == "\n".join(
+            f"{k} : {items[k]}" for k in items
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_keys(self) -> None:
+        items = self.THREE
+        keys = ("foo", "a")
+        N = max(len(m.key(k)) for k in items)
+
+        assert m.kvtable(self.THREE, keys=keys) == "\n".join(
+            f"{m.key(k): <{N}} : {m.value(str(items[k]))}" for k in keys
+        )
+
+    def test_keys_plain(self, use_plain_text: UsePlainTextFixture) -> None:
+        items = self.THREE
+        keys = ("foo", "a")
+        N = max(len(m.key(k)) for k in items)
+
+        assert m.kvtable(items, keys=keys) == "\n".join(
+            f"{k: <{N}} : {items[k]}" for k in keys
+        )
+
+
+class Test_rule:
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_pad(self) -> None:
+        assert m.rule(pad=4) == colors.cyan("    " + "-" * (m.UI_WIDTH - 4))
+
+    def test_pad_with_text(
+        self,
+    ) -> None:
+        front = "    --- foo bar "
+        assert m.rule("foo bar", pad=4) == colors.cyan(
+            front + "-" * (m.UI_WIDTH - len(front))
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_text(self) -> None:
+        front = "--- foo bar "
+        assert m.rule("foo bar") == colors.cyan(
+            front + "-" * (m.UI_WIDTH - len(front))
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_char(self) -> None:
+        assert m.rule(char="a") == colors.cyan("a" * m.UI_WIDTH)
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_N(self) -> None:
+        assert m.rule(N=60) == colors.cyan("-" * 60)
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_N_with_text(self) -> None:
+        front = "--- foo bar "
+        assert m.rule("foo bar", N=65) == colors.cyan(
+            front + "-" * (65 - len(front))
+        )
+
+    @pytest.mark.skipif(colorama is None, reason="colorama required")
+    def test_pad_plain(self, use_plain_text: UsePlainTextFixture) -> None:
+        assert m.rule(pad=4) == "    " + "-" * (m.UI_WIDTH - 4)
+
+    def test_pad_with_text_plain(
+        self, use_plain_text: UsePlainTextFixture
+    ) -> None:
+        front = "    --- foo bar "
+        assert m.rule("foo bar", pad=4) == front + "-" * (
+            m.UI_WIDTH - len(front)
+        )
+
+    def test_text_plain(self, use_plain_text: UsePlainTextFixture) -> None:
+        front = "--- foo bar "
+        assert m.rule("foo bar") == "--- foo bar " + "-" * (
+            m.UI_WIDTH - len(front)
+        )
+
+    def test_char_plain(self, use_plain_text: UsePlainTextFixture) -> None:
+        assert m.rule(char="a") == "a" * m.UI_WIDTH
+
+    def test_N_plain(self, use_plain_text: UsePlainTextFixture) -> None:
+        assert m.rule(N=60) == "-" * 60
+
+    def test_N_with_text_plain(
+        self, use_plain_text: UsePlainTextFixture
+    ) -> None:
+        front = "--- foo bar "
+        assert m.rule("foo bar", N=65) == front + "-" * (65 - len(front))
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_section() -> None:
+    assert m.section("some section") == m.bright(m.white("some section"))
+
+
+def test_section_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.section("some section") == "some section"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_warn() -> None:
+    assert m.warn("some message") == m.magenta("WARNING: some message")
+
+
+def test_warn_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.warn("some message") == "WARNING: some message"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_shell() -> None:
+    assert m.shell("cmd --foo") == colors.dim(colors.white("+cmd --foo"))
+
+
+def test_shell_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.shell("cmd --foo") == "+cmd --foo"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_shell_with_char() -> None:
+    assert m.shell("cmd --foo", char="") == colors.dim(
+        colors.white("cmd --foo")
+    )
+
+
+def test_shell_with_char_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.shell("cmd --foo", char="") == "cmd --foo"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_passed() -> None:
+    assert m.passed("msg") == f"{colors.bright(colors.green('[PASS]'))} msg"
+
+
+def test_passed_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.passed("msg") == "[PASS] msg"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_passed_with_details() -> None:
+    assert (
+        m.passed("msg", details=["a", "b"])
+        == f"{colors.bright(colors.green('[PASS]'))} msg\n   a\n   b"
+    )
+
+
+def test_passed_with_details_plain(
+    use_plain_text: UsePlainTextFixture,
+) -> None:
+    assert m.passed("msg", details=["a", "b"]) == "[PASS] msg\n   a\n   b"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_failed() -> None:
+    assert m.failed("msg") == f"{colors.bright(colors.red('[FAIL]'))} msg"
+
+
+def test_failed_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.failed("msg") == "[FAIL] msg"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_failed_with_details() -> None:
+    assert (
+        m.failed("msg", details=["a", "b"])
+        == f"{colors.bright(colors.red('[FAIL]'))} msg\n   a\n   b"
+    )
+
+
+def test_failed_with_details_plain(
+    use_plain_text: UsePlainTextFixture,
+) -> None:
+    assert m.failed("msg", details=["a", "b"]) == "[FAIL] msg\n   a\n   b"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_skipped() -> None:
+    assert m.skipped("msg") == f"{colors.cyan('[SKIP]')} msg"
+
+
+def test_skipped_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert m.skipped("msg") == "[SKIP] msg"
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_summary() -> None:
+    assert m.summary("foo", 12, 11, timedelta(seconds=2.123)) == colors.bright(
+        colors.red(
+            f"{'foo: Passed 11 of 12 tests (91.7%) in 2.12s': >{m.UI_WIDTH}}"
+        )
+    )
+
+
+def test_summary_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert (
+        m.summary("foo", 12, 11, timedelta(seconds=2.123))
+        == f"{'foo: Passed 11 of 12 tests (91.7%) in 2.12s': >{m.UI_WIDTH}}"
+    )
+
+
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_summary_no_justify() -> None:
+    assert m.summary(
+        "foo", 12, 11, timedelta(seconds=2.123), justify=False
+    ) == colors.bright(
+        colors.red("foo: Passed 11 of 12 tests (91.7%) in 2.12s")
+    )
+
+
+def test_summary_no_justify_plain(use_plain_text: UsePlainTextFixture) -> None:
+    assert (
+        m.summary("foo", 12, 11, timedelta(seconds=2.123), justify=False)
+        == "foo: Passed 11 of 12 tests (91.7%) in 2.12s"
+    )
diff --git a/tests/unit/util.py b/tests/unit/util.py
new file mode 100644
index 000000000..b6ce793c0
--- /dev/null
+++ b/tests/unit/util.py
@@ -0,0 +1,33 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from itertools import chain, combinations
+from typing import Any, Iterable, Iterator
+
+import pytest
+from typing_extensions import TypeAlias
+
+Capsys: TypeAlias = pytest.CaptureFixture[str]
+
+
+# ref: https://docs.python.org/3/library/itertools.html
+def powerset(iterable: Iterable[Any]) -> Iterator[Any]:
+    s = list(iterable)
+    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))
+
+
+def powerset_nonempty(iterable: Iterable[Any]) -> Iterator[Any]:
+    return (x for x in powerset(iterable) if len(x))

From 7224c9de6d17044a07d0bdc425039da518a3f0d3 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Thu, 6 Oct 2022 18:28:25 -0700
Subject: [PATCH 007/121] Handle conflicts for library-level args (#416)

* Handle conflicts for library-level args

* fix copy-pasta
---
 legate/util/args.py                 | 42 ++++++++++++++++++++++++-----
 tests/unit/legate/util/test_args.py | 35 +++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/legate/util/args.py b/legate/util/args.py
index e8fdc0c34..88cd73193 100644
--- a/legate/util/args.py
+++ b/legate/util/args.py
@@ -14,6 +14,7 @@
 #
 from __future__ import annotations
 
+import re
 import sys
 import warnings
 from argparse import Action, ArgumentParser, Namespace
@@ -73,6 +74,7 @@ class ArgSpec:
     choices: NotRequired[Sequence[Any]] = Unset
     help: NotRequired[str] = Unset
     metavar: NotRequired[str] = Unset
+    required: NotRequired[bool] = Unset
 
 
 @dataclass(frozen=True)
@@ -80,6 +82,10 @@ class Argument:
     name: str
     spec: ArgSpec
 
+    @property
+    def kwargs(self) -> dict[str, Any]:
+        return dict(entries(self.spec))
+
 
 def entries(obj: Any) -> Iterable[tuple[str, Any]]:
     for f in fields(obj):
@@ -153,25 +159,47 @@ def parse_library_command_args(
         prog=f"<{libname} program>", add_help=False, allow_abbrev=False
     )
 
-    lib_prefix = f"-{libname}:"
+    # Some explanation is in order. Argparse treats arguments with a single
+    # dash differently, e.g. "-xyz" is interpreted as "-x -y -z". This can
+    # cause confusion and clashes when there are multiple single-dash args
+    # with identical prefixes. TLDR; we want "-legate:foo" to behave just
+    # as if it was "--legate:foo". In order to do this, we configure a parser
+    # for "long argumens" and then munge the values in sys.argv to update
+    # any "short prefix" arguments to be "long prefix" arguments first, before
+    # parsing. We also take care to update any output. The alternative here
+    # would be to abandon argparse entirely, and parse sys.argv manually.
+    #
+    # ref: https://github.com/nv-legate/legate.core/issues/415
+
+    short_prefix = f"-{libname}:"
+    long_prefix = f"-{short_prefix}"
 
     argnames = [arg.name for arg in args]
 
     for arg in args:
-        argname = f"{lib_prefix}{arg.name}"
-        kwargs = dict(entries(arg.spec))
-        parser.add_argument(argname, **kwargs)
+        argname = f"{long_prefix}{arg.name}"
+        parser.add_argument(argname, **arg.kwargs)
 
     has_custom_help = "help" in argnames
 
-    if f"{lib_prefix}help" in sys.argv and not has_custom_help:
-        parser.print_help()
+    if f"{short_prefix}help" in sys.argv and not has_custom_help:
+        help_string = parser.format_help()
+
+        # this is a little sloppy but should suffice in practice
+        print(help_string.replace(long_prefix, short_prefix))
+
         sys.exit()
 
+    # convert any short-prefix args to be long-prefix
+    sys.argv = [re.sub(f"^{short_prefix}", long_prefix, x) for x in sys.argv]
+
     args, extra = parser.parse_known_args()
 
+    # put any unconsumed args back they way they were
+    extra = [re.sub(f"^{long_prefix}", short_prefix, x) for x in extra]
+
     for item in extra:
-        if item.startswith(lib_prefix):
+        if item.startswith(short_prefix):
             warnings.warn(
                 f"Unrecognized argument {item!r} for {libname} (passed on as-is)"  # noqa: E501
             )
diff --git a/tests/unit/legate/util/test_args.py b/tests/unit/legate/util/test_args.py
index 02d01a58c..83e3e02b3 100644
--- a/tests/unit/legate/util/test_args.py
+++ b/tests/unit/legate/util/test_args.py
@@ -91,6 +91,13 @@ def test_default(self) -> None:
         }
 
 
+class TestArgument:
+    def test_kwargs(self) -> None:
+        arg = m.Argument("arg", m.ArgSpec("dest", default=2, help="help"))
+
+        assert arg.kwargs == dict(m.entries(arg.spec))
+
+
 def test_entries() -> None:
     assert set(m.entries(_TestObj())) == {("a", 10), ("c", "foo")}
 
@@ -115,13 +122,26 @@ def test_default_help_precedence(
         self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
     ) -> None:
         monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
-        args = [m.Argument("bar", m.ArgSpec(dest="help"))]
+        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
         with pytest.raises(SystemExit) as e:
             m.parse_library_command_args("foo", args)
         assert e.value.code is None
         out, err = capsys.readouterr()  # type: ignore[unreachable]
         assert out.startswith("usage: <foo program>")
 
+    def test_default_help_patches_short_args(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
+        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
+        with pytest.raises(SystemExit) as e:
+            m.parse_library_command_args("foo", args)
+        assert e.value.code is None
+        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        assert out.startswith("usage: <foo program>")
+        assert "-foo:bar" in out
+        assert "--foo:bar" not in out
+
     def test_help_override(
         self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
     ) -> None:
@@ -182,6 +202,19 @@ def test_unrecognized_libname_arg(
         assert vars(ns) == {}
         assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
 
+    def test_no_prefix_conflict(
+        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
+    ) -> None:
+        monkeypatch.setattr(
+            "sys.argv", ["app", "-foo:bar", "--foo", "-f", "1", "-ff"]
+        )
+        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
+        ns = m.parse_library_command_args("foo", args)
+        out, err = capsys.readouterr()
+        assert out == ""
+        assert vars(ns) == {"bar": True}
+        assert sys.argv == ["app", "--foo", "-f", "1", "-ff"]
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From c24487280d0046147ec3d2c9e38390dcafb57faf Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 6 Oct 2022 20:43:37 -0700
Subject: [PATCH 008/121] Revive dead region managers on field allocations
 (#418) (#419)

* Make sure we test LRU mechanism in debug mode

* Make sure we don't create fields on a dead region manager
---
 legate/core/runtime.py          | 32 ++++++++++++++++++++++----------
 src/core/mapping/core_mapper.cc |  2 +-
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index c30bc6237..22b3815e2 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -263,15 +263,21 @@ def destroy(self, unordered: bool) -> None:
         # unordered destructions
         self._region.destroy(unordered)
 
-    def increase_field_count(self) -> bool:
+    def increase_active_field_count(self) -> bool:
         revived = self._active_field_count == 0
         self._active_field_count += 1
         return revived
 
-    def decrease_field_count(self) -> bool:
+    def decrease_active_field_count(self) -> bool:
         self._active_field_count -= 1
         return self._active_field_count == 0
 
+    def increase_field_count(self) -> bool:
+        fresh = self._alloc_field_count == 0
+        self._alloc_field_count += 1
+        revived = self.increase_active_field_count()
+        return not fresh and revived
+
     @property
     def has_space(self) -> bool:
         return self._alloc_field_count < LEGATE_MAX_FIELDS
@@ -281,13 +287,12 @@ def get_next_field_id(self) -> int:
         self._next_field_id += 1
         return field_id
 
-    def allocate_field(self, field_size: Any) -> tuple[Region, int]:
+    def allocate_field(self, field_size: Any) -> tuple[Region, int, bool]:
         field_id = self._region.field_space.allocate_field(
             field_size, self.get_next_field_id()
         )
-        self._alloc_field_count += 1
-        self.increase_field_count()
-        return self._region, field_id
+        revived = self.increase_field_count()
+        return self._region, field_id, revived
 
 
 # This class manages the allocation and reuse of fields
@@ -315,18 +320,23 @@ def try_reuse_field(self) -> Optional[tuple[Region, int]]:
     def allocate_field(self) -> tuple[Region, int]:
         if (result := self.try_reuse_field()) is not None:
             region_manager = self.runtime.find_region_manager(result[0])
-            if region_manager.increase_field_count():
+            if region_manager.increase_active_field_count():
                 self.runtime.revive_manager(region_manager)
             return result
         region_manager = self.runtime.find_or_create_region_manager(self.shape)
-        return region_manager.allocate_field(self.field_size)
+        region, field_id, revived = region_manager.allocate_field(
+            self.field_size
+        )
+        if revived:
+            self.runtime.revive_manager(region_manager)
+        return region, field_id
 
     def free_field(
         self, region: Region, field_id: int, ordered: bool = False
     ) -> None:
         self.free_fields.append((region, field_id))
         region_manager = self.runtime.find_region_manager(region)
-        if region_manager.decrease_field_count():
+        if region_manager.decrease_active_field_count():
             self.runtime.free_region_manager(
                 self.shape, region, unordered=not ordered
             )
@@ -1361,7 +1371,9 @@ def import_output_region(
             self.region_managers_by_region[region] = region_mgr
             self.find_or_create_field_manager(shape, dtype.size)
 
-        region_mgr.increase_field_count()
+        revived = region_mgr.increase_field_count()
+        if revived:
+            self.revive_manager(region_mgr)
         return RegionField.create(region, field_id, dtype.size, shape)
 
     def create_output_region(
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index 26e140879..da3f7414b 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -145,7 +145,7 @@ CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
     precise_exception_trace(static_cast<bool>(extract_env("LEGATE_PRECISE_EXCEPTION_TRACE", 0, 0))),
     field_reuse_frac(extract_env("LEGATE_FIELD_REUSE_FRAC", 256, 256)),
     field_reuse_freq(extract_env("LEGATE_FIELD_REUSE_FREQ", 32, 32)),
-    max_lru_length(extract_env("LEGATE_MAX_LRU_LENGTH", 5, 0)),
+    max_lru_length(extract_env("LEGATE_MAX_LRU_LENGTH", 5, 1)),
     has_socket_mem(false)
 {
   // Query to find all our local processors

From 13f5f38f96ea64d3ea34de1fdc63a378b253d979 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 7 Oct 2022 09:11:09 -0700
Subject: [PATCH 009/121] Fix typo in driver script (#421)

---
 legate/driver/logs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/legate/driver/logs.py b/legate/driver/logs.py
index 261ab6dd5..70696d3e1 100644
--- a/legate/driver/logs.py
+++ b/legate/driver/logs.py
@@ -145,7 +145,7 @@ def process(self) -> bool:
         dflag = "d" if self.config.debugging.dataflow else ""
         eflag = "e" if self.config.debugging.event else ""
         if dflag or eflag:
-            cmd += ("-{dflag}{eflag}",)
+            cmd += (f"-{dflag}{eflag}",)
 
         cmd += tuple(f"legate_{n}.log" for n in range(ranks))
 

From 8c7552231f75ca548fec5865422a56fa43927c28 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 7 Oct 2022 16:11:07 -0700
Subject: [PATCH 010/121] On mapping failure retry after tightening non-RO reqs
 (#423)

* Acquire instances eagerly

* Add option to allow map_legate_store to fail

* Fixes to 'handled' check

* On failed mapping, retry with tight write requirements

* Use original polarity for map_legate_store

* Reduce the diff

* Uninitialized variable

* Some debugging output

* Add more debug logging to mapper

* Mapper name and rank id is obvious from log message header

* Note task id on mapper debug messages

* Skip non-existent (?) reqs when setting chosen_instances

* Report sizes of newly created reduction instances

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 src/core/mapping/base_mapper.cc | 212 ++++++++++++++++++++++----------
 src/core/mapping/base_mapper.h  |   3 +-
 2 files changed, 151 insertions(+), 64 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index f975fe3f8..66c1f4f80 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -18,6 +18,7 @@
 #include <sstream>
 
 #include "legion/legion_mapping.h"
+#include "mappers/mapping_utilities.h"
 
 #include "core/data/store.h"
 #include "core/mapping/base_mapper.h"
@@ -502,6 +503,10 @@ void BaseMapper::map_task(const MapperContext ctx,
                           const MapTaskInput& input,
                           MapTaskOutput& output)
 {
+#ifdef DEBUG_LEGATE
+  logger.debug() << "Entering map_task for " << Utilities::to_string(runtime, ctx, task);
+#endif
+
   // Should never be mapping the top-level task here
   assert(task.get_depth() > 0);
 
@@ -601,10 +606,51 @@ void BaseMapper::map_task(const MapperContext ctx,
 
   output.chosen_instances.resize(task.regions.size());
 
-  // Map each field separately for each of the logical regions
-  std::vector<PhysicalInstance> needed_acquires;
-  std::map<PhysicalInstance, std::set<uint32_t>> instances_to_mappings;
-  for (uint32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
+  bool can_fail = true;
+  std::map<PhysicalInstance, std::set<int32_t>> instance_to_mappings;
+  std::map<int32_t, PhysicalInstance> mapping_to_instance;
+  std::vector<bool> handled(mappings.size(), false);
+
+  // See case of failed instance creation below
+  auto tighten_write_reqs = [&]() {
+    for (int32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
+      auto& mapping      = mappings[mapping_idx];
+      PrivilegeMode priv = LEGION_NO_ACCESS;
+#ifdef DEBUG_LEGATE
+      std::stringstream reqs_ss;
+#endif
+      for (auto req_idx : mapping.requirement_indices()) {
+        const RegionRequirement& req = task.regions[req_idx];
+        if (!req.region.exists()) continue;
+        priv |= req.privilege;
+#ifdef DEBUG_LEGATE
+        reqs_ss << " " << req_idx;
+#endif
+      }
+      if (!(priv & LEGION_WRITE_PRIV) || mapping.policy.exact) continue;
+#ifdef DEBUG_LEGATE
+      logger.debug() << "Task " << task.get_unique_id()
+                     << ": tightened mapping policy for reqs:" << reqs_ss.str();
+#endif
+      mapping.policy.exact = true;
+      if (!handled[mapping_idx]) continue;
+      handled[mapping_idx] = false;
+      auto m2i_it          = mapping_to_instance.find(mapping_idx);
+      if (m2i_it == mapping_to_instance.end()) continue;
+      PhysicalInstance inst = m2i_it->second;
+      mapping_to_instance.erase(m2i_it);
+      auto i2m_it = instance_to_mappings.find(inst);
+      i2m_it->second.erase(mapping_idx);
+      if (i2m_it->second.empty()) {
+        runtime->release_instance(ctx, inst);
+        instance_to_mappings.erase(i2m_it);
+      }
+    }
+  };
+
+  // Mapping each field separately for each of the logical regions
+  for (int32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
+    if (handled[mapping_idx]) continue;
     auto& mapping    = mappings[mapping_idx];
     auto req_indices = mapping.requirement_indices();
 
@@ -615,12 +661,14 @@ void BaseMapper::map_task(const MapperContext ctx,
       if (target == StoreTarget::FBMEM) target = StoreTarget::ZCMEM;
 #endif
       output.future_locations.push_back(get_target_memory(task.target_proc, target));
+      handled[mapping_idx] = true;
       continue;
-    } else if (mapping.for_unbound_stores()) {
+    }
+
+    if (mapping.for_unbound_stores()) {
       for (auto req_idx : req_indices) {
         output.output_targets[req_idx] = get_target_memory(task.target_proc, mapping.policy.target);
         auto ndim                      = mapping.stores.front().dim();
-
         // FIXME: Unbound stores can have more than one dimension later
         std::vector<DimensionKind> dimension_ordering;
         for (int32_t dim = ndim - 1; dim >= 0; --dim)
@@ -630,65 +678,79 @@ void BaseMapper::map_task(const MapperContext ctx,
         output.output_constraints[req_idx].ordering_constraint =
           OrderingConstraint(dimension_ordering, false);
       }
+      handled[mapping_idx] = true;
       continue;
     }
 
     std::vector<std::reference_wrapper<const RegionRequirement>> reqs;
+#ifdef DEBUG_LEGATE
+    std::stringstream reqs_ss;
+#endif
     for (auto req_idx : req_indices) {
       const auto& req = task.regions[req_idx];
       if (!req.region.exists()) continue;
       reqs.push_back(std::cref(req));
+#ifdef DEBUG_LEGATE
+      reqs_ss << " " << req_idx;
+#endif
+    }
+    if (reqs.empty()) {
+      handled[mapping_idx] = true;
+      continue;
     }
 
-    if (reqs.empty()) continue;
-
-    // Get the reference to our valid instances in case we decide to use them
+    // Get an instance and acquire it if necessary. If the acquire fails then prune it from the
+    // mapper's data structures and retry, until we succeed or map_legate_store fails with an out of
+    // memory error.
     PhysicalInstance result;
-    if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result))
-      needed_acquires.push_back(result);
-
-    for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result);
-    instances_to_mappings[result].insert(mapping_idx);
-  }
-
-  // Do an acquire on all the instances so we have our result
-  // Keep doing this until we succed or we get an out of memory error
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    // If we failed to acquire any of the instances we need to prune them
-    // out of the mapper's data structure so do that first
-    std::set<PhysicalInstance> failed_acquires;
-    filter_failed_acquires(ctx, needed_acquires, failed_acquires);
-
-    for (auto failed_acquire : failed_acquires) {
-      auto affected_mappings = instances_to_mappings[failed_acquire];
-      instances_to_mappings.erase(failed_acquire);
-
-      for (auto& mapping_idx : affected_mappings) {
-        auto& mapping    = mappings[mapping_idx];
-        auto req_indices = mapping.requirement_indices();
-
-        std::vector<std::reference_wrapper<const RegionRequirement>> reqs;
-        for (auto req_idx : req_indices) reqs.push_back(std::cref(task.regions[req_idx]));
-
-        for (auto req_idx : req_indices) {
-          auto& instances   = output.chosen_instances[req_idx];
-          uint32_t inst_idx = 0;
-          for (; inst_idx < instances.size(); ++inst_idx)
-            if (instances[inst_idx] == failed_acquire) break;
-          instances.erase(instances.begin() + inst_idx);
-        }
-
-        PhysicalInstance result;
-        if (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result))
-          needed_acquires.push_back(result);
-
-        for (auto req_idx : req_indices) output.chosen_instances[req_idx].push_back(result);
-        instances_to_mappings[result].insert(mapping_idx);
+    while (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result, can_fail)) {
+      if (result == PhysicalInstance()) break;
+      if (instance_to_mappings.count(result) > 0 || runtime->acquire_instance(ctx, result)) {
+#ifdef DEBUG_LEGATE
+        logger.debug() << "Task " << task.get_unique_id() << ": acquired instance " << result
+                       << " for reqs:" << reqs_ss.str();
+#endif
+        break;
       }
+#ifdef DEBUG_LEGATE
+      logger.debug() << "Task " << task.get_unique_id() << ": failed to acquire instance " << result
+                     << " for reqs:" << reqs_ss.str();
+#endif
+      AutoLock lock(ctx, local_instances->manager_lock());
+      local_instances->erase(result);
     }
+
+    // If instance creation failed we try mapping all stores again, but request tight instances for
+    // write requirements. The hope is that these write requirements cover the entire region (i.e.
+    // they use a complete partition), so the new tight instances will invalidate any pre-existing
+    // "bloated" instances for the same region, freeing up enough memory so that mapping can succeed
+    if (result == PhysicalInstance()) {
+#ifdef DEBUG_LEGATE
+      logger.debug() << "Task " << task.get_unique_id()
+                     << ": failed mapping for reqs:" << reqs_ss.str();
+#endif
+      assert(can_fail);
+      tighten_write_reqs();
+      mapping_idx = -1;
+      can_fail    = false;
+      continue;
+    }
+
+    // Success; record the instance for this mapping.
+#ifdef DEBUG_LEGATE
+    logger.debug() << "Task " << task.get_unique_id()
+                   << ": completed mapping for reqs:" << reqs_ss.str();
+#endif
+    instance_to_mappings[result].insert(mapping_idx);
+    mapping_to_instance[mapping_idx] = result;
+    handled[mapping_idx]             = true;
   }
+
+  // Succeeded in mapping all stores, record it on map_task output.
+  for (const auto& m2i : mapping_to_instance)
+    for (auto req_idx : mappings[m2i.first].requirement_indices())
+      if (task.regions[req_idx].region.exists())
+        output.chosen_instances[req_idx].push_back(m2i.second);
 }
 
 void BaseMapper::map_replicate_task(const MapperContext ctx,
@@ -747,7 +809,8 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
                                   const StoreMapping& mapping,
                                   std::vector<std::reference_wrapper<const RegionRequirement>> reqs,
                                   Processor target_proc,
-                                  PhysicalInstance& result)
+                                  PhysicalInstance& result,
+                                  bool can_fail)
 {
   const auto& policy = mapping.policy;
   std::vector<LogicalRegion> regions;
@@ -776,12 +839,29 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   // If we're making a reduction instance, we should just make it now
   if (redop != 0) {
     layout_constraints.add_constraint(SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
-
-    if (!runtime->create_physical_instance(
-          ctx, target_memory, layout_constraints, regions, result, true /*acquire*/))
+    size_t footprint = 0;
+    if (runtime->create_physical_instance(ctx,
+                                          target_memory,
+                                          layout_constraints,
+                                          regions,
+                                          result,
+                                          true /*acquire*/,
+                                          LEGION_GC_DEFAULT_PRIORITY,
+                                          false /*tight bounds*/,
+                                          &footprint)) {
+#ifdef DEBUG_LEGATE
+      Realm::LoggerMessage msg = logger.debug();
+      msg << "Operation " << mappable.get_unique_id() << ": created reduction instance " << result
+          << " for";
+      for (LogicalRegion r : regions) msg << " " << r;
+      msg << " (size: " << footprint << " bytes, memory: " << target_memory << ")";
+#endif
+      // We already did the acquire
+      return false;
+    }
+    if (!can_fail)
       report_failed_mapping(mappable, mapping.requirement_index(), target_memory, redop);
-    // We already did the acquire
-    return false;
+    return true;
   }
 
   auto& fields = layout_constraints.field_constraint.field_set;
@@ -797,8 +877,8 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
       local_instances->find_instance(
         regions.front(), fields.front(), target_memory, result, policy)) {
 #ifdef DEBUG_LEGATE
-    logger.debug() << get_mapper_name() << " found instance " << result << " for "
-                   << regions.front();
+    logger.debug() << "Operation " << mappable.get_unique_id() << ": reused cached instance "
+                   << result << " for " << regions.front();
 #endif
     runtime->enable_reentrant(ctx);
     // Needs acquire to keep the runtime happy
@@ -861,8 +941,12 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
     assert(result.exists());
 #ifdef DEBUG_LEGATE
     if (created) {
-      logger.debug() << get_mapper_name() << " created instance " << result << " for " << *group
-                     << " (size: " << footprint << " bytes, memory: " << target_memory << ")";
+      logger.debug() << "Operation " << mappable.get_unique_id() << ": created instance " << result
+                     << " for " << *group << " (size: " << footprint
+                     << " bytes, memory: " << target_memory << ")";
+    } else {
+      logger.debug() << "Operation " << mappable.get_unique_id() << ": found instance " << result
+                     << " for " << *group;
     }
 #endif
     // Only save the result for future use if it is not an external instance
@@ -879,8 +963,10 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   runtime->enable_reentrant(ctx);
 
   // If we make it here then we failed entirely
-  auto req_indices = mapping.requirement_indices();
-  for (auto req_idx : req_indices) report_failed_mapping(mappable, req_idx, target_memory, redop);
+  if (!can_fail) {
+    auto req_indices = mapping.requirement_indices();
+    for (auto req_idx : req_indices) report_failed_mapping(mappable, req_idx, target_memory, redop);
+  }
   return true;
 }
 
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index fac2c7304..d81898411 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -268,7 +268,8 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
                         const StoreMapping& mapping,
                         std::vector<std::reference_wrapper<const Legion::RegionRequirement>> reqs,
                         Legion::Processor target_proc,
-                        Legion::Mapping::PhysicalInstance& result);
+                        Legion::Mapping::PhysicalInstance& result,
+                        bool can_fail);
   bool map_raw_array(const Legion::Mapping::MapperContext ctx,
                      const Legion::Mappable& mappable,
                      unsigned index,

From 15cf1dcdafdc7da8b93668fded778f9633f3cb14 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 16:35:27 -0700
Subject: [PATCH 011/121] [pre-commit.ci] pre-commit autoupdate (#408)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 22.8.0 → 22.10.0](https://github.com/psf/black/compare/22.8.0...22.10.0)
- [github.com/pre-commit/mirrors-mypy: v0.971 → v0.982](https://github.com/pre-commit/mirrors-mypy/compare/v0.971...v0.982)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2adbe47d7..04478d01c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 22.8.0
+      rev: 22.10.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8
@@ -18,7 +18,7 @@ repos:
           files: \.(cu|cuh|h|cc|inl)$
           types_or: []
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v0.971'
+      rev: 'v0.982'
       hooks:
         - id: mypy
           pass_filenames: false

From f1ee9680da0d0cbbecaadc7cdd232cb0febde1d4 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 11 Oct 2022 17:10:07 -0700
Subject: [PATCH 012/121] Fix Transform class hierarchy (#427)

---
 legate/core/transform.py | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/legate/core/transform.py b/legate/core/transform.py
index 4cd7475c6..817f08e25 100644
--- a/legate/core/transform.py
+++ b/legate/core/transform.py
@@ -60,21 +60,6 @@ def invert_extent(self, extent: Shape) -> Shape:
     def invert_point(self, point: Shape) -> Shape:
         ...
 
-    def invert(self, partition: PartitionBase) -> PartitionBase:
-        ...
-
-    def convert(self, partition: PartitionBase) -> PartitionBase:
-        ...
-
-    def convert_partition(self, partition: PartitionBase) -> PartitionBase:
-        ...
-
-    def _invert_partition(self, partition: PartitionBase) -> PartitionBase:
-        ...
-
-    def invert_partition(self, partition: PartitionBase) -> PartitionBase:
-        ...
-
     def invert_symbolic_point(self, dims: SymbolicPoint) -> SymbolicPoint:
         ...
 
@@ -89,7 +74,11 @@ def get_inverse_transform(self, ndim: int) -> AffineTransform:
 
 
 class Transform(TransformProto, Protocol):
-    pass
+    def invert(self, partition: PartitionBase) -> PartitionBase:
+        ...
+
+    def convert(self, partition: PartitionBase) -> PartitionBase:
+        ...
 
 
 class Shift(Transform):
@@ -546,6 +535,9 @@ def invert_restrictions(self, restrictions: Restrictions) -> Restrictions:
         right = restrictions[self._dim + self._shape.ndim :]
         return left + right
 
+    def convert(self, partition: PartitionBase) -> PartitionBase:
+        raise NonInvertibleError()
+
     def convert_restrictions(self, restrictions: Restrictions) -> Restrictions:
         left = restrictions[: self._dim]
         right = restrictions[self._dim + 1 :]
@@ -585,6 +577,18 @@ class TransformStackBase(TransformProto, Protocol):
     def bottom(self) -> bool:
         ...
 
+    def stack(self, transform: Transform) -> TransformStack:
+        ...
+
+    def convert_partition(self, partition: PartitionBase) -> PartitionBase:
+        ...
+
+    def _invert_partition(self, partition: PartitionBase) -> PartitionBase:
+        ...
+
+    def invert_partition(self, partition: PartitionBase) -> PartitionBase:
+        ...
+
 
 class TransformStack(TransformStackBase):
     def __init__(

From 054a589fb29dbfc7be1c7d36bef1b7e5998c72e9 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 11 Oct 2022 18:20:24 -0700
Subject: [PATCH 013/121] More changes for provenance (#417)

* Provenance for inline mappings and attach ops

* Show provenance in the mapping failure

* Minor tweak to the error message
---
 legate/core/_legion/operation.py | 15 +++++++++++++++
 legate/core/store.py             |  3 +++
 src/core/mapping/base_mapper.cc  | 20 ++++++++++++++------
 typings/legion_cffi/lib.pyi      |  3 +++
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/legate/core/_legion/operation.py b/legate/core/_legion/operation.py
index 2ccde2f5c..e07b4ba8b 100644
--- a/legate/core/_legion/operation.py
+++ b/legate/core/_legion/operation.py
@@ -39,6 +39,7 @@ def __init__(
         tag: int = 0,
         parent: Optional[Region] = None,
         coherence: int = legion.LEGION_EXCLUSIVE,
+        provenance: Optional[str] = None,
     ) -> None:
         """
         An InlineMapping object provides a mechanism for creating a mapped
@@ -87,6 +88,10 @@ def __init__(
                 mapper,
                 tag,
             )
+        if provenance is not None:
+            legion.legion_inline_launcher_set_provenance(
+                self.launcher, provenance.encode()
+            )
         self.region = region
         self._launcher = ffi.gc(
             self.launcher, legion.legion_inline_launcher_destroy
@@ -1091,6 +1096,7 @@ def __init__(
         mapper: int = 0,
         tag: int = 0,
         read_only: bool = False,
+        provenance: Optional[str] = None,
     ) -> None:
         """
         An Attach object provides a mechanism for attaching external data to
@@ -1115,6 +1121,10 @@ def __init__(
         self.launcher = legion.legion_attach_launcher_create(
             region.handle, region.handle, legion.LEGION_EXTERNAL_INSTANCE
         )
+        if provenance is not None:
+            legion.legion_attach_launcher_set_provenance(
+                self.launcher, provenance.encode()
+            )
         self.region = region
         self._launcher = ffi.gc(
             self.launcher, legion.legion_attach_launcher_destroy
@@ -1232,6 +1242,7 @@ def __init__(
         shard_local_data: dict[Region, Any],
         mapper: int = 0,
         tag: int = 0,
+        provenance: Optional[str] = None,
     ) -> None:
         """
         A variant of Attach that allows attaching multiple pieces of external
@@ -1260,6 +1271,10 @@ def __init__(
             legion.LEGION_EXTERNAL_INSTANCE,
             True,  # restricted
         )
+        if provenance is not None:
+            legion.legion_index_attach_launcher_set_provenance(
+                self.launcher, provenance.encode()
+            )
         self._launcher = ffi.gc(
             self.launcher, legion.legion_index_attach_launcher_destroy
         )
diff --git a/legate/core/store.py b/legate/core/store.py
index 26e648989..4c947829b 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -178,6 +178,7 @@ def record_detach(detach: Union[Detach, IndexDetach]) -> None:
                 self.field.field_id,
                 alloc,
                 mapper=context.mapper_id,
+                provenance=context.provenance,
             )
             # If we're not sharing then there is no need to map or restrict the
             # attachment
@@ -229,6 +230,7 @@ def record_detach(detach: Union[Detach, IndexDetach]) -> None:
                 self.field.field_id,
                 shard_local_data,
                 mapper=context.mapper_id,
+                provenance=context.provenance,
             )
             index_attach.set_deduplicate_across_shards(True)
             # If we're not sharing there is no need to restrict the attachment
@@ -265,6 +267,7 @@ def get_inline_mapped_region(self, context: Context) -> PhysicalRegion:
                     self.region,
                     self.field.field_id,
                     mapper=context.mapper_id,
+                    provenance=context.provenance,
                 )
                 self.physical_region = runtime.dispatch(mapping)
                 self.physical_region_mapped = True
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 66c1f4f80..983043322 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -1152,27 +1152,31 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
     REALM_MEMORY_KINDS(MEM_NAMES)
 #undef MEM_NAMES
   };
+  std::string provenance = mappable.get_provenance_string();
+  if (provenance.empty()) provenance = "unknown provenance";
   switch (mappable.get_mappable_type()) {
     case Mappable::TASK_MAPPABLE: {
       const auto task = mappable.as_task();
       if (redop > 0)
         logger.error(
           "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of task %s (UID %lld) into %s memory " IDFMT,
+          "requirement %d of task %s [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           redop,
           index,
           task->get_task_name(),
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
       else
         logger.error(
           "Mapper %s failed to map region requirement %d of "
-          "task %s (UID %lld) into %s memory " IDFMT,
+          "task %s [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           index,
           task->get_task_name(),
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
@@ -1182,19 +1186,21 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
       if (redop > 0)
         logger.error(
           "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of copy (UID %lld) into %s memory " IDFMT,
+          "requirement %d of copy [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           redop,
           index,
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
       else
         logger.error(
           "Mapper %s failed to map region requirement %d of "
-          "copy (UID %lld) into %s memory " IDFMT,
+          "copy [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           index,
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
@@ -1204,19 +1210,21 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
       if (redop > 0)
         logger.error(
           "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of inline mapping (UID %lld) into %s memory " IDFMT,
+          "requirement %d of inline mapping [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           redop,
           index,
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
       else
         logger.error(
           "Mapper %s failed to map region requirement %d of "
-          "inline mapping (UID %lld) into %s memory " IDFMT,
+          "inline mapping [%s] (UID %lld) into %s memory " IDFMT,
           get_mapper_name(),
           index,
+          provenance.c_str(),
           mappable.get_unique_id(),
           memory_kinds[target_memory.kind()],
           target_memory.id);
diff --git a/typings/legion_cffi/lib.pyi b/typings/legion_cffi/lib.pyi
index 11b43a823..2326b327c 100644
--- a/typings/legion_cffi/lib.pyi
+++ b/typings/legion_cffi/lib.pyi
@@ -93,6 +93,7 @@ def legion_attach_launcher_destroy(*args: Any) -> Any: ...
 def legion_attach_launcher_execute(*args: Any) -> Any: ...
 def legion_attach_launcher_set_mapped(*args: Any) -> Any: ...
 def legion_attach_launcher_set_restricted(*args: Any) -> Any: ...
+def legion_attach_launcher_set_provenance(*args: Any) -> Any: ...
 def legion_auto_generate_id(*args: Any) -> Any: ...
 def legion_context_consensus_match(*args: Any) -> Any: ...
 def legion_context_progress_unordered_operations(*args: Any) -> Any: ...
@@ -166,6 +167,7 @@ def legion_index_attach_launcher_set_deduplicate_across_shards(
     *args: Any,
 ) -> Any: ...
 def legion_index_attach_launcher_set_restricted(*args: Any) -> Any: ...
+def legion_index_attach_launcher_set_provenance(*args: Any) -> Any: ...
 def legion_index_copy_launcher_add_dst_field(*args: Any) -> Any: ...
 def legion_index_copy_launcher_add_dst_indirect_region_requirement_logical_partition(
     *args: Any,
@@ -268,6 +270,7 @@ def legion_inline_launcher_add_field(*args: Any) -> Any: ...
 def legion_inline_launcher_create_logical_region(*args: Any) -> Any: ...
 def legion_inline_launcher_destroy(*args: Any) -> Any: ...
 def legion_inline_launcher_execute(*args: Any) -> Any: ...
+def legion_inline_launcher_set_provenance(*args: Any) -> Any: ...
 def legion_issue_timing_op_seconds(*args: Any) -> Any: ...
 def legion_issue_timing_op_microseconds(*args: Any) -> Any: ...
 def legion_issue_timing_op_nanoseconds(*args: Any) -> Any: ...

From 07d272313526810ebc5a8d8968f3b0db2deb55f5 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 12 Oct 2022 00:10:31 -0700
Subject: [PATCH 014/121] Handle scalar outputs correctly in manual tasks
 (#432)

* Handle scalar outputs correctly in manual tasks

* Increase the max return size
---
 legate/core/operation.py | 2 ++
 src/core/task/task.h     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index 324c341e3..b53612d5e 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -694,6 +694,7 @@ def add_output(
                 )
             if arg.kind is Future:
                 self._scalar_outputs.append(len(self._outputs))
+                self._outputs.append(arg)
             self._output_parts.append(arg.partition(REPLICATE))
         else:
             self._output_parts.append(arg)
@@ -709,6 +710,7 @@ def add_reduction(
         if isinstance(arg, Store):
             if arg.kind is Future:
                 self._scalar_reductions.append(len(self._reductions))
+                self._reductions.append((arg, redop))
             self._reduction_parts.append((arg.partition(REPLICATE), redop))
         else:
             self._reduction_parts.append((arg, redop))
diff --git a/src/core/task/task.h b/src/core/task/task.h
index 904d71f51..f86e9987c 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -32,8 +32,8 @@
 
 namespace legate {
 
-// We're going to allow for each task to use only up to 170 scalar output stores
-constexpr size_t LEGATE_MAX_SIZE_SCALAR_RETURN = 2048;
+// We're going to allow for each task to use only up to 341 scalar output stores
+constexpr size_t LEGATE_MAX_SIZE_SCALAR_RETURN = 4096;
 
 using LegateVariantImpl = void (*)(TaskContext&);
 

From c304e1f834a40fac8ab376ccaddfb7aacd3fa3a5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Wed, 12 Oct 2022 10:11:30 -0700
Subject: [PATCH 015/121] force CPM to download Legion if legion_dir or
 legion_src_dir is not explicitly provided (#411)

Co-authored-by: Bryan Van de Ven <bryan@bokeh.org>
---
 cmake/thirdparty/get_legion.cmake | 20 +++++++++++---------
 install.py                        |  4 +++-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/cmake/thirdparty/get_legion.cmake b/cmake/thirdparty/get_legion.cmake
index 344fc2d2c..5faf54023 100644
--- a/cmake/thirdparty/get_legion.cmake
+++ b/cmake/thirdparty/get_legion.cmake
@@ -37,16 +37,18 @@ function(find_or_configure_legion)
       BUILD_EXPORT_SET   legate-core-exports
       INSTALL_EXPORT_SET legate-core-exports)
 
-  # First try to find Legion via find_package()
-  # so the `Legion_USE_*` variables are visible
-  # Use QUIET find by default.
-  set(_find_mode QUIET)
-  # If Legion_DIR/Legion_ROOT are defined as something other than empty or NOTFOUND
-  # use a REQUIRED find so that the build does not silently download Legion.
-  if(Legion_DIR OR Legion_ROOT)
-    set(_find_mode REQUIRED)
+  if((NOT CPM_Legion_SOURCE) AND (NOT CPM_DOWNLOAD_Legion))
+    # First try to find Legion via find_package()
+    # so the `Legion_USE_*` variables are visible
+    # Use QUIET find by default.
+    set(_find_mode QUIET)
+    # If Legion_DIR/Legion_ROOT are defined as something other than empty or NOTFOUND
+    # use a REQUIRED find so that the build does not silently download Legion.
+    if(Legion_DIR OR Legion_ROOT)
+      set(_find_mode REQUIRED)
+    endif()
+    rapids_find_package(Legion ${PKG_VERSION} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
   endif()
-  rapids_find_package(Legion ${PKG_VERSION} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
 
   if(Legion_FOUND)
     message(STATUS "CPM: using local package Legion@${PKG_VERSION}")
diff --git a/install.py b/install.py
index 6566ca68b..6cd0ca3a7 100755
--- a/install.py
+++ b/install.py
@@ -454,8 +454,10 @@ def validate_path(path):
         cmake_flags += ["-DThrust_ROOT=%s" % thrust_dir]
     if legion_dir:
         cmake_flags += ["-DLegion_ROOT=%s" % legion_dir]
-    if legion_src_dir:
+    elif legion_src_dir:
         cmake_flags += ["-DCPM_Legion_SOURCE=%s" % legion_src_dir]
+    else:
+        cmake_flags += ["-DCPM_DOWNLOAD_Legion=ON"]
     if legion_url:
         cmake_flags += ["-Dlegate_core_LEGION_REPOSITORY=%s" % legion_url]
     if legion_branch:

From eb0fcc3bebb6b6682108cd9bad6d14f7e5ed277b Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 12 Oct 2022 11:21:01 -0700
Subject: [PATCH 016/121] remove --install-dir option (#430)

* remove --install-dir option

* remove --python-lib
---
 bin/legate                | 23 ------------
 install.py                | 76 ++++++++++++---------------------------
 legate/__main__.py        | 24 -------------
 legate/driver/__init__.py |  9 ++++-
 setup.py                  |  6 ++--
 5 files changed, 32 insertions(+), 106 deletions(-)
 delete mode 100644 bin/legate
 delete mode 100755 legate/__main__.py

diff --git a/bin/legate b/bin/legate
deleted file mode 100644
index caa4983ec..000000000
--- a/bin/legate
+++ /dev/null
@@ -1,23 +0,0 @@
-#! /usr/bin/env python3
-# -*- coding: utf-8 -*-
-import re
-import sys
-import os
-import pathlib
-
-prefix_dir = os.path.dirname(os.path.dirname(__file__))
-for path in sys.path[:]: # slice to void inf append loop
-    parts = pathlib.Path(path).parts
-    if "lib" in parts:
-        relative_path = parts[parts.index("lib"):]
-        new_prefix_path = os.path.join(prefix_dir, *relative_path)
-        if os.path.exists(new_prefix_path):
-            sys.path.append(new_prefix_path)
-
-from legate.driver import main
-
-if __name__ == '__main__':
-    # if legate is installed into a non-standard location, the legate
-    # libraries may not be available in the python import path
-    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
-    sys.exit(main(sys.argv))
diff --git a/install.py b/install.py
index 6cd0ca3a7..6105411e3 100755
--- a/install.py
+++ b/install.py
@@ -19,7 +19,6 @@
 import multiprocessing
 import os
 import platform
-import re
 import shutil
 import subprocess
 import sys
@@ -145,26 +144,26 @@ def was_previously_built_with_different_build_isolation(
     return False
 
 
-def get_install_dir_or_default(install_dir):
-    # If no install dir was passed on the command line, infer the location
-    # of where to install the Legion Python bindings, otherwise they'll only
-    # be installed into the local scikit-build cmake-install dir
-    if install_dir is None:
-        # Install into conda prefix if defined
-        if "CONDA_PREFIX" in os.environ:
-            install_dir = os.environ["CONDA_PREFIX"]
-        else:
-            import site
-
-            # Try to install into user site packages first?
-            if site.ENABLE_USER_SITE and os.path.exists(
-                site_pkgs := site.getusersitepackages()
-            ):
-                install_dir = site_pkgs
-            # Otherwise fallback to regular site-packages?
-            elif os.path.exists(site_pkgs := site.getsitepackages()):
-                install_dir = site_pkgs
-    return install_dir
+def get_install_dir():
+    # Infer the location where to install the Legion Python bindings,
+    # otherwise they'll only be installed into the local scikit-build
+    # cmake-install dir
+
+    # Install into conda prefix if defined
+    if "CONDA_PREFIX" in os.environ:
+        return os.environ["CONDA_PREFIX"]
+
+    import site
+
+    # Try to install into user site packages first?
+    if site.ENABLE_USER_SITE and os.path.exists(
+        user_site_pkgs := site.getusersitepackages()
+    ):
+        return user_site_pkgs
+
+    # Otherwise fallback to regular site-packages?
+    if os.path.exists(site_pkgs := site.getsitepackages()):
+        return site_pkgs
 
 
 def install_legion_python_bindings(
@@ -246,9 +245,7 @@ def install(
     nccl_dir,
     cmake_exe,
     cmake_generator,
-    install_dir,
     gasnet_dir,
-    pylib_name,
     cuda_dir,
     maxdim,
     maxfields,
@@ -291,9 +288,7 @@ def install(
         print("nccl_dir:", nccl_dir)
         print("cmake_exe:", cmake_exe)
         print("cmake_generator:", cmake_generator)
-        print("install_dir:", install_dir)
         print("gasnet_dir:", gasnet_dir)
-        print("pylib_name:", pylib_name)
         print("cuda_dir:", cuda_dir)
         print("maxdim:", maxdim)
         print("maxfields:", maxfields)
@@ -319,14 +314,7 @@ def install(
 
     legate_core_dir = dirname(realpath(__file__))
 
-    if pylib_name is None:
-        pyversion, pylib_name = find_active_python_version_and_path()
-    else:
-        f_name = os.path.split(pylib_name)[-1]
-        match = re.match(r"^libpython(\d\d?\.\d\d?)", f_name)
-        e = "Unable to get version from library name {}".format(pylib_name)
-        assert match, e
-        pyversion = match.group(1)
+    pyversion, pylib_name = find_active_python_version_and_path()
     print("Using python lib and version: {}, {}".format(pylib_name, pyversion))
 
     def validate_path(path):
@@ -388,7 +376,7 @@ def validate_path(path):
         except Exception:
             pass
 
-    install_dir = get_install_dir_or_default(validate_path(install_dir))
+    install_dir = get_install_dir()
 
     if verbose:
         print("install_dir: ", install_dir)
@@ -485,14 +473,6 @@ def validate_path(path):
 
 def driver():
     parser = argparse.ArgumentParser(description="Install Legate front end.")
-    parser.add_argument(
-        "--install-dir",
-        dest="install_dir",
-        metavar="DIR",
-        required=False,
-        default=None,
-        help="Path to install all Legate-related software",
-    )
     parser.add_argument(
         "--debug",
         dest="debug",
@@ -630,18 +610,6 @@ def driver():
         default=os.environ.get("NCCL_PATH"),
         help="Path to NCCL installation directory.",
     )
-    parser.add_argument(
-        "--python-lib",
-        dest="pylib_name",
-        action="store",
-        required=False,
-        default=None,
-        help=(
-            "Build Legate against the specified Python shared library. "
-            "Default is to use the Python library currently executing this "
-            "install script."
-        ),
-    )
     parser.add_argument(
         "--with-cmake",
         dest="cmake_exe",
diff --git a/legate/__main__.py b/legate/__main__.py
deleted file mode 100755
index e7cc19d7b..000000000
--- a/legate/__main__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import sys
-
-from .driver import main
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv))
diff --git a/legate/driver/__init__.py b/legate/driver/__init__.py
index b8496597d..67ce493b8 100644
--- a/legate/driver/__init__.py
+++ b/legate/driver/__init__.py
@@ -17,4 +17,11 @@
 from .config import Config
 from .driver import Driver
 from .launcher import Launcher
-from .main import main
+
+
+def main() -> int:
+    import sys
+
+    from .main import main as _main
+
+    return _main(sys.argv)
diff --git a/setup.py b/setup.py
index a59dadd8f..24e358eb0 100755
--- a/setup.py
+++ b/setup.py
@@ -62,13 +62,11 @@
     include_package_data=True,
     entry_points={
         "console_scripts": [
+            "legate = legate.driver:main",
             "lgpatch = legate.lgpatch:main",
         ],
     },
-    scripts=[
-        "bind.sh",
-        "bin/legate",
-    ],
+    scripts=["bind.sh"],
     cmdclass=versioneer.get_cmdclass(),
     install_requires=["numpy>=1.22"],
     zip_safe=False,

From 1be0d978eb823ba233af4c35501d6c06bc7043be Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 12 Oct 2022 15:03:55 -0700
Subject: [PATCH 017/121] Fix GPU shard computation (#433)

---
 legate/tester/stages/_linux/gpu.py                 | 2 +-
 tests/unit/legate/tester/stages/_linux/test_gpu.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/legate/tester/stages/_linux/gpu.py b/legate/tester/stages/_linux/gpu.py
index f1a222fc0..64f625c00 100644
--- a/legate/tester/stages/_linux/gpu.py
+++ b/legate/tester/stages/_linux/gpu.py
@@ -69,7 +69,7 @@ def compute_spec(self, config: Config, system: TestSystem) -> StageSpec:
         N = len(system.gpus)
         degree = N // config.gpus
 
-        fbsize = min(gpu.total for gpu in system.gpus) / (2 << 20)  # MB
+        fbsize = min(gpu.total for gpu in system.gpus) / (1 << 20)  # MB
         oversub_factor = int(fbsize // (config.fbmem * BLOAT_FACTOR))
         workers = adjust_workers(
             degree * oversub_factor, config.requested_workers
diff --git a/tests/unit/legate/tester/stages/_linux/test_gpu.py b/tests/unit/legate/tester/stages/_linux/test_gpu.py
index df1441c65..8d792b7b3 100644
--- a/tests/unit/legate/tester/stages/_linux/test_gpu.py
+++ b/tests/unit/legate/tester/stages/_linux/test_gpu.py
@@ -55,16 +55,16 @@ def test_spec_with_gpus_1() -> None:
     c = Config(["test.py", "--gpus", "1"])
     s = FakeSystem()
     stage = m.GPU(c, s)
-    assert stage.spec.workers == 12
-    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 12
+    assert stage.spec.workers == 24
+    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 24
 
 
 def test_spec_with_gpus_2() -> None:
     c = Config(["test.py", "--gpus", "2"])
     s = FakeSystem()
     stage = m.GPU(c, s)
-    assert stage.spec.workers == 6
-    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)] * 6
+    assert stage.spec.workers == 12
+    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)] * 12
 
 
 def test_spec_with_requested_workers() -> None:

From b2b6228cc30b58b72b6644a09866bb016063bc74 Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Wed, 12 Oct 2022 15:28:08 -0700
Subject: [PATCH 018/121] Only set default CMake generator if Ninja is
 available: Issue #374 (#379)

* Only set default generator if Ninja is available

* Address PR comments, fix typos

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 install.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install.py b/install.py
index 6105411e3..93cd215d2 100755
--- a/install.py
+++ b/install.py
@@ -401,7 +401,7 @@ def validate_path(path):
     cmake_flags = []
 
     if cmake_generator:
-        cmake_flags += [f"-G{cmake_generator}"]
+        cmake_flags += [f"-G'{cmake_generator}'"]
 
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
@@ -622,8 +622,8 @@ def driver():
         "--cmake-generator",
         dest="cmake_generator",
         required=False,
-        default="Ninja",
-        choices=["Ninja", "Unix Makefiles"],
+        default=(None if shutil.which("ninja") is None else "Ninja"),
+        choices=["Ninja", "Unix Makefiles", None],
         help="The CMake makefiles generator",
     )
     parser.add_argument(

From c48b62cf427a05027bc75ea9a97149b8624362ef Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 12 Oct 2022 15:59:29 -0700
Subject: [PATCH 019/121] . (#434)

---
 legate/util/fs.py                     | 80 ++++++++++++++-------------
 tests/unit/legate/driver/test_main.py |  5 +-
 2 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/legate/util/fs.py b/legate/util/fs.py
index e05e15279..e3ea9e958 100644
--- a/legate/util/fs.py
+++ b/legate/util/fs.py
@@ -281,50 +281,52 @@ def installed_legion_paths(legion_dir: Path) -> LegionPaths:
     cmake_cache_txt = legate_build_dir / "CMakeCache.txt"
 
     try:
-        # Test whether Legion_DIR is set. If it isn't, then we built Legion as
-        # a side-effect of building legate_core
-        read_cmake_cache_value(
-            cmake_cache_txt, "Legion_DIR:PATH=Legion_DIR-NOTFOUND"
-        )
-    except Exception:
-        # If Legion_DIR is a valid path, check whether it's a
-        # Legion build dir, i.e. `-D Legion_ROOT=/legion/build`
-        legion_dir = Path(
-            read_cmake_cache_value(cmake_cache_txt, "Legion_DIR:PATH=")
-        )
-        if legion_dir.joinpath("CMakeCache.txt").exists():
-            cmake_cache_txt = legion_dir / "CMakeCache.txt"
-
-    try:
-        # If Legion_SOURCE_DIR and Legion_BINARY_DIR are in CMakeCache.txt,
-        # return the paths to Legion in the legate_core build dir.
-        legion_source_dir = Path(
+        try:
+            # Test whether Legion_DIR is set. If it isn't, then we built
+            # Legion as a side-effect of building legate_core
             read_cmake_cache_value(
-                cmake_cache_txt, "Legion_SOURCE_DIR:STATIC="
+                cmake_cache_txt, "Legion_DIR:PATH=Legion_DIR-NOTFOUND"
             )
-        )
-        legion_binary_dir = Path(
-            read_cmake_cache_value(
-                cmake_cache_txt, "Legion_BINARY_DIR:STATIC="
+        except Exception:
+            # If Legion_DIR is a valid path, check whether it's a
+            # Legion build dir, i.e. `-D Legion_ROOT=/legion/build`
+            legion_dir = Path(
+                read_cmake_cache_value(cmake_cache_txt, "Legion_DIR:PATH=")
             )
-        )
+            if legion_dir.joinpath("CMakeCache.txt").exists():
+                cmake_cache_txt = legion_dir / "CMakeCache.txt"
 
-        legion_runtime_dir = legion_binary_dir / "runtime"
-        legion_bindings_dir = legion_source_dir / "bindings"
-
-        return LegionPaths(
-            legion_bin_path=legion_binary_dir / "bin",
-            legion_lib_path=legion_binary_dir / "lib",
-            realm_defines_h=legion_runtime_dir / "realm_defines.h",
-            legion_defines_h=legion_runtime_dir / "legion_defines.h",
-            legion_spy_py=legion_source_dir / "tools" / "legion_spy.py",
-            legion_prof_py=legion_source_dir / "tools" / "legion_prof.py",
-            legion_python=legion_binary_dir / "bin" / "legion_python",
-            legion_module=legion_bindings_dir / "python" / "build" / "lib",
-            legion_jupyter_module=legion_source_dir / "jupyter_notebook",
-        )
     except Exception:
-        pass
+        try:
+            # If Legion_SOURCE_DIR and Legion_BINARY_DIR are in CMakeCache.txt,
+            # return the paths to Legion in the legate_core build dir.
+            legion_source_dir = Path(
+                read_cmake_cache_value(
+                    cmake_cache_txt, "Legion_SOURCE_DIR:STATIC="
+                )
+            )
+            legion_binary_dir = Path(
+                read_cmake_cache_value(
+                    cmake_cache_txt, "Legion_BINARY_DIR:STATIC="
+                )
+            )
+
+            legion_runtime_dir = legion_binary_dir / "runtime"
+            legion_bindings_dir = legion_source_dir / "bindings"
+
+            return LegionPaths(
+                legion_bin_path=legion_binary_dir / "bin",
+                legion_lib_path=legion_binary_dir / "lib",
+                realm_defines_h=legion_runtime_dir / "realm_defines.h",
+                legion_defines_h=legion_runtime_dir / "legion_defines.h",
+                legion_spy_py=legion_source_dir / "tools" / "legion_spy.py",
+                legion_prof_py=legion_source_dir / "tools" / "legion_prof.py",
+                legion_python=legion_binary_dir / "bin" / "legion_python",
+                legion_module=legion_bindings_dir / "python" / "build" / "lib",
+                legion_jupyter_module=legion_source_dir / "jupyter_notebook",
+            )
+        except Exception:
+            pass
 
     # Otherwise return the installation paths.
     return installed_legion_paths(Path(sys.argv[0]).parents[1])
diff --git a/tests/unit/legate/driver/test_main.py b/tests/unit/legate/driver/test_main.py
index 4c0260abb..0992a226b 100644
--- a/tests/unit/legate/driver/test_main.py
+++ b/tests/unit/legate/driver/test_main.py
@@ -14,6 +14,8 @@
 #
 from __future__ import annotations
 
+import sys
+
 from pytest_mock import MockerFixture
 
 import legate.driver as m
@@ -34,8 +36,9 @@ def test_main(mocker: MockerFixture) -> None:
     system_spy = mocker.spy(legate.util.system.System, "__init__")
     driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
     mocker.patch("legate.driver.driver.Driver.run", return_value=123)
+    mocker.patch.object(sys, "argv", ["foo", "bar"])
 
-    result = m.main(["foo", "bar"])
+    result = m.main()
 
     assert config_spy.call_count == 1
     assert config_spy.call_args[0][1:] == (["foo", "bar"],)

From 9f3894dff7010b02fc6bb8d771c4cf4c3680ca3a Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Thu, 13 Oct 2022 13:45:21 -0700
Subject: [PATCH 020/121] Allow only one of --legion-dir and --legion-src-dir
 (#387)

* skip finding legion installation if given legion source override

* check that only one of legion-dir and legion-src-dir is given

* Revert "skip finding legion installation if given legion source override"

This reverts commit 80a0f7ebdd7b6cc9eddfe3a8e4ebacfd08928845.

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 install.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/install.py b/install.py
index 93cd215d2..e3303ae72 100755
--- a/install.py
+++ b/install.py
@@ -274,6 +274,9 @@ def install(
     if clean_first is None:
         clean_first = not editable
 
+    if legion_dir is not None and legion_src_dir is not None:
+        sys.exit("Cannot specify both --legion-dir and --legion-src-dir")
+
     print("Verbose build is ", "on" if verbose else "off")
     if verbose:
         print("networks:", networks)

From db41dbb0e578cc74b25c917cd6de6f9c66dcee74 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Thu, 13 Oct 2022 15:28:41 -0700
Subject: [PATCH 021/121] Report pytest exit code on fail (#436)

* report process exit code on test failure

* report process exit code on test failure
---
 legate/tester/stages/util.py                 |   2 +-
 legate/util/ui.py                            |  10 +-
 tests/unit/legate/tester/stages/test_util.py | 109 +++++++++++++++++++
 tests/unit/legate/util/test_ui.py            |  32 ++++++
 4 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/legate/tester/stages/util.py b/legate/tester/stages/util.py
index 2d6514877..27d53bbd1 100644
--- a/legate/tester/stages/util.py
+++ b/legate/tester/stages/util.py
@@ -112,4 +112,4 @@ def log_proc(
     elif proc.returncode == 0:
         LOG(passed(msg, details=details))
     else:
-        LOG(failed(msg, details=details))
+        LOG(failed(msg, details=details, exit_code=proc.returncode))
diff --git a/legate/util/ui.py b/legate/util/ui.py
index 9cf74b094..2e4af8fbc 100644
--- a/legate/util/ui.py
+++ b/legate/util/ui.py
@@ -110,7 +110,9 @@ def error(text: str) -> str:
     return red(f"ERROR: {text}")
 
 
-def failed(msg: str, *, details: Details | None = None) -> str:
+def failed(
+    msg: str, *, details: Details | None = None, exit_code: int | None = None
+) -> str:
     """Report a failed test result with a bright red [FAIL].
 
     Parameters
@@ -122,9 +124,11 @@ def failed(msg: str, *, details: Details | None = None) -> str:
         A sequenece of text lines to diplay below the ``msg`` line
 
     """
+    fail = f"{bright(red('[FAIL]'))}"
+    exit = f"{bright(white(f' (exit: {exit_code}) '))}" if exit_code else ""
     if details:
-        return f"{bright(red('[FAIL]'))} {msg}\n{_format_details(details)}"
-    return f"{bright(red('[FAIL]'))} {msg}"
+        return f"{fail} {msg}{exit}\n{_format_details(details)}"
+    return f"{fail} {msg}{exit}"
 
 
 def passed(msg: str, *, details: Details | None = None) -> str:
diff --git a/tests/unit/legate/tester/stages/test_util.py b/tests/unit/legate/tester/stages/test_util.py
index b4c528d06..f97174de8 100644
--- a/tests/unit/legate/tester/stages/test_util.py
+++ b/tests/unit/legate/tester/stages/test_util.py
@@ -17,9 +17,26 @@
 """
 from __future__ import annotations
 
+from pathlib import Path
+
 import pytest
 
+from legate.tester.config import Config
+from legate.tester.logger import LOG
 from legate.tester.stages import util as m
+from legate.tester.test_system import ProcessResult
+from legate.util.ui import failed, passed, shell, skipped
+
+
+def test_StageResult() -> None:
+    procs = [ProcessResult(f"run{i}", Path(f"test{i}")) for i in range(10)]
+    procs[2].returncode = 10
+    procs[7].returncode = -2
+
+    result = m.StageResult(procs=procs, time=0)
+
+    assert result.total == 10
+    assert result.passed == 8
 
 
 class Test_adjust_workers:
@@ -46,3 +63,95 @@ def test_zero_computed(self) -> None:
     def test_requested_too_large(self) -> None:
         with pytest.raises(RuntimeError):
             assert m.adjust_workers(10, 11)
+
+
+class Test_log_proc:
+    @pytest.mark.parametrize("returncode", (-23, -1, 0, 1, 17))
+    def test_skipped(self, returncode) -> None:
+        config = Config([])
+        proc = ProcessResult(
+            "proc", Path("proc"), skipped=True, returncode=returncode
+        )
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=False)
+
+        assert LOG.lines == (skipped(f"(foo) {proc.test_file}"),)
+
+    def test_passed(self) -> None:
+        config = Config([])
+        proc = ProcessResult("proc", Path("proc"))
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=False)
+
+        assert LOG.lines == (passed(f"(foo) {proc.test_file}"),)
+
+    def test_passed_verbose(self) -> None:
+        config = Config([])
+        proc = ProcessResult("proc", Path("proc"), output="foo\nbar")
+        details = proc.output.split("\n")
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=True)
+
+        assert LOG.lines == tuple(
+            passed(f"(foo) {proc.test_file}", details=details).split("\n")
+        )
+
+    @pytest.mark.parametrize("returncode", (-23, -1, 1, 17))
+    def test_failed(self, returncode) -> None:
+        config = Config([])
+        proc = ProcessResult("proc", Path("proc"), returncode=returncode)
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=False)
+
+        assert LOG.lines == (
+            failed(f"(foo) {proc.test_file}", exit_code=returncode),
+        )
+
+    @pytest.mark.parametrize("returncode", (-23, -1, 1, 17))
+    def test_failed_verbose(self, returncode) -> None:
+        config = Config([])
+        proc = ProcessResult(
+            "proc", Path("proc"), returncode=returncode, output="foo\nbar"
+        )
+        details = proc.output.split("\n")
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=True)
+
+        assert LOG.lines == tuple(
+            failed(
+                f"(foo) {proc.test_file}",
+                details=details,
+                exit_code=returncode,
+            ).split("\n")
+        )
+
+    def test_dry_run(self) -> None:
+        config = Config([])
+        config.dry_run = True
+        proc = ProcessResult("proc", Path("proc"))
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=False)
+
+        assert LOG.lines == (
+            shell(proc.invocation),
+            passed(f"(foo) {proc.test_file}"),
+        )
+
+    def test_debug(self) -> None:
+        config = Config([])
+        config.debug = True
+        proc = ProcessResult("proc", Path("proc"))
+
+        LOG.clear()
+        m.log_proc("foo", proc, config, verbose=False)
+
+        assert LOG.lines == (
+            shell(proc.invocation),
+            passed(f"(foo) {proc.test_file}"),
+        )
diff --git a/tests/unit/legate/util/test_ui.py b/tests/unit/legate/util/test_ui.py
index a9ac7d890..4603c053c 100644
--- a/tests/unit/legate/util/test_ui.py
+++ b/tests/unit/legate/util/test_ui.py
@@ -320,6 +320,19 @@ def test_failed_plain(use_plain_text: UsePlainTextFixture) -> None:
     assert m.failed("msg") == "[FAIL] msg"
 
 
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_failed_with_exit_code() -> None:
+    fail = colors.bright(colors.red("[FAIL]"))
+    exit = colors.bright(colors.white(" (exit: 10) "))
+    assert m.failed("msg", exit_code=10) == f"{fail} msg{exit}"  # noqa
+
+
+def test_failed_with_exit_code_plain(
+    use_plain_text: UsePlainTextFixture,
+) -> None:
+    assert m.failed("msg", exit_code=10) == "[FAIL] msg (exit: 10) "
+
+
 @pytest.mark.skipif(colorama is None, reason="colorama required")
 def test_failed_with_details() -> None:
     assert (
@@ -334,6 +347,25 @@ def test_failed_with_details_plain(
     assert m.failed("msg", details=["a", "b"]) == "[FAIL] msg\n   a\n   b"
 
 
+@pytest.mark.skipif(colorama is None, reason="colorama required")
+def test_failed_with_details_and_exit_code() -> None:
+    fail = colors.bright(colors.red("[FAIL]"))
+    exit = colors.bright(colors.white(" (exit: 10) "))
+    assert (
+        m.failed("msg", details=["a", "b"], exit_code=10)
+        == f"{fail} msg{exit}\n   a\n   b"
+    )
+
+
+def test_failed_with_details_and_exit_code_plain(
+    use_plain_text: UsePlainTextFixture,
+) -> None:
+    assert (
+        m.failed("msg", details=["a", "b"], exit_code=10)
+        == "[FAIL] msg (exit: 10) \n   a\n   b"
+    )
+
+
 @pytest.mark.skipif(colorama is None, reason="colorama required")
 def test_skipped() -> None:
     assert m.skipped("msg") == f"{colors.cyan('[SKIP]')} msg"

From 39a3b4c9f0e9c9b5e4e4dd39ffe2e9394f7c6df1 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 13 Oct 2022 23:30:01 -0700
Subject: [PATCH 022/121] API to declare tasks with side effects (#437)

---
 legate/core/operation.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index b53612d5e..a3b2aef17 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -245,6 +245,7 @@ def __init__(
         self._comm_args: list[Communicator] = []
         self._exn_types: list[type] = []
         self._tb: Union[None, TracebackType] = None
+        self._side_effect = False
 
     @property
     def uses_communicator(self) -> bool:
@@ -557,6 +558,7 @@ def launch(self, strategy: Strategy) -> None:
             self.context,
             self._task_id,
             self.mapper_id,
+            side_effect=self._side_effect,
             provenance=self.provenance,
         )
 
@@ -742,8 +744,9 @@ def launch(self, strategy: Strategy) -> None:
             self.context,
             self._task_id,
             self.mapper_id,
-            error_on_interference=False,
             tag=tag,
+            error_on_interference=False,
+            side_effect=self._side_effect,
             provenance=self.provenance,
         )
 

From 349182ba218603fcf43c73ed1c4d489f94084097 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Fri, 14 Oct 2022 11:40:35 -0700
Subject: [PATCH 023/121] legate/util: fix a mypy error on MacOS (#438)

Fixes #435.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/util/system.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/legate/util/system.py b/legate/util/system.py
index 702514cc2..5fbabe1d0 100644
--- a/legate/util/system.py
+++ b/legate/util/system.py
@@ -90,16 +90,20 @@ def cpus(self) -> tuple[CPUInfo, ...]:
 
         if sys.platform == "darwin":
             return tuple(CPUInfo((i,)) for i in range(N))
-
-        sibling_sets: set[tuple[int, ...]] = set()
-        for i in range(N):
-            line = open(
-                f"/sys/devices/system/cpu/cpu{i}/topology/thread_siblings_list"
-            ).read()
-            sibling_sets.add(
-                tuple(sorted(int(x) for x in line.strip().split(",")))
+        else:
+            # This explicit else is needed for mypy to not raise a type
+            # error on MacOS.
+            sibling_sets: set[tuple[int, ...]] = set()
+            for i in range(N):
+                line = open(
+                    f"/sys/devices/system/cpu/cpu{i}/topology/thread_siblings_list"  # noqa E501
+                ).read()
+                sibling_sets.add(
+                    tuple(sorted(int(x) for x in line.strip().split(",")))
+                )
+            return tuple(
+                CPUInfo(siblings) for siblings in sorted(sibling_sets)
             )
-        return tuple(CPUInfo(siblings) for siblings in sorted(sibling_sets))
 
     @cached_property
     def gpus(self) -> tuple[GPUInfo, ...]:

From ad493f111c96a5d008c809f740a75c0995807575 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Mon, 17 Oct 2022 12:49:51 -0700
Subject: [PATCH 024/121] Improvements to legate.jupyter (#425)

* Improvements to legate.jupyter

* use absolute paths

* define entry point

* update docs

* always print minimal status

* first batch of tests

* add kernelspec tests

* Apply suggestions from code review

* Update README.md
---
 README.md                                 |  68 ++++---
 legate/driver/args.py                     | 176 ++++--------------
 legate/driver/command.py                  |  54 +++---
 legate/driver/config.py                   |  24 ++-
 legate/driver/driver.py                   |   4 +-
 legate/driver/launcher.py                 |  16 +-
 legate/driver/logs.py                     |   8 +-
 legate/jupyter/__init__.py                |  22 ++-
 legate/jupyter/__main__.py                |  39 ----
 legate/jupyter/_legion_kernel.py          |  60 +++++++
 legate/jupyter/_magic_cmd.py              |  78 --------
 legate/jupyter/args.py                    | 107 +++++++++++
 legate/jupyter/config.py                  |  87 +++++++++
 legate/jupyter/kernel.py                  | 128 +++++++++++++
 legate/jupyter/magic.py                   | 103 +++++++++++
 legate/jupyter/main.py                    |  37 ++++
 legate/tester/config.py                   |   4 +-
 legate/util/args.py                       |   2 +-
 legate/util/shared_args.py                | 207 ++++++++++++++++++++++
 setup.py                                  |   1 +
 tests/unit/legate/driver/test_args.py     |   4 -
 tests/unit/legate/driver/test_driver.py   |   2 +-
 tests/unit/legate/driver/test_launcher.py |   2 +-
 tests/unit/legate/driver/test_main.py     |   2 +-
 tests/unit/legate/jupyter/__init__.py     |  15 ++
 tests/unit/legate/jupyter/test_args.py    | 104 +++++++++++
 tests/unit/legate/jupyter/test_config.py  | 129 ++++++++++++++
 tests/unit/legate/jupyter/test_kernel.py  | 172 ++++++++++++++++++
 tests/unit/legate/jupyter/test_main.py    |  73 ++++++++
 tests/unit/legate/util/test_args.py       |  13 +-
 typings/IPython/__init__.pyi              |  20 +++
 typings/IPython/core/magic.pyi            |  28 +++
 typings/jupyter_client/__init__.pyi       |   0
 typings/jupyter_client/kernelspec.pyi     |  40 +++++
 34 files changed, 1469 insertions(+), 360 deletions(-)
 delete mode 100644 legate/jupyter/__main__.py
 create mode 100644 legate/jupyter/_legion_kernel.py
 delete mode 100644 legate/jupyter/_magic_cmd.py
 create mode 100755 legate/jupyter/args.py
 create mode 100644 legate/jupyter/config.py
 create mode 100644 legate/jupyter/kernel.py
 create mode 100644 legate/jupyter/magic.py
 create mode 100644 legate/jupyter/main.py
 create mode 100644 legate/util/shared_args.py
 create mode 100644 tests/unit/legate/jupyter/__init__.py
 create mode 100644 tests/unit/legate/jupyter/test_args.py
 create mode 100644 tests/unit/legate/jupyter/test_config.py
 create mode 100644 tests/unit/legate/jupyter/test_kernel.py
 create mode 100644 tests/unit/legate/jupyter/test_main.py
 create mode 100644 typings/IPython/__init__.pyi
 create mode 100644 typings/IPython/core/magic.pyi
 create mode 100644 typings/jupyter_client/__init__.pyi
 create mode 100644 typings/jupyter_client/kernelspec.pyi

diff --git a/README.md b/README.md
index 019655092..ff1142695 100644
--- a/README.md
+++ b/README.md
@@ -452,28 +452,22 @@ that can adversely effect the performance of the application.
 Same as normal Python programs, Legate programs can be run
 using Jupyter Notebook. Currently we support single node execution with
 multiple CPUs and GPUs, and plan to support multi-node execution in the future.
-We leverage Legion's Jupyter support, so you may want to refer to the  
+We leverage Legion's Jupyter support, so you may want to refer to the
 [relevant section in Legion's README](https://github.com/StanfordLegion/legion/blob/master/jupyter_notebook/README.md).
-To simplify the installation, we provide a script specifically for Legate libraries. 
+To simplify the installation, we provide a script specifically for Legate libraries.
 
 ### Installation of the Legate IPython Kernel
 
-Please install Legate, then run the following command to install the IPython
-kernel:
+Please install Legate, then run the following command to install a default
+Jupyter kernel:
 ```
-python -m legate.jupyter --json=legate_jupyter.json
+legate-jupyter
 ```
-If `--json=` is not provided, the installation script will look for a file
-named `legate_jupyter.json` in the current directory. A sample
-`legate_jupyter.json` file is provided in the legate.core source directory.
-
 If installation is successful, you will see some output like the following:
 ```
-IPython kernel: legate_kernel_nocr(Legate_SM_GPU) has been installed
+Jupyter kernel spec Legate_SM_GPU (Legate_SM_GPU) has been installed
 ```
-`Legate_SM_GPU` is the kernel name, and you will need to provide it
-when starting the Jupyter Notebook. `SM` means the kernel is only for
-shared memory execution; `GPU` means GPU support is enabled. 
+`Legate_SM_GPU` is the default kernel name.
 
 ### Running with Jupyter Notebook
 
@@ -486,22 +480,13 @@ the Legion Jupyter Notebook extension:
 
 ### Configuring the Jupyter Notebook
 
-The Legate IPython kernel is configured according to the json file provided at
-install time. Here is an example of an entry in the json file:
+The Legate Jupyter kernel is configured according to the command line arguments
+provided at install time.  Standard `legate` options for Core, Memory, and
+Mult-node configuration may be provided, as well as a name for the kernel:
 ```
-"cpus": {
-    "cmd": "--cpus",
-    "value": 1
-}
+legate-jupyter --name legate_cpus_2 --cpus 2
 ```
-* `cpus` is the name of the field. 
-
-* `cmd` is used to tell Jupyter how to pass the value for that field to Legate through the
-CLI, in this case using `--cpus` to set the number of CPUs.
-
-* `value` is the value of the field. 
-
-Other configuration options can be added by using the `other_options` field of the json file. 
+Other configuration options can be seen by using the `--help` command line option.
 
 ### Magic Command
 
@@ -509,17 +494,24 @@ We provide a Jupyter magic command to display the IPython kernel configuration.
 ```
 %load_ext legate.jupyter
 %legate_info
-Number of CPUs to use per rank: 4
-Number of GPUs to use per rank: 1
-Number of OpenMP groups to use per rank: 0
-Number of threads per OpenMP group: 4
-Number of Utility processors per rank: 2
-Amount of DRAM memory per rank (in MBs): 4000
-Amount of DRAM memory per NUMA domain per rank (in MBs): 0
-Amount of framebuffer memory per GPU (in MBs): 4000
-Amount of zero-copy memory per rank (in MBs): 32
-Amount of registered CPU-side pinned memory per rank (in MBs): 0
-Number of nodes to use: 1
+```
+results in output:
+```
+Kernel 'Legate_SM_GPU' configured for 1 node(s)
+
+Cores:
+  CPUs to use per rank : 4
+  GPUs to use per rank : 0
+  OpenMP groups to use per rank : 0
+  Threads per OpenMP group : 4
+  Utility processors per rank : 2
+
+Memory:
+  DRAM memory per rank (in MBs) : 4000
+  DRAM memory per NUMA domain per rank (in MBs) : 0
+  Framebuffer memory per GPU (in MBs) : 4000
+  Zero-copy memory per rank (in MBs) : 32
+  Registered CPU-side pinned memory per rank (in MBs) : 0
 ```
 
 ## Other FAQs
diff --git a/legate/driver/args.py b/legate/driver/args.py
index 739722170..cc8667384 100755
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -18,13 +18,27 @@
 
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 
-from ..util.types import LauncherType
+from ..util.shared_args import (
+    CPUS,
+    FBMEM,
+    GPUS,
+    LAUNCHER,
+    LAUNCHER_EXTRA,
+    NOCR,
+    NODES,
+    NUMAMEM,
+    OMPS,
+    OMPTHREADS,
+    RANKS_PER_NODE,
+    REGMEM,
+    SYSMEM,
+    UTILITY,
+    ZCMEM,
+)
 from . import defaults
 
 __all__ = ("parser",)
 
-LAUNCHERS: tuple[LauncherType, ...] = ("mpirun", "jsrun", "srun", "none")
-
 parser = ArgumentParser(
     description="Legate Driver",
     allow_abbrev=False,
@@ -33,58 +47,11 @@
 
 
 multi_node = parser.add_argument_group("Multi-node configuration")
-
-
-multi_node.add_argument(
-    "--nodes",
-    type=int,
-    default=defaults.LEGATE_NODES,
-    dest="nodes",
-    help="Number of nodes to use",
-)
-
-
-multi_node.add_argument(
-    "--ranks-per-node",
-    type=int,
-    default=defaults.LEGATE_RANKS_PER_NODE,
-    dest="ranks_per_node",
-    help="Number of ranks (processes running copies of the program) to "
-    "launch per node. The default (1 rank per node) will typically result "
-    "in the best performance.",
-)
-
-
-multi_node.add_argument(
-    "--no-replicate",
-    dest="not_control_replicable",
-    action="store_true",
-    required=False,
-    help="Execute this program without control replication.  Most of the "
-    "time, this is not recommended.  This option should be used for "
-    "debugging.  The -lg:safe_ctrlrepl Legion option may be helpful "
-    "with discovering issues with replicated control.",
-)
-
-multi_node.add_argument(
-    "--launcher",
-    dest="launcher",
-    choices=LAUNCHERS,
-    default="none",
-    help='launcher program to use (set to "none" for local runs, or if '
-    "the launch has already happened by the time legate is invoked)",
-)
-
-
-multi_node.add_argument(
-    "--launcher-extra",
-    dest="launcher_extra",
-    action="append",
-    default=[],
-    required=False,
-    help="additional argument to pass to the launcher (can appear more "
-    "than once)",
-)
+multi_node.add_argument(NODES.name, **NODES.kwargs)
+multi_node.add_argument(RANKS_PER_NODE.name, **RANKS_PER_NODE.kwargs)
+multi_node.add_argument(NOCR.name, **NOCR.kwargs)
+multi_node.add_argument(LAUNCHER.name, **LAUNCHER.kwargs)
+multi_node.add_argument(LAUNCHER_EXTRA.name, **LAUNCHER_EXTRA.kwargs)
 
 
 binding = parser.add_argument_group("Hardware binding")
@@ -124,98 +91,19 @@
 
 
 core = parser.add_argument_group("Core alloction")
-
-
-core.add_argument(
-    "--cpus",
-    type=int,
-    default=defaults.LEGATE_CPUS,
-    dest="cpus",
-    help="Number of CPUs to use per rank",
-)
-
-
-core.add_argument(
-    "--gpus",
-    type=int,
-    default=defaults.LEGATE_GPUS,
-    dest="gpus",
-    help="Number of GPUs to use per rank",
-)
-
-
-core.add_argument(
-    "--omps",
-    type=int,
-    default=defaults.LEGATE_OMP_PROCS,
-    dest="openmp",
-    help="Number of OpenMP groups to use per rank",
-)
-
-
-core.add_argument(
-    "--ompthreads",
-    type=int,
-    default=defaults.LEGATE_OMP_THREADS,
-    dest="ompthreads",
-    help="Number of threads per OpenMP group",
-)
-
-
-core.add_argument(
-    "--utility",
-    type=int,
-    default=defaults.LEGATE_UTILITY_CORES,
-    dest="utility",
-    help="Number of Utility processors per rank to request for meta-work",
-)
+core.add_argument(CPUS.name, **CPUS.kwargs)
+core.add_argument(GPUS.name, **GPUS.kwargs)
+core.add_argument(OMPS.name, **OMPS.kwargs)
+core.add_argument(OMPTHREADS.name, **OMPTHREADS.kwargs)
+core.add_argument(UTILITY.name, **UTILITY.kwargs)
 
 
 memory = parser.add_argument_group("Memory alloction")
-
-memory.add_argument(
-    "--sysmem",
-    type=int,
-    default=defaults.LEGATE_SYSMEM,
-    dest="sysmem",
-    help="Amount of DRAM memory per rank (in MBs)",
-)
-
-
-memory.add_argument(
-    "--numamem",
-    type=int,
-    default=defaults.LEGATE_NUMAMEM,
-    dest="numamem",
-    help="Amount of DRAM memory per NUMA domain per rank (in MBs)",
-)
-
-
-memory.add_argument(
-    "--fbmem",
-    type=int,
-    default=defaults.LEGATE_FBMEM,
-    dest="fbmem",
-    help="Amount of framebuffer memory per GPU (in MBs)",
-)
-
-
-memory.add_argument(
-    "--zcmem",
-    type=int,
-    default=defaults.LEGATE_ZCMEM,
-    dest="zcmem",
-    help="Amount of zero-copy memory per rank (in MBs)",
-)
-
-
-memory.add_argument(
-    "--regmem",
-    type=int,
-    default=defaults.LEGATE_REGMEM,
-    dest="regmem",
-    help="Amount of registered CPU-side pinned memory per rank (in MBs)",
-)
+memory.add_argument(SYSMEM.name, **SYSMEM.kwargs)
+memory.add_argument(NUMAMEM.name, **NUMAMEM.kwargs)
+memory.add_argument(FBMEM.name, **FBMEM.kwargs)
+memory.add_argument(ZCMEM.name, **ZCMEM.kwargs)
+memory.add_argument(REGMEM.name, **REGMEM.kwargs)
 
 
 # FIXME: We set the eager pool size to 50% of the total size for now.
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 7d11c2c9b..f45a10c7c 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -21,14 +21,14 @@
 if TYPE_CHECKING:
     from ..util.system import System
     from ..util.types import CommandPart
-    from .config import Config
+    from .config import ConfigProtocol
     from .launcher import Launcher
 
 __all__ = ("CMD_PARTS",)
 
 
 def cmd_bind(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     cpu_bind = config.binding.cpu_bind
     mem_bind = config.binding.mem_bind
@@ -69,7 +69,9 @@ def check_bind_ranks(name: str, binding: str) -> None:
     return opts
 
 
-def cmd_gdb(config: Config, system: System, launcher: Launcher) -> CommandPart:
+def cmd_gdb(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
     if not config.debugging.gdb:
         return ()
 
@@ -81,7 +83,7 @@ def cmd_gdb(config: Config, system: System, launcher: Launcher) -> CommandPart:
 
 
 def cmd_cuda_gdb(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     if not config.debugging.cuda_gdb:
         return ()
@@ -94,7 +96,7 @@ def cmd_cuda_gdb(
 
 
 def cmd_nvprof(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     if not config.profiling.nvprof:
         return ()
@@ -105,7 +107,7 @@ def cmd_nvprof(
 
 
 def cmd_nsys(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     if not config.profiling.nsys:
         return ()
@@ -123,7 +125,7 @@ def cmd_nsys(
 
 
 def cmd_memcheck(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     memcheck = config.debugging.memcheck
 
@@ -131,7 +133,7 @@ def cmd_memcheck(
 
 
 def cmd_nocr(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     control_replicable = not config.multi_node.not_control_replicable
 
@@ -139,7 +141,7 @@ def cmd_nocr(
 
 
 def cmd_module(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     module = config.other.module
 
@@ -147,26 +149,26 @@ def cmd_module(
 
 
 def cmd_rlwrap(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     return ("rlwrap",) if config.other.rlwrap else ()
 
 
 def cmd_legion(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     return (str(system.legion_paths.legion_python),)
 
 
 def cmd_processor(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     # We always need one python processor per rank and no local fields
     return ("-ll:py", "1", "-lg:local", "0")
 
 
 def cmd_kthreads(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     freeze_on_error = config.debugging.freeze_on_error
     gdb = config.debugging.gdb
@@ -181,7 +183,7 @@ def cmd_kthreads(
 
 
 def cmd_cpus(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     cpus = config.core.cpus
 
@@ -189,7 +191,7 @@ def cmd_cpus(
 
 
 def cmd_gpus(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     gpus = config.core.gpus
 
@@ -198,7 +200,7 @@ def cmd_gpus(
 
 
 def cmd_openmp(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     openmp = config.core.openmp
     ompthreads = config.core.ompthreads
@@ -228,7 +230,7 @@ def cmd_openmp(
 
 
 def cmd_utility(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     utility = config.core.utility
     ranks = config.multi_node.ranks
@@ -247,20 +249,22 @@ def cmd_utility(
     return opts
 
 
-def cmd_mem(config: Config, system: System, launcher: Launcher) -> CommandPart:
+def cmd_mem(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
     # Always specify the csize
     return ("-ll:csize", str(config.memory.sysmem))
 
 
 def cmd_numamem(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     numamem = config.memory.numamem
     return () if numamem == 0 else ("-ll:nsize", str(numamem))
 
 
 def cmd_fbmem(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     if config.core.gpus == 0:
         return ()
@@ -270,14 +274,14 @@ def cmd_fbmem(
 
 
 def cmd_regmem(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     regmem = config.memory.regmem
     return () if regmem == 0 else ("-ll:rsize", str(regmem))
 
 
 def cmd_log_levels(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     log_dir = config.logging.logdir
 
@@ -308,7 +312,7 @@ def cmd_log_levels(
 
 
 def cmd_log_file(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     log_dir = config.logging.logdir
     log_to_file = config.logging.log_to_file
@@ -320,7 +324,7 @@ def cmd_log_file(
 
 
 def cmd_eager_alloc(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     eager_alloc = config.memory.eager_alloc
 
@@ -328,7 +332,7 @@ def cmd_eager_alloc(
 
 
 def cmd_user_opts(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     return config.user_opts
 
diff --git a/legate/driver/config.py b/legate/driver/config.py
index c4acb3c41..5e42bc584 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -21,7 +21,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import Any
+from typing import Any, Protocol
 
 from ..util.types import (
     ArgList,
@@ -123,6 +123,24 @@ class Other(DataclassMixin):
     rlwrap: bool
 
 
+class ConfigProtocol(Protocol):
+
+    _args: Namespace
+
+    argv: ArgList
+
+    user_opts: tuple[str, ...]
+    multi_node: MultiNode
+    binding: Binding
+    core: Core
+    memory: Memory
+    profiling: Profiling
+    logging: Logging
+    debugging: Debugging
+    info: Info
+    other: Other
+
+
 class Config:
     """A centralized configuration object that provides the information
     needed by the Legate driver in order to run.
@@ -135,7 +153,9 @@ class Config:
     """
 
     def __init__(self, argv: ArgList) -> None:
-        args, extra = parser.parse_known_args(argv[1:])
+        self.argv = argv
+
+        args, extra = parser.parse_known_args(self.argv[1:])
 
         # only saving this for help with testing
         self._args = args
diff --git a/legate/driver/driver.py b/legate/driver/driver.py
index 7a3e00c40..5329b951f 100644
--- a/legate/driver/driver.py
+++ b/legate/driver/driver.py
@@ -22,7 +22,7 @@
 from ..util.system import System
 from ..util.ui import kvtable, rule, section, value, warn
 from .command import CMD_PARTS
-from .config import Config
+from .config import ConfigProtocol
 from .launcher import Launcher
 from .logs import process_logs
 
@@ -53,7 +53,7 @@ class Driver:
 
     """
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         self.config = config
         self.system = system
         self.launcher = Launcher.create(config, system)
diff --git a/legate/driver/launcher.py b/legate/driver/launcher.py
index 009b0cf6b..e41b0a2e1 100644
--- a/legate/driver/launcher.py
+++ b/legate/driver/launcher.py
@@ -25,7 +25,7 @@
 if TYPE_CHECKING:
     from ..util.system import System
     from ..util.types import Command, EnvDict, LauncherType
-    from .config import Config
+    from .config import ConfigProtocol
 
 __all__ = ("Launcher",)
 
@@ -71,7 +71,7 @@ class Launcher:
 
     cmd: Command
 
-    _config: Config
+    _config: ConfigProtocol
 
     _system: System
 
@@ -79,7 +79,7 @@ class Launcher:
 
     _custom_env_vars: set[str] | None = None
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         self._config = config
         self._system = system
 
@@ -95,7 +95,7 @@ def __eq__(self, other: object) -> bool:
         )
 
     @classmethod
-    def create(cls, config: Config, system: System) -> Launcher:
+    def create(cls, config: ConfigProtocol, system: System) -> Launcher:
         """Factory method for creating appropriate Launcher subclass based on
         user configuration.
 
@@ -291,7 +291,7 @@ class SimpleLauncher(Launcher):
 
     kind: LauncherType = "none"
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
         if config.multi_node.ranks == 1:
@@ -319,7 +319,7 @@ class MPILauncher(Launcher):
 
     kind: LauncherType = "mpirun"
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
         self.rank_id = "%q{OMPI_COMM_WORLD_RANK}"
@@ -349,7 +349,7 @@ class JSRunLauncher(Launcher):
 
     kind: LauncherType = "jsrun"
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
         self.rank_id = "%q{OMPI_COMM_WORLD_RANK}"
@@ -377,7 +377,7 @@ class SRunLauncher(Launcher):
 
     kind: LauncherType = "srun"
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
         self.rank_id = "%q{SLURM_PROCID}"
diff --git a/legate/driver/logs.py b/legate/driver/logs.py
index 70696d3e1..95b2ed46f 100644
--- a/legate/driver/logs.py
+++ b/legate/driver/logs.py
@@ -29,7 +29,7 @@
 if TYPE_CHECKING:
     from ..util.system import System
     from ..util.types import Command
-    from .config import Config
+    from .config import ConfigProtocol
     from .launcher import Launcher
 
 __all__ = (
@@ -53,10 +53,10 @@ class LogHandler(metaclass=ABCMeta):
 
     """
 
-    config: Config
+    config: ConfigProtocol
     system: System
 
-    def __init__(self, config: Config, system: System) -> None:
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
         self.config = config
         self.system = system
 
@@ -164,7 +164,7 @@ def cleanup(self, keep_logs: bool) -> None:
 
 @contextmanager
 def process_logs(
-    config: Config, system: System, launcher: Launcher
+    config: ConfigProtocol, system: System, launcher: Launcher
 ) -> Iterator[tuple[LogHandler, ...]]:
     """A context manager for log initializion and processing/cleanup, based
     on the user configuration.
diff --git a/legate/jupyter/__init__.py b/legate/jupyter/__init__.py
index 318a79cc5..c9530a071 100644
--- a/legate/jupyter/__init__.py
+++ b/legate/jupyter/__init__.py
@@ -12,11 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from __future__ import annotations
 
-# mypy: ignore-errors
-from ._magic_cmd import LegateInfoMagics
+from typing import TYPE_CHECKING
 
+from legate.jupyter.magic import LegateInfoMagics
 
-def load_ipython_extension(ipython) -> None:
-    legate_info_magic = LegateInfoMagics(ipython)
-    ipython.register_magics(legate_info_magic)
+if TYPE_CHECKING:
+    from IPython import InteractiveShell
+
+
+def load_ipython_extension(ipython: InteractiveShell) -> None:
+    ipython.register_magics(LegateInfoMagics(ipython))
+
+
+def main() -> int:
+    import sys
+
+    from .main import main as _main
+
+    return _main(sys.argv)
diff --git a/legate/jupyter/__main__.py b/legate/jupyter/__main__.py
deleted file mode 100644
index 26cc4e4cf..000000000
--- a/legate/jupyter/__main__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# mypy: ignore-errors
-import shutil
-from pathlib import Path
-
-import install_jupyter
-from jupyter_client.kernelspec import KernelSpecManager
-
-if __name__ == "__main__":
-    legate_exe = Path(shutil.which("legate"))
-    legate_dir = legate_exe.parent.absolute()
-    args, opts = install_jupyter.parse_args()
-    if args.json == "legion_python.json":
-        # override the default one
-        args.json = "legate_jupyter.json"
-    args.legion_prefix = str(legate_dir)
-    legion_jupyter_file = Path(install_jupyter.__file__)
-    kernel_file_dir = str(legion_jupyter_file.parent.absolute())
-    kernel_name = install_jupyter.driver(args, opts, kernel_file_dir)
-    # copy the json file into ipython kernel directory
-    ksm = KernelSpecManager()
-    spec = ksm.get_kernel_spec(kernel_name)
-    shutil.copy(args.json, spec.resource_dir)
diff --git a/legate/jupyter/_legion_kernel.py b/legate/jupyter/_legion_kernel.py
new file mode 100644
index 000000000..b88d23f30
--- /dev/null
+++ b/legate/jupyter/_legion_kernel.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# Copyright 2022 Los Alamos National Laboratory, NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import sys
+from contextlib import contextmanager
+from typing import Any, Iterator, TextIO
+
+from ipykernel.ipkernel import IPythonKernel  # type: ignore
+
+__version__ = "0.1"
+
+
+@contextmanager
+def reset_stdout(stdout: TextIO) -> Iterator[None]:
+    _stdout = sys.stdout
+    sys.stdout = stdout
+    yield
+    sys.stdout = _stdout
+
+
+class LegionKernel(IPythonKernel):  # type: ignore
+    implementation = "legion_kernel"
+    implementation_version = __version__
+    banner = "Legion IPython Kernel for SM"
+    language = "python"
+    language_version = __version__
+    language_info = {
+        "name": "legion_kernel",
+        "mimetype": "text/x-python",
+        "codemirror_mode": {"name": "ipython", "version": 3},
+        "pygments_lexer": "ipython3",
+        "nbconvert_exporter": "python",
+        "file_extension": ".py",
+    }
+
+    def __init__(self, **kwargs: Any) -> None:
+        with reset_stdout(open("/dev/stdout", "w")):
+            print("Initializing Legion kernel for single- or multi-node.")
+        super().__init__(**kwargs)
+
+
+if __name__ == "__main__":
+    from ipykernel.kernelapp import IPKernelApp  # type: ignore
+
+    IPKernelApp.launch_instance(kernel_class=LegionKernel)
diff --git a/legate/jupyter/_magic_cmd.py b/legate/jupyter/_magic_cmd.py
deleted file mode 100644
index 06e653a51..000000000
--- a/legate/jupyter/_magic_cmd.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# mypy: ignore-errors
-import json
-import os
-import sys
-
-from IPython.core.magic import Magics, line_magic, magics_class
-from jupyter_client.kernelspec import KernelSpecManager, NoSuchKernel
-
-cmd_dict = {
-    "cpus": "Number of CPUs to use per rank",
-    "gpus": "Number of GPUs to use per rank",
-    "omps": "Number of OpenMP groups to use per rank",
-    "ompthreads": "Number of threads per OpenMP group",
-    "utility": "Number of Utility processors per rank",
-    "sysmem": "Amount of DRAM memory per rank (in MBs)",
-    "numamem": "Amount of DRAM memory per NUMA domain per rank (in MBs)",
-    "fbmem": "Amount of framebuffer memory per GPU (in MBs)",
-    "zcmem": "Amount of zero-copy memory per rank (in MBs)",
-    "regmem": "Amount of registered CPU-side pinned memory per rank (in MBs)",
-    "nodes": "Number of nodes to use",
-}
-
-
-class LegateInfo(object):
-    def __init__(self, filename: str) -> None:
-        self.config_dict = dict()
-        # check if the json file is in the ipython kernel directory
-        try:
-            ksm = KernelSpecManager()
-            spec = ksm.get_kernel_spec("legate_kernel_nocr")
-        except NoSuchKernel:
-            print(
-                "Can not find the json file in the "
-                "IPython kernel directory, please "
-                "make sure the kernel has been installed."
-            )
-            sys.exit(1)
-        filename_with_path = os.path.join(spec.resource_dir, filename)
-        with open(filename_with_path) as json_file:
-            json_dict = json.load(json_file)
-            if missing := (set(cmd_dict) - set(json_dict)):
-                raise RuntimeError(f"Expected keys {missing!r} are missing")
-            for key in cmd_dict.keys():
-                self.config_dict[key] = json_dict[key]["value"]
-
-    def __repr__(self) -> str:
-        out_str = ""
-        for key, value in self.config_dict.items():
-            out_str += f"{cmd_dict[key]}: {value}\n"
-        return out_str[:-1]
-
-
-@magics_class
-class LegateInfoMagics(Magics):
-    __slots__ = ["legate_json"]
-
-    def __init__(self, shell):
-        super(LegateInfoMagics, self).__init__(shell)
-        self.legate_json = LegateInfo("legate_jupyter.json")
-
-    @line_magic
-    def legate_info(self, line: str) -> None:
-        print(self.legate_json)
diff --git a/legate/jupyter/args.py b/legate/jupyter/args.py
new file mode 100755
index 000000000..77c16b66a
--- /dev/null
+++ b/legate/jupyter/args.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+
+from legate.driver import defaults
+from legate.util import shared_args as sa
+
+__all__ = ("parser",)
+
+
+parser = ArgumentParser(
+    description="Install a Legate Jupyter Kernel",
+    allow_abbrev=False,
+    formatter_class=ArgumentDefaultsHelpFormatter,
+)
+
+kernel = parser.add_argument_group("Kernel configuration")
+
+kernel.add_argument(
+    "--user",
+    action="store_true",
+    default=True,
+    dest="user",
+    help="Install the kernel in user home directory",
+)
+
+kernel.add_argument(
+    "--name",
+    default="Legate_SM_GPU",
+    dest="spec_name",
+    help="A name for the kernel spec",
+)
+
+kernel.add_argument(
+    "--display-name",
+    default=None,
+    dest="display_name",
+    help="A display name for the kernel (if not provided, --name is used)",
+)
+
+kernel.add_argument(
+    "--prefix",
+    default=None,
+    dest="prefix",
+    help="A prefix to install the kernel into",
+)
+
+
+multi_node = parser.add_argument_group("Multi-node configuration")
+multi_node.add_argument(sa.NODES.name, **sa.NODES.kwargs)
+multi_node.add_argument(sa.RANKS_PER_NODE.name, **sa.RANKS_PER_NODE.kwargs)
+multi_node.add_argument(sa.NOCR.name, **sa.NOCR.kwargs)
+multi_node.add_argument(sa.LAUNCHER.name, **sa.LAUNCHER.kwargs)
+multi_node.add_argument(sa.LAUNCHER_EXTRA.name, **sa.LAUNCHER_EXTRA.kwargs)
+
+
+core = parser.add_argument_group("Core alloction")
+core.add_argument(sa.CPUS.name, **sa.CPUS.kwargs)
+core.add_argument(sa.GPUS.name, **sa.GPUS.kwargs)
+core.add_argument(sa.OMPS.name, **sa.OMPS.kwargs)
+core.add_argument(sa.OMPTHREADS.name, **sa.OMPTHREADS.kwargs)
+core.add_argument(sa.UTILITY.name, **sa.UTILITY.kwargs)
+
+
+memory = parser.add_argument_group("Memory alloction")
+memory.add_argument(sa.SYSMEM.name, **sa.SYSMEM.kwargs)
+memory.add_argument(sa.NUMAMEM.name, **sa.NUMAMEM.kwargs)
+memory.add_argument(sa.FBMEM.name, **sa.FBMEM.kwargs)
+memory.add_argument(sa.ZCMEM.name, **sa.ZCMEM.kwargs)
+memory.add_argument(sa.REGMEM.name, **sa.REGMEM.kwargs)
+
+# FIXME: We set the eager pool size to 50% of the total size for now.
+#        This flag will be gone once we roll out a new allocation scheme.
+memory.add_argument(
+    "--eager-alloc-percentage",
+    dest="eager_alloc",
+    default=defaults.LEGATE_EAGER_ALLOC_PERCENTAGE,
+    required=False,
+    help="Specify the size of eager allocation pool in percentage",
+)
+
+info = parser.add_argument_group("Informational")
+
+info.add_argument(
+    "-v",
+    "--verbose",
+    dest="verbose",
+    action="count",
+    default=0,
+    help="Display verbose output. Use -vv for even more output (test stdout)",
+)
diff --git a/legate/jupyter/config.py b/legate/jupyter/config.py
new file mode 100644
index 000000000..52c44c00f
--- /dev/null
+++ b/legate/jupyter/config.py
@@ -0,0 +1,87 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate driver configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from legate.driver.config import (
+    Binding,
+    Core,
+    Debugging,
+    Info,
+    Logging,
+    Memory,
+    MultiNode,
+    Other,
+    Profiling,
+)
+from legate.jupyter.args import parser
+from legate.util.types import ArgList, DataclassMixin, object_to_dataclass
+
+__all__ = ("Config",)
+
+
+@dataclass(frozen=True)
+class Kernel(DataclassMixin):
+    user: bool
+    prefix: str | None
+    spec_name: str
+    display_name: str
+
+
+class Config:
+    """A Jupyter-specific configuration object that provides the information
+    needed by the Legate driver in order to run.
+
+    Parameters
+    ----------
+    argv : ArgList
+        command-line arguments to use when building the configuration
+
+    """
+
+    def __init__(self, argv: ArgList) -> None:
+        self.argv = argv
+
+        args = parser.parse_args(self.argv[1:])
+
+        # only saving these for help with testing
+        self._args = args
+
+        if args.display_name is None:
+            args.display_name = args.spec_name
+
+        self.kernel = object_to_dataclass(args, Kernel)
+        self.verbose = args.verbose
+
+        # these are the values we leave configurable for the kernel
+        self.multi_node = object_to_dataclass(args, MultiNode)
+        self.core = object_to_dataclass(args, Core)
+        self.memory = object_to_dataclass(args, Memory)
+
+        # turn everything else off
+        self.user_opts: tuple[str, ...] = ()
+        self.binding = Binding(None, None, None, None)
+        self.profiling = Profiling(False, False, False, "", [])
+        self.logging = Logging(None, Path(), False, False)
+        self.debugging = Debugging(
+            False, False, False, False, False, False, False
+        )
+        self.info = Info(False, False, self.verbose > 0)
+        self.other = Other(None, False, False)
diff --git a/legate/jupyter/kernel.py b/legate/jupyter/kernel.py
new file mode 100644
index 000000000..e71604ed8
--- /dev/null
+++ b/legate/jupyter/kernel.py
@@ -0,0 +1,128 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Consolidate driver configuration from command-line and environment.
+
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+from dataclasses import asdict
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any, Literal, TypedDict
+
+from jupyter_client.kernelspec import (
+    KernelSpec,
+    KernelSpecManager,
+    NoSuchKernel,
+)
+
+from legate.driver import Driver
+from legate.jupyter.config import Config
+from legate.util.types import ArgList
+from legate.util.ui import error
+
+
+class LegateMetadata(TypedDict):
+    argv: ArgList
+    multi_node: dict[str, Any]
+    memory: dict[str, Any]
+    core: dict[str, Any]
+
+
+LEGATE_JUPYTER_KERNEL_SPEC_KEY = "__LEGATE_JUPYTER_KERNEL_SPEC__"
+LEGATE_JUPYTER_METADATA_KEY: Literal["legate"] = "legate"
+
+
+def generate_kernel_spec(driver: Driver, config: Config) -> KernelSpec:
+    legion_kernel = Path(__file__).parent / "_legion_kernel.py"
+    argv = list(driver.cmd) + [str(legion_kernel), "-f", "{connection_file}"]
+
+    env = {k: v for k, v in driver.env.items() if k in driver.custom_env_vars}
+
+    # Inexplicably, there is apparently no reasonable or supported way to
+    # determine the name of the currently running/connected Jupyter kernel.
+    # Instead, tunnel an env var with the name through, so that our LegateInfo
+    # line magic can actually find the right kernel spec to report on.
+    assert LEGATE_JUPYTER_KERNEL_SPEC_KEY not in env
+    env[LEGATE_JUPYTER_KERNEL_SPEC_KEY] = config.kernel.spec_name
+
+    return KernelSpec(
+        display_name=config.kernel.display_name,
+        language="python",
+        argv=argv,
+        env=env,
+        metadata={
+            LEGATE_JUPYTER_METADATA_KEY: LegateMetadata(
+                {
+                    "argv": config.argv[1:],
+                    "multi_node": asdict(config.multi_node),
+                    "memory": asdict(config.memory),
+                    "core": asdict(config.core),
+                }
+            )
+        },
+    )
+
+
+def install_kernel_spec(spec: KernelSpec, config: Config) -> None:
+    ksm = KernelSpecManager()
+
+    spec_name = config.kernel.spec_name
+    display_name = spec.display_name
+
+    try:
+        ksm.get_kernel_spec(spec_name)
+    except NoSuchKernel:
+        pass
+    else:
+        msg = error(
+            f"kernel spec {spec_name!r} already exists. Remove it by "
+            f"running 'jupyter kernelspec uninstall {spec_name!r}, "
+            "or choose a new kernel name."
+        )
+        print(msg)
+        sys.exit(1)
+
+    with TemporaryDirectory() as tmpdir:
+        os.chmod(tmpdir, 0o755)
+        with open(Path(tmpdir).joinpath("kernel.json"), "w") as f:
+            out = json.dumps(spec.to_dict(), sort_keys=True, indent=2)
+            if config.verbose > 0:
+                print(f"Wrote kernel spec file {spec_name}/kernel.json\n")
+            if config.verbose > 1:
+                print(f"\n{out}\n")
+            f.write(out)
+
+        try:
+            ksm.install_kernel_spec(
+                tmpdir,
+                spec_name,
+                user=config.kernel.user,
+                prefix=config.kernel.prefix,
+            )
+            print(
+                f"Jupyter kernel spec {spec_name} ({display_name}) "
+                "has been installed"
+            )
+        except Exception as e:
+            msg = error(
+                "Failed to install the Jupyter kernel spec "
+                f"{spec_name} ({display_name}) with error: {e}"
+            )
+            print(msg)
+            sys.exit(1)
diff --git a/legate/jupyter/magic.py b/legate/jupyter/magic.py
new file mode 100644
index 000000000..b5b82784c
--- /dev/null
+++ b/legate/jupyter/magic.py
@@ -0,0 +1,103 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import os
+from textwrap import indent
+from typing import TYPE_CHECKING
+
+from IPython.core.magic import Magics, line_magic, magics_class
+from jupyter_client.kernelspec import KernelSpecManager, NoSuchKernel
+
+from legate.jupyter.kernel import (
+    LEGATE_JUPYTER_KERNEL_SPEC_KEY,
+    LEGATE_JUPYTER_METADATA_KEY,
+    LegateMetadata,
+)
+from legate.util.colors import scrub
+from legate.util.ui import kvtable
+
+if TYPE_CHECKING:
+    from IPython import InteractiveShell
+
+
+core = {
+    "cpus": "CPUs to use per rank",
+    "gpus": "GPUs to use per rank",
+    "openmp": "OpenMP groups to use per rank",
+    "ompthreads": "Threads per OpenMP group",
+    "utility": "Utility processors per rank",
+}
+
+memory = {
+    "sysmem": "DRAM memory per rank (in MBs)",
+    "numamem": "DRAM memory per NUMA domain per rank (in MBs)",
+    "fbmem": "Framebuffer memory per GPU (in MBs)",
+    "zcmem": "Zero-copy memory per rank (in MBs)",
+    "regmem": "Registered CPU-side pinned memory per rank (in MBs)",
+}
+
+
+class LegateInfo(object):
+    config: LegateMetadata
+
+    def __init__(self) -> None:
+        if LEGATE_JUPYTER_KERNEL_SPEC_KEY not in os.environ:
+            raise RuntimeError("Cannot determine currently running kernel")
+
+        spec_name = os.environ[LEGATE_JUPYTER_KERNEL_SPEC_KEY]
+
+        try:
+            spec = KernelSpecManager().get_kernel_spec(spec_name)
+        except NoSuchKernel:
+            raise RuntimeError(
+                f"Cannot find a Legate Jupyter kernel named {spec_name!r}"
+            )
+
+        self.spec_name = spec_name
+        self.config = spec.metadata[LEGATE_JUPYTER_METADATA_KEY]
+
+    def __str__(self) -> str:
+        nodes = self.config["multi_node"]["nodes"]
+        header = f"Kernel {self.spec_name!r} configured for {nodes} node(s)"
+        core_table = {
+            desc: self.config["core"][field] for field, desc in core.items()
+        }
+        memory_table = {
+            desc: self.config["memory"][field]
+            for field, desc in memory.items()
+        }
+
+        out = f"""{header}
+
+Cores:
+{indent(kvtable(core_table, align=False),  prefix='  ')}
+
+Memory:
+{indent(kvtable(memory_table, align=False), prefix='  ')}
+"""
+        # remove any text colors in notebook
+        return scrub(out)
+
+
+@magics_class
+class LegateInfoMagics(Magics):
+    def __init__(self, shell: InteractiveShell | None = None) -> None:
+        super().__init__(shell=shell)
+        self.info = LegateInfo()
+
+    @line_magic
+    def legate_info(self, line: str) -> None:
+        print(self.info)
diff --git a/legate/jupyter/main.py b/legate/jupyter/main.py
new file mode 100644
index 000000000..494fdf421
--- /dev/null
+++ b/legate/jupyter/main.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from legate.driver import Driver
+from legate.jupyter.config import Config
+from legate.jupyter.kernel import generate_kernel_spec, install_kernel_spec
+from legate.util.system import System
+
+__all__ = ("main",)
+
+
+def main(argv: list[str]) -> int:
+    config = Config(argv)
+    system = System()
+
+    driver = Driver(config, system)
+
+    spec = generate_kernel_spec(driver, config)
+
+    install_kernel_spec(spec, config)
+
+    return 0
diff --git a/legate/tester/config.py b/legate/tester/config.py
index a621ba8c3..497c3a385 100644
--- a/legate/tester/config.py
+++ b/legate/tester/config.py
@@ -38,7 +38,9 @@ class Config:
     """
 
     def __init__(self, argv: ArgList) -> None:
-        args, self._extra_args = parser.parse_known_args(argv[1:])
+        self.argv = argv
+
+        args, self._extra_args = parser.parse_known_args(self.argv[1:])
 
         # which tests to run
         self.examples = True
diff --git a/legate/util/args.py b/legate/util/args.py
index 88cd73193..4485d6db2 100644
--- a/legate/util/args.py
+++ b/legate/util/args.py
@@ -66,7 +66,7 @@ class _UnsetType:
 @dataclass(frozen=True)
 class ArgSpec:
     dest: str
-    action: NotRequired[ActionType] = "store_true"
+    action: NotRequired[ActionType] = Unset
     nargs: NotRequired[Union[int, NargsType]] = Unset
     const: NotRequired[Any] = Unset
     default: NotRequired[Any] = Unset
diff --git a/legate/util/shared_args.py b/legate/util/shared_args.py
new file mode 100644
index 000000000..46def8642
--- /dev/null
+++ b/legate/util/shared_args.py
@@ -0,0 +1,207 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from ..driver import defaults
+from .args import ArgSpec, Argument
+from .types import LauncherType
+
+__all__ = (
+    "CPUS",
+    "FBMEM",
+    "GPUS",
+    "LAUNCHER_EXTRA",
+    "LAUNCHER",
+    "LAUNCHERS",
+    "NOCR",
+    "NODES",
+    "NUMAMEM",
+    "OMPS",
+    "OMPTHREADS",
+    "RANKS_PER_NODE",
+    "REGMEM",
+    "SYSMEM",
+    "UTILITY",
+    "ZCMEM",
+)
+
+LAUNCHERS: tuple[LauncherType, ...] = ("mpirun", "jsrun", "srun", "none")
+
+NODES = Argument(
+    "--nodes",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_NODES,
+        dest="nodes",
+        help="Number of nodes to use",
+    ),
+)
+
+
+RANKS_PER_NODE = Argument(
+    "--ranks-per-node",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_RANKS_PER_NODE,
+        dest="ranks_per_node",
+        help="Number of ranks (processes running copies of the program) to "
+        "launch per node. The default (1 rank per node) will typically result "
+        "in the best performance.",
+    ),
+)
+
+
+NOCR = Argument(
+    "--no-replicate",
+    ArgSpec(
+        dest="not_control_replicable",
+        action="store_true",
+        required=False,
+        help="Execute this program without control replication.  Most of the "
+        "time, this is not recommended.  This option should be used for "
+        "debugging.  The -lg:safe_ctrlrepl Legion option may be helpful "
+        "with discovering issues with replicated control.",
+    ),
+)
+
+LAUNCHER = Argument(
+    "--launcher",
+    ArgSpec(
+        dest="launcher",
+        choices=LAUNCHERS,
+        default="none",
+        help='launcher program to use (set to "none" for local runs, or if '
+        "the launch has already happened by the time legate is invoked)",
+    ),
+)
+
+
+LAUNCHER_EXTRA = Argument(
+    "--launcher-extra",
+    ArgSpec(
+        dest="launcher_extra",
+        action="append",
+        default=[],
+        required=False,
+        help="additional argument to pass to the launcher (can appear more "
+        "than once)",
+    ),
+)
+
+
+CPUS = Argument(
+    "--cpus",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_CPUS,
+        dest="cpus",
+        help="Number of CPUs to use per rank",
+    ),
+)
+
+GPUS = Argument(
+    "--gpus",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_GPUS,
+        dest="gpus",
+        help="Number of OpenMP groups to use per rank",
+    ),
+)
+
+OMPS = Argument(
+    "--omps",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_OMP_PROCS,
+        dest="openmp",
+        help="Number of OpenMP groups to use per rank",
+    ),
+)
+
+
+OMPTHREADS = Argument(
+    "--ompthreads",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_OMP_THREADS,
+        dest="ompthreads",
+        help="Number of threads per OpenMP group",
+    ),
+)
+
+UTILITY = Argument(
+    "--utility",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_UTILITY_CORES,
+        dest="utility",
+        help="Number of Utility processors per rank to request for meta-work",
+    ),
+)
+
+SYSMEM = Argument(
+    "--sysmem",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_SYSMEM,
+        dest="sysmem",
+        help="Amount of DRAM memory per rank (in MBs)",
+    ),
+)
+
+
+NUMAMEM = Argument(
+    "--numamem",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_NUMAMEM,
+        dest="numamem",
+        help="Amount of DRAM memory per NUMA domain per rank (in MBs)",
+    ),
+)
+
+
+FBMEM = Argument(
+    "--fbmem",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_FBMEM,
+        dest="fbmem",
+        help="Amount of framebuffer memory per GPU (in MBs)",
+    ),
+)
+
+
+ZCMEM = Argument(
+    "--zcmem",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_ZCMEM,
+        dest="zcmem",
+        help="Amount of zero-copy memory per rank (in MBs)",
+    ),
+)
+
+
+REGMEM = Argument(
+    "--regmem",
+    ArgSpec(
+        type=int,
+        default=defaults.LEGATE_REGMEM,
+        dest="regmem",
+        help="Amount of registered CPU-side pinned memory per rank (in MBs)",
+    ),
+)
diff --git a/setup.py b/setup.py
index 24e358eb0..89583411a 100755
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,7 @@
     entry_points={
         "console_scripts": [
             "legate = legate.driver:main",
+            "legate-jupyter = legate.jupyter:main",
             "lgpatch = legate.lgpatch:main",
         ],
     },
diff --git a/tests/unit/legate/driver/test_args.py b/tests/unit/legate/driver/test_args.py
index 76a29fd9b..fa9d36929 100644
--- a/tests/unit/legate/driver/test_args.py
+++ b/tests/unit/legate/driver/test_args.py
@@ -18,10 +18,6 @@
 import legate.driver.defaults as defaults
 
 
-def test_LAUNCHERS() -> None:
-    assert m.LAUNCHERS == ("mpirun", "jsrun", "srun", "none")
-
-
 class TestParserDefaults:
     def test_allow_abbrev(self) -> None:
         assert not m.parser.allow_abbrev
diff --git a/tests/unit/legate/driver/test_driver.py b/tests/unit/legate/driver/test_driver.py
index fad492a2f..652f73627 100644
--- a/tests/unit/legate/driver/test_driver.py
+++ b/tests/unit/legate/driver/test_driver.py
@@ -21,11 +21,11 @@
 from pytest_mock import MockerFixture
 
 import legate.driver.driver as m
-from legate.driver.args import LAUNCHERS
 from legate.driver.command import CMD_PARTS
 from legate.driver.config import Config
 from legate.driver.launcher import RANK_ENV_VARS, Launcher
 from legate.util.colors import scrub
+from legate.util.shared_args import LAUNCHERS
 from legate.util.system import System
 from legate.util.types import LauncherType
 
diff --git a/tests/unit/legate/driver/test_launcher.py b/tests/unit/legate/driver/test_launcher.py
index ecf980d87..ebfc793c5 100644
--- a/tests/unit/legate/driver/test_launcher.py
+++ b/tests/unit/legate/driver/test_launcher.py
@@ -19,7 +19,7 @@
 import pytest
 
 import legate.driver.launcher as m
-from legate.driver.args import LAUNCHERS
+from legate.util.shared_args import LAUNCHERS
 from legate.util.system import System
 from legate.util.types import LauncherType
 
diff --git a/tests/unit/legate/driver/test_main.py b/tests/unit/legate/driver/test_main.py
index 0992a226b..a5537afba 100644
--- a/tests/unit/legate/driver/test_main.py
+++ b/tests/unit/legate/driver/test_main.py
@@ -41,7 +41,7 @@ def test_main(mocker: MockerFixture) -> None:
     result = m.main()
 
     assert config_spy.call_count == 1
-    assert config_spy.call_args[0][1:] == (["foo", "bar"],)
+    assert config_spy.call_args[0][1] == sys.argv
     assert config_spy.call_args[1] == {}
 
     assert system_spy.call_count == 1
diff --git a/tests/unit/legate/jupyter/__init__.py b/tests/unit/legate/jupyter/__init__.py
new file mode 100644
index 000000000..f0b271624
--- /dev/null
+++ b/tests/unit/legate/jupyter/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
diff --git a/tests/unit/legate/jupyter/test_args.py b/tests/unit/legate/jupyter/test_args.py
new file mode 100644
index 000000000..c0904927a
--- /dev/null
+++ b/tests/unit/legate/jupyter/test_args.py
@@ -0,0 +1,104 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import legate.driver.args as m
+import legate.driver.defaults as defaults
+
+
+class TestParserDefaults:
+    def test_allow_abbrev(self) -> None:
+        assert not m.parser.allow_abbrev
+
+    # kernel
+
+    def test_no_user(self) -> None:
+        assert m.parser.get_default("user") is None
+
+    def test_name(self) -> None:
+        assert m.parser.get_default("name") is None
+
+    def test_display_name(self) -> None:
+        assert m.parser.get_default("display_name") is None
+
+    def test_prefix(self) -> None:
+        assert m.parser.get_default("prefix") is None
+
+    # multi_node
+
+    def test_nodes(self) -> None:
+        assert m.parser.get_default("nodes") == defaults.LEGATE_NODES
+
+    def test_ranks_per_node(self) -> None:
+        assert (
+            m.parser.get_default("ranks_per_node")
+            == defaults.LEGATE_RANKS_PER_NODE
+        )
+
+    def test_no_replicate(self) -> None:
+        assert m.parser.get_default("not_control_replicable") is False
+
+    def test_launcher(self) -> None:
+        assert m.parser.get_default("launcher") == "none"
+
+    def test_launcher_extra(self) -> None:
+        assert m.parser.get_default("launcher_extra") == []
+
+    # core
+
+    def test_cpus(self) -> None:
+        assert m.parser.get_default("cpus") == defaults.LEGATE_CPUS
+
+    def test_gpus(self) -> None:
+        assert m.parser.get_default("gpus") == defaults.LEGATE_GPUS
+
+    def test_omps(self) -> None:
+        assert m.parser.get_default("openmp") == defaults.LEGATE_OMP_PROCS
+
+    def test_ompthreads(self) -> None:
+        assert (
+            m.parser.get_default("ompthreads") == defaults.LEGATE_OMP_THREADS
+        )
+
+    def test_utility(self) -> None:
+        assert m.parser.get_default("utility") == defaults.LEGATE_UTILITY_CORES
+
+    # memory
+
+    def test_sysmem(self) -> None:
+        assert m.parser.get_default("sysmem") == defaults.LEGATE_SYSMEM
+
+    def test_numamem(self) -> None:
+        assert m.parser.get_default("numamem") == defaults.LEGATE_NUMAMEM
+
+    def test_fbmem(self) -> None:
+        assert m.parser.get_default("fbmem") == defaults.LEGATE_FBMEM
+
+    def test_zcmem(self) -> None:
+        assert m.parser.get_default("zcmem") == defaults.LEGATE_ZCMEM
+
+    def test_regmem(self) -> None:
+        assert m.parser.get_default("regmem") == defaults.LEGATE_REGMEM
+
+    def test_eager_alloc(self) -> None:
+        assert (
+            m.parser.get_default("eager_alloc")
+            == defaults.LEGATE_EAGER_ALLOC_PERCENTAGE
+        )
+
+    # info
+
+    def test_verbose(self) -> None:
+        assert m.parser.get_default("verbose") is False
diff --git a/tests/unit/legate/jupyter/test_config.py b/tests/unit/legate/jupyter/test_config.py
new file mode 100644
index 000000000..3ee258a14
--- /dev/null
+++ b/tests/unit/legate/jupyter/test_config.py
@@ -0,0 +1,129 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import call
+
+from pytest_mock import MockerFixture
+
+import legate.driver.defaults as defaults
+import legate.jupyter.config as m
+from legate.driver.config import Core, Memory, MultiNode
+from legate.util.types import DataclassMixin
+
+
+class TestKernel:
+    def test_fields(self) -> None:
+        assert set(m.Kernel.__dataclass_fields__) == {
+            "user",
+            "prefix",
+            "spec_name",
+            "display_name",
+        }
+
+    def test_mixin(self) -> None:
+        assert issubclass(m.Kernel, DataclassMixin)
+
+
+class TestConfig:
+    def test_default_init(self) -> None:
+
+        # Note this test does not clear the environment. Default values from
+        # the defaults module can depend on the environment, but what matters
+        # is that the generated config matches those values, whatever they are.
+
+        c = m.Config(["legate-jupyter"])
+
+        assert c.multi_node == m.MultiNode(
+            nodes=defaults.LEGATE_NODES,
+            ranks_per_node=defaults.LEGATE_RANKS_PER_NODE,
+            not_control_replicable=False,
+            launcher="none",
+            launcher_extra=[],
+        )
+        assert c.core == m.Core(
+            cpus=4,
+            gpus=0,
+            openmp=defaults.LEGATE_OMP_PROCS,
+            ompthreads=defaults.LEGATE_OMP_THREADS,
+            utility=defaults.LEGATE_UTILITY_CORES,
+        )
+        c.memory == m.Memory(
+            sysmem=defaults.LEGATE_SYSMEM,
+            numamem=defaults.LEGATE_NUMAMEM,
+            fbmem=defaults.LEGATE_FBMEM,
+            zcmem=defaults.LEGATE_ZCMEM,
+            regmem=defaults.LEGATE_REGMEM,
+            eager_alloc=defaults.LEGATE_EAGER_ALLOC_PERCENTAGE,
+        )
+
+        # These are all "turned off"
+
+        assert c.binding == m.Binding(
+            cpu_bind=None,
+            mem_bind=None,
+            gpu_bind=None,
+            nic_bind=None,
+        )
+
+        c.profiling == m.Profiling(
+            profile=False,
+            nvprof=False,
+            nsys=False,
+            nsys_targets="",
+            nsys_extra=[],
+        )
+
+        assert c.logging == m.Logging(
+            user_logging_levels=None,
+            logdir=Path("."),
+            log_to_file=False,
+            keep_logs=False,
+        )
+
+        assert c.debugging == m.Debugging(
+            gdb=False,
+            cuda_gdb=False,
+            memcheck=False,
+            freeze_on_error=False,
+            gasnet_trace=False,
+            dataflow=False,
+            event=False,
+        )
+
+        assert c.info == m.Info(progress=False, mem_usage=False, verbose=False)
+
+        assert c.other == m.Other(module=None, dry_run=False, rlwrap=False)
+
+    def test_arg_conversions(self, mocker: MockerFixture) -> None:
+
+        # This is kind of a dumb short-cut test, but if we believe that
+        # object_to_dataclass works as advertised, then this test ensures that
+        # it is being used for all the sub-configs that it should be used for
+
+        spy = mocker.spy(m, "object_to_dataclass")
+
+        c = m.Config(["legate"])
+
+        assert spy.call_count == 4
+        spy.assert_has_calls(
+            [
+                call(c._args, m.Kernel),
+                call(c._args, MultiNode),
+                call(c._args, Core),
+                call(c._args, Memory),
+            ]
+        )
diff --git a/tests/unit/legate/jupyter/test_kernel.py b/tests/unit/legate/jupyter/test_kernel.py
new file mode 100644
index 000000000..42925387b
--- /dev/null
+++ b/tests/unit/legate/jupyter/test_kernel.py
@@ -0,0 +1,172 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import json
+from dataclasses import asdict
+
+from pytest_mock import MockerFixture
+
+import legate.jupyter.kernel as m
+from legate.driver import Driver
+from legate.jupyter.config import Config
+from legate.util.system import System
+
+from ...util import Capsys
+
+
+def test_LEGATE_JUPYTER_KERNEL_SPEC_KEY() -> None:
+    assert m.LEGATE_JUPYTER_KERNEL_SPEC_KEY == "__LEGATE_JUPYTER_KERNEL_SPEC__"
+
+
+def test_LEGATE_JUPYTER_METADATA_KEY() -> None:
+    assert m.LEGATE_JUPYTER_METADATA_KEY == "legate"
+
+
+system = System()
+
+
+class Test_generate_kernel_spec:
+    def test_defatul(self) -> None:
+        config = Config([])
+        driver = Driver(config, system)
+
+        spec = m.generate_kernel_spec(driver, config)
+
+        expected_env = {
+            k: v for k, v in driver.env.items() if k in driver.custom_env_vars
+        }
+        expected_env[
+            m.LEGATE_JUPYTER_KERNEL_SPEC_KEY
+        ] = config.kernel.spec_name
+
+        assert spec.display_name == config.kernel.display_name
+        assert spec.language == "python"
+        assert spec.argv[:-3] == list(driver.cmd)
+        assert spec.argv[-3].endswith("_legion_kernel.py")
+        assert spec.argv[-2:] == ["-f", "{connection_file}"]
+        assert spec.env == expected_env
+        assert m.LEGATE_JUPYTER_METADATA_KEY in spec.metadata
+        metadata = spec.metadata[m.LEGATE_JUPYTER_METADATA_KEY]
+        assert metadata == {
+            "argv": config.argv[1:],
+            "multi_node": asdict(config.multi_node),
+            "memory": asdict(config.memory),
+            "core": asdict(config.core),
+        }
+
+
+class Test_install_kernel_spec:
+    def test_install(self, mocker: MockerFixture, capsys: Capsys) -> None:
+        install_mock = mocker.patch(
+            "jupyter_client.kernelspec.KernelSpecManager.install_kernel_spec"
+        )
+
+        config = Config(
+            ["legate-jupyter", "--name", "____fake_test_kernel_123abc_____"]
+        )
+        driver = Driver(config, system)
+
+        spec = m.generate_kernel_spec(driver, config)
+
+        m.install_kernel_spec(spec, config)
+
+        assert install_mock.call_count == 1
+        assert install_mock.call_args[0][1] == config.kernel.spec_name
+        assert install_mock.call_args[1] == {
+            "user": config.kernel.user,
+            "prefix": config.kernel.prefix,
+        }
+
+        out, _ = capsys.readouterr()
+        assert out == (
+            f"Jupyter kernel spec {config.kernel.spec_name} "
+            f"({config.kernel.display_name}) "
+            "has been installed\n"
+        )
+
+    def test_install_verbose(
+        self, mocker: MockerFixture, capsys: Capsys
+    ) -> None:
+        install_mock = mocker.patch(
+            "jupyter_client.kernelspec.KernelSpecManager.install_kernel_spec"
+        )
+
+        config = Config(
+            [
+                "legate-jupyter",
+                "-v",
+                "--name",
+                "____fake_test_kernel_123abc_____",
+            ]
+        )
+        driver = Driver(config, system)
+
+        spec = m.generate_kernel_spec(driver, config)
+
+        m.install_kernel_spec(spec, config)
+
+        assert install_mock.call_count == 1
+        assert install_mock.call_args[0][1] == config.kernel.spec_name
+        assert install_mock.call_args[1] == {
+            "user": config.kernel.user,
+            "prefix": config.kernel.prefix,
+        }
+
+        out, _ = capsys.readouterr()
+        assert out == (
+            f"Wrote kernel spec file {config.kernel.spec_name}/kernel.json\n\n"
+            f"Jupyter kernel spec {config.kernel.spec_name} "
+            f"({config.kernel.display_name}) "
+            "has been installed\n"
+        )
+
+    def test_install_verbose2(
+        self, mocker: MockerFixture, capsys: Capsys
+    ) -> None:
+        install_mock = mocker.patch(
+            "jupyter_client.kernelspec.KernelSpecManager.install_kernel_spec"
+        )
+
+        config = Config(
+            [
+                "legate-jupyter",
+                "-vv",
+                "--name",
+                "____fake_test_kernel_123abc_____",
+            ]
+        )
+        driver = Driver(config, system)
+
+        spec = m.generate_kernel_spec(driver, config)
+
+        m.install_kernel_spec(spec, config)
+
+        assert install_mock.call_count == 1
+        assert install_mock.call_args[0][1] == config.kernel.spec_name
+        assert install_mock.call_args[1] == {
+            "user": config.kernel.user,
+            "prefix": config.kernel.prefix,
+        }
+
+        out, _ = capsys.readouterr()
+        spec_json = json.dumps(spec.to_dict(), sort_keys=True, indent=2)
+        assert out == (
+            f"Wrote kernel spec file {config.kernel.spec_name}/kernel.json\n\n"
+            f"\n{spec_json}\n\n"
+            f"Jupyter kernel spec {config.kernel.spec_name} "
+            f"({config.kernel.display_name}) "
+            "has been installed\n"
+        )
diff --git a/tests/unit/legate/jupyter/test_main.py b/tests/unit/legate/jupyter/test_main.py
new file mode 100644
index 000000000..0e0159dc9
--- /dev/null
+++ b/tests/unit/legate/jupyter/test_main.py
@@ -0,0 +1,73 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import sys
+
+from pytest_mock import MockerFixture
+
+import legate.jupyter as m
+
+# main function shadows main module
+# def test___all__() -> None:
+
+# The main() function is very simple, this test just confirms that
+# all the expected plumbing is hooked up as it is supposed to be
+
+
+def test_main(mocker: MockerFixture) -> None:
+    import legate.driver.driver
+    import legate.jupyter.config
+    import legate.util.system
+
+    config_spy = mocker.spy(legate.jupyter.config.Config, "__init__")
+    system_spy = mocker.spy(legate.util.system.System, "__init__")
+    driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
+    generate_spy = mocker.spy(legate.jupyter.kernel, "generate_kernel_spec")
+    install_mock = mocker.patch("legate.jupyter.kernel.install_kernel_spec")
+    mocker.patch.object(sys, "argv", ["legate-jupyter", "--name", "foo"])
+
+    m.main()
+
+    assert config_spy.call_count == 1
+    assert config_spy.call_args[0][1] == sys.argv
+    assert config_spy.call_args[1] == {}
+
+    assert system_spy.call_count == 1
+    assert system_spy.call_args[0][1:] == ()
+    assert system_spy.call_args[1] == {}
+
+    assert driver_spy.call_count == 1
+    assert len(driver_spy.call_args[0]) == 3
+    assert isinstance(driver_spy.call_args[0][1], legate.jupyter.config.Config)
+    assert isinstance(driver_spy.call_args[0][2], legate.util.system.System)
+    assert driver_spy.call_args[1] == {}
+
+    assert generate_spy.call_count == 1
+    assert len(generate_spy.call_args[0]) == 2
+    assert isinstance(
+        generate_spy.call_args[0][0], legate.driver.driver.Driver
+    )
+    assert isinstance(
+        generate_spy.call_args[0][1], legate.jupyter.config.Config
+    )
+    assert generate_spy.call_args[1] == {}
+
+    assert install_mock.call_count == 1
+    assert install_mock.call_args[0][0] == generate_spy.spy_return
+    assert isinstance(
+        install_mock.call_args[0][1], legate.jupyter.config.Config
+    )
+    assert install_mock.call_args[1] == {}
diff --git a/tests/unit/legate/util/test_args.py b/tests/unit/legate/util/test_args.py
index 83e3e02b3..f6c97f4ed 100644
--- a/tests/unit/legate/util/test_args.py
+++ b/tests/unit/legate/util/test_args.py
@@ -82,12 +82,11 @@ class TestArgSpec:
     def test_default(self) -> None:
         spec = m.ArgSpec("dest")
         assert spec.dest == "dest"
-        assert spec.action == "store_true"
+        assert spec.action == m.Unset
 
         # all others are unset
         assert set(m.entries(spec)) == {
             ("dest", "dest"),
-            ("action", "store_true"),
         }
 
 
@@ -146,7 +145,9 @@ def test_help_override(
         self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
     ) -> None:
         monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
-        args = [m.Argument("help", m.ArgSpec(dest="help"))]
+        args = [
+            m.Argument("help", m.ArgSpec(action="store_true", dest="help"))
+        ]
         ns = m.parse_library_command_args("foo", args)
         out, err = capsys.readouterr()
         assert out == ""
@@ -158,7 +159,7 @@ def test_basic(
     ) -> None:
         monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:quux", "1"])
         args = [
-            m.Argument("bar", m.ArgSpec(dest="bar")),
+            m.Argument("bar", m.ArgSpec(action="store_true", dest="bar")),
             m.Argument(
                 "quux", m.ArgSpec(dest="quux", action="store", type=int)
             ),
@@ -173,7 +174,7 @@ def test_extra_args_passed_on(
         self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
     ) -> None:
         monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "--extra", "1"])
-        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
+        args = [m.Argument("bar", m.ArgSpec(action="store_true", dest="bar"))]
         ns = m.parse_library_command_args("foo", args)
         out, err = capsys.readouterr()
         assert out == ""
@@ -208,7 +209,7 @@ def test_no_prefix_conflict(
         monkeypatch.setattr(
             "sys.argv", ["app", "-foo:bar", "--foo", "-f", "1", "-ff"]
         )
-        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
+        args = [m.Argument("bar", m.ArgSpec(action="store_true", dest="bar"))]
         ns = m.parse_library_command_args("foo", args)
         out, err = capsys.readouterr()
         assert out == ""
diff --git a/typings/IPython/__init__.pyi b/typings/IPython/__init__.pyi
new file mode 100644
index 000000000..13b35e47e
--- /dev/null
+++ b/typings/IPython/__init__.pyi
@@ -0,0 +1,20 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from .core.magic import Magics
+
+class InteractiveShell:
+    def register_magics(self, *objs: Magics) -> None: ...
diff --git a/typings/IPython/core/magic.pyi b/typings/IPython/core/magic.pyi
new file mode 100644
index 000000000..354c7ce2c
--- /dev/null
+++ b/typings/IPython/core/magic.pyi
@@ -0,0 +1,28 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import Any, Callable, TypeVar
+
+from typing_extensions import ParamSpec
+
+class Magics:
+    def __init__(self, shell: Any) -> None: ...
+
+R = TypeVar("R")
+P = ParamSpec("P")
+
+line_magic: Callable[[Callable[P, R]], Callable[P, R]]
+magics_class: Callable[[Callable[P, R]], Callable[P, R]]
diff --git a/typings/jupyter_client/__init__.pyi b/typings/jupyter_client/__init__.pyi
new file mode 100644
index 000000000..e69de29bb
diff --git a/typings/jupyter_client/kernelspec.pyi b/typings/jupyter_client/kernelspec.pyi
new file mode 100644
index 000000000..d69b0e3b7
--- /dev/null
+++ b/typings/jupyter_client/kernelspec.pyi
@@ -0,0 +1,40 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from typing import Any
+
+class KernelSpec:
+    display_name: str
+    metadata: dict[str, Any]
+
+    def __init__(
+        self,
+        argv: list[str],
+        env: dict[str, str],
+        display_name: str,
+        language: str,
+        metadata: dict[str, Any],
+    ) -> None: ...
+    def to_dict(self) -> dict[str, Any]: ...
+
+class NoSuchKernel(Exception): ...
+
+class KernelSpecManager:
+    def __init__(self, **kwargs: Any) -> None: ...
+    def get_kernel_spec(self, kernel_name: str) -> KernelSpec: ...
+    def install_kernel_spec(
+        self, source_dir: str, kernel_name: str, user: bool, prefix: str | None
+    ) -> None: ...

From 2a9617c3af0d4a2e0eed54b45b1e5160dcb351bc Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 17 Oct 2022 20:13:57 -0700
Subject: [PATCH 025/121] APIs that GH 437 should have included (#443)

---
 legate/core/operation.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/legate/core/operation.py b/legate/core/operation.py
index a3b2aef17..fcd626acb 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -247,6 +247,13 @@ def __init__(
         self._tb: Union[None, TracebackType] = None
         self._side_effect = False
 
+    @property
+    def side_effect(self) -> bool:
+        return self._side_effect
+
+    def set_side_effect(self, side_effect: bool) -> None:
+        self._side_effect = side_effect
+
     @property
     def uses_communicator(self) -> bool:
         return len(self._comm_args) > 0

From 180c23ed8a63d1646c9b1b0be65a1ba2bd1da23c Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Tue, 18 Oct 2022 16:38:17 +0800
Subject: [PATCH 026/121] Support python coverage test. (#431)

* Support python coverage test.

* Address comments

* Address comments - part-2

* Add unit tests for cov_args related.
---
 legate/tester/args.py                         | 27 +++++++++++++++++
 legate/tester/config.py                       |  5 +++-
 legate/tester/stages/test_stage.py            | 23 ++++++++++++++-
 .../legate/tester/stages/test_test_stage.py   | 29 +++++++++++++++++++
 tests/unit/legate/tester/test_config.py       |  9 ++++++
 5 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/legate/tester/args.py b/legate/tester/args.py
index 6c3f24962..fcb680db1 100644
--- a/legate/tester/args.py
+++ b/legate/tester/args.py
@@ -187,6 +187,33 @@
 )
 
 
+test_opts.add_argument(
+    "--cov-bin",
+    default=None,
+    help=(
+        "coverage binary location, "
+        "e.g. /conda_path/envs/env_name/bin/coverage"
+    ),
+)
+
+
+test_opts.add_argument(
+    "--cov-args",
+    default="run -a --branch",
+    help="coverage run command arguments, e.g. run -a --branch",
+)
+
+
+test_opts.add_argument(
+    "--cov-src-path",
+    default=None,
+    help=(
+        "path value of --source in coverage run command, "
+        "e.g. /project_path/cunumeric/cunumeric"
+    ),
+)
+
+
 test_opts.add_argument(
     "-j",
     "--workers",
diff --git a/legate/tester/config.py b/legate/tester/config.py
index 497c3a385..e5cf412fc 100644
--- a/legate/tester/config.py
+++ b/legate/tester/config.py
@@ -43,7 +43,7 @@ def __init__(self, argv: ArgList) -> None:
         args, self._extra_args = parser.parse_known_args(self.argv[1:])
 
         # which tests to run
-        self.examples = True
+        self.examples = False if args.cov_bin else True
         self.integration = True
         self.unit = args.unit
         self.files = args.files
@@ -68,6 +68,9 @@ def __init__(self, argv: ArgList) -> None:
         self.test_root = args.test_root
         self.requested_workers = args.workers
         self.legate_dir = self._compute_legate_dir(args)
+        self.cov_bin = args.cov_bin
+        self.cov_args = args.cov_args
+        self.cov_src_path = args.cov_src_path
 
     @property
     def env(self) -> EnvDict:
diff --git a/legate/tester/stages/test_stage.py b/legate/tester/stages/test_stage.py
index c21fdd630..5962500bf 100644
--- a/legate/tester/stages/test_stage.py
+++ b/legate/tester/stages/test_stage.py
@@ -205,6 +205,24 @@ def file_args(self, test_file: Path, config: Config) -> ArgList:
 
         return args
 
+    def cov_args(self, config: Config) -> ArgList:
+        """Coverage binary and coverage arguments.
+
+        Parameters
+        ----------
+        config: Config
+            Test runner configuration
+
+        """
+        if config.cov_bin:
+            args = [str(config.cov_bin)] + config.cov_args.split()
+            if config.cov_src_path:
+                args += ["--source", str(config.cov_src_path)]
+        else:
+            args = []
+
+        return args
+
     def run(
         self, test_file: Path, config: Config, system: TestSystem
     ) -> ProcessResult:
@@ -227,10 +245,13 @@ def run(
 
         shard = self.shards.get()
 
+        cov_args = self.cov_args(config)
+
+        cmd = [str(config.legate_path)] + cov_args + [str(test_path)]
+
         stage_args = self.args + self.shard_args(shard, config)
         file_args = self.file_args(test_file, config)
 
-        cmd = [str(config.legate_path), str(test_path)]
         cmd += stage_args + file_args + config.extra_args
 
         self.delay(shard, config, system)
diff --git a/tests/unit/legate/tester/stages/test_test_stage.py b/tests/unit/legate/tester/stages/test_test_stage.py
index 90edfaed4..6a5678c22 100644
--- a/tests/unit/legate/tester/stages/test_test_stage.py
+++ b/tests/unit/legate/tester/stages/test_test_stage.py
@@ -86,3 +86,32 @@ def test_file_args_vv(self) -> None:
         stage = MockTestStage(c, s)
         assert stage.file_args(Path("integration/foo"), c) == ["-v", "-s"]
         assert stage.file_args(Path("unit/foo"), c) == []
+
+    def test_cov_args_without_cov_bin(self) -> None:
+        c = m.Config(["test.py", "--cov-args", "run -a"])
+        stage = MockTestStage(c, s)
+        assert stage.cov_args(c) == []
+
+    def test_cov_args_with_cov_bin(self) -> None:
+        cov_bin = "conda/envs/legate/bin/coverage"
+        args = ["--cov-bin", cov_bin]
+        c = m.Config(["test.py"] + args)
+        expected_result = [cov_bin] + c.cov_args.split()
+        stage = MockTestStage(c, s)
+        assert stage.cov_args(c) == expected_result
+
+    def test_cov_args_with_cov_bin_args_and_src_path(self) -> None:
+        cov_bin = "conda/envs/legate/bin/coverage"
+        cov_args = "run -a"
+        cov_src_path = "source_path"
+        args = (
+            ["--cov-bin", cov_bin]
+            + ["--cov-args", cov_args]
+            + ["--cov-src-path", cov_src_path]
+        )
+        c = m.Config(["test.py"] + args)
+        expected_result = (
+            [cov_bin] + cov_args.split() + ["--source", cov_src_path]
+        )
+        stage = MockTestStage(c, s)
+        assert stage.cov_args(c) == expected_result
diff --git a/tests/unit/legate/tester/test_config.py b/tests/unit/legate/tester/test_config.py
index d55104980..2d4326d69 100644
--- a/tests/unit/legate/tester/test_config.py
+++ b/tests/unit/legate/tester/test_config.py
@@ -71,6 +71,10 @@ def test_default_init(self) -> None:
 
         assert c.legate_path == "legate"
 
+        assert c.cov_bin is None
+        assert c.cov_args == "run -a --branch"
+        assert c.cov_src_path is None
+
     @pytest.mark.parametrize("feature", FEATURES)
     def test_env_features(
         self, monkeypatch: pytest.MonkeyPatch, feature: str
@@ -180,3 +184,8 @@ def test_extra_args(self) -> None:
         assert c.extra_args == extra
         c = m.Config(["test.py"] + extra + ["--files", "a", "b"])
         assert c.extra_args == extra
+
+    def test_cov_args(self) -> None:
+        cov_args = ["--cov-args", "run -a"]
+        c = m.Config(["test.py"] + cov_args)
+        assert c.cov_args == "run -a"

From d30f6ca47b0bfa0843833413f1d008d1ae030fbd Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Tue, 18 Oct 2022 15:32:08 -0700
Subject: [PATCH 027/121] Make terminal colors explicitly opt-in (#445)

* checkpoint

* Make terminal colors explicitly opt-in
---
 legate/driver/args.py                    |  8 +++++
 legate/driver/config.py                  |  3 ++
 legate/jupyter/args.py                   |  8 +++++
 legate/jupyter/config.py                 |  3 ++
 legate/tester/args.py                    |  8 +++++
 legate/tester/config.py                  |  3 ++
 legate/util/colors.py                    | 22 +++++++++++++
 tests/unit/legate/driver/test_config.py  |  8 +++++
 tests/unit/legate/jupyter/test_config.py |  8 +++++
 tests/unit/legate/tester/test_config.py  |  8 +++++
 tests/unit/legate/util/test_colors.py    | 42 ++++++++++++++++++++++--
 11 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/legate/driver/args.py b/legate/driver/args.py
index cc8667384..f281d22e3 100755
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -330,3 +330,11 @@
     required=False,
     help="Whether to run with rlwrap to improve readline ability",
 )
+
+other.add_argument(
+    "--color",
+    dest="color",
+    action="store_true",
+    required=False,
+    help="Whether to use color terminal output (if colorama is installed)",
+)
diff --git a/legate/driver/config.py b/legate/driver/config.py
index 5e42bc584..b1192b665 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 from typing import Any, Protocol
 
+from ..util import colors
 from ..util.types import (
     ArgList,
     DataclassMixin,
@@ -157,6 +158,8 @@ def __init__(self, argv: ArgList) -> None:
 
         args, extra = parser.parse_known_args(self.argv[1:])
 
+        colors.ENABLED = args.color
+
         # only saving this for help with testing
         self._args = args
 
diff --git a/legate/jupyter/args.py b/legate/jupyter/args.py
index 77c16b66a..7f80a49a1 100755
--- a/legate/jupyter/args.py
+++ b/legate/jupyter/args.py
@@ -105,3 +105,11 @@
     default=0,
     help="Display verbose output. Use -vv for even more output (test stdout)",
 )
+
+info.add_argument(
+    "--color",
+    dest="color",
+    action="store_true",
+    required=False,
+    help="Whether to use color terminal output (if colorama is installed)",
+)
diff --git a/legate/jupyter/config.py b/legate/jupyter/config.py
index 52c44c00f..745238b63 100644
--- a/legate/jupyter/config.py
+++ b/legate/jupyter/config.py
@@ -20,6 +20,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 
+import legate.util.colors as colors
 from legate.driver.config import (
     Binding,
     Core,
@@ -64,6 +65,8 @@ def __init__(self, argv: ArgList) -> None:
         # only saving these for help with testing
         self._args = args
 
+        colors.ENABLED = args.color
+
         if args.display_name is None:
             args.display_name = args.spec_name
 
diff --git a/legate/tester/args.py b/legate/tester/args.py
index fcb680db1..0645fea9e 100644
--- a/legate/tester/args.py
+++ b/legate/tester/args.py
@@ -248,3 +248,11 @@
     action="store_true",
     help="Print out the commands that are to be executed",
 )
+
+parser.add_argument(
+    "--color",
+    dest="color",
+    action="store_true",
+    required=False,
+    help="Whether to use color terminal output (if colorama is installed)",
+)
diff --git a/legate/tester/config.py b/legate/tester/config.py
index e5cf412fc..39441e433 100644
--- a/legate/tester/config.py
+++ b/legate/tester/config.py
@@ -21,6 +21,7 @@
 from argparse import Namespace
 from pathlib import Path
 
+from ..util import colors
 from ..util.types import ArgList, EnvDict
 from . import DEFAULT_PROCESS_ENV, FEATURES, SKIPPED_EXAMPLES, FeatureType
 from .args import parser
@@ -42,6 +43,8 @@ def __init__(self, argv: ArgList) -> None:
 
         args, self._extra_args = parser.parse_known_args(self.argv[1:])
 
+        colors.ENABLED = args.color
+
         # which tests to run
         self.examples = False if args.cov_bin else True
         self.integration = True
diff --git a/legate/util/colors.py b/legate/util/colors.py
index 5bb0b14b3..6c417c221 100644
--- a/legate/util/colors.py
+++ b/legate/util/colors.py
@@ -37,6 +37,12 @@
 )
 
 
+# Color terminal output needs to be explicitly opt-in. Applications that want
+# to enable it should set this global flag to True, e.g based on a command line
+# argument or other user-supplied configuration
+ENABLED = False
+
+
 def _text(text: str) -> str:
     return text
 
@@ -45,27 +51,43 @@ def _text(text: str) -> str:
     import colorama  # type: ignore[import]
 
     def bright(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Style.BRIGHT}{text}{colorama.Style.RESET_ALL}"
 
     def dim(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Style.DIM}{text}{colorama.Style.RESET_ALL}"
 
     def white(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.WHITE}{text}{colorama.Style.RESET_ALL}"
 
     def cyan(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.CYAN}{text}{colorama.Style.RESET_ALL}"
 
     def red(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.RED}{text}{colorama.Style.RESET_ALL}"
 
     def magenta(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.MAGENTA}{text}{colorama.Style.RESET_ALL}"
 
     def green(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.GREEN}{text}{colorama.Style.RESET_ALL}"
 
     def yellow(text: str) -> str:
+        if not ENABLED:
+            return text
         return f"{colorama.Fore.YELLOW}{text}{colorama.Style.RESET_ALL}"
 
     if sys.platform == "win32":
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 536289221..104f95f58 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -23,6 +23,7 @@
 
 import legate.driver.config as m
 import legate.driver.defaults as defaults
+from legate.util import colors
 from legate.util.colors import scrub
 from legate.util.types import DataclassMixin
 
@@ -173,6 +174,8 @@ def test_default_init(self) -> None:
 
         c = m.Config(["legate"])
 
+        assert colors.ENABLED is False
+
         assert c.multi_node == m.MultiNode(
             nodes=defaults.LEGATE_NODES,
             ranks_per_node=defaults.LEGATE_RANKS_PER_NODE,
@@ -232,6 +235,11 @@ def test_default_init(self) -> None:
 
         assert c.other == m.Other(module=None, dry_run=False, rlwrap=False)
 
+    def test_color_arg(self) -> None:
+        m.Config(["legate", "--color"])
+
+        assert colors.ENABLED is True
+
     def test_arg_conversions(self, mocker: MockerFixture) -> None:
 
         # This is kind of a dumb short-cut test, but if we believe that
diff --git a/tests/unit/legate/jupyter/test_config.py b/tests/unit/legate/jupyter/test_config.py
index 3ee258a14..4e956ff85 100644
--- a/tests/unit/legate/jupyter/test_config.py
+++ b/tests/unit/legate/jupyter/test_config.py
@@ -22,6 +22,7 @@
 import legate.driver.defaults as defaults
 import legate.jupyter.config as m
 from legate.driver.config import Core, Memory, MultiNode
+from legate.util import colors
 from legate.util.types import DataclassMixin
 
 
@@ -47,6 +48,8 @@ def test_default_init(self) -> None:
 
         c = m.Config(["legate-jupyter"])
 
+        assert colors.ENABLED is False
+
         assert c.multi_node == m.MultiNode(
             nodes=defaults.LEGATE_NODES,
             ranks_per_node=defaults.LEGATE_RANKS_PER_NODE,
@@ -108,6 +111,11 @@ def test_default_init(self) -> None:
 
         assert c.other == m.Other(module=None, dry_run=False, rlwrap=False)
 
+    def test_color_arg(self) -> None:
+        m.Config(["legate-jupyter", "--color"])
+
+        assert colors.ENABLED is True
+
     def test_arg_conversions(self, mocker: MockerFixture) -> None:
 
         # This is kind of a dumb short-cut test, but if we believe that
diff --git a/tests/unit/legate/tester/test_config.py b/tests/unit/legate/tester/test_config.py
index 2d4326d69..f0e351caf 100644
--- a/tests/unit/legate/tester/test_config.py
+++ b/tests/unit/legate/tester/test_config.py
@@ -32,12 +32,15 @@
     config as m,
 )
 from legate.tester.args import PIN_OPTIONS, PinOptionsType
+from legate.util import colors
 
 
 class TestConfig:
     def test_default_init(self) -> None:
         c = m.Config([])
 
+        assert colors.ENABLED is False
+
         assert c.examples is True
         assert c.integration is True
         assert c.unit is False
@@ -75,6 +78,11 @@ def test_default_init(self) -> None:
         assert c.cov_args == "run -a --branch"
         assert c.cov_src_path is None
 
+    def test_color_arg(self) -> None:
+        m.Config(["test.py", "--color"])
+
+        assert colors.ENABLED is True
+
     @pytest.mark.parametrize("feature", FEATURES)
     def test_env_features(
         self, monkeypatch: pytest.MonkeyPatch, feature: str
diff --git a/tests/unit/legate/util/test_colors.py b/tests/unit/legate/util/test_colors.py
index 873f3dc53..60dce0ec3 100644
--- a/tests/unit/legate/util/test_colors.py
+++ b/tests/unit/legate/util/test_colors.py
@@ -57,9 +57,17 @@ def use_plain_text(mocker: MockerFixture) -> None:
 )
 
 
+def test_default_ENABLED() -> None:
+    assert m.ENABLED is False
+
+
 @pytest.mark.skipif(colorama is None, reason="colorama required")
 @pytest.mark.parametrize("color", COLOR_FUNCS)
-def test_color_functions(color: str) -> None:
+def test_color_functions_ENABLED_True(
+    mocker: MockerFixture, color: str
+) -> None:
+    mocker.patch.object(m, "ENABLED", True)
+
     cfunc = getattr(m, color)
     cprop = getattr(colorama.Fore, color.upper())
 
@@ -68,9 +76,26 @@ def test_color_functions(color: str) -> None:
     assert out == f"{cprop}some text{colorama.Style.RESET_ALL}"
 
 
+@pytest.mark.parametrize("color", COLOR_FUNCS)
+def test_color_functions_ENABLED_False(
+    mocker: MockerFixture, color: str
+) -> None:
+    mocker.patch.object(m, "ENABLED", False)
+
+    cfunc = getattr(m, color)
+
+    out = cfunc("some text")
+
+    assert out == "some text"
+
+
 @pytest.mark.skipif(colorama is None, reason="colorama required")
 @pytest.mark.parametrize("style", STYLE_FUNCS)
-def test_style_functions(style: str) -> None:
+def test_style_functions_ENABLED_True(
+    mocker: MockerFixture, style: str
+) -> None:
+    mocker.patch.object(m, "ENABLED", True)
+
     sfunc = getattr(m, style)
     sprop = getattr(colorama.Style, style.upper())
 
@@ -79,6 +104,19 @@ def test_style_functions(style: str) -> None:
     assert out == f"{sprop}some text{colorama.Style.RESET_ALL}"
 
 
+@pytest.mark.parametrize("style", STYLE_FUNCS)
+def test_style_functions_ENABLED_False(
+    mocker: MockerFixture, style: str
+) -> None:
+    mocker.patch.object(m, "ENABLED", False)
+
+    sfunc = getattr(m, style)
+
+    out = sfunc("some text")
+
+    assert out == "some text"
+
+
 @pytest.mark.skipif(colorama is None, reason="colorama required")
 @pytest.mark.parametrize("color", COLOR_FUNCS)
 @pytest.mark.parametrize("style", STYLE_FUNCS)

From 90fb1ee91eaa6492a7d5fa709c5e8f138aa4459c Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 19 Oct 2022 10:05:07 -0700
Subject: [PATCH 028/121] Allow launcher_extra to split quoted values (#444)

* Allow launcher_extra to split quoted values

* use shlex.split to preserve sub-quotes

* avoid over-quoting

* same treatment for nsys_extra

* docs
---
 legate/driver/args.py                   |   4 +-
 legate/driver/config.py                 |  19 +++++
 legate/util/shared_args.py              |   3 +-
 tests/unit/legate/driver/test_config.py | 100 ++++++++++++++++++++++++
 4 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/legate/driver/args.py b/legate/driver/args.py
index f281d22e3..c473efa5f 100755
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -162,7 +162,9 @@
     action="append",
     default=[],
     required=False,
-    help="Specify extra flags for Nsight Systems",
+    help="Specify extra flags for Nsight Systems (can appear more than once). "
+    "Multiple arguments may be provided together in a quoted string "
+    "(arguments with spaces inside must be additionally quoted)",
 )
 
 logging = parser.add_argument_group("Logging")
diff --git a/legate/driver/config.py b/legate/driver/config.py
index b1192b665..470cca123 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -17,6 +17,7 @@
 """
 from __future__ import annotations
 
+import shlex
 from argparse import Namespace
 from dataclasses import dataclass
 from functools import cached_property
@@ -44,6 +45,16 @@ class MultiNode(DataclassMixin):
     launcher: LauncherType
     launcher_extra: list[str]
 
+    def __post_init__(self, **kw: dict[str, Any]) -> None:
+        # fix up launcher_extra to automaticaly handle quoted strings with
+        # internal whitespace, have to use __setattr__ for frozen
+        # https://docs.python.org/3/library/dataclasses.html#frozen-instances
+        if self.launcher_extra:
+            ex: list[str] = sum(
+                (shlex.split(x) for x in self.launcher_extra), []
+            )
+            object.__setattr__(self, "launcher_extra", ex)
+
     @property
     def ranks(self) -> int:
         return self.nodes * self.ranks_per_node
@@ -84,6 +95,14 @@ class Profiling(DataclassMixin):
     nsys_targets: str  # TODO: multi-choice
     nsys_extra: list[str]
 
+    def __post_init__(self, **kw: dict[str, Any]) -> None:
+        # fix up nsys_extra to automaticaly handle quoted strings with
+        # internal whitespace, have to use __setattr__ for frozen
+        # https://docs.python.org/3/library/dataclasses.html#frozen-instances
+        if self.nsys_extra:
+            ex: list[str] = sum((shlex.split(x) for x in self.nsys_extra), [])
+            object.__setattr__(self, "nsys_extra", ex)
+
 
 @dataclass(frozen=True)
 class Logging(DataclassMixin):
diff --git a/legate/util/shared_args.py b/legate/util/shared_args.py
index 46def8642..688c0bfa3 100644
--- a/legate/util/shared_args.py
+++ b/legate/util/shared_args.py
@@ -96,7 +96,8 @@
         default=[],
         required=False,
         help="additional argument to pass to the launcher (can appear more "
-        "than once)",
+        "than once). Multiple arguments may be provided together in a quoted "
+        "string (arguments with spaces inside must be additionally quoted)",
     ),
 )
 
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 104f95f58..2408bfe08 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -55,6 +55,56 @@ def test_fields(self) -> None:
     def test_mixin(self) -> None:
         assert issubclass(m.MultiNode, DataclassMixin)
 
+    @pytest.mark.parametrize(
+        "extra",
+        (["a"], ["a", "b c"], ["a", "b c", "d e"], ["a", "b c", "d e", "f"]),
+    )
+    def test_launcher_extra_fixup_basic(self, extra) -> None:
+        mn = m.MultiNode(
+            nodes=1,
+            ranks_per_node=1,
+            not_control_replicable=False,
+            launcher="launcher",
+            launcher_extra=extra,
+        )
+        assert mn.launcher_extra == sum((x.split() for x in extra), [])
+
+    def test_launcher_extra_fixup_complex(self) -> None:
+        mn = m.MultiNode(
+            nodes=1,
+            ranks_per_node=1,
+            not_control_replicable=False,
+            launcher="launcher",
+            launcher_extra=[
+                "-H g0002,g0002 -X SOMEENV --fork",
+                "-bind-to none",
+            ],
+        )
+        assert mn.launcher_extra == [
+            "-H",
+            "g0002,g0002",
+            "-X",
+            "SOMEENV",
+            "--fork",
+            "-bind-to",
+            "none",
+        ]
+
+    def test_launcher_extra_fixup_quoted(self) -> None:
+        mn = m.MultiNode(
+            nodes=1,
+            ranks_per_node=1,
+            not_control_replicable=False,
+            launcher="launcher",
+            launcher_extra=[
+                "-f 'some path with spaces/foo.txt'",
+            ],
+        )
+        assert mn.launcher_extra == [
+            "-f",
+            "some path with spaces/foo.txt",
+        ]
+
 
 class TestBinding:
     def test_fields(self) -> None:
@@ -111,6 +161,56 @@ def test_fields(self) -> None:
     def test_mixin(self) -> None:
         assert issubclass(m.Profiling, DataclassMixin)
 
+    @pytest.mark.parametrize(
+        "extra",
+        (["a"], ["a", "b c"], ["a", "b c", "d e"], ["a", "b c", "d e", "f"]),
+    )
+    def test_nsys_extra_fixup_basic(self, extra) -> None:
+        p = m.Profiling(
+            profile=True,
+            nvprof=True,
+            nsys=True,
+            nsys_targets="foo,bar",
+            nsys_extra=extra,
+        )
+        assert p.nsys_extra == sum((x.split() for x in extra), [])
+
+    def test_nsys_extra_fixup_complex(self) -> None:
+        p = m.Profiling(
+            profile=True,
+            nvprof=True,
+            nsys=True,
+            nsys_targets="foo,bar",
+            nsys_extra=[
+                "-H g0002,g0002 -X SOMEENV --fork",
+                "-bind-to none",
+            ],
+        )
+        assert p.nsys_extra == [
+            "-H",
+            "g0002,g0002",
+            "-X",
+            "SOMEENV",
+            "--fork",
+            "-bind-to",
+            "none",
+        ]
+
+    def test_nsys_extra_fixup_quoted(self) -> None:
+        p = m.Profiling(
+            profile=True,
+            nvprof=True,
+            nsys=True,
+            nsys_targets="foo,bar",
+            nsys_extra=[
+                "-f 'some path with spaces/foo.txt'",
+            ],
+        )
+        assert p.nsys_extra == [
+            "-f",
+            "some path with spaces/foo.txt",
+        ]
+
 
 class TestLogging:
     def test_fields(self) -> None:

From 6792d4d2ae8a6f0c79a5370ce40b91edb9c8d335 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 19 Oct 2022 11:23:53 -0700
Subject: [PATCH 029/121] Update CMakeLists.txt (#446)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74e640445..fa2ce2cf4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(legate_core_version 22.10.00)
+set(legate_core_version 22.12.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes

From b5ce4428719a41c1113ec536a63acf68767fa9fd Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Fri, 21 Oct 2022 14:30:09 -0700
Subject: [PATCH 030/121] remove unncecessary exec bit (#451)

---
 legate/driver/args.py  | 0
 legate/jupyter/args.py | 0
 legate/lgpatch.py      | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 legate/driver/args.py
 mode change 100755 => 100644 legate/jupyter/args.py
 mode change 100755 => 100644 legate/lgpatch.py

diff --git a/legate/driver/args.py b/legate/driver/args.py
old mode 100755
new mode 100644
diff --git a/legate/jupyter/args.py b/legate/jupyter/args.py
old mode 100755
new mode 100644
diff --git a/legate/lgpatch.py b/legate/lgpatch.py
old mode 100755
new mode 100644

From bbc69e5383a0abcb5e2099a6eb8912578bf099db Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Tue, 25 Oct 2022 08:49:57 -0700
Subject: [PATCH 031/121] Better error when GPU detection fails (#448)

---
 legate/tester/test_plan.py            |  6 +++---
 legate/util/system.py                 |  8 +++++++-
 tests/unit/legate/util/test_system.py | 14 +++++++++++---
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/legate/tester/test_plan.py b/legate/tester/test_plan.py
index cc877f7a4..634790758 100644
--- a/legate/tester/test_plan.py
+++ b/legate/tester/test_plan.py
@@ -79,9 +79,9 @@ def intro(self) -> str:
 
         cpus = len(self._system.cpus)
         try:
-            gpus = len(self._system.gpus)
-        except ImportError:
-            gpus = 0
+            gpus: int | str = len(self._system.gpus)
+        except RuntimeError:
+            gpus = "N/A"
 
         details = (
             f"* Feature stages       : {', '.join(yellow(x) for x in self._config.features)}",  # noqa E501
diff --git a/legate/util/system.py b/legate/util/system.py
index 5fbabe1d0..ba48e6ac5 100644
--- a/legate/util/system.py
+++ b/legate/util/system.py
@@ -121,7 +121,13 @@ def gpus(self) -> tuple[GPUInfo, ...]:
             # fail.
             pynvml.nvmlInit()
         except Exception:
-            return ()
+            if platform.system() == "Darwin":
+                raise RuntimeError("GPU execution is not available on OSX.")
+            else:
+                raise RuntimeError(
+                    "GPU detection failed. Make sure nvml and pynvml are "
+                    "both installed."
+                )
 
         num_gpus = pynvml.nvmlDeviceGetCount()
 
diff --git a/tests/unit/legate/util/test_system.py b/tests/unit/legate/util/test_system.py
index 3ae242b6f..c3a5d6184 100644
--- a/tests/unit/legate/util/test_system.py
+++ b/tests/unit/legate/util/test_system.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import os
-import sys
+import platform
 
 import pytest
 from pytest_mock import MockerFixture
@@ -98,8 +98,16 @@ def test_cpus(self) -> None:
         assert len(cpus) > 0
         assert all(len(cpu.ids) > 0 for cpu in cpus)
 
-    @pytest.mark.skipif(sys.platform != "linux", reason="pynvml required")
-    def test_gpus(self) -> None:
+    @pytest.mark.skipif(platform.system() != "Linux", reason="Linux test")
+    def test_gpus_linux(self) -> None:
         s = m.System()
         # can't really assume / test much here
         s.gpus
+
+    @pytest.mark.skipif(platform.system() != "Darwin", reason="OSX test")
+    def test_gpus_osx(self) -> None:
+        s = m.System()
+
+        msg = "GPU execution is not available on OSX."
+        with pytest.raises(RuntimeError, msg=msg):
+            s.gpus

From 5512ffb0e2c7bdecb522fa9b98c9276111c4cf61 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Tue, 25 Oct 2022 10:44:32 -0700
Subject: [PATCH 032/121] Add script to generate conda envs (#367)

* Add script to generate conda envs

* remove gcc and sysroot pkgs

* split out openmpi and compilers options

* Adjust consensus match frequency based on field sizes (#402)

* Perform consensus match more frequently for bigger free fields

* Minor cleanup

* add command line args for selection

* help wording

* Make script executable

* Fixes for python 3.8

* Remove old environment files

* Unify file naming for "compilers" and "openmpi"

* Fix typo

* Remove optional ninja dependency

* Not just for the core, include cunumeric also

* Update build documentation

* Fix a file link

* Fix formatting

* remove typing_extensions dependency

* remove jinja dependency

* slight vertical whitespace improvemetn

* Use custom BooleanFlag action, for the benefit of py3.8

* Update build instructions

* Fix intra-document reference

* Revise file naming scheme

* Update BUILD.md

Co-authored-by: Wonchan Lee <wonchanl@nvidia.com>
Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 BUILD.md                        | 240 ++++++++++++++++++--
 README.md                       | 140 ++----------
 conda/environment-test-3.10.yml |  58 -----
 conda/environment-test-3.8.yml  |  58 -----
 conda/environment-test-3.9.yml  |  58 -----
 scripts/generate-conda-envs.py  | 381 ++++++++++++++++++++++++++++++++
 6 files changed, 618 insertions(+), 317 deletions(-)
 delete mode 100644 conda/environment-test-3.10.yml
 delete mode 100644 conda/environment-test-3.8.yml
 delete mode 100644 conda/environment-test-3.9.yml
 create mode 100755 scripts/generate-conda-envs.py

diff --git a/BUILD.md b/BUILD.md
index 8bf5f1ac7..5abb0af8d 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -15,40 +15,227 @@ limitations under the License.
 
 -->
 
-# Overview
+# TL;DR
 
-The build system is designed to enable two different modes of use:
-1. Simple `install.py` helper script or `pip install` for users
-2. Highly customizable incremental builds for developers
+1) Check if there are specialized scripts available for your cluster at https://github.com/nv-legate/quickstart.
+2) [Install dependencies from conda](#getting-dependencies-through-conda)
+3) [Build using install.py](#using-installpy)
 
-We review each of these modes with examples.
+# Getting dependencies
 
+## Getting dependencies through conda
+
+The primary method of retrieving dependencies for Legate Core and downstream
+libraries is through [conda](https://conda.io). You will need an installation of
+conda to follow the instructions below.
+
+Please use the `scripts/generate-conda-envs.py` script to create a conda
+environment file listing all the packages that are required to build, run and
+test Legate Core and all downstream libraries. For example:
+
+```
+$ ./scripts/generate-conda-envs.py --python 3.10 --ctk 11.7 --os linux --compilers --openmpi
+--- generating: environment-test-linux-py310-cuda-11.7-compilers-openmpi.yaml
+```
+
+Run this script with `-h` to see all available configuration options for the
+generated environment file (e.g. all the supported Python versions). See the
+[Notable Dependencies](#notable-dependencies) section for more details.
+
+Once you have this environment file, you can install the required packages by
+creating a new conda environment:
+
+```
+conda env create -n legate -f <env-file>.yaml
+```
+
+or by updating an existing environment:
+
+```
+conda env update -f <env-file>.yaml
+```
+
+## Notable dependencies
+
+### OS (`--os` option)
+
+Legate has been tested on Linux and MacOS, although only a few flavors of Linux
+such as Ubuntu have been thoroughly tested. There is currently no support for
+Windows.
+
+### Python >= 3.8 (`--python` option)
+
+In terms of Python compatibility, Legate *roughly* follows the timeline outlined
+in [NEP 29](https://numpy.org/neps/nep-0029-deprecation_policy.html).
+
+### C++17 compatible compiler (`--compilers` option)
+
+For example: g++, clang, or nvc++. When creating an environment using the
+`--compilers` flag, an appropriate compiler for the current system will be
+pulled from conda.
+
+If you need/prefer to use the system-provided compilers (typical for HPC
+installations), please use a conda environment generated with `--no-compilers`.
+Note that this will likely result in a
+[conda/system library conflict](#alternative-sources-for-dependencies),
+since the system compilers will typically produce executables
+that link against the system-provided libraries, which can shadow the
+conda-provided equivalents.
+
+### CUDA >= 10.2 (`--ctk` flag; optional)
+
+Only necessary if you wish to run with Nvidia GPUs.
+
+Some CUDA components necessary for building, e.g. the `nvcc` compiler and driver
+stubs, are not distributed through conda. These must instead be installed using
+[system-level packages](https://developer.nvidia.com/cuda-downloads).
+
+Independent of the system-level CUDA installation, conda will need to install an
+environment-local copy of the CUDA toolkit (which is what the `--ctk` option
+controls). To avoid versioning conflicts it is safest to match the version of
+CUDA installed system-wide on your machine
+
+Legate is tested and guaranteed to be compatible with Volta and later GPU
+architectures. You can use Legate with Pascal GPUs as well, but there could
+be issues due to lack of independent thread scheduling. Please report any such
+issues on GitHub.
+
+### Fortran compiler (optional)
+
+Only necessary if you wish to build OpenBLAS from source.
+
+Not included by default in the generated conda environment files; install
+`fortran-compiler` from `conda-forge` if you need it.
+
+### Numactl (optional)
+
+Required to support CPU and memory binding in the Legate launcher.
+
+Not available on conda; typically available through the system-level package
+manager.
+
+### MPI (`--openmpi` option)
+
+Only necessary if you wish to run on multiple nodes.
+
+Conda distributes a generic build of OpenMPI, but you may need to use a more
+specialized build, e.g. the one distributed by
+[MOFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/),
+or one provided by your HPC vendor. In that case you should use an environment
+file generated with `--no-openmpi`.
+
+Legate requires a build of MPI that supports `MPI_THREAD_MULTIPLE`.
+
+### Networking libraries (e.g. Infiniband, RoCE, UCX; optional)
+
+Only necessary if you wish to run on multiple nodes.
+
+Not available on conda; typically available through MOFED or the system-level
+package manager.
+
+If using UCX, a build configured with `--enable-mt` is required.
+
+## Alternative sources for dependencies
+
+If you do not wish to use conda for some (or all) of the dependencies, you can
+remove the corresponding entries from the environment file before passing it to
+conda. See [the `install.py` section](#using-installpy) for instructions on how
+to provide alternative locations for these dependencies to the build process.
+
+Note that this is likely to result in conflicts between conda-provided and
+system-provided libraries.
+
+Conda distributes its own version of certain common libraries (in particular the
+C++ standard library), which are also typically available system-wide. Any
+system package you include will typically link to the system version, while
+conda packages link to the conda version. Often these two different versions,
+although incompatible, carry the same version number (`SONAME`), and are
+therefore indistinguishable to the dynamic linker. Then, the first component to
+specify a link location for this library will cause it to be loaded from there,
+and any subsequent link requests for the same library, even if suggesting a
+different link location, will get served using the previously linked version.
+
+This can cause link failures at runtime, e.g. when a system-level library
+happens to be the first to load GLIBC, causing any conda library that comes
+after to trip GLIBC's internal version checks, since the conda library expects
+to find symbols with more recent version numbers than what is available on the
+system-wide GLIBC:
+
+```
+/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /opt/conda/envs/legate/lib/libarrow.so)
+```
+
+You can usually work around this issue by putting the conda library directory
+first in the dynamic library resolution path:
+
+```
+LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
+```
+
+This way you can make sure that the (typically more recent) conda version of any
+common library will be preferred over the system-wide one, no matter which
+component requests it first.
 
 # Building for Users
 
 ## Using install.py
 
-For releases <= 22.07, the main method for building Legate was the `install.py` script.
-Although the underlying implementation has significantly changed, `install.py` still supports the
-same usage and same set of flags. For a full list of flags, users can run:
+The Legate Core repository comes with a helper `install.py` script in the
+top-level directory, that will build the C++ parts of the library and install
+the C++ and Python components under the currently active Python environment.
+
+To add GPU support, use the `--cuda` flag:
+
+```
+./install.py --cuda
+```
+
+You can specify the CUDA toolkit directory and the CUDA architecture you want to
+target using the `--with-cuda` and `--arch` flags, e.g.:
 
 ```
-$ ./install.py --help
+./install.py --cuda --with-cuda /usr/local/cuda/ --arch ampere
 ```
 
-## Using Conda
+By default the script relies on CMake's auto-detection for these settings.
+CMake will first search the currently active Python/conda environment
+for dependencies, then any common system-wide installation directories (e.g.
+`/usr/lib`). If a dependency cannot be found but is publicly available in source
+form (e.g. OpenBLAS), cmake will fetch and build it automatically. You can
+override this search by providing an install location for any dependency
+explicitly, using a `--with-dep` flag, e.g. `--with-nccl` and
+`--with-openblas`.
+
+For multi-node execution Legate uses [GASNet](https://gasnet.lbl.gov/) which can be
+requested using the `--network gasnet1` or `--network gasnetex` flag. By default
+GASNet will be automatically downloaded and built, but if you have an existing
+installation then you can inform the install script using the `--with-gasnet` flag.
+You also need to specify the interconnect network of the target machine using the
+`--conduit` flag.
+
+For example this would be an installation for a
+[DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
+```
+./install.py --network gasnet1 --conduit ibv --cuda --arch ampere
+```
+Alternatively, here is an install line for the
+[Piz-Daint](https://www.cscs.ch/computers/dismissed/piz-daint-piz-dora/) supercomputer:
+```
+./install.py --network gasnet1 --conduit aries --cuda --arch pascal
+```
 
-Legate can be installed using Conda by pointing to the required channels (`-c`):
+To see all available configuration options, run with the `--help` flag:
 
 ```
-conda install -c nvidia -c conda-forge -c legate legate-core
+./install.py --help
 ```
 
 ## Using pip
 
-Legate is not yet registered in a standard pip repository. However, users can still use the
-pip installer to build and install Legate. After downloading or cloning the legate.core source,
-users can run the following in the legate.core folder:
+Legate Core is not yet registered in a standard pip repository. However, users
+can still use the pip installer to build and install Legate Core. The following
+command will trigger a single-node, CPU-only build of Legate Core, then install
+it into the currently active Python environment:
 
 ```
 $ pip install .
@@ -58,18 +245,20 @@ or
 $ python3 -m pip install .
 ```
 
-This will install Legate in the standard packages directory for the environment Python.
+## Advanced Customization
 
-### Advanced Customization
-
-If users need to customize details of the underlying CMake build, they can pass
-CMake flags through the `SKBUILD_CONFIGURE_OPTIONS` environment variable:
+Legate relies on CMake to select its toolchain and build flags. Users can set
+the environment variables `CXX` or `CXXFLAGS` prior to building to override the
+CMake defaults. Alternatively, CMake values can be overridden through the
+`SKBUILD_CONFIGURE_OPTIONS` variable:
 
 ```
 $ SKBUILD_CONFIGURE_OPTIONS="-D Legion_USE_CUDA:BOOL=ON" \
   pip install .
 ```
+
 An alternative syntax using `setup.py` with `scikit-build` is
+
 ```
 $ python setup.py install -- -DLegion_USE_CUDA:BOOL=ON
 ```
@@ -86,15 +275,17 @@ in `setup.py` to drive the build and installation.  A `pip install` will trigger
 3. pip installation of Python files
 
 The CMake build can be configured independently of `pip`, allowing incremental C++ builds directly through CMake.
-This simplifies rebuilding `libcunumeric.so` either via command-line or via IDE.
+This simplifies rebuilding the C++ shared libraries either via command-line or via IDE.
 After building the C++ libraries, the `pip install` can be done in "editable" mode using the `-e` flag.
 This configures the Python site packages to import the Python source tree directly.
 The Python source can then be edited and used directly for testing without requiring another `pip install`.
 
 ## Example
 
-There are several examples in the `scripts` folder. We walk through the steps in the `build-separately-no-install.sh` here.
-First, the CMake build needs to be configured, e.g.:
+There are several examples in the `scripts` folder. We walk through the steps in
+`build-separately-no-install.sh` here.
+
+First, the CMake build needs to be configured:
 
 ```
 $ cmake -S . -B build -GNinja -D Legion_USE_CUDA=ON
@@ -118,6 +309,7 @@ $ SKBUILD_BUILD_OPTIONS="-D FIND_LEGATE_CORE_CPP=ON -D legate_core_ROOT=$(pwd)/b
 
 The Python source tree and CMake build tree are now available with the environment Python
 for running Legate programs. The diagram below illustrates the
-complete workflow for building both Legate core and a downstream package [cuNumeric]()
+complete workflow for building both Legate core and a downstream package,
+[cuNumeric](https://github.com/nv-legate/cunumeric)
 
 <img src="docs/figures/developer-build.png" alt="drawing" width="600"/>
diff --git a/README.md b/README.md
index ff1142695..fe0d5b5e4 100644
--- a/README.md
+++ b/README.md
@@ -50,15 +50,23 @@ Pull requests are welcomed.
 
 If you have questions, please contact us at legate(at)nvidia.com.
 
-1. [Why Legate?](#why-legate)
-1. [What is the Legate Core?](#what-is-the-legate-core)
-1. [How Does Legate Work?](#how-does-legate-work)
-1. [How Do I Install Legate?](#how-do-i-install-legate)
-1. [How Do I Use Legate?](#how-do-i-use-legate)
-1. [Other FAQs](#other-faqs)
-1. [Contributing](#contributing)
-1. [Documentation](#documentation)
-1. [Next Steps](#next-steps)
+- [Legate](#legate)
+  - [Why Legate?](#why-legate)
+  - [What is the Legate Core?](#what-is-the-legate-core)
+  - [How Does Legate Work?](#how-does-legate-work)
+  - [How Do I Install Legate?](#how-do-i-install-legate)
+  - [How Do I Use Legate?](#how-do-i-use-legate)
+    - [Distributed Launch](#distributed-launch)
+    - [Debugging and Profiling](#debugging-and-profiling)
+  - [Running Legate programs with Jupyter Notebook](#running-legate-programs-with-jupyter-notebook)
+    - [Installation of the Legate IPython Kernel](#installation-of-the-legate-ipython-kernel)
+    - [Running with Jupyter Notebook](#running-with-jupyter-notebook)
+    - [Configuring the Jupyter Notebook](#configuring-the-jupyter-notebook)
+    - [Magic Command](#magic-command)
+  - [Other FAQs](#other-faqs)
+  - [Contributing](#contributing)
+  - [Documentation](#documentation)
+  - [Next Steps](#next-steps)
 
 ## Why Legate?
 
@@ -215,120 +223,14 @@ Legate Core is available [on conda](https://anaconda.org/legate/legate-core):
 conda install -c nvidia -c conda-forge -c legate legate-core
 ```
 
+The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
+and Volta or later GPU architectures.
+
 Docker image build scripts, as well as specialized
 install scripts for supported clusters are available on the
 [quickstart](https://github.com/nv-legate/quickstart) repo.
 
-Read on for general instructions on building Legate Core from source.
-
-### Dependencies
-
-Legate has been tested on Linux and MacOS, although only a few flavors of Linux
-such as Ubuntu have been thoroughly tested. There is currently no support for
-Windows.
-
-Legate Core requires the following:
-
-  - Python >= 3.8
-  - [CUDA](https://developer.nvidia.com/cuda-downloads) >= 10.2
-  - GNU Make
-  - C++17 compatible compiler (g++, clang, or nvc++)
-  - numactl (optional, to support CPU and memory binding)
-  - the Python packages listed in any one of the conda environment files:
-    - `conda/environment-test-3.8.yml`
-    - `conda/environment-test-3.9.yml`
-    - `conda/environment-test-3.10.yml`
-
-You can install the required Python packages by creating a new conda environment:
-
-```
-conda env create -n legate -f conda/environment-test-3.10.yml
-```
-
-or by updating an existing environment:
-
-```
-conda env update -f conda/environment-test-3.10.yml
-```
-
-Note that conda will need to install an environment-local copy of the CUDA
-toolkit, and by default it will choose the latest available version. To avoid
-versioning conflicts, however, it is safer to match the version of CUDA
-installed system-wide on your machine. Therefore, we suggest that you add this
-as an explicit dependency at the bottom of the conda environment file. For
-example, if your system-wide CUDA installation is at version 10.2, add:
-
-```
-  - cudatoolkit=10.2
-```
-
-### Installation
-
-The Legate Core library comes with both a standard `setup.py` script and a
-custom `install.py` script in the top-level directory of the repository that
-will build and install the Legate Core library. Users can use either script
-to install Legate as they will produce the same effect. Users can do a simple
-pip installation of a single-node, CPU-only Legate configuration by navigating
-to the Legate source directory and running:
-```
-pip install .
-```
-or
-```
-python3 -m pip install .
-```
-
-This will install Legate into the standard packages of the Python environment.
-
-To add GPU support or do more complicated customization, Legate provides a
-helper `install.py` script. For GPU support, simply use the `--cuda` flag:
-
-```
-./install.py --cuda
-```
-
-The first time you request GPU support you may need to use the `--with-cuda` flag to
-specify the location of your CUDA installation and the `--with-nccl` flag to specify
-the path to your NCCL installation, if these cannot be automatically located by the build system.
-You can also specify the name of the CUDA architecture you want to target with the `--arch`
-flag. By default the script relies on CMake's auto-detection.
-```
-./install.py --cuda --with-cuda /usr/local/cuda/ --with-nccl "$CONDA_PREFIX" --arch ampere
-```
-For multi-node support Legate uses [GASNet](https://gasnet.lbl.gov/) which can be
-requested using the `--network gasnet1` or `--network gasnetex` flag. By default
-GASNet will be automatically downloaded and built, but if you have an existing
-installation then you can inform the install script using the `--with-gasnet` flag.
-You also need to specify the interconnect network of the target machine using the
-`--conduit` flag.
-
-For example this would be an installation for a
-[DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
-```
-./install.py --network gasnet1 --conduit ibv --cuda --arch ampere
-```
-Alternatively here is an install line for the
-[Piz-Daint](https://www.cscs.ch/computers/dismissed/piz-daint-piz-dora/) supercomputer:
-```
-./install.py --network gasnet1 --conduit aries --cuda --arch pascal
-```
-To see all the options available for installing Legate, run with the `--help` flag:
-```
-./install.py --help
-```
-
-### Toolchain Selection
-
-Legate relies on CMake to select its toolchain and build flags.
-Users can set the environment variables `CXX` or `CXXFLAGS`
-prior to building to override the CMake defaults. Alternatively, CMake values
-can be overriden through the `SKBUILD_CONFIGURE_OPTIONS` variable,
-which is discussed in more detail in the [developer build instructions](BUILD.md).
-
-### Developer Workflow
-
-Details on doing incremental CMake builds and editable pip installations can be
-found in the [developer build instructions](BUILD.md).
+See [BUILD.md]() for instructions on building Legate Core from source.
 
 ## How Do I Use Legate?
 
diff --git a/conda/environment-test-3.10.yml b/conda/environment-test-3.10.yml
deleted file mode 100644
index 736e4c5e0..000000000
--- a/conda/environment-test-3.10.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: legate-core-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.10
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile
diff --git a/conda/environment-test-3.8.yml b/conda/environment-test-3.8.yml
deleted file mode 100644
index 9f58e9b5d..000000000
--- a/conda/environment-test-3.8.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: legate-core-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.8
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile
diff --git a/conda/environment-test-3.9.yml b/conda/environment-test-3.9.yml
deleted file mode 100644
index 9d4eea27d..000000000
--- a/conda/environment-test-3.9.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: legate-core-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.9
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
new file mode 100755
index 000000000..a5cd426ee
--- /dev/null
+++ b/scripts/generate-conda-envs.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See the LICENSE file for details.
+#
+from __future__ import annotations
+
+from argparse import Action, ArgumentParser
+from dataclasses import dataclass
+from textwrap import indent
+from typing import Literal, Protocol, Tuple
+
+# --- Types -------------------------------------------------------------------
+
+Req = str
+Reqs = Tuple[Req, ...]
+OSType = Literal["linux", "darwin"]
+
+
+class SectionConfig(Protocol):
+    header: str
+
+    @property
+    def conda(self) -> Reqs:
+        return ()
+
+    @property
+    def pip(self) -> Reqs:
+        return ()
+
+    def __str__(self) -> str:
+        return self.header
+
+    def format(self, kind: str) -> str:
+        return SECTION_TEMPLATE.format(
+            header=self.header,
+            reqs="- "
+            + "\n- ".join(self.conda if kind == "conda" else self.pip),
+        )
+
+
+@dataclass(frozen=True)
+class CUDAConfig(SectionConfig):
+    ctk_version: str | None
+
+    header = "cuda"
+
+    @property
+    def conda(self) -> Reqs:
+        if self.ctk_version is None:
+            return ()
+
+        return (
+            f"cudatoolkit={self.ctk_version}",  # runtime
+            "cutensor>=1.3.3",  # runtime
+            "nccl",  # runtime
+            "pynvml",  # tests
+        )
+
+    def __str__(self) -> str:
+        if self.ctk_version == "none":
+            return ""
+
+        return f"-cuda{self.ctk_version}"
+
+
+@dataclass(frozen=True)
+class BuildConfig(SectionConfig):
+    compilers: bool = True
+    openmpi: bool = True
+
+    header = "build"
+
+    @property
+    def conda(self) -> Reqs:
+        pkgs = (
+            "cmake>=3.24",
+            "git",
+            "make",
+            "scikit-build>=0.13.1",
+            "setuptools>=60",
+            "zlib",
+        )
+        if self.compilers:
+            pkgs += ("c-compiler", "cxx-compiler")
+        if self.openmpi:
+            pkgs += ("openmpi",)
+        return sorted(pkgs)
+
+    def __str__(self) -> str:
+        val = "-compilers" if self.compilers else ""
+        val += "-openmpi" if self.openmpi else ""
+        return val
+
+
+@dataclass(frozen=True)
+class RuntimeConfig(SectionConfig):
+    header = "runtime"
+
+    @property
+    def conda(self) -> Reqs:
+        return (
+            "cffi",
+            "llvm-openmp",
+            "numpy>=1.22",
+            "openblas=*=*openmp*",
+            "opt_einsum",
+            "pyarrow>=5",
+            "scipy",
+            "typing_extensions",
+        )
+
+
+@dataclass(frozen=True)
+class TestsConfig(SectionConfig):
+    header = "tests"
+
+    @property
+    def conda(self) -> Reqs:
+        return (
+            "clang-tools>=8",
+            "clang>=8",
+            "colorama",
+            "coverage",
+            "mock",
+            "mypy>=0.961",
+            "pre-commit",
+            "pytest-cov",
+            "pytest-lazy-fixture",
+            "pytest-mock",
+            "pytest",
+            "types-docutils",
+        )
+
+    @property
+    def pip(self) -> Reqs:
+        return ("tifffile",)
+
+
+@dataclass(frozen=True)
+class DocsConfig(SectionConfig):
+    header = "docs"
+
+    @property
+    def pip(self) -> Reqs:
+        return (
+            "jinja2",
+            "markdown<3.4.0",
+            "pydata-sphinx-theme",
+            "recommonmark",
+            "sphinx-copybutton",
+            "sphinx-markdown-tables",
+            "sphinx>=4.4.0",
+        )
+
+
+@dataclass(frozen=True)
+class EnvConfig:
+    use: str
+    python: str
+    os: OSType
+    ctk: str | None
+    compilers: bool
+    openmpi: bool
+
+    @property
+    def sections(self) -> Tuple[SectionConfig, ...]:
+        return (
+            self.cuda,
+            self.build,
+            self.runtime,
+            self.tests,
+            self.docs,
+        )
+
+    @property
+    def cuda(self) -> CUDAConfig:
+        return CUDAConfig(self.ctk)
+
+    @property
+    def build(self) -> BuildConfig:
+        return BuildConfig(self.compilers, self.openmpi)
+
+    @property
+    def runtime(self) -> RuntimeConfig:
+        return RuntimeConfig()
+
+    @property
+    def tests(self) -> TestsConfig:
+        return TestsConfig()
+
+    @property
+    def docs(self) -> DocsConfig:
+        return DocsConfig()
+
+    @property
+    def filename(self) -> str:
+        return f"environment-{self.use}-{self.os}-py{self.python}{self.cuda}{self.build}.yaml"  # noqa
+
+
+# --- Setup -------------------------------------------------------------------
+
+PYTHON_VERSIONS = ("3.8", "3.9", "3.10")
+
+CTK_VERSIONS = (
+    "none",
+    "10.2",
+    "11.0",
+    "11.1",
+    "11.2",
+    "11.3",
+    "11.4",
+    "11.5",
+    "11.6",
+    "11.7",
+)
+
+OS_NAMES: Tuple[OSType, ...] = ("linux", "osx")
+
+
+ENV_TEMPLATE = """\
+name: legate-{use}
+channels:
+  - conda-forge
+dependencies:
+
+  - python={python}
+
+{conda_sections}{pip}
+"""
+
+SECTION_TEMPLATE = """\
+# {header}
+{reqs}
+
+"""
+
+PIP_TEMPLATE = """\
+  - pip
+  - pip:
+{pip_sections}
+"""
+
+ALL_CONFIGS = [
+    EnvConfig("test", python, "linux", ctk, compilers, openmpi)
+    for python in PYTHON_VERSIONS
+    for ctk in CTK_VERSIONS
+    for compilers in (True, False)
+    for openmpi in (True, False)
+] + [
+    EnvConfig("test", python, "darwin", "none", compilers, openmpi)
+    for python in PYTHON_VERSIONS
+    for compilers in (True, False)
+    for openmpi in (True, False)
+]
+
+# --- Code --------------------------------------------------------------------
+
+
+class BooleanFlag(Action):
+    def __init__(
+        self,
+        option_strings,
+        dest,
+        default,
+        required=False,
+        help="",
+        metavar=None,
+    ):
+        assert all(not opt.startswith("--no") for opt in option_strings)
+
+        def flatten(list):
+            return [item for sublist in list for item in sublist]
+
+        option_strings = flatten(
+            [
+                [opt, "--no-" + opt[2:], "--no" + opt[2:]]
+                if opt.startswith("--")
+                else [opt]
+                for opt in option_strings
+            ]
+        )
+        super().__init__(
+            option_strings,
+            dest,
+            nargs=0,
+            const=None,
+            default=default,
+            type=bool,
+            choices=None,
+            required=required,
+            help=help,
+            metavar=metavar,
+        )
+
+    def __call__(self, parser, namespace, values, option_string):
+        setattr(namespace, self.dest, not option_string.startswith("--no"))
+
+
+if __name__ == "__main__":
+
+    import sys
+
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--python",
+        choices=PYTHON_VERSIONS,
+        default=None,
+        help="Python version to generate for, (default: all python versions)",
+    )
+    parser.add_argument(
+        "--ctk",
+        choices=CTK_VERSIONS,
+        default=None,
+        dest="ctk_version",
+        help="CTK version to generate for (default: all CTK versions)",
+    )
+    parser.add_argument(
+        "--os",
+        choices=OS_NAMES,
+        default=None,
+        help="OS to generate for (default: all OSes)",
+    )
+    parser.add_argument(
+        "--compilers",
+        action=BooleanFlag,
+        dest="compilers",
+        default=None,
+        help="Whether to include conda compilers or not (default: both)",
+    )
+    parser.add_argument(
+        "--openmpi",
+        action=BooleanFlag,
+        dest="openmpi",
+        default=None,
+        help="Whether to include openmpi or not (default: both)",
+    )
+
+    args = parser.parse_args(sys.argv[1:])
+
+    configs = ALL_CONFIGS
+
+    if args.python is not None:
+        configs = (x for x in configs if x.python == args.python)
+    if args.ctk_version is not None:
+        configs = (
+            x for x in configs if x.cuda.ctk_version == args.ctk_version
+        )
+    if args.compilers is not None:
+        configs = (x for x in configs if x.build.compilers == args.compilers)
+    if args.os is not None:
+        configs = (x for x in configs if x.os == args.os)
+    if args.openmpi is not None:
+        configs = (x for x in configs if x.build.openmpi == args.openmpi)
+
+    for config in configs:
+        conda_sections = indent(
+            "".join(s.format("conda") for s in config.sections if s.conda),
+            "  ",
+        )
+
+        pip_sections = indent(
+            "".join(s.format("pip") for s in config.sections if s.pip), "    "
+        )
+
+        print(f"--- generating: {config.filename}")
+        out = ENV_TEMPLATE.format(
+            use=config.use,
+            python=config.python,
+            conda_sections=conda_sections,
+            pip=PIP_TEMPLATE.format(pip_sections=pip_sections),
+        )
+        with open(f"{config.filename}", "w") as f:
+            f.write(out)

From 5c430039ffe4accf89149ea5ae42c7b68dd73ddf Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 25 Oct 2022 11:01:52 -0700
Subject: [PATCH 033/121] Minor fix in documentation

---
 BUILD.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BUILD.md b/BUILD.md
index 5abb0af8d..320059f9f 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -114,7 +114,7 @@ Required to support CPU and memory binding in the Legate launcher.
 Not available on conda; typically available through the system-level package
 manager.
 
-### MPI (`--openmpi` option)
+### MPI (`--openmpi` option; optional)
 
 Only necessary if you wish to run on multiple nodes.
 

From b7cb465758d1c91537eddf31f849147a4c10382c Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 25 Oct 2022 13:45:43 -0700
Subject: [PATCH 034/121] Fix for cunumeric#668 (#453)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate_core_cpp.cmake | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index 3860d85dc..bf89ff01b 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -117,7 +117,7 @@ if(Legion_USE_Python AND (NOT Python3_FOUND))
 endif()
 
 if(Legion_NETWORKS)
-  find_package(MPI REQUIRED)
+  find_package(MPI REQUIRED COMPONENTS CXX)
 endif()
 
 if(Legion_USE_CUDA)
@@ -266,8 +266,8 @@ target_link_libraries(legate_core
    PUBLIC Legion::Legion
           legate::Thrust
           $<TARGET_NAME_IF_EXISTS:CUDA::nvToolsExt>
-  PRIVATE $<TARGET_NAME_IF_EXISTS:MPI::MPI_CXX>
-          $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>)
+          $<TARGET_NAME_IF_EXISTS:MPI::MPI_CXX>
+  PRIVATE $<TARGET_NAME_IF_EXISTS:NCCL::NCCL>)
 
 target_compile_options(legate_core
   PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${legate_core_CXX_OPTIONS}>"
@@ -394,6 +394,11 @@ endif()
   "set(Legion_USE_Python ${Legion_USE_Python})"
   "set(Legion_NETWORKS ${Legion_NETWORKS})"
   "set(Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS})"
+[=[
+if(Legion_NETWORKS)
+  find_package(MPI REQUIRED COMPONENTS CXX)
+endif()
+]=]
 )
 
 rapids_export(

From fcc0dcc505645a2c1696d6402c8ea662ac7e995d Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 26 Oct 2022 06:23:46 -0700
Subject: [PATCH 035/121] Update upload artifact action version (#454)

v2 -> v3 to avoid GitHub warnings.
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5a90f5520..11b700eab 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -60,7 +60,7 @@ jobs:
         if: always()
       - name: Upload Build Log
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: build-log
           path: ./**/${{ env.COMMIT }}-build.log.gpg
\ No newline at end of file

From e4b94ce4e231b74f96946df59351091c11057051 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 26 Oct 2022 13:26:50 -0700
Subject: [PATCH 036/121] Only keep traceback reprs, to avoid cycles (#447)

* only keep traceback reprs, to avoid cycles

* fix attr name

* use format_tb instead of repr

* Small format change

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 legate/core/exception.py |  9 +++++----
 legate/core/operation.py | 16 +++++++---------
 legate/core/runtime.py   |  6 ++----
 legate/core/utils.py     |  6 +++---
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/legate/core/exception.py b/legate/core/exception.py
index a1d4daae9..5b8bace1e 100644
--- a/legate/core/exception.py
+++ b/legate/core/exception.py
@@ -19,7 +19,6 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from types import TracebackType
     from typing import Optional
 
     from ._legion import Future
@@ -30,11 +29,11 @@ def __init__(
         self,
         exn_types: list[type],
         future: Future,
-        tb: Optional[TracebackType] = None,
+        tb_repr: Optional[str] = None,
     ):
         self._exn_types = exn_types
         self._future = future
-        self._tb = tb
+        self._tb_repr = tb_repr
 
     def raise_exception(self) -> None:
         buf = self._future.get_buffer()
@@ -45,5 +44,7 @@ def raise_exception(self) -> None:
         error_message = buf[9 : 9 + error_size].decode()
         exn_type = self._exn_types[exn_index]
         exn_reraised = exn_type(error_message)
-        exn_original = exn_type(error_message).with_traceback(self._tb)
+        if self._tb_repr is not None:
+            error_message += "\n" + self._tb_repr[:-1]  # remove extra newline
+        exn_original = exn_type(error_message)
         raise exn_reraised from exn_original
diff --git a/legate/core/operation.py b/legate/core/operation.py
index fcd626acb..c079afd0d 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -32,11 +32,9 @@
 from .partition import REPLICATE, Weighted
 from .shape import Shape
 from .store import Store, StorePartition
-from .utils import OrderedSet, capture_traceback
+from .utils import OrderedSet, capture_traceback_repr
 
 if TYPE_CHECKING:
-    from types import TracebackType
-
     from .communicator import Communicator
     from .constraints import Constraint
     from .context import Context
@@ -244,7 +242,7 @@ def __init__(
         self._scalar_args: list[tuple[Any, Union[DTType, tuple[DTType]]]] = []
         self._comm_args: list[Communicator] = []
         self._exn_types: list[type] = []
-        self._tb: Union[None, TracebackType] = None
+        self._tb_repr: Union[None, str] = None
         self._side_effect = False
 
     @property
@@ -279,7 +277,7 @@ def can_raise_exception(self) -> bool:
         return len(self._exn_types) > 0
 
     def capture_traceback(self) -> None:
-        self._tb = capture_traceback()
+        self._tb_repr = capture_traceback_repr()
 
     def _add_scalar_args_to_launcher(self, launcher: TaskLauncher) -> None:
         for (arg, dtype) in self._scalar_args:
@@ -309,7 +307,7 @@ def _demux_scalar_stores_future(self, result: Future) -> None:
                 output.set_storage(result)
             elif self.can_raise_exception:
                 runtime.record_pending_exception(
-                    self._exn_types, result, self._tb
+                    self._exn_types, result, self._tb_repr
                 )
             else:
                 assert num_unbound_outs == 1
@@ -327,7 +325,7 @@ def _demux_scalar_stores_future(self, result: Future) -> None:
                 runtime.record_pending_exception(
                     self._exn_types,
                     runtime.extract_scalar(result, idx),
-                    self._tb,
+                    self._tb_repr,
                 )
 
     def _demux_scalar_stores_future_map(
@@ -366,7 +364,7 @@ def _demux_scalar_stores_future_map(
                 runtime.record_pending_exception(
                     self._exn_types,
                     runtime.reduce_exception_future_map(result),
-                    self._tb,
+                    self._tb_repr,
                 )
             else:
                 assert False
@@ -400,7 +398,7 @@ def _demux_scalar_stores_future_map(
                 runtime.record_pending_exception(
                     self._exn_types,
                     runtime.reduce_exception_future_map(exn_fut_map),
-                    self._tb,
+                    self._tb_repr,
                 )
 
     def _demux_scalar_stores(
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 22b3815e2..4d12a6591 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -51,8 +51,6 @@
 from .shape import Shape
 
 if TYPE_CHECKING:
-    from types import TracebackType
-
     from . import ArgumentMap, Detach, IndexDetach, IndexPartition, Library
     from ._legion import FieldListLike, PhysicalRegion
     from .communicator import Communicator
@@ -1551,9 +1549,9 @@ def record_pending_exception(
         self,
         exn_types: list[type],
         future: Future,
-        tb: Optional[TracebackType] = None,
+        tb_repr: Optional[str] = None,
     ) -> None:
-        exn = PendingException(exn_types, future, tb)
+        exn = PendingException(exn_types, future, tb_repr)
         self._pending_exceptions.append(exn)
 
     def raise_exceptions(self) -> None:
diff --git a/legate/core/utils.py b/legate/core/utils.py
index a9fa0e9e1..6a59ca02f 100644
--- a/legate/core/utils.py
+++ b/legate/core/utils.py
@@ -72,9 +72,9 @@ def cast_tuple(value: Any) -> tuple[Any, ...]:
     return value if isinstance(value, tuple) else tuple(value)
 
 
-def capture_traceback(
+def capture_traceback_repr(
     skip_core_frames: bool = True,
-) -> Optional[TracebackType]:
+) -> Optional[str]:
     tb = None
     for frame, _ in traceback.walk_stack(None):
         if frame.f_globals["__name__"].startswith("legate.core"):
@@ -85,4 +85,4 @@ def capture_traceback(
             tb_lasti=frame.f_lasti,
             tb_lineno=frame.f_lineno,
         )
-    return tb
+    return "".join(traceback.format_tb(tb)) if tb is not None else None

From 95b91eab63ffad606d0884b355593925a879d9d0 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 26 Oct 2022 14:47:15 -0700
Subject: [PATCH 037/121] Fix up mypy errors in tests (#456)

---
 .pre-commit-config.yaml                            |  4 ++--
 legate/tester/stages/test_stage.py                 |  2 +-
 tests/unit/legate/driver/test_config.py            | 10 +++++-----
 tests/unit/legate/jupyter/test_kernel.py           | 10 +++++-----
 tests/unit/legate/tester/stages/test_test_stage.py |  9 ++++++++-
 tests/unit/legate/tester/stages/test_util.py       |  9 +++++----
 tests/unit/legate/util/test_system.py              |  2 +-
 7 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 04478d01c..402bffb64 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       hooks:
         - id: mypy
           pass_filenames: false
-          args: ['legate']
-          additional_dependencies: [numpy]
+          args: ['legate', 'tests']
+          additional_dependencies: [numpy,pytest,pytest_mock]
 default_language_version:
     python: python3
diff --git a/legate/tester/stages/test_stage.py b/legate/tester/stages/test_stage.py
index 5962500bf..f9c871461 100644
--- a/legate/tester/stages/test_stage.py
+++ b/legate/tester/stages/test_stage.py
@@ -92,7 +92,7 @@ def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
             Process execution wrapper
 
         """
-        ...
+        return
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
         """Generate the command line arguments necessary to launch
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 2408bfe08..249107ed2 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -59,12 +59,12 @@ def test_mixin(self) -> None:
         "extra",
         (["a"], ["a", "b c"], ["a", "b c", "d e"], ["a", "b c", "d e", "f"]),
     )
-    def test_launcher_extra_fixup_basic(self, extra) -> None:
+    def test_launcher_extra_fixup_basic(self, extra: list[str]) -> None:
         mn = m.MultiNode(
             nodes=1,
             ranks_per_node=1,
             not_control_replicable=False,
-            launcher="launcher",
+            launcher="mpirun",
             launcher_extra=extra,
         )
         assert mn.launcher_extra == sum((x.split() for x in extra), [])
@@ -74,7 +74,7 @@ def test_launcher_extra_fixup_complex(self) -> None:
             nodes=1,
             ranks_per_node=1,
             not_control_replicable=False,
-            launcher="launcher",
+            launcher="mpirun",
             launcher_extra=[
                 "-H g0002,g0002 -X SOMEENV --fork",
                 "-bind-to none",
@@ -95,7 +95,7 @@ def test_launcher_extra_fixup_quoted(self) -> None:
             nodes=1,
             ranks_per_node=1,
             not_control_replicable=False,
-            launcher="launcher",
+            launcher="mpirun",
             launcher_extra=[
                 "-f 'some path with spaces/foo.txt'",
             ],
@@ -165,7 +165,7 @@ def test_mixin(self) -> None:
         "extra",
         (["a"], ["a", "b c"], ["a", "b c", "d e"], ["a", "b c", "d e", "f"]),
     )
-    def test_nsys_extra_fixup_basic(self, extra) -> None:
+    def test_nsys_extra_fixup_basic(self, extra: list[str]) -> None:
         p = m.Profiling(
             profile=True,
             nvprof=True,
diff --git a/tests/unit/legate/jupyter/test_kernel.py b/tests/unit/legate/jupyter/test_kernel.py
index 42925387b..d176ba23d 100644
--- a/tests/unit/legate/jupyter/test_kernel.py
+++ b/tests/unit/legate/jupyter/test_kernel.py
@@ -53,11 +53,11 @@ def test_defatul(self) -> None:
         ] = config.kernel.spec_name
 
         assert spec.display_name == config.kernel.display_name
-        assert spec.language == "python"
-        assert spec.argv[:-3] == list(driver.cmd)
-        assert spec.argv[-3].endswith("_legion_kernel.py")
-        assert spec.argv[-2:] == ["-f", "{connection_file}"]
-        assert spec.env == expected_env
+        assert spec.language == "python"  # type: ignore
+        assert spec.argv[:-3] == list(driver.cmd)  # type: ignore
+        assert spec.argv[-3].endswith("_legion_kernel.py")  # type: ignore
+        assert spec.argv[-2:] == ["-f", "{connection_file}"]  # type: ignore
+        assert spec.env == expected_env  # type: ignore
         assert m.LEGATE_JUPYTER_METADATA_KEY in spec.metadata
         metadata = spec.metadata[m.LEGATE_JUPYTER_METADATA_KEY]
         assert metadata == {
diff --git a/tests/unit/legate/tester/stages/test_test_stage.py b/tests/unit/legate/tester/stages/test_test_stage.py
index 6a5678c22..fcdc7a934 100644
--- a/tests/unit/legate/tester/stages/test_test_stage.py
+++ b/tests/unit/legate/tester/stages/test_test_stage.py
@@ -23,8 +23,9 @@
 from legate.tester import FeatureType
 from legate.tester.config import Config
 from legate.tester.stages import test_stage as m
-from legate.tester.stages.util import StageResult, StageSpec
+from legate.tester.stages.util import Shard, StageResult, StageSpec
 from legate.tester.test_system import ProcessResult, TestSystem as _TestSystem
+from legate.util.types import ArgList, EnvDict
 
 from . import FakeSystem
 
@@ -45,6 +46,12 @@ def __init__(self, config: Config, system: _TestSystem) -> None:
     def compute_spec(self, config: Config, system: _TestSystem) -> StageSpec:
         return StageSpec(2, [(0,), (1,), (2,)])
 
+    def shard_args(self, shard: Shard, config: Config) -> ArgList:
+        return []
+
+    def env(self, config: Config, system: _TestSystem) -> EnvDict:
+        return {}
+
 
 class TestTestStage:
     def test_name(self) -> None:
diff --git a/tests/unit/legate/tester/stages/test_util.py b/tests/unit/legate/tester/stages/test_util.py
index f97174de8..0729253dd 100644
--- a/tests/unit/legate/tester/stages/test_util.py
+++ b/tests/unit/legate/tester/stages/test_util.py
@@ -17,6 +17,7 @@
 """
 from __future__ import annotations
 
+from datetime import timedelta
 from pathlib import Path
 
 import pytest
@@ -33,7 +34,7 @@ def test_StageResult() -> None:
     procs[2].returncode = 10
     procs[7].returncode = -2
 
-    result = m.StageResult(procs=procs, time=0)
+    result = m.StageResult(procs=procs, time=timedelta(0))
 
     assert result.total == 10
     assert result.passed == 8
@@ -67,7 +68,7 @@ def test_requested_too_large(self) -> None:
 
 class Test_log_proc:
     @pytest.mark.parametrize("returncode", (-23, -1, 0, 1, 17))
-    def test_skipped(self, returncode) -> None:
+    def test_skipped(self, returncode: int) -> None:
         config = Config([])
         proc = ProcessResult(
             "proc", Path("proc"), skipped=True, returncode=returncode
@@ -100,7 +101,7 @@ def test_passed_verbose(self) -> None:
         )
 
     @pytest.mark.parametrize("returncode", (-23, -1, 1, 17))
-    def test_failed(self, returncode) -> None:
+    def test_failed(self, returncode: int) -> None:
         config = Config([])
         proc = ProcessResult("proc", Path("proc"), returncode=returncode)
 
@@ -112,7 +113,7 @@ def test_failed(self, returncode) -> None:
         )
 
     @pytest.mark.parametrize("returncode", (-23, -1, 1, 17))
-    def test_failed_verbose(self, returncode) -> None:
+    def test_failed_verbose(self, returncode: int) -> None:
         config = Config([])
         proc = ProcessResult(
             "proc", Path("proc"), returncode=returncode, output="foo\nbar"
diff --git a/tests/unit/legate/util/test_system.py b/tests/unit/legate/util/test_system.py
index c3a5d6184..38db9cc0b 100644
--- a/tests/unit/legate/util/test_system.py
+++ b/tests/unit/legate/util/test_system.py
@@ -109,5 +109,5 @@ def test_gpus_osx(self) -> None:
         s = m.System()
 
         msg = "GPU execution is not available on OSX."
-        with pytest.raises(RuntimeError, msg=msg):
+        with pytest.raises(RuntimeError, msg=msg):  # type: ignore
             s.gpus

From f6fb68fc64d962a6878a621f662bc9546db2b1af Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Thu, 27 Oct 2022 07:20:56 -0700
Subject: [PATCH 038/121] Fix returned legion paths for editable install with
 separate legion build (#442)

---
 legate/util/fs.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/legate/util/fs.py b/legate/util/fs.py
index e3ea9e958..15338d783 100644
--- a/legate/util/fs.py
+++ b/legate/util/fs.py
@@ -295,8 +295,9 @@ def installed_legion_paths(legion_dir: Path) -> LegionPaths:
             )
             if legion_dir.joinpath("CMakeCache.txt").exists():
                 cmake_cache_txt = legion_dir / "CMakeCache.txt"
-
-    except Exception:
+    finally:
+        # Hopefully at this point we have a valid cmake_cache_txt with a
+        # valid Legion_SOURCE_DIR and Legion_BINARY_DIR
         try:
             # If Legion_SOURCE_DIR and Legion_BINARY_DIR are in CMakeCache.txt,
             # return the paths to Legion in the legate_core build dir.

From 7a20ea85a4ea9c5aa4c201ccd7dfc5073c0bd47f Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 27 Oct 2022 10:10:43 -0700
Subject: [PATCH 039/121] Fix BUILD.md link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fe0d5b5e4..f713191e9 100644
--- a/README.md
+++ b/README.md
@@ -230,7 +230,7 @@ Docker image build scripts, as well as specialized
 install scripts for supported clusters are available on the
 [quickstart](https://github.com/nv-legate/quickstart) repo.
 
-See [BUILD.md]() for instructions on building Legate Core from source.
+See [BUILD.md](BUILD.md) for instructions on building Legate Core from source.
 
 ## How Do I Use Legate?
 

From f8052ccda03b860e5f1dfa963909a515bdef8c21 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 27 Oct 2022 23:21:08 -0700
Subject: [PATCH 040/121] Mapper improvements (#452)

* Make mapping::RegionField agnostic to the owning operator

* Rename MapperDeserializer to mapping::TaskDeserializer

* Plumbing to use Legate Copy objects in the mapper

* Rename mapping/task.* to mapping/operation.*

* Use the right target processors for point copies

* Use requirements directly instead of their indices in the store mapping

* Make sure there is only up to one indirection field in a copy

* Assign the right requirement index in CopyReqAnalyzer

* Refactor map_copy to use map_legate_store instead of the legacy mapping call

* Use map_legate_store in map_inline

* Use map_legate_store in map_partition

* Remove the obsolete map_raw_array finally

* Clean up default store target lookup

* Refactor several switch statements into dispatch templates

* Unify sharding functor handling

* Minor tweak to task slicing for manually parallelized tasks

* Clean up variant lookups using std::optional

* Forbid colocations on unbound stores (reduction stores can colocate though)

* Massive clean-up on map_task

* Unify the code to map multiple legate stores

* One last DRY

* Missing include

* Remove dead code

* Make sure the output variable is initialized before calling legate_map_store
---
 legate/core/_legion/operation.py             |   16 +
 legate/core/launcher.py                      |  260 ++--
 legate/core/operation.py                     |   10 +
 legate_core_cpp.cmake                        |    6 +-
 src/core/mapping/base_mapper.cc              | 1309 ++++++------------
 src/core/mapping/base_mapper.h               |   73 +-
 src/core/mapping/mapping.cc                  |   35 +-
 src/core/mapping/mapping.h                   |    9 +-
 src/core/mapping/{task.cc => operation.cc}   |   79 +-
 src/core/mapping/{task.h => operation.h}     |   44 +-
 src/core/mapping/{task.inl => operation.inl} |    0
 src/core/utilities/deserializer.cc           |   76 +-
 src/core/utilities/deserializer.h            |   50 +-
 src/core/utilities/deserializer.inl          |    8 +-
 typings/legion_cffi/lib.pyi                  |    2 +
 15 files changed, 916 insertions(+), 1061 deletions(-)
 rename src/core/mapping/{task.cc => operation.cc} (57%)
 rename src/core/mapping/{task.h => operation.h} (77%)
 rename src/core/mapping/{task.inl => operation.inl} (100%)

diff --git a/legate/core/_legion/operation.py b/legate/core/_legion/operation.py
index e07b4ba8b..cf13a7bff 100644
--- a/legate/core/_legion/operation.py
+++ b/legate/core/_legion/operation.py
@@ -648,6 +648,14 @@ def set_sharding_space(self, space: IndexSpace) -> None:
             self.launcher, space.handle
         )
 
+    def set_mapper_arg(self, data: Any, size: int) -> None:
+        legion.legion_copy_launcher_set_mapper_arg(
+            self.launcher,
+            (ffi.from_buffer(data), size),
+        )
+        # Hold a reference to the data to prevent collection
+        self.data = data
+
     @dispatch
     def launch(
         self,
@@ -1070,6 +1078,14 @@ def set_sharding_space(self, space: IndexSpace) -> None:
             self.launcher, space.handle
         )
 
+    def set_mapper_arg(self, data: Any, size: int) -> None:
+        legion.legion_index_copy_launcher_set_mapper_arg(
+            self.launcher,
+            (ffi.from_buffer(data), size),
+        )
+        # Hold a reference to the data to prevent collection
+        self.data = data
+
     @dispatch
     def launch(
         self,
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 8b4e01fbd..ce87ffada 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -24,7 +24,6 @@
     Sequence,
     Tuple,
     Union,
-    overload,
 )
 
 from . import (
@@ -104,6 +103,13 @@ def _pack(buf: BufferBuilder, value: Any, dtype: Any, is_tuple: bool) -> None:
         serializer(buf, value)
 
 
+class RequirementIndexer(Protocol):
+    def get_requirement_index(
+        self, req: Union[RegionReq, OutputReq], field_id: int
+    ) -> int:
+        ...
+
+
 class LauncherArg(Protocol):
     def pack(self, buf: BufferBuilder) -> None:
         ...
@@ -164,40 +170,16 @@ def __str__(self) -> str:
 
 
 class RegionFieldArg:
-    @overload
     def __init__(
         self,
-        analyzer: RequirementAnalyzer,
-        store: Store,
-        dim: int,
-        req: RegionReq,
-        field_id: int,
-        redop: int,
-    ) -> None:
-        ...
-
-    @overload
-    def __init__(
-        self,
-        analyzer: OutputAnalyzer,
-        store: Store,
-        dim: int,
-        req: OutputReq,
-        field_id: int,
-        redop: int,
-    ) -> None:
-        ...
-
-    def __init__(
-        self,
-        analyzer: Union[OutputAnalyzer, RequirementAnalyzer],
+        indexer: RequirementIndexer,
         store: Store,
         dim: int,
         req: Union[OutputReq, RegionReq],
         field_id: int,
         redop: int,
     ) -> None:
-        self._analyzer = analyzer
+        self._indexer = indexer
         self._store = store
         self._dim = dim
         self._req = req
@@ -210,9 +192,7 @@ def pack(self, buf: BufferBuilder) -> None:
         buf.pack_32bit_int(self._dim)
 
         buf.pack_32bit_uint(
-            self._analyzer.get_requirement_index(
-                self._req, self._field_id  # type: ignore [arg-type]
-            )
+            self._indexer.get_requirement_index(self._req, self._field_id)
         )
         buf.pack_32bit_uint(self._field_id)
 
@@ -220,6 +200,15 @@ def __str__(self) -> str:
         return f"RegionFieldArg({self._dim}, {self._req}, {self._field_id})"
 
 
+def pack_args(
+    argbuf: BufferBuilder,
+    args: Sequence[LauncherArg],
+) -> None:
+    argbuf.pack_32bit_uint(len(args))
+    for arg in args:
+        arg.pack(argbuf)
+
+
 AddReqMethod = Any
 
 
@@ -575,7 +564,7 @@ def coalesce(self, error_on_interference: bool) -> dict[Any, list[int]]:
         return coalesced
 
 
-class RequirementAnalyzer:
+class RequirementAnalyzer(RequirementIndexer):
     def __init__(self, error_on_interference: bool = True) -> None:
         self._field_sets: dict[Region, FieldSet] = {}
         self._requirements: list[tuple[RegionReq, list[int]]] = []
@@ -614,7 +603,10 @@ def analyze_requirements(self) -> None:
                     self._requirement_map[(req, field_id)] = req_idx
                 self._requirements.append((req, fields))
 
-    def get_requirement_index(self, req: RegionReq, field_id: int) -> int:
+    def get_requirement_index(
+        self, req: Union[RegionReq, OutputReq], field_id: int
+    ) -> int:
+        assert isinstance(req, RegionReq)
         try:
             return self._requirement_map[(req, field_id)]
         except KeyError:
@@ -622,7 +614,7 @@ def get_requirement_index(self, req: RegionReq, field_id: int) -> int:
             return self._requirement_map[(req, field_id)]
 
 
-class OutputAnalyzer:
+class OutputAnalyzer(RequirementIndexer):
     def __init__(self) -> None:
         self._groups: dict[Any, OrderedSet[tuple[int, Store]]] = {}
         self._requirements: list[tuple[OutputReq, list[int]]] = []
@@ -664,7 +656,10 @@ def analyze_requirements(self) -> None:
 
             self._requirements.append((req, fields))
 
-    def get_requirement_index(self, req: OutputReq, field_id: int) -> int:
+    def get_requirement_index(
+        self, req: Union[RegionReq, OutputReq], field_id: int
+    ) -> int:
+        assert isinstance(req, OutputReq)
         return self._requirement_map[(req, field_id)]
 
     def update_storages(self) -> None:
@@ -673,6 +668,28 @@ def update_storages(self) -> None:
                 req.update_storage(store, field_id)
 
 
+# A simple analyzer that does not coalesce requirements
+class CopyReqAnalyzer(RequirementIndexer):
+    def __init__(self) -> None:
+        self._requirements: list[tuple[RegionReq, int]] = []
+        self._requirement_map: dict[tuple[RegionReq, int], int] = {}
+
+    @property
+    def requirements(self) -> list[tuple[RegionReq, int]]:
+        return self._requirements
+
+    def insert(self, req: RegionReq, field_id: int) -> None:
+        entry = (req, field_id)
+        self._requirement_map[entry] = len(self._requirements)
+        self._requirements.append(entry)
+
+    def get_requirement_index(
+        self, req: Union[RegionReq, OutputReq], field_id: int
+    ) -> int:
+        assert isinstance(req, RegionReq)
+        return self._requirement_map[(req, field_id)]
+
+
 class TaskLauncher:
     def __init__(
         self,
@@ -864,25 +881,16 @@ def set_sharding_space(self, space: IndexSpace) -> None:
     def set_point(self, point: Point) -> None:
         self._point = point
 
-    @staticmethod
-    def pack_args(
-        argbuf: BufferBuilder,
-        args: Sequence[LauncherArg],
-    ) -> None:
-        argbuf.pack_32bit_uint(len(args))
-        for arg in args:
-            arg.pack(argbuf)
-
     def build_task(
         self, launch_domain: Rect, argbuf: BufferBuilder
     ) -> IndexTask:
         self._req_analyzer.analyze_requirements()
         self._out_analyzer.analyze_requirements()
 
-        self.pack_args(argbuf, self._inputs)
-        self.pack_args(argbuf, self._outputs)
-        self.pack_args(argbuf, self._reductions)
-        self.pack_args(argbuf, self._scalars)
+        pack_args(argbuf, self._inputs)
+        pack_args(argbuf, self._outputs)
+        pack_args(argbuf, self._reductions)
+        pack_args(argbuf, self._scalars)
         argbuf.pack_bool(self._can_raise_exception)
         argbuf.pack_bool(self._insert_barrier)
         argbuf.pack_32bit_uint(len(self._comms))
@@ -921,10 +929,10 @@ def build_single_task(self, argbuf: BufferBuilder) -> SingleTask:
         self._req_analyzer.analyze_requirements()
         self._out_analyzer.analyze_requirements()
 
-        self.pack_args(argbuf, self._inputs)
-        self.pack_args(argbuf, self._outputs)
-        self.pack_args(argbuf, self._reductions)
-        self.pack_args(argbuf, self._scalars)
+        pack_args(argbuf, self._inputs)
+        pack_args(argbuf, self._outputs)
+        pack_args(argbuf, self._reductions)
+        pack_args(argbuf, self._scalars)
         argbuf.pack_bool(self._can_raise_exception)
 
         assert len(self._comms) == 0
@@ -982,7 +990,15 @@ def __init__(
         assert type(tag) != bool
         self._context = context
         self._mapper_id = mapper_id
-        self._req_analyzer = RequirementAnalyzer()
+        self._inputs: list[LauncherArg] = []
+        self._outputs: list[LauncherArg] = []
+        self._reductions: list[LauncherArg] = []
+        self._source_indirects: list[LauncherArg] = []
+        self._target_indirects: list[LauncherArg] = []
+        self._input_reqs = CopyReqAnalyzer()
+        self._output_reqs = CopyReqAnalyzer()
+        self._source_indirect_reqs = CopyReqAnalyzer()
+        self._target_indirect_reqs = CopyReqAnalyzer()
         self._tag = tag
         self._sharding_space: Union[IndexSpace, None] = None
         self._point: Union[Point, None] = None
@@ -998,11 +1014,15 @@ def library_mapper_id(self) -> int:
     def legion_mapper_id(self) -> int:
         return self._context.get_mapper_id(self._mapper_id)
 
-    def __del__(self) -> None:
-        del self._req_analyzer
-
     def add_store(
-        self, store: Store, proj: Proj, perm: Permission, tag: int, flags: int
+        self,
+        args: list[LauncherArg],
+        req_analyzer: CopyReqAnalyzer,
+        store: Store,
+        proj: Proj,
+        perm: Permission,
+        tag: int,
+        flags: int,
     ) -> None:
         assert store.kind is not Future
         assert store._transform.bottom
@@ -1015,37 +1035,97 @@ def add_store(
 
         req = RegionReq(region, perm, proj, tag, flags)
 
-        self._req_analyzer.insert(req, field_id)
+        req_analyzer.insert(req, field_id)
+
+        redop = -1 if proj.redop is None else proj.redop
+        args.append(
+            RegionFieldArg(
+                req_analyzer,
+                store,
+                region.index_space.get_dim(),
+                req,
+                field_id,
+                redop,
+            )
+        )
 
     def add_input(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.READ, tag, flags)
+        self.add_store(
+            self._inputs,
+            self._input_reqs,
+            store,
+            proj,
+            Permission.READ,
+            tag,
+            flags,
+        )
 
     def add_output(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.WRITE, tag, flags)
+        self.add_store(
+            self._outputs,
+            self._output_reqs,
+            store,
+            proj,
+            Permission.WRITE,
+            tag,
+            flags,
+        )
 
     def add_inout(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.READ_WRITE, tag, flags)
+        self.add_store(
+            self._outputs,
+            self._output_reqs,
+            store,
+            proj,
+            Permission.READ_WRITE,
+            tag,
+            flags,
+        )
 
     def add_reduction(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.REDUCTION, tag, flags)
+        self.add_store(
+            self._reductions,
+            self._output_reqs,
+            store,
+            proj,
+            Permission.REDUCTION,
+            tag,
+            flags,
+        )
 
     def add_source_indirect(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.SOURCE_INDIRECT, tag, flags)
+        self.add_store(
+            self._source_indirects,
+            self._source_indirect_reqs,
+            store,
+            proj,
+            Permission.SOURCE_INDIRECT,
+            tag,
+            flags,
+        )
 
     def add_target_indirect(
         self, store: Store, proj: Proj, tag: int = 0, flags: int = 0
     ) -> None:
-        self.add_store(store, proj, Permission.TARGET_INDIRECT, tag, flags)
+        self.add_store(
+            self._target_indirects,
+            self._target_indirect_reqs,
+            store,
+            proj,
+            Permission.TARGET_INDIRECT,
+            tag,
+            flags,
+        )
 
     def set_sharding_space(self, space: IndexSpace) -> None:
         self._sharding_space = space
@@ -1054,7 +1134,11 @@ def set_point(self, point: Point) -> None:
         self._point = point
 
     def build_copy(self, launch_domain: Rect) -> IndexCopy:
-        self._req_analyzer.analyze_requirements()
+        argbuf = BufferBuilder()
+        pack_args(argbuf, self._inputs)
+        pack_args(argbuf, self._outputs + self._reductions)
+        pack_args(argbuf, self._source_indirects)
+        pack_args(argbuf, self._target_indirects)
 
         copy = IndexCopy(
             launch_domain,
@@ -1062,39 +1146,48 @@ def build_copy(self, launch_domain: Rect) -> IndexCopy:
             tag=self._tag,
             provenance=self._provenance,
         )
-        for (req, fields) in self._req_analyzer.requirements:
-            if req.permission in (
-                Permission.SOURCE_INDIRECT,
-                Permission.TARGET_INDIRECT,
-            ):
-                assert len(fields) == 1
-                req.proj.add(copy, req, fields[0], _index_copy_calls)
-            else:
-                req.proj.add(copy, req, fields, _index_copy_calls)
+
+        def add_requirements(
+            requirements: list[tuple[RegionReq, int]]
+        ) -> None:
+            for (req, field) in requirements:
+                req.proj.add(copy, req, field, _index_copy_calls)
+
+        add_requirements(self._input_reqs.requirements)
+        add_requirements(self._output_reqs.requirements)
+        add_requirements(self._source_indirect_reqs.requirements)
+        add_requirements(self._target_indirect_reqs.requirements)
 
         if self._sharding_space is not None:
             copy.set_sharding_space(self._sharding_space)
         copy.set_possible_src_indirect_out_of_range(self._source_oor)
         copy.set_possible_dst_indirect_out_of_range(self._target_oor)
+        copy.set_mapper_arg(argbuf.get_string(), argbuf.get_size())
         return copy
 
     def build_single_copy(self) -> SingleCopy:
-        self._req_analyzer.analyze_requirements()
+        argbuf = BufferBuilder()
+        pack_args(argbuf, self._inputs)
+        pack_args(argbuf, self._outputs + self._reductions)
+        pack_args(argbuf, self._source_indirects)
+        pack_args(argbuf, self._target_indirects)
 
         copy = SingleCopy(
             mapper=self.legion_mapper_id,
             tag=self._tag,
             provenance=self._provenance,
         )
-        for (req, fields) in self._req_analyzer.requirements:
-            if req.permission in (
-                Permission.SOURCE_INDIRECT,
-                Permission.TARGET_INDIRECT,
-            ):
-                assert len(fields) == 1
-                req.proj.add_single(copy, req, fields[0], _single_copy_calls)
-            else:
-                req.proj.add_single(copy, req, fields, _single_copy_calls)
+
+        def add_requirements(
+            requirements: list[tuple[RegionReq, int]]
+        ) -> None:
+            for (req, field) in requirements:
+                req.proj.add_single(copy, req, field, _single_copy_calls)
+
+        add_requirements(self._input_reqs.requirements)
+        add_requirements(self._output_reqs.requirements)
+        add_requirements(self._source_indirect_reqs.requirements)
+        add_requirements(self._target_indirect_reqs.requirements)
 
         if self._sharding_space is not None:
             copy.set_sharding_space(self._sharding_space)
@@ -1102,6 +1195,7 @@ def build_single_copy(self) -> SingleCopy:
             copy.set_point(self._point)
         copy.set_possible_src_indirect_out_of_range(self._source_oor)
         copy.set_possible_dst_indirect_out_of_range(self._target_oor)
+        copy.set_mapper_arg(argbuf.get_string(), argbuf.get_size())
         return copy
 
     def execute(
diff --git a/legate/core/operation.py b/legate/core/operation.py
index c079afd0d..a158ece7d 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -837,6 +837,11 @@ def add_reduction(
     def add_source_indirect(
         self, store: Store, partition: Optional[PartSym] = None
     ) -> None:
+        if len(self._source_indirects) != 0:
+            raise RuntimeError(
+                "There can be only up to one source indirection store for "
+                "a Copy operation"
+            )
         self._check_store(store)
         if partition is None:
             partition = self._get_unique_partition(store)
@@ -846,6 +851,11 @@ def add_source_indirect(
     def add_target_indirect(
         self, store: Store, partition: Optional[PartSym] = None
     ) -> None:
+        if len(self._target_indirects) != 0:
+            raise RuntimeError(
+                "There can be only up to one target indirection store for "
+                "a Copy operation"
+            )
         self._check_store(store)
         if partition is None:
             partition = self._get_unique_partition(store)
diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index bf89ff01b..6150a1908 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -196,7 +196,7 @@ list(APPEND legate_core_SOURCES
   src/core/mapping/core_mapper.cc
   src/core/mapping/instance_manager.cc
   src/core/mapping/mapping.cc
-  src/core/mapping/task.cc
+  src/core/mapping/operation.cc
   src/core/runtime/context.cc
   src/core/runtime/projection.cc
   src/core/runtime/runtime.cc
@@ -344,8 +344,8 @@ install(
 install(
   FILES src/core/mapping/base_mapper.h
         src/core/mapping/mapping.h
-        src/core/mapping/task.h
-        src/core/mapping/task.inl
+        src/core/mapping/operation.h
+        src/core/mapping/operation.inl
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/legate/core/mapping)
 
 install(
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 983043322..9ae61f62b 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -16,6 +16,7 @@
 
 #include <cstdlib>
 #include <sstream>
+#include <unordered_map>
 
 #include "legion/legion_mapping.h"
 #include "mappers/mapping_utilities.h"
@@ -23,13 +24,14 @@
 #include "core/data/store.h"
 #include "core/mapping/base_mapper.h"
 #include "core/mapping/instance_manager.h"
-#include "core/mapping/task.h"
+#include "core/mapping/operation.h"
 #include "core/runtime/projection.h"
 #include "core/runtime/shard.h"
 #include "core/utilities/linearize.h"
 #include "legate_defines.h"
 
 using LegionTask = Legion::Task;
+using LegionCopy = Legion::Copy;
 
 using namespace Legion;
 using namespace Legion::Mapping;
@@ -37,6 +39,42 @@ using namespace Legion::Mapping;
 namespace legate {
 namespace mapping {
 
+namespace {
+
+const std::vector<StoreTarget>& default_store_targets(Processor::Kind kind)
+{
+  static const std::map<Processor::Kind, std::vector<StoreTarget>> defaults = {
+    {Processor::LOC_PROC, {StoreTarget::SYSMEM}},
+    {Processor::TOC_PROC, {StoreTarget::FBMEM, StoreTarget::ZCMEM}},
+    {Processor::OMP_PROC, {StoreTarget::SOCKETMEM, StoreTarget::SYSMEM}},
+  };
+
+  auto finder = defaults.find(kind);
+  if (defaults.end() == finder) LEGATE_ABORT;
+  return finder->second;
+}
+
+std::string log_mappable(const Mappable& mappable, bool prefix_only = false)
+{
+  static const std::map<MappableType, std::string> prefixes = {
+    {LEGION_TASK_MAPPABLE, "Task "},
+    {LEGION_COPY_MAPPABLE, "Copy "},
+    {LEGION_INLINE_MAPPABLE, "Inline mapping "},
+    {LEGION_PARTITION_MAPPABLE, "Partition "},
+  };
+  auto finder = prefixes.find(mappable.get_mappable_type());
+#ifdef DEBUG_LEGATE
+  assert(finder != prefixes.end());
+#endif
+  if (prefix_only) return finder->second;
+
+  std::stringstream ss;
+  ss << finder->second << mappable.get_unique_id();
+  return ss.str();
+}
+
+}  // namespace
+
 BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
   : Mapper(rt->get_mapper_runtime()),
     legion_runtime(rt),
@@ -65,14 +103,6 @@ BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
         local_omps.push_back(local_proc);
         break;
       }
-      case Processor::IO_PROC: {
-        local_ios.push_back(local_proc);
-        break;
-      }
-      case Processor::PY_PROC: {
-        local_pys.push_back(local_proc);
-        break;
-      }
       default: break;
     }
   }
@@ -187,21 +217,8 @@ void BaseMapper::select_task_options(const MapperContext ctx,
   Task legate_task(&task, context, runtime, ctx);
   auto target = task_target(legate_task, options);
 
+  dispatch(target, [&output](auto& procs) { output.initial_proc = procs.front(); });
   // We never want valid instances
-  switch (target) {
-    case TaskTarget::CPU: {
-      output.initial_proc = local_cpus.front();
-      break;
-    }
-    case TaskTarget::GPU: {
-      output.initial_proc = local_gpus.front();
-      break;
-    }
-    case TaskTarget::OMP: {
-      output.initial_proc = local_omps.front();
-      break;
-    }
-  }
   output.valid_instances = false;
 }
 
@@ -256,21 +273,7 @@ void BaseMapper::slice_auto_task(const MapperContext ctx,
     }
   };
 
-  switch (task.target_proc.kind()) {
-    case Processor::LOC_PROC: {
-      round_robin(local_cpus);
-      break;
-    }
-    case Processor::TOC_PROC: {
-      round_robin(local_gpus);
-      break;
-    }
-    case Processor::OMP_PROC: {
-      round_robin(local_omps);
-      break;
-    }
-    default: LEGATE_ABORT;
-  }
+  dispatch(task.target_proc.kind(), round_robin);
 }
 
 void BaseMapper::generate_prime_factor(const std::vector<Processor>& processors,
@@ -306,22 +309,7 @@ const std::vector<int32_t> BaseMapper::get_processor_grid(Legion::Processor::Kin
   auto finder = proc_grids.find(key);
   if (finder != proc_grids.end()) return finder->second;
 
-  int32_t num_procs = 1;
-  switch (kind) {
-    case Processor::LOC_PROC: {
-      num_procs = static_cast<int32_t>(local_cpus.size());
-      break;
-    }
-    case Processor::TOC_PROC: {
-      num_procs = static_cast<int32_t>(local_gpus.size());
-      break;
-    }
-    case Processor::OMP_PROC: {
-      num_procs = static_cast<int32_t>(local_omps.size());
-      break;
-    }
-    default: LEGATE_ABORT;
-  }
+  int32_t num_procs = dispatch(kind, [](auto& procs) { return procs.size(); });
 
   std::vector<int32_t> grid;
   auto factor_it = all_factors[kind].begin();
@@ -348,11 +336,6 @@ void BaseMapper::slice_manual_task(const MapperContext ctx,
 {
   output.slices.reserve(input.domain.get_volume());
 
-  // Get the domain for the sharding space also
-  Domain sharding_domain = task.index_domain;
-  if (task.sharding_space.exists())
-    sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
-
   auto distribute = [&](auto& procs) {
     auto ndim       = input.domain.dim;
     auto& proc_grid = get_processor_grid(task.target_proc.kind(), ndim);
@@ -364,21 +347,7 @@ void BaseMapper::slice_manual_task(const MapperContext ctx,
     }
   };
 
-  switch (task.target_proc.kind()) {
-    case Processor::LOC_PROC: {
-      distribute(local_cpus);
-      break;
-    }
-    case Processor::TOC_PROC: {
-      distribute(local_gpus);
-      break;
-    }
-    case Processor::OMP_PROC: {
-      distribute(local_omps);
-      break;
-    }
-    default: LEGATE_ABORT;
-  }
+  dispatch(task.target_proc.kind(), distribute);
 }
 
 void BaseMapper::slice_round_robin_task(const MapperContext ctx,
@@ -405,21 +374,7 @@ void BaseMapper::slice_round_robin_task(const MapperContext ctx,
     }
   };
 
-  switch (task.target_proc.kind()) {
-    case Processor::LOC_PROC: {
-      distribute(local_cpus);
-      break;
-    }
-    case Processor::TOC_PROC: {
-      distribute(local_gpus);
-      break;
-    }
-    case Processor::OMP_PROC: {
-      distribute(local_omps);
-      break;
-    }
-    default: LEGATE_ABORT;
-  }
+  dispatch(task.target_proc.kind(), distribute);
 }
 
 void BaseMapper::slice_task(const MapperContext ctx,
@@ -427,74 +382,46 @@ void BaseMapper::slice_task(const MapperContext ctx,
                             const SliceTaskInput& input,
                             SliceTaskOutput& output)
 {
-  if (task.tag == LEGATE_CORE_MANUAL_PARALLEL_LAUNCH_TAG) {
-    if (task.regions.size() == 0)
-      slice_round_robin_task(ctx, task, input, output);
-    else
-      slice_manual_task(ctx, task, input, output);
-  } else
+  if (task.tag == LEGATE_CORE_MANUAL_PARALLEL_LAUNCH_TAG)
+    slice_manual_task(ctx, task, input, output);
+  else if (task.regions.size() == 0)
+    slice_round_robin_task(ctx, task, input, output);
+  else
     slice_auto_task(ctx, task, input, output);
 }
 
 bool BaseMapper::has_variant(const MapperContext ctx, const LegionTask& task, Processor::Kind kind)
 {
-  const std::pair<TaskID, Processor::Kind> key(task.task_id, kind);
-  // Check to see if we already have it
-  auto finder = leaf_variants.find(key);
-  if ((finder != leaf_variants.end()) && (finder->second != 0)) return true;
-  std::vector<VariantID> variants;
-  runtime->find_valid_variants(ctx, key.first, variants, key.second);
-  // Process all the results, record if we found what we were looking for
-  bool has_leaf = false;
-  for (auto vid : variants) {
-    assert(vid > 0);
-    switch (vid) {
-      case LEGATE_CPU_VARIANT:
-      case LEGATE_OMP_VARIANT:
-      case LEGATE_GPU_VARIANT: {
-        has_leaf           = true;
-        leaf_variants[key] = vid;
-        break;
-      }
-      default:         // TODO: handle vectorized variants
-        LEGATE_ABORT;  // unhandled variant kind
-    }
-  }
-  if (!has_leaf) leaf_variants[key] = 0;
-  return has_leaf;
+  return find_variant(ctx, task, kind).has_value();
 }
 
-VariantID BaseMapper::find_variant(const MapperContext ctx,
-                                   const LegionTask& task,
-                                   Processor::Kind kind)
+std::optional<VariantID> BaseMapper::find_variant(const MapperContext ctx,
+                                                  const LegionTask& task,
+                                                  Processor::Kind kind)
 {
-  const std::pair<TaskID, Processor::Kind> key(task.task_id, kind);
-  auto finder = leaf_variants.find(key);
-  if ((finder != leaf_variants.end()) && (finder->second != 0)) return finder->second;
+  const VariantCacheKey key(task.task_id, kind);
+  auto finder = variants.find(key);
+  if (finder != variants.end()) return finder->second;
+
   // Haven't seen it before so let's look it up to make sure it exists
-  std::vector<VariantID> variants;
-  runtime->find_valid_variants(ctx, key.first, variants, key.second);
-  VariantID result = 0;  // 0 is reserved
-  bool has_leaf    = false;
-  // Process all the results, record if we found what we were looking for
-  for (auto vid : variants) {
+  std::vector<VariantID> avail_variants;
+  runtime->find_valid_variants(ctx, key.first, avail_variants, key.second);
+  std::optional<VariantID> result;
+  for (auto vid : avail_variants) {
+#ifdef DEBUG_LEGATE
     assert(vid > 0);
+#endif
     switch (vid) {
       case LEGATE_CPU_VARIANT:
       case LEGATE_OMP_VARIANT:
       case LEGATE_GPU_VARIANT: {
-        has_leaf           = true;
-        leaf_variants[key] = vid;
-        result             = vid;
+        result = vid;
         break;
       }
-      default:         // TODO: handle vectorized variants
-        LEGATE_ABORT;  // unhandled variant kind
+      default: LEGATE_ABORT;  // unhandled variant kind
     }
   }
-  if (!has_leaf) leaf_variants[key] = 0;
-  // We must always be able to find the variant;
-  assert(result != 0);
+  variants[key] = result;
   return result;
 }
 
@@ -511,246 +438,139 @@ void BaseMapper::map_task(const MapperContext ctx,
   assert(task.get_depth() > 0);
 
   // Let's populate easy outputs first
-  output.chosen_variant = find_variant(ctx, task, task.target_proc.kind());
+  auto variant = find_variant(ctx, task, task.target_proc.kind());
+#ifdef DEBUG_LEGATE
+  assert(variant.has_value());
+#endif
+  output.chosen_variant = *variant;
   // Just put our target proc in the target processors for now
   output.target_procs.push_back(task.target_proc);
 
   Task legate_task(&task, context, runtime, ctx);
 
-  std::vector<StoreTarget> options;
-  switch (task.target_proc.kind()) {
-    case Processor::LOC_PROC: {
-      options = {StoreTarget::SYSMEM};
-      break;
-    }
-    case Processor::TOC_PROC: {
-      options = {StoreTarget::FBMEM, StoreTarget::ZCMEM};
-      break;
-    }
-    case Processor::OMP_PROC: {
-      options = {StoreTarget::SOCKETMEM, StoreTarget::SYSMEM};
-      break;
-    }
-    default: LEGATE_ABORT;
-  }
+  const auto& options = default_store_targets(task.target_proc.kind());
 
   auto mappings = store_mappings(legate_task, options);
 
-  std::map<RegionField::Id, uint32_t> client_mapped_regions;
-  std::map<uint32_t, uint32_t> client_mapped_futures;
-  for (uint32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
-    auto& mapping = mappings[mapping_idx];
-
-    assert(mapping.stores.size() > 0);
-    for (uint32_t store_idx = 1; store_idx < mapping.stores.size(); ++store_idx) {
-      if (!mapping.stores[store_idx].can_colocate_with(mapping.stores[0])) {
-        logger.error("Mapper %s tried to colocate stores that cannot colocate", get_mapper_name());
-        LEGATE_ABORT;
-      }
+  auto validate_colocation = [this](const auto& mapping) {
+    if (mapping.stores.empty()) {
+      logger.error("Store mapping must contain at least one store");
+      LEGATE_ABORT;
     }
-
     if (mapping.stores.size() > 1 && mapping.policy.ordering.relative) {
       logger.error("Colocation with relative dimension ordering is illegal");
       LEGATE_ABORT;
     }
-
-    for (auto& store : mapping.stores) {
-      if (store.is_future()) {
-        auto fut_idx                   = store.future().index();
-        client_mapped_futures[fut_idx] = mapping_idx;
-        continue;
+    auto& first_store = mapping.stores.front();
+    for (auto it = mapping.stores.begin() + 1; it != mapping.stores.end(); ++it) {
+      if (!it->can_colocate_with(first_store)) {
+        logger.error("Mapper %s tried to colocate stores that cannot colocate", get_mapper_name());
+        LEGATE_ABORT;
       }
+    }
+    assert(!(mapping.for_future() || mapping.for_unbound_store()) || mapping.stores.size() == 1);
+  };
+
+#ifdef DEBUG_LEGATE
+  for (auto& mapping : mappings) validate_colocation(mapping);
+#endif
 
-      auto& rf = store.region_field();
-      auto key = rf.unique_id();
-
-      auto finder = client_mapped_regions.find(key);
-      // If this is the first store mapping for this requirement,
-      // we record the mapping index for future reference.
-      if (finder == client_mapped_regions.end()) client_mapped_regions[key] = mapping_idx;
-      // If we're still in the same store mapping, we know for sure
-      // that the mapping is consistent.
-      else {
-        if (finder->second == mapping_idx) continue;
-        // Otherwise, we do consistency checking
-        auto& other_mapping = mappings[finder->second];
-        if (mapping.policy != other_mapping.policy) {
+  std::vector<StoreMapping> for_futures, for_unbound_stores, for_stores;
+  std::set<uint32_t> mapped_futures;
+  std::set<RegionField::Id> mapped_regions;
+
+  for (auto& mapping : mappings) {
+    if (mapping.for_future()) {
+      mapped_futures.insert(mapping.store().future_index());
+      for_futures.push_back(std::move(mapping));
+    } else if (mapping.for_unbound_store()) {
+      mapped_regions.insert(mapping.store().unique_region_field_id());
+      for_unbound_stores.push_back(std::move(mapping));
+    } else {
+      for (auto& store : mapping.stores) mapped_regions.insert(store.unique_region_field_id());
+      for_stores.push_back(std::move(mapping));
+    }
+  }
+
+  auto check_consistency = [this](const auto& mappings) {
+    std::map<RegionField::Id, InstanceMappingPolicy> policies;
+    for (const auto& mapping : mappings)
+      for (auto& store : mapping.stores) {
+        auto key    = store.unique_region_field_id();
+        auto finder = policies.find(key);
+        if (policies.end() == finder)
+          policies[key] = mapping.policy;
+        else if (mapping.policy != finder->second) {
           logger.error("Mapper %s returned inconsistent store mappings", get_mapper_name());
           LEGATE_ABORT;
         }
       }
-    }
-  }
+  };
+#ifdef DEBUG_LEGATE
+  check_consistency(for_stores);
+#endif
 
   // Generate default mappings for stores that are not yet mapped by the client mapper
   auto default_option            = options.front();
   auto generate_default_mappings = [&](auto& stores, bool exact) {
     for (auto& store : stores) {
+      auto mapping = StoreMapping::default_mapping(store, default_option, exact);
       if (store.is_future()) {
-        auto fut_idx = store.future().index();
-        if (client_mapped_futures.find(fut_idx) == client_mapped_futures.end())
-          mappings.push_back(StoreMapping::default_mapping(store, default_option, exact));
-        continue;
+        auto fut_idx = store.future_index();
+        if (mapped_futures.find(fut_idx) != mapped_futures.end()) continue;
+        mapped_futures.insert(fut_idx);
+        for_futures.push_back(std::move(mapping));
       } else {
-        auto key = store.region_field().unique_id();
-        if (client_mapped_regions.find(key) != client_mapped_regions.end()) continue;
-        client_mapped_regions[key] = static_cast<int32_t>(mappings.size());
-        mappings.push_back(StoreMapping::default_mapping(store, default_option, exact));
+        auto key = store.unique_region_field_id();
+        if (mapped_regions.find(key) != mapped_regions.end()) continue;
+        mapped_regions.insert(key);
+        if (store.unbound())
+          for_unbound_stores.push_back(std::move(mapping));
+        else
+          for_stores.push_back(std::move(mapping));
       }
     }
   };
-
   generate_default_mappings(legate_task.inputs(), false);
   generate_default_mappings(legate_task.outputs(), false);
   generate_default_mappings(legate_task.reductions(), false);
 
-  output.chosen_instances.resize(task.regions.size());
-
-  bool can_fail = true;
-  std::map<PhysicalInstance, std::set<int32_t>> instance_to_mappings;
-  std::map<int32_t, PhysicalInstance> mapping_to_instance;
-  std::vector<bool> handled(mappings.size(), false);
-
-  // See case of failed instance creation below
-  auto tighten_write_reqs = [&]() {
-    for (int32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
-      auto& mapping      = mappings[mapping_idx];
-      PrivilegeMode priv = LEGION_NO_ACCESS;
-#ifdef DEBUG_LEGATE
-      std::stringstream reqs_ss;
-#endif
-      for (auto req_idx : mapping.requirement_indices()) {
-        const RegionRequirement& req = task.regions[req_idx];
-        if (!req.region.exists()) continue;
-        priv |= req.privilege;
-#ifdef DEBUG_LEGATE
-        reqs_ss << " " << req_idx;
-#endif
-      }
-      if (!(priv & LEGION_WRITE_PRIV) || mapping.policy.exact) continue;
-#ifdef DEBUG_LEGATE
-      logger.debug() << "Task " << task.get_unique_id()
-                     << ": tightened mapping policy for reqs:" << reqs_ss.str();
-#endif
-      mapping.policy.exact = true;
-      if (!handled[mapping_idx]) continue;
-      handled[mapping_idx] = false;
-      auto m2i_it          = mapping_to_instance.find(mapping_idx);
-      if (m2i_it == mapping_to_instance.end()) continue;
-      PhysicalInstance inst = m2i_it->second;
-      mapping_to_instance.erase(m2i_it);
-      auto i2m_it = instance_to_mappings.find(inst);
-      i2m_it->second.erase(mapping_idx);
-      if (i2m_it->second.empty()) {
-        runtime->release_instance(ctx, inst);
-        instance_to_mappings.erase(i2m_it);
-      }
-    }
-  };
-
-  // Mapping each field separately for each of the logical regions
-  for (int32_t mapping_idx = 0; mapping_idx < mappings.size(); ++mapping_idx) {
-    if (handled[mapping_idx]) continue;
-    auto& mapping    = mappings[mapping_idx];
-    auto req_indices = mapping.requirement_indices();
-
-    if (req_indices.empty()) {
-      // This is a mapping for futures
+  // Map future-backed stores
+  auto map_futures = [&](auto& mappings) {
+    for (auto& mapping : mappings) {
       StoreTarget target = mapping.policy.target;
 #ifdef LEGATE_NO_FUTURES_ON_FB
       if (target == StoreTarget::FBMEM) target = StoreTarget::ZCMEM;
 #endif
       output.future_locations.push_back(get_target_memory(task.target_proc, target));
-      handled[mapping_idx] = true;
-      continue;
     }
-
-    if (mapping.for_unbound_stores()) {
-      for (auto req_idx : req_indices) {
-        output.output_targets[req_idx] = get_target_memory(task.target_proc, mapping.policy.target);
-        auto ndim                      = mapping.stores.front().dim();
-        // FIXME: Unbound stores can have more than one dimension later
-        std::vector<DimensionKind> dimension_ordering;
-        for (int32_t dim = ndim - 1; dim >= 0; --dim)
-          dimension_ordering.push_back(
-            static_cast<DimensionKind>(static_cast<int32_t>(DimensionKind::LEGION_DIM_X) + dim));
-        dimension_ordering.push_back(DimensionKind::LEGION_DIM_F);
-        output.output_constraints[req_idx].ordering_constraint =
-          OrderingConstraint(dimension_ordering, false);
-      }
-      handled[mapping_idx] = true;
-      continue;
-    }
-
-    std::vector<std::reference_wrapper<const RegionRequirement>> reqs;
-#ifdef DEBUG_LEGATE
-    std::stringstream reqs_ss;
-#endif
-    for (auto req_idx : req_indices) {
-      const auto& req = task.regions[req_idx];
-      if (!req.region.exists()) continue;
-      reqs.push_back(std::cref(req));
-#ifdef DEBUG_LEGATE
-      reqs_ss << " " << req_idx;
-#endif
-    }
-    if (reqs.empty()) {
-      handled[mapping_idx] = true;
-      continue;
-    }
-
-    // Get an instance and acquire it if necessary. If the acquire fails then prune it from the
-    // mapper's data structures and retry, until we succeed or map_legate_store fails with an out of
-    // memory error.
-    PhysicalInstance result;
-    while (map_legate_store(ctx, task, mapping, reqs, task.target_proc, result, can_fail)) {
-      if (result == PhysicalInstance()) break;
-      if (instance_to_mappings.count(result) > 0 || runtime->acquire_instance(ctx, result)) {
-#ifdef DEBUG_LEGATE
-        logger.debug() << "Task " << task.get_unique_id() << ": acquired instance " << result
-                       << " for reqs:" << reqs_ss.str();
-#endif
-        break;
-      }
-#ifdef DEBUG_LEGATE
-      logger.debug() << "Task " << task.get_unique_id() << ": failed to acquire instance " << result
-                     << " for reqs:" << reqs_ss.str();
-#endif
-      AutoLock lock(ctx, local_instances->manager_lock());
-      local_instances->erase(result);
-    }
-
-    // If instance creation failed we try mapping all stores again, but request tight instances for
-    // write requirements. The hope is that these write requirements cover the entire region (i.e.
-    // they use a complete partition), so the new tight instances will invalidate any pre-existing
-    // "bloated" instances for the same region, freeing up enough memory so that mapping can succeed
-    if (result == PhysicalInstance()) {
-#ifdef DEBUG_LEGATE
-      logger.debug() << "Task " << task.get_unique_id()
-                     << ": failed mapping for reqs:" << reqs_ss.str();
-#endif
-      assert(can_fail);
-      tighten_write_reqs();
-      mapping_idx = -1;
-      can_fail    = false;
-      continue;
+  };
+  map_futures(for_futures);
+
+  // Map unbound stores
+  auto map_unbound_stores = [&](auto& mappings) {
+    for (auto& mapping : mappings) {
+      auto req_idx                   = mapping.requirement_index();
+      output.output_targets[req_idx] = get_target_memory(task.target_proc, mapping.policy.target);
+      auto ndim                      = mapping.store().dim();
+      // FIXME: Unbound stores can have more than one dimension later
+      std::vector<DimensionKind> dimension_ordering;
+      for (int32_t dim = ndim - 1; dim >= 0; --dim)
+        dimension_ordering.push_back(
+          static_cast<DimensionKind>(static_cast<int32_t>(DimensionKind::LEGION_DIM_X) + dim));
+      dimension_ordering.push_back(DimensionKind::LEGION_DIM_F);
+      output.output_constraints[req_idx].ordering_constraint =
+        OrderingConstraint(dimension_ordering, false);
     }
+  };
+  map_unbound_stores(for_unbound_stores);
 
-    // Success; record the instance for this mapping.
-#ifdef DEBUG_LEGATE
-    logger.debug() << "Task " << task.get_unique_id()
-                   << ": completed mapping for reqs:" << reqs_ss.str();
-#endif
-    instance_to_mappings[result].insert(mapping_idx);
-    mapping_to_instance[mapping_idx] = result;
-    handled[mapping_idx]             = true;
-  }
+  output.chosen_instances.resize(task.regions.size());
+  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  for (uint32_t idx = 0; idx < task.regions.size(); ++idx)
+    output_map[&task.regions[idx]] = &output.chosen_instances[idx];
 
-  // Succeeded in mapping all stores, record it on map_task output.
-  for (const auto& m2i : mapping_to_instance)
-    for (auto req_idx : mappings[m2i.first].requirement_indices())
-      if (task.regions[req_idx].region.exists())
-        output.chosen_instances[req_idx].push_back(m2i.second);
+  map_legate_stores(ctx, task, for_stores, task.target_proc, output_map);
 }
 
 void BaseMapper::map_replicate_task(const MapperContext ctx,
@@ -762,35 +582,6 @@ void BaseMapper::map_replicate_task(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-bool BaseMapper::find_existing_instance(const MapperContext ctx,
-                                        LogicalRegion region,
-                                        FieldID fid,
-                                        Memory target_memory,
-                                        PhysicalInstance& result,
-                                        Strictness strictness,
-                                        bool acquire_instance_lock)
-{
-  std::unique_ptr<AutoLock> lock =
-    acquire_instance_lock ? std::make_unique<AutoLock>(ctx, local_instances->manager_lock())
-                          : nullptr;
-  // See if we already have it in our local instances
-  if (local_instances->find_instance(region, fid, target_memory, result))
-    return true;
-  else if (strictness == Strictness::strict)
-    return false;
-
-  // See if we can find an existing instance in any memory
-  if (local_instances->find_instance(region, fid, local_system_memory, result)) return true;
-
-  for (auto& pair : local_frame_buffers)
-    if (local_instances->find_instance(region, fid, pair.second, result)) return true;
-
-  for (auto& pair : local_numa_domains)
-    if (local_instances->find_instance(region, fid, pair.second, result)) return true;
-
-  return false;
-}
-
 Memory BaseMapper::get_target_memory(Processor proc, StoreTarget target)
 {
   switch (target) {
@@ -804,26 +595,120 @@ Memory BaseMapper::get_target_memory(Processor proc, StoreTarget target)
   return Memory::NO_MEMORY;
 }
 
+void BaseMapper::map_legate_stores(const MapperContext ctx,
+                                   const Mappable& mappable,
+                                   std::vector<StoreMapping>& mappings,
+                                   Processor target_proc,
+                                   OutputMap& output_map)
+{
+  auto try_mapping = [&](bool can_fail) {
+    const PhysicalInstance NO_INST{};
+    std::vector<PhysicalInstance> instances;
+    for (auto& mapping : mappings) {
+      PhysicalInstance result = NO_INST;
+      auto reqs               = mapping.requirements();
+      while (map_legate_store(ctx, mappable, mapping, reqs, target_proc, result, can_fail)) {
+        if (NO_INST == result) {
+#ifdef DEBUG_LEGATE
+          assert(can_fail);
+#endif
+          for (auto& instance : instances) runtime->release_instance(ctx, instance);
+          return false;
+        }
+#ifdef DEBUG_LEGATE
+        std::stringstream reqs_ss;
+        for (auto req_idx : mapping.requirement_indices()) reqs_ss << " " << req_idx;
+#endif
+        if (runtime->acquire_instance(ctx, result)) {
+#ifdef DEBUG_LEGATE
+          logger.debug() << log_mappable(mappable) << ": acquired instance " << result
+                         << " for reqs:" << reqs_ss.str();
+#endif
+          break;
+        }
+#ifdef DEBUG_LEGATE
+        logger.debug() << log_mappable(mappable) << ": failed to acquire instance " << result
+                       << " for reqs:" << reqs_ss.str();
+#endif
+        AutoLock lock(ctx, local_instances->manager_lock());
+        local_instances->erase(result);
+        result = NO_INST;
+      }
+      instances.push_back(result);
+    }
+
+    // If we're here, all stores are mapped and instances are all acquired
+    for (uint32_t idx = 0; idx < mappings.size(); ++idx) {
+      auto& mapping  = mappings[idx];
+      auto& instance = instances[idx];
+      for (auto& req : mapping.requirements()) output_map[req]->push_back(instance);
+    }
+    return true;
+  };
+
+  // We can retry the mapping with tightened policies only if at least one of the policies
+  // is lenient
+  bool can_fail = false;
+  for (auto& mapping : mappings) can_fail = can_fail || !mapping.policy.exact;
+
+  if (!try_mapping(can_fail)) {
+#ifdef DEBUG_LEGATE
+    logger.debug() << log_mappable(mappable) << " failed to map all stores, retrying with "
+                   << "tighter policies";
+#endif
+    // If instance creation failed we try mapping all stores again, but request tight instances for
+    // write requirements. The hope is that these write requirements cover the entire region (i.e.
+    // they use a complete partition), so the new tight instances will invalidate any pre-existing
+    // "bloated" instances for the same region, freeing up enough memory so that mapping can succeed
+    tighten_write_policies(mappable, mappings);
+    try_mapping(false);
+  }
+}
+
+void BaseMapper::tighten_write_policies(const Mappable& mappable,
+                                        std::vector<StoreMapping>& mappings)
+{
+  for (auto& mapping : mappings) {
+    // If the policy is exact, there's nothing we can tighten
+    if (mapping.policy.exact) continue;
+
+    PrivilegeMode priv = LEGION_NO_ACCESS;
+    for (auto* req : mapping.requirements()) priv |= req->privilege;
+    // We tighten only write requirements
+    if (!(priv & LEGION_WRITE_PRIV)) continue;
+
+#ifdef DEBUG_LEGATE
+    std::stringstream reqs_ss;
+    for (auto req_idx : mapping.requirement_indices()) reqs_ss << " " << req_idx;
+    logger.debug() << log_mappable(mappable)
+                   << ": tightened mapping policy for reqs:" << reqs_ss.str();
+#endif
+    mapping.policy.exact = true;
+  }
+}
+
 bool BaseMapper::map_legate_store(const MapperContext ctx,
                                   const Mappable& mappable,
                                   const StoreMapping& mapping,
-                                  std::vector<std::reference_wrapper<const RegionRequirement>> reqs,
+                                  const std::set<const RegionRequirement*>& reqs,
                                   Processor target_proc,
                                   PhysicalInstance& result,
                                   bool can_fail)
 {
+  if (reqs.empty()) return false;
+
   const auto& policy = mapping.policy;
   std::vector<LogicalRegion> regions;
-  for (auto& req : reqs) regions.push_back(req.get().region);
+  for (auto* req : reqs) regions.push_back(req->region);
   auto target_memory = get_target_memory(target_proc, policy.target);
 
   ReductionOpID redop = 0;
   bool first          = true;
-  for (auto& req : reqs) {
+  for (auto* req : reqs) {
     if (first)
-      redop = req.get().redop;
+      redop = req->redop;
     else {
-      if (redop != req.get().redop) {
+      if (redop != req->redop) {
         logger.error(
           "Colocated stores should be either non-reduction arguments "
           "or reductions with the same reduction operator.");
@@ -970,281 +855,41 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   return true;
 }
 
-bool BaseMapper::map_raw_array(const MapperContext ctx,
-                               const Mappable& mappable,
-                               uint32_t index,
-                               LogicalRegion region,
-                               FieldID fid,
-                               Memory target_memory,
-                               Processor target_proc,
-                               const std::vector<PhysicalInstance>& valid,
-                               PhysicalInstance& result,
-                               bool memoize_result,
-                               ReductionOpID redop /*=0*/)
-{
-  // If we're making a reduction instance, we should just make it now
-  if (redop != 0) {
-    // Switch the target memory if we're going to a GPU because
-    // Realm's DMA system still does not support reductions
-    const std::vector<LogicalRegion> regions(1, region);
-    LayoutConstraintSet layout_constraints;
-    // No specialization
-    layout_constraints.add_constraint(SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
-    // SOA-C dimension ordering
-    std::vector<DimensionKind> dimension_ordering(4);
-    dimension_ordering[0] = DIM_Z;
-    dimension_ordering[1] = DIM_Y;
-    dimension_ordering[2] = DIM_X;
-    dimension_ordering[3] = DIM_F;
-    layout_constraints.add_constraint(OrderingConstraint(dimension_ordering, false /*contiguous*/));
-    // Constraint for the kind of memory
-    layout_constraints.add_constraint(MemoryConstraint(target_memory.kind()));
-    // Make sure we have our field
-    const std::vector<FieldID> fields(1, fid);
-    layout_constraints.add_constraint(FieldConstraint(fields, true /*contiguous*/));
-    if (!runtime->create_physical_instance(
-          ctx, target_memory, layout_constraints, regions, result, true /*acquire*/))
-      report_failed_mapping(mappable, index, target_memory, redop);
-    // We already did the acquire
-    return false;
-  }
-
-  AutoLock lock(ctx, local_instances->manager_lock());
-
-  // See if we already have it in our local instances
-  if (local_instances->find_instance(region, fid, target_memory, result))
-    // Needs acquire to keep the runtime happy
-    return true;
-
-  // There's a little asymmetry here between CPUs and GPUs for NUMA effects
-  // For CPUs NUMA-effects are within a factor of 2X additional latency and
-  // reduced bandwidth, so it's better to just use data where it is rather
-  // than move it. For GPUs though, the difference between local framebuffer
-  // and remote can be on the order of 800 GB/s versus 20 GB/s over NVLink
-  // so it's better to move things local, so we'll always try to make a local
-  // instance before checking for a nearby instance in a different GPU.
-  if (target_proc.exists() && ((target_proc.kind() == Processor::LOC_PROC) ||
-                               (target_proc.kind() == Processor::OMP_PROC))) {
-    Machine::MemoryQuery affinity_mems(machine);
-    affinity_mems.has_affinity_to(target_proc);
-    for (auto memory : affinity_mems) {
-      if (local_instances->find_instance(region, fid, memory, result))
-        // Needs acquire to keep the runtime happy
-        return true;
-    }
-  }
-  // This whole process has to appear atomic
-  runtime->disable_reentrant(ctx);
-  // Haven't made this instance before, so make it now
-  // We can do an interesting optimization here to try to reduce unnecessary
-  // inter-memory copies. For logical regions that are overlapping we try
-  // to accumulate as many as possible into one physical instance and use
-  // that instance for all the tasks for the different regions.
-  // First we have to see if there is anything we overlap with
-  const IndexSpace is = region.get_index_space();
-  const Domain domain = runtime->get_index_space_domain(ctx, is);
-  auto group          = local_instances->find_region_group(region, domain, fid, target_memory);
-
-  // We're going to need some of this constraint information no matter
-  // which path we end up taking below
-  LayoutConstraintSet layout_constraints;
-  // No specialization
-  layout_constraints.add_constraint(SpecializedConstraint());
-  // SOA-C dimension ordering
-  std::vector<DimensionKind> dimension_ordering(4);
-  dimension_ordering[0] = DIM_Z;
-  dimension_ordering[1] = DIM_Y;
-  dimension_ordering[2] = DIM_X;
-  dimension_ordering[3] = DIM_F;
-  layout_constraints.add_constraint(OrderingConstraint(dimension_ordering, false /*contiguous*/));
-  // Constraint for the kind of memory
-  layout_constraints.add_constraint(MemoryConstraint(target_memory.kind()));
-  // Make sure we have our field
-  const std::vector<FieldID> fields(1, fid);
-  layout_constraints.add_constraint(FieldConstraint(fields, true /*contiguous*/));
-
-  bool created;
-  size_t footprint;
-  if (runtime->find_or_create_physical_instance(ctx,
-                                                target_memory,
-                                                layout_constraints,
-                                                group->get_regions(),
-                                                result,
-                                                created,
-                                                true /*acquire*/,
-                                                memoize_result ? GC_NEVER_PRIORITY : 0,
-                                                false /*tight bounds*/,
-                                                &footprint)) {
-    // We succeeded in making the instance where we want it
-    assert(result.exists());
-    if (created)
-      logger.info("%s created instance %lx containing %zd bytes in memory " IDFMT,
-                  get_mapper_name(),
-                  result.get_instance_id(),
-                  footprint,
-                  target_memory.id);
-    // Only save the result for future use if it is not an external instance
-    if (memoize_result && !result.is_external_instance()) {
-      auto replaced = local_instances->record_instance(group, fid, result);
-      for (auto& instance : replaced) {
-        if (!instance.is_external_instance())
-          runtime->set_garbage_collection_priority(ctx, instance, 0);
-      }
-    }
-    // We made it so no need for an acquire
-    runtime->enable_reentrant(ctx);
-    return false;
-  }
-  // Done with the atomic part
-  runtime->enable_reentrant(ctx);
-
-  // If we get here it's because we failed to make the instance, we still
-  // have a few more tricks that we can try
-  // First see if we can find an existing valid instance that we can use
-  // with affinity to our target processor
-  if (!valid.empty())
-    for (auto& instance : valid) {
-      // If it doesn't have the field then we don't care
-      if (instance.has_field(fid)) continue;
-      if (!target_proc.exists() || machine.has_affinity(target_proc, instance.get_location())) {
-        result = instance;
-        return true;
-      }
-    }
-
-  // Still couldn't find an instance, see if we can find any instances
-  // in memories that are local to our node that we can use
-  if (target_proc.exists()) {
-    Machine::MemoryQuery affinity_mems(machine);
-    affinity_mems.has_affinity_to(target_proc);
-    for (auto mem : affinity_mems)
-      if (local_instances->find_instance(region, fid, mem, result))
-        // Needs acquire to keep the runtime happy
-        return true;
-  } else if (find_existing_instance(
-               ctx, region, fid, target_memory, result, Strictness::strict, false))
-    return true;
-  // If we make it here then we failed entirely
-  report_failed_mapping(mappable, index, target_memory, redop);
-  return true;
-}
-
-void BaseMapper::filter_failed_acquires(const MapperContext ctx,
-                                        std::vector<PhysicalInstance>& needed_acquires,
-                                        std::set<PhysicalInstance>& failed_acquires)
-{
-  AutoLock lock(ctx, local_instances->manager_lock());
-  for (auto& instance : needed_acquires) {
-    if (failed_acquires.find(instance) != failed_acquires.end()) continue;
-    failed_acquires.insert(instance);
-    local_instances->erase(instance);
-  }
-  needed_acquires.clear();
-}
-
 void BaseMapper::report_failed_mapping(const Mappable& mappable,
                                        uint32_t index,
                                        Memory target_memory,
                                        ReductionOpID redop)
 {
-  const char* memory_kinds[] = {
+  static const char* memory_kinds[] = {
 #define MEM_NAMES(name, desc) desc,
     REALM_MEMORY_KINDS(MEM_NAMES)
 #undef MEM_NAMES
   };
+
+  std::string opname = "";
+  if (mappable.get_mappable_type() == Mappable::TASK_MAPPABLE) {
+    const auto task = mappable.as_task();
+    opname          = task->get_task_name();
+  }
+
   std::string provenance = mappable.get_provenance_string();
   if (provenance.empty()) provenance = "unknown provenance";
-  switch (mappable.get_mappable_type()) {
-    case Mappable::TASK_MAPPABLE: {
-      const auto task = mappable.as_task();
-      if (redop > 0)
-        logger.error(
-          "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of task %s [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          redop,
-          index,
-          task->get_task_name(),
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      else
-        logger.error(
-          "Mapper %s failed to map region requirement %d of "
-          "task %s [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          index,
-          task->get_task_name(),
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      break;
-    }
-    case Mappable::COPY_MAPPABLE: {
-      if (redop > 0)
-        logger.error(
-          "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of copy [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          redop,
-          index,
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      else
-        logger.error(
-          "Mapper %s failed to map region requirement %d of "
-          "copy [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          index,
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      break;
-    }
-    case Mappable::INLINE_MAPPABLE: {
-      if (redop > 0)
-        logger.error(
-          "Mapper %s failed to map reduction (%d) region "
-          "requirement %d of inline mapping [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          redop,
-          index,
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      else
-        logger.error(
-          "Mapper %s failed to map region requirement %d of "
-          "inline mapping [%s] (UID %lld) into %s memory " IDFMT,
-          get_mapper_name(),
-          index,
-          provenance.c_str(),
-          mappable.get_unique_id(),
-          memory_kinds[target_memory.kind()],
-          target_memory.id);
-      break;
-    }
-    case Mappable::PARTITION_MAPPABLE: {
-      assert(redop == 0);
-      logger.error(
-        "Mapper %s failed to map region requirement %d of "
-        "partition (UID %lld) into %s memory " IDFMT,
-        get_mapper_name(),
-        index,
-        mappable.get_unique_id(),
-        memory_kinds[target_memory.kind()],
-        target_memory.id);
-      break;
-    }
-    default: LEGATE_ABORT;  // should never get here
-  }
-  LEGATE_ABORT;
+
+  std::stringstream req_ss;
+  if (redop > 0)
+    req_ss << "reduction (" << redop << ") requirement " << index;
+  else
+    req_ss << "region requirement " << index;
+
+  logger.error("Mapper %s failed to map %s of %s%s[%s] (UID %lld) into %s memory " IDFMT,
+               get_mapper_name(),
+               req_ss.str().c_str(),
+               log_mappable(mappable, true /*prefix_only*/).c_str(),
+               opname.c_str(),
+               provenance.c_str(),
+               mappable.get_unique_id(),
+               memory_kinds[target_memory.kind()],
+               target_memory.id);
 }
 
 void BaseMapper::select_task_variant(const MapperContext ctx,
@@ -1252,7 +897,11 @@ void BaseMapper::select_task_variant(const MapperContext ctx,
                                      const SelectVariantInput& input,
                                      SelectVariantOutput& output)
 {
-  output.chosen_variant = find_variant(ctx, task, input.processor.kind());
+  auto variant = find_variant(ctx, task, input.processor.kind());
+#ifdef DEBUG_LEGATE
+  assert(variant.has_value());
+#endif
+  output.chosen_variant = *variant;
 }
 
 void BaseMapper::postmap_task(const MapperContext ctx,
@@ -1307,14 +956,6 @@ void BaseMapper::legate_select_sources(const MapperContext ctx,
       if (!affinity.empty()) {
         assert(affinity.size() == 1);
         memory_bandwidth = affinity[0].bandwidth;
-#if 0
-          } else {
-            // TODO: More graceful way of dealing with multi-hop copies
-            logger.warning("Legate mapper is potentially "
-                              "requesting a multi-hop copy between memories "
-                              IDFMT " and " IDFMT "!", location.id,
-                              destination_memory.id);
-#endif
       }
       source_memories[location] = memory_bandwidth;
       band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, memory_bandwidth));
@@ -1349,18 +990,26 @@ void BaseMapper::report_profiling(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
+ShardingID BaseMapper::find_sharding_functor_by_key_store_projection(
+  const std::vector<RegionRequirement>& requirements)
+{
+  ProjectionID proj_id = 0;
+  for (auto& requirement : requirements)
+    if (LEGATE_CORE_KEY_STORE_TAG == requirement.tag) {
+      proj_id = requirement.projection;
+      break;
+    }
+  return find_sharding_functor_by_projection_functor(proj_id);
+}
+
 void BaseMapper::select_sharding_functor(const MapperContext ctx,
                                          const LegionTask& task,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
-  for (auto& req : task.regions)
-    if (req.tag == LEGATE_CORE_KEY_STORE_TAG) {
-      output.chosen_functor = find_sharding_functor_by_projection_functor(req.projection);
-      return;
-    }
-
-  output.chosen_functor = 0;
+  output.chosen_functor = task.is_index_space
+                            ? find_sharding_functor_by_key_store_projection(task.regions)
+                            : find_sharding_functor_by_projection_functor(0);
 }
 
 void BaseMapper::map_inline(const MapperContext ctx,
@@ -1368,49 +1017,26 @@ void BaseMapper::map_inline(const MapperContext ctx,
                             const MapInlineInput& input,
                             MapInlineOutput& output)
 {
-  const std::vector<PhysicalInstance>& valid = input.valid_instances;
-  const RegionRequirement& req               = inline_op.requirement;
-  output.chosen_instances.resize(req.privilege_fields.size());
-  uint32_t index = 0;
-  std::vector<PhysicalInstance> needed_acquires;
-  for (auto fid : req.privilege_fields) {
-    if (map_raw_array(ctx,
-                      inline_op,
-                      0,
-                      req.region,
-                      fid,
-                      local_system_memory,
-                      inline_op.parent_task->current_proc,
-                      valid,
-                      output.chosen_instances[index],
-                      false /*memoize*/,
-                      req.redop))
-      needed_acquires.push_back(output.chosen_instances[index]);
-    ++index;
-  }
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    std::set<PhysicalInstance> failed_instances;
-    filter_failed_acquires(ctx, needed_acquires, failed_instances);
-    // Now go through all the fields for the instances and try and remap
-    std::set<FieldID>::const_iterator fit = req.privilege_fields.begin();
-    for (uint32_t idx = 0; idx < output.chosen_instances.size(); idx++, fit++) {
-      if (failed_instances.find(output.chosen_instances[idx]) == failed_instances.end()) continue;
-      // Now try to remap it
-      if (map_raw_array(ctx,
-                        inline_op,
-                        0 /*idx*/,
-                        req.region,
-                        *fit,
-                        local_system_memory,
-                        inline_op.parent_task->current_proc,
-                        valid,
-                        output.chosen_instances[idx],
-                        false /*memoize*/))
-        needed_acquires.push_back(output.chosen_instances[idx]);
-    }
-  }
+  Processor target_proc{Processor::NO_PROC};
+  if (!local_omps.empty())
+    target_proc = local_omps.front();
+  else
+    target_proc = local_cpus.front();
+
+  auto store_target = default_store_targets(target_proc.kind()).front();
+
+#ifdef DEBUG_LEGATE
+  assert(inline_op.requirement.instance_fields.size() == 1);
+#endif
+
+  Store store(legion_runtime->get_mapper_runtime(), ctx, &inline_op.requirement);
+  std::vector<StoreMapping> mappings;
+  mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+
+  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  for (auto* req : mappings.front().requirements()) output_map[req] = &output.chosen_instances;
+
+  map_legate_stores(ctx, inline_op, mappings, target_proc, output_map);
 }
 
 void BaseMapper::select_inline_sources(const MapperContext ctx,
@@ -1430,181 +1056,97 @@ void BaseMapper::report_profiling(const MapperContext ctx,
 }
 
 void BaseMapper::map_copy(const MapperContext ctx,
-                          const Copy& copy,
+                          const LegionCopy& copy,
                           const MapCopyInput& input,
                           MapCopyOutput& output)
 {
-  // We should always be able to materialize instances of the things
-  // we are copying so make concrete source instances
-  std::vector<PhysicalInstance> needed_acquires;
-  Memory target_memory = local_system_memory;
-  /*
-  if (copy.is_index_space) {
-    // If we've got GPUs, assume we're using them
-    if (!local_gpus.empty() || !local_omps.empty()) {
-      const ShardingID sid          = select_sharding_functor(copy);
-      NumPyShardingFunctor* functor = find_sharding_functor(sid);
-      Domain sharding_domain        = copy.index_domain;
-      if (copy.sharding_space.exists())
-        sharding_domain = runtime->get_index_space_domain(ctx, copy.sharding_space);
-      const uint32_t local_index =
-        functor->localize(copy.index_point, sharding_domain, total_nodes, local_node);
-      if (!local_gpus.empty()) {
-        const Processor proc = local_gpus[local_index % local_gpus.size()];
-        target_memory        = local_frame_buffers[proc];
-      } else {
-        const Processor proc = local_omps[local_index % local_omps.size()];
-        target_memory        = local_numa_domains[proc];
-      }
-    }
-  } else {
-  */
-  {
-    // If we have just one local GPU then let's use it, otherwise punt to CPU
-    // since it's not clear which one we should use
-    if (local_frame_buffers.size() == 1) target_memory = local_frame_buffers.begin()->second;
-  }
+  Processor target_proc{Processor::NO_PROC};
 
-  auto map_stores = [&](auto idx, auto& req, auto& inputs, auto& outputs) {
-    auto& region = req.region;
-    outputs.resize(req.privilege_fields.size());
-    const auto& valid  = inputs;
-    uint32_t fidx      = 0;
-    const bool memoize = req.privilege != LEGION_REDUCE;
-    for (auto fid : req.privilege_fields) {
-      if (req.redop != 0) {
-        ++fidx;
-        continue;
-      }
-      if (find_existing_instance(ctx, region, fid, target_memory, outputs[fidx]) ||
-          map_raw_array(ctx,
-                        copy,
-                        idx,
-                        region,
-                        fid,
-                        target_memory,
-                        Processor::NO_PROC,
-                        valid,
-                        outputs[fidx],
-                        memoize))
-        needed_acquires.push_back(outputs[fidx]);
-      ++fidx;
-    }
-  };
+  uint32_t proc_id = 0;
+  if (copy.is_index_space) {
+    Domain sharding_domain = copy.index_domain;
+    if (copy.sharding_space.exists())
+      sharding_domain = runtime->get_index_space_domain(ctx, copy.sharding_space);
 
-  auto dst_offset          = copy.src_requirements.size();
-  auto src_indirect_offset = dst_offset + copy.dst_requirements.size();
-  auto dst_indirect_offset = src_indirect_offset + copy.src_indirect_requirements.size();
-
-  for (uint32_t idx = 0; idx < copy.src_requirements.size(); idx++) {
-    map_stores(
-      idx, copy.src_requirements[idx], input.src_instances[idx], output.src_instances[idx]);
-
-    map_stores(idx + dst_offset,
-               copy.dst_requirements[idx],
-               input.dst_instances[idx],
-               output.dst_instances[idx]);
-
-    if (idx < copy.src_indirect_requirements.size()) {
-      std::vector<PhysicalInstance> outputs;
-      map_stores(idx + src_indirect_offset,
-                 copy.src_indirect_requirements[idx],
-                 input.src_indirect_instances[idx],
-                 outputs);
-      output.src_indirect_instances[idx] = outputs[0];
-    }
+    // FIXME: We might later have non-identity projections for copy requirements,
+    // in which case we should find the key store and use its projection functor
+    // for the linearization
+    auto* key_functor = find_legate_projection_functor(0);
 
-    if (idx < copy.dst_indirect_requirements.size()) {
-      std::vector<PhysicalInstance> outputs;
-      map_stores(idx + dst_indirect_offset,
-                 copy.dst_indirect_requirements[idx],
-                 input.dst_indirect_instances[idx],
-                 outputs);
-      output.dst_indirect_instances[idx] = outputs[0];
+    if (key_functor != nullptr) {
+      auto lo = key_functor->project_point(sharding_domain.lo(), sharding_domain);
+      auto hi = key_functor->project_point(sharding_domain.hi(), sharding_domain);
+      auto p  = key_functor->project_point(copy.index_point, sharding_domain);
+      proc_id = linearize(lo, hi, p);
+    } else {
+      proc_id = linearize(sharding_domain.lo(), sharding_domain.hi(), copy.index_point);
     }
   }
+  if (!local_gpus.empty())
+    target_proc = local_gpus[proc_id % local_gpus.size()];
+  else if (!local_omps.empty())
+    target_proc = local_omps[proc_id % local_omps.size()];
+  else
+    target_proc = local_cpus[proc_id % local_cpus.size()];
 
-  auto remap_stores = [&](auto idx, auto& req, auto& inputs, auto& outputs, auto& failed_acquires) {
-    auto& region       = req.region;
-    const auto& valid  = inputs;
-    uint32_t fidx      = 0;
-    const bool memoize = req.privilege != LEGION_REDUCE;
-    for (auto fid : req.privilege_fields) {
-      if (failed_acquires.find(outputs[fidx]) == failed_acquires.end()) {
-        ++fidx;
-        continue;
-      }
-      if (map_raw_array(ctx,
-                        copy,
-                        idx,
-                        region,
-                        fid,
-                        target_memory,
-                        Processor::NO_PROC,
-                        valid,
-                        outputs[fidx],
-                        memoize))
-        needed_acquires.push_back(outputs[fidx]);
-      ++fidx;
-    }
+  auto store_target = default_store_targets(target_proc.kind()).front();
+
+  Copy legate_copy(&copy, runtime, ctx);
+
+  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  auto add_to_output_map = [&output_map](auto& reqs, auto& instances) {
+    instances.resize(reqs.size());
+    for (uint32_t idx = 0; idx < reqs.size(); ++idx) output_map[&reqs[idx]] = &instances[idx];
   };
+  add_to_output_map(copy.src_requirements, output.src_instances);
+  add_to_output_map(copy.dst_requirements, output.dst_instances);
 
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    // If we failed to acquire any of the instances we need to prune them
-    // out of the mapper's data structure so do that first
-    std::set<PhysicalInstance> failed_acquires;
-    filter_failed_acquires(ctx, needed_acquires, failed_acquires);
-
-    // Now go through and try to remap region requirements with failed acquisitions
-    for (uint32_t idx = 0; idx < copy.src_requirements.size(); idx++) {
-      remap_stores(idx,
-                   copy.src_requirements[idx],
-                   input.src_instances[idx],
-                   output.src_instances[idx],
-                   failed_acquires);
-
-      remap_stores(idx + dst_offset,
-                   copy.dst_requirements[idx],
-                   input.dst_instances[idx],
-                   output.dst_instances[idx],
-                   failed_acquires);
-      if (idx < copy.src_indirect_requirements.size()) {
-        std::vector<PhysicalInstance> outputs(1, output.src_indirect_instances[idx]);
-        remap_stores(idx + src_indirect_offset,
-                     copy.src_indirect_requirements[idx],
-                     input.src_indirect_instances[idx],
-                     outputs,
-                     failed_acquires);
-      }
-      if (idx < copy.dst_indirect_requirements.size()) {
-        std::vector<PhysicalInstance> outputs(1, output.dst_indirect_instances[idx]);
-        remap_stores(idx + dst_indirect_offset,
-                     copy.dst_indirect_requirements[idx],
-                     input.dst_indirect_instances[idx],
-                     outputs,
-                     failed_acquires);
-      }
-    }
+#ifdef DEBUG_LEGATE
+  assert(copy.src_indirect_requirements.size() <= 1);
+  assert(copy.dst_indirect_requirements.size() <= 1);
+#endif
+  if (!copy.src_indirect_requirements.empty()) {
+    // This is to make the push_back call later add the isntance to the right place
+    output.src_indirect_instances.clear();
+    output_map[&copy.src_indirect_requirements.front()] = &output.src_indirect_instances;
   }
+  if (!copy.dst_indirect_requirements.empty()) {
+    // This is to make the push_back call later add the isntance to the right place
+    output.dst_indirect_instances.clear();
+    output_map[&copy.dst_indirect_requirements.front()] = &output.dst_indirect_instances;
+  }
+
+  std::vector<StoreMapping> mappings;
+
+  for (auto& store : legate_copy.inputs())
+    mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+  for (auto& store : legate_copy.outputs())
+    mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+  for (auto& store : legate_copy.input_indirections())
+    mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+  for (auto& store : legate_copy.output_indirections())
+    mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+
+  map_legate_stores(ctx, copy, mappings, target_proc, output_map);
 }
 
 void BaseMapper::select_copy_sources(const MapperContext ctx,
-                                     const Copy& copy,
+                                     const LegionCopy& copy,
                                      const SelectCopySrcInput& input,
                                      SelectCopySrcOutput& output)
 {
   legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
 }
 
-void BaseMapper::speculate(const MapperContext ctx, const Copy& copy, SpeculativeOutput& output)
+void BaseMapper::speculate(const MapperContext ctx,
+                           const LegionCopy& copy,
+                           SpeculativeOutput& output)
 {
   output.speculate = false;
 }
 
 void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const Copy& copy,
+                                  const LegionCopy& copy,
                                   const CopyProfilingInfo& input)
 {
   // No profiling for copies yet
@@ -1612,11 +1154,12 @@ void BaseMapper::report_profiling(const MapperContext ctx,
 }
 
 void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Copy& copy,
+                                         const LegionCopy& copy,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
-  output.chosen_functor = 0;
+  // TODO: Copies can have key stores in the future
+  output.chosen_functor = find_sharding_functor_by_projection_functor(0);
 }
 
 void BaseMapper::select_close_sources(const MapperContext ctx,
@@ -1730,56 +1273,26 @@ void BaseMapper::map_partition(const MapperContext ctx,
                                const MapPartitionInput& input,
                                MapPartitionOutput& output)
 {
-  const RegionRequirement& req = partition.requirement;
-  output.chosen_instances.resize(req.privilege_fields.size());
-  const std::vector<PhysicalInstance>& valid = input.valid_instances;
-  std::vector<PhysicalInstance> needed_acquires;
-  uint32_t fidx      = 0;
-  const bool memoize = true;
-  for (auto fid : req.privilege_fields) {
-    if (find_existing_instance(ctx,
-                               req.region,
-                               fid,
-                               local_system_memory,
-                               output.chosen_instances[fidx],
-                               Strictness::strict) ||
-        map_raw_array(ctx,
-                      partition,
-                      0,
-                      req.region,
-                      fid,
-                      local_system_memory,
-                      Processor::NO_PROC,
-                      valid,
-                      output.chosen_instances[fidx],
-                      memoize)) {
-      needed_acquires.push_back(output.chosen_instances[fidx]);
-    }
-    ++fidx;
-  }
-  while (!needed_acquires.empty() &&
-         !runtime->acquire_and_filter_instances(ctx, needed_acquires, true /*filter on acquire*/)) {
-    assert(!needed_acquires.empty());
-    std::set<PhysicalInstance> failed_instances;
-    filter_failed_acquires(ctx, needed_acquires, failed_instances);
-    // Now go through all the fields for the instances and try and remap
-    auto fit = req.privilege_fields.begin();
-    for (uint32_t idx = 0; idx < output.chosen_instances.size(); idx++, fit++) {
-      if (failed_instances.find(output.chosen_instances[idx]) == failed_instances.end()) continue;
-      // Now try to remap it
-      if (map_raw_array(ctx,
-                        partition,
-                        0 /*idx*/,
-                        req.region,
-                        *fit,
-                        local_system_memory,
-                        Processor::NO_PROC,
-                        valid,
-                        output.chosen_instances[idx],
-                        memoize))
-        needed_acquires.push_back(output.chosen_instances[idx]);
-    }
-  }
+  Processor target_proc{Processor::NO_PROC};
+  if (!local_omps.empty())
+    target_proc = local_omps.front();
+  else
+    target_proc = local_cpus.front();
+
+  auto store_target = default_store_targets(target_proc.kind()).front();
+
+#ifdef DEBUG_LEGATE
+  assert(partition.requirement.instance_fields.size() == 1);
+#endif
+
+  Store store(legion_runtime->get_mapper_runtime(), ctx, &partition.requirement);
+  std::vector<StoreMapping> mappings;
+  mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
+
+  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  for (auto* req : mappings.front().requirements()) output_map[req] = &output.chosen_instances;
+
+  map_legate_stores(ctx, partition, mappings, target_proc, output_map);
 }
 
 void BaseMapper::select_partition_sources(const MapperContext ctx,
@@ -1803,7 +1316,7 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
-  output.chosen_functor = 0;
+  output.chosen_functor = find_sharding_functor_by_projection_functor(0);
 }
 
 void BaseMapper::select_sharding_functor(const MapperContext ctx,
@@ -1811,7 +1324,9 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
-  output.chosen_functor = 0;
+  output.chosen_functor = fill.is_index_space
+                            ? find_sharding_functor_by_key_store_projection({fill.requirement})
+                            : find_sharding_functor_by_projection_functor(0);
 }
 
 void BaseMapper::configure_context(const MapperContext ctx,
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index d81898411..850427b6d 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -18,6 +18,7 @@
 
 #include <functional>
 #include <memory>
+#include <optional>
 
 #include "legion.h"
 
@@ -256,34 +257,22 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
 
  protected:
   Legion::Memory get_target_memory(Legion::Processor proc, StoreTarget target);
-  bool find_existing_instance(const Legion::Mapping::MapperContext ctx,
-                              Legion::LogicalRegion region,
-                              Legion::FieldID fid,
-                              Legion::Memory target_memory,
-                              Legion::Mapping::PhysicalInstance& result,
-                              Strictness strictness      = Strictness::hint,
-                              bool acquire_instance_lock = true);
+  using OutputMap =
+    std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>;
+  void map_legate_stores(const Legion::Mapping::MapperContext ctx,
+                         const Legion::Mappable& mappable,
+                         std::vector<StoreMapping>& mappings,
+                         Legion::Processor target_proc,
+                         OutputMap& output_map);
+  void tighten_write_policies(const Legion::Mappable& mappable,
+                              std::vector<StoreMapping>& mappings);
   bool map_legate_store(const Legion::Mapping::MapperContext ctx,
                         const Legion::Mappable& mappable,
                         const StoreMapping& mapping,
-                        std::vector<std::reference_wrapper<const Legion::RegionRequirement>> reqs,
+                        const std::set<const Legion::RegionRequirement*>& reqs,
                         Legion::Processor target_proc,
                         Legion::Mapping::PhysicalInstance& result,
                         bool can_fail);
-  bool map_raw_array(const Legion::Mapping::MapperContext ctx,
-                     const Legion::Mappable& mappable,
-                     unsigned index,
-                     Legion::LogicalRegion region,
-                     Legion::FieldID fid,
-                     Legion::Memory target_memory,
-                     Legion::Processor target_proc,
-                     const std::vector<Legion::Mapping::PhysicalInstance>& valid,
-                     Legion::Mapping::PhysicalInstance& result,
-                     bool memoize,
-                     Legion::ReductionOpID redop = 0);
-  void filter_failed_acquires(const Legion::Mapping::MapperContext ctx,
-                              std::vector<Legion::Mapping::PhysicalInstance>& needed_acquires,
-                              std::set<Legion::Mapping::PhysicalInstance>& failed_acquires);
   void report_failed_mapping(const Legion::Mappable& mappable,
                              unsigned index,
                              Legion::Memory target_memory,
@@ -297,15 +286,40 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
   bool has_variant(const Legion::Mapping::MapperContext ctx,
                    const Legion::Task& task,
                    Legion::Processor::Kind kind);
-  Legion::VariantID find_variant(const Legion::Mapping::MapperContext ctx,
-                                 const Legion::Task& task,
-                                 Legion::Processor::Kind kind);
+  std::optional<Legion::VariantID> find_variant(const Legion::Mapping::MapperContext ctx,
+                                                const Legion::Task& task,
+                                                Legion::Processor::Kind kind);
 
  private:
   void generate_prime_factors();
   void generate_prime_factor(const std::vector<Legion::Processor>& processors,
                              Legion::Processor::Kind kind);
 
+ protected:
+  template <typename Functor>
+  decltype(auto) dispatch(TaskTarget target, Functor functor)
+  {
+    switch (target) {
+      case TaskTarget::CPU: return functor(local_cpus);
+      case TaskTarget::GPU: return functor(local_gpus);
+      case TaskTarget::OMP: return functor(local_omps);
+    }
+    assert(false);
+    return functor(local_cpus);
+  }
+  template <typename Functor>
+  decltype(auto) dispatch(Legion::Processor::Kind kind, Functor functor)
+  {
+    switch (kind) {
+      case Legion::Processor::LOC_PROC: return functor(local_cpus);
+      case Legion::Processor::TOC_PROC: return functor(local_gpus);
+      case Legion::Processor::OMP_PROC: return functor(local_omps);
+      default: LEGATE_ABORT;
+    }
+    assert(false);
+    return functor(local_cpus);
+  }
+
  protected:
   const std::vector<int32_t> get_processor_grid(Legion::Processor::Kind kind, int32_t ndim);
   void slice_auto_task(const Legion::Mapping::MapperContext ctx,
@@ -321,6 +335,10 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
                               const SliceTaskInput& input,
                               SliceTaskOutput& output);
 
+ protected:
+  Legion::ShardingID find_sharding_functor_by_key_store_projection(
+    const std::vector<Legion::RegionRequirement>& requirements);
+
  protected:
   static inline bool physical_sort_func(
     const std::pair<Legion::Mapping::PhysicalInstance, unsigned>& left,
@@ -343,15 +361,14 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
   std::vector<Legion::Processor> local_cpus;
   std::vector<Legion::Processor> local_gpus;
   std::vector<Legion::Processor> local_omps;  // OpenMP processors
-  std::vector<Legion::Processor> local_ios;   // I/O processors
-  std::vector<Legion::Processor> local_pys;   // Python processors
  protected:
   Legion::Memory local_system_memory, local_zerocopy_memory;
   std::map<Legion::Processor, Legion::Memory> local_frame_buffers;
   std::map<Legion::Processor, Legion::Memory> local_numa_domains;
 
  protected:
-  std::map<std::pair<Legion::TaskID, Legion::Processor::Kind>, Legion::VariantID> leaf_variants;
+  using VariantCacheKey = std::pair<Legion::TaskID, Legion::Processor::Kind>;
+  std::map<VariantCacheKey, std::optional<Legion::VariantID>> variants;
 
  protected:
   InstanceManager* local_instances;
diff --git a/src/core/mapping/mapping.cc b/src/core/mapping/mapping.cc
index bbd75ee53..5d1aa971c 100644
--- a/src/core/mapping/mapping.cc
+++ b/src/core/mapping/mapping.cc
@@ -112,23 +112,42 @@ void InstanceMappingPolicy::populate_layout_constraints(
   return std::move(policy);
 }
 
-bool StoreMapping::for_unbound_stores() const
+bool StoreMapping::for_future() const
+{
+  for (auto& store : stores) return store.is_future();
+  assert(false);
+  return false;
+}
+
+bool StoreMapping::for_unbound_store() const
 {
   for (auto& store : stores) return store.unbound();
   assert(false);
   return false;
 }
 
+const Store& StoreMapping::store() const
+{
+#ifdef DEBUG_LEGATE
+  assert(stores.size() == 1);
+#endif
+  return stores.front();
+}
+
 uint32_t StoreMapping::requirement_index() const
 {
+#ifdef DEBUG_LEGATE
   assert(stores.size() > 0);
   uint32_t result = -1U;
   for (auto& store : stores) {
-    auto idx = store.region_field().index();
+    auto idx = store.requirement_index();
     assert(result == -1U || result == idx);
     result = idx;
   }
   return result;
+#else
+  return stores.front().requirement_index();
+#endif
 }
 
 std::set<uint32_t> StoreMapping::requirement_indices() const
@@ -141,6 +160,18 @@ std::set<uint32_t> StoreMapping::requirement_indices() const
   return std::move(indices);
 }
 
+std::set<const RegionRequirement*> StoreMapping::requirements() const
+{
+  std::set<const RegionRequirement*> reqs;
+  for (auto& store : stores) {
+    if (store.is_future()) continue;
+    auto* req = store.region_field().get_requirement();
+    if (!req->region.exists()) continue;
+    reqs.insert(req);
+  }
+  return std::move(reqs);
+}
+
 void StoreMapping::populate_layout_constraints(
   Legion::LayoutConstraintSet& layout_constraints) const
 {
diff --git a/src/core/mapping/mapping.h b/src/core/mapping/mapping.h
index 5d5ab8466..2d56d1a67 100644
--- a/src/core/mapping/mapping.h
+++ b/src/core/mapping/mapping.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "core/mapping/task.h"
+#include "core/mapping/operation.h"
 
 namespace legate {
 namespace mapping {
@@ -133,9 +133,14 @@ struct StoreMapping {
   StoreMapping& operator=(StoreMapping&&) = default;
 
  public:
-  bool for_unbound_stores() const;
+  bool for_future() const;
+  bool for_unbound_store() const;
+  const Store& store() const;
+
+ public:
   uint32_t requirement_index() const;
   std::set<uint32_t> requirement_indices() const;
+  std::set<const Legion::RegionRequirement*> requirements() const;
 
  public:
   void populate_layout_constraints(Legion::LayoutConstraintSet& layout_constraints) const;
diff --git a/src/core/mapping/task.cc b/src/core/mapping/operation.cc
similarity index 57%
rename from src/core/mapping/task.cc
rename to src/core/mapping/operation.cc
index 42549b826..03f34d5b2 100644
--- a/src/core/mapping/task.cc
+++ b/src/core/mapping/operation.cc
@@ -14,27 +14,28 @@
  *
  */
 
-#include "core/mapping/task.h"
+#include "core/mapping/operation.h"
 #include "core/utilities/deserializer.h"
 
 namespace legate {
 namespace mapping {
 
 using LegionTask = Legion::Task;
+using LegionCopy = Legion::Copy;
 
 using namespace Legion;
 using namespace Legion::Mapping;
 
-RegionField::RegionField(const LegionTask* task, int32_t dim, uint32_t idx, FieldID fid)
-  : task_(task), dim_(dim), idx_(idx), fid_(fid)
+RegionField::RegionField(const RegionRequirement* req, int32_t dim, uint32_t idx, FieldID fid)
+  : req_(req), dim_(dim), idx_(idx), fid_(fid)
 {
 }
 
 bool RegionField::can_colocate_with(const RegionField& other) const
 {
-  auto& my_req    = get_requirement();
-  auto& other_req = other.get_requirement();
-  return my_req.region.get_tree_id() == other_req.region.get_tree_id();
+  auto* my_req    = get_requirement();
+  auto* other_req = other.get_requirement();
+  return my_req->region.get_tree_id() == other_req->region.get_tree_id();
 }
 
 Domain RegionField::domain(MapperRuntime* runtime, const MapperContext context) const
@@ -42,15 +43,7 @@ Domain RegionField::domain(MapperRuntime* runtime, const MapperContext context)
   return runtime->get_index_space_domain(context, get_index_space());
 }
 
-const RegionRequirement& RegionField::get_requirement() const
-{
-  return dim_ > 0 ? task_->regions[idx_] : task_->output_regions[idx_];
-}
-
-IndexSpace RegionField::get_index_space() const
-{
-  return get_requirement().region.get_index_space();
-}
+IndexSpace RegionField::get_index_space() const { return req_->region.get_index_space(); }
 
 FutureWrapper::FutureWrapper(uint32_t idx, const Domain& domain) : idx_(idx), domain_(domain) {}
 
@@ -90,27 +83,53 @@ Store::Store(Legion::Mapping::MapperRuntime* runtime,
 {
 }
 
+Store::Store(Legion::Mapping::MapperRuntime* runtime,
+             const Legion::Mapping::MapperContext context,
+             const Legion::RegionRequirement* requirement)
+  : is_future_(false),
+    is_output_store_(false),
+    dim_(requirement->region.get_dim()),
+    code_(LegateTypeCode::MAX_TYPE_NUMBER),
+    redop_id_(-1),
+    runtime_(runtime),
+    context_(context)
+{
+  region_field_ = RegionField(requirement, dim_, 0, requirement->instance_fields.front());
+}
+
 bool Store::can_colocate_with(const Store& other) const
 {
   if (is_future() || other.is_future())
     return false;
-  else if (is_reduction() || other.is_reduction())
+  else if (unbound() || other.unbound())
     return false;
+  else if (is_reduction() || other.is_reduction())
+    return redop() == other.redop() && region_field_.can_colocate_with(other.region_field_);
   return region_field_.can_colocate_with(other.region_field_);
 }
 
 const RegionField& Store::region_field() const
 {
+#ifdef DEBUG_LEGATE
   assert(!is_future());
+#endif
   return region_field_;
 }
 
 const FutureWrapper& Store::future() const
 {
+#ifdef DEBUG_LEGATE
   assert(is_future());
+#endif
   return future_;
 }
 
+RegionField::Id Store::unique_region_field_id() const { return region_field().unique_id(); }
+
+uint32_t Store::requirement_index() const { return region_field().index(); }
+
+uint32_t Store::future_index() const { return future().index(); }
+
 Domain Store::domain() const
 {
   assert(!unbound());
@@ -126,7 +145,7 @@ Task::Task(const LegionTask* task,
            const MapperContext context)
   : task_(task), library_(library)
 {
-  MapperDeserializer dez(task, runtime, context);
+  TaskDeserializer dez(task, runtime, context);
   inputs_     = dez.unpack<std::vector<Store>>();
   outputs_    = dez.unpack<std::vector<Store>>();
   reductions_ = dez.unpack<std::vector<Store>>();
@@ -135,5 +154,31 @@ Task::Task(const LegionTask* task,
 
 int64_t Task::task_id() const { return library_.get_local_task_id(task_->task_id); }
 
+Copy::Copy(const LegionCopy* copy, MapperRuntime* runtime, const MapperContext context)
+  : copy_(copy)
+{
+  CopyDeserializer dez(copy->mapper_data,
+                       copy->mapper_data_size,
+                       {copy->src_requirements,
+                        copy->dst_requirements,
+                        copy->src_indirect_requirements,
+                        copy->dst_indirect_requirements},
+                       runtime,
+                       context);
+  inputs_ = dez.unpack<std::vector<Store>>();
+  dez.next_requirement_list();
+  outputs_ = dez.unpack<std::vector<Store>>();
+  dez.next_requirement_list();
+  input_indirections_ = dez.unpack<std::vector<Store>>();
+  dez.next_requirement_list();
+  output_indirections_ = dez.unpack<std::vector<Store>>();
+#ifdef DEBUG_LEGATE
+  for (auto& input : inputs_) assert(!input.is_future());
+  for (auto& output : outputs_) assert(!output.is_future());
+  for (auto& input_indirection : input_indirections_) assert(!input_indirection.is_future());
+  for (auto& output_indirection : output_indirections_) assert(!output_indirection.is_future());
+#endif
+}
+
 }  // namespace mapping
 }  // namespace legate
diff --git a/src/core/mapping/task.h b/src/core/mapping/operation.h
similarity index 77%
rename from src/core/mapping/task.h
rename to src/core/mapping/operation.h
index 2d3ad9d6b..0cc5dc267 100644
--- a/src/core/mapping/task.h
+++ b/src/core/mapping/operation.h
@@ -32,7 +32,7 @@ class RegionField {
 
  public:
   RegionField() {}
-  RegionField(const Legion::Task* task, int32_t dim, uint32_t idx, Legion::FieldID fid);
+  RegionField(const Legion::RegionRequirement* req, int32_t dim, uint32_t idx, Legion::FieldID fid);
 
  public:
   RegionField(const RegionField& other)            = default;
@@ -62,12 +62,12 @@ class RegionField {
   Legion::FieldID field_id() const { return fid_; }
   bool unbound() const { return dim_ < 0; }
 
- private:
-  const Legion::RegionRequirement& get_requirement() const;
+ public:
+  const Legion::RegionRequirement* get_requirement() const { return req_; }
   Legion::IndexSpace get_index_space() const;
 
  private:
-  const Legion::Task* task_{nullptr};
+  const Legion::RegionRequirement* req_{nullptr};
   int32_t dim_{-1};
   uint32_t idx_{-1U};
   Legion::FieldID fid_{-1U};
@@ -111,6 +111,10 @@ class Store {
         const RegionField& region_field,
         bool is_output_store                        = false,
         std::shared_ptr<TransformStack>&& transform = nullptr);
+  // A special constructor to create a mapper view of a store from a region requirement
+  Store(Legion::Mapping::MapperRuntime* runtime,
+        const Legion::Mapping::MapperContext context,
+        const Legion::RegionRequirement* requirement);
 
  public:
   Store(const Store& other)            = default;
@@ -134,6 +138,11 @@ class Store {
   const RegionField& region_field() const;
   const FutureWrapper& future() const;
 
+ public:
+  RegionField::Id unique_region_field_id() const;
+  uint32_t requirement_index() const;
+  uint32_t future_index() const;
+
  public:
   template <int32_t DIM>
   Legion::Rect<DIM> shape() const;
@@ -188,7 +197,32 @@ class Task {
   std::vector<Scalar> scalars_;
 };
 
+class Copy {
+ public:
+  Copy(const Legion::Copy* copy,
+       Legion::Mapping::MapperRuntime* runtime,
+       const Legion::Mapping::MapperContext context);
+
+ public:
+  const std::vector<Store>& inputs() const { return inputs_; }
+  const std::vector<Store>& outputs() const { return outputs_; }
+  const std::vector<Store>& input_indirections() const { return input_indirections_; }
+  const std::vector<Store>& output_indirections() const { return output_indirections_; }
+
+ public:
+  Legion::DomainPoint point() const { return copy_->index_point; }
+
+ private:
+  const Legion::Copy* copy_;
+
+ private:
+  std::vector<Store> inputs_;
+  std::vector<Store> outputs_;
+  std::vector<Store> input_indirections_;
+  std::vector<Store> output_indirections_;
+};
+
 }  // namespace mapping
 }  // namespace legate
 
-#include "core/mapping/task.inl"
+#include "core/mapping/operation.inl"
diff --git a/src/core/mapping/task.inl b/src/core/mapping/operation.inl
similarity index 100%
rename from src/core/mapping/task.inl
rename to src/core/mapping/operation.inl
diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc
index 1fb0c8b0e..f62f5b50a 100644
--- a/src/core/utilities/deserializer.cc
+++ b/src/core/utilities/deserializer.cc
@@ -17,7 +17,6 @@
 #include "core/utilities/deserializer.h"
 #include "core/data/scalar.h"
 #include "core/data/store.h"
-#include "core/mapping/task.h"
 #include "core/utilities/machine.h"
 
 #include "legion/legion_c.h"
@@ -32,7 +31,7 @@ namespace legate {
 
 TaskDeserializer::TaskDeserializer(const LegionTask* task,
                                    const std::vector<PhysicalRegion>& regions)
-  : BaseDeserializer(task),
+  : BaseDeserializer(static_cast<const int8_t*>(task->args), task->arglen),
     futures_{task->futures.data(), task->futures.size()},
     regions_{regions.data(), regions.size()},
     outputs_()
@@ -128,15 +127,19 @@ void TaskDeserializer::_unpack(Legion::PhaseBarrier& barrier)
 
 namespace mapping {
 
-MapperDeserializer::MapperDeserializer(const LegionTask* task,
-                                       MapperRuntime* runtime,
-                                       MapperContext context)
-  : BaseDeserializer(task), runtime_(runtime), context_(context), future_index_(0)
+TaskDeserializer::TaskDeserializer(const Legion::Task* task,
+                                   MapperRuntime* runtime,
+                                   MapperContext context)
+  : BaseDeserializer(static_cast<const int8_t*>(task->args), task->arglen),
+    task_(task),
+    runtime_(runtime),
+    context_(context),
+    future_index_(0)
 {
   first_task_ = false;
 }
 
-void MapperDeserializer::_unpack(Store& value)
+void TaskDeserializer::_unpack(Store& value)
 {
   auto is_future        = unpack<bool>();
   auto is_output_region = unpack<bool>();
@@ -159,7 +162,7 @@ void MapperDeserializer::_unpack(Store& value)
   }
 }
 
-void MapperDeserializer::_unpack(FutureWrapper& value)
+void TaskDeserializer::_unpack(FutureWrapper& value)
 {
   // We still need to deserialize these fields to get to the domain
   unpack<bool>();
@@ -177,13 +180,66 @@ void MapperDeserializer::_unpack(FutureWrapper& value)
   value = FutureWrapper(future_index_++, domain);
 }
 
-void MapperDeserializer::_unpack(RegionField& value, bool is_output_region)
+void TaskDeserializer::_unpack(RegionField& value, bool is_output_region)
+{
+  auto dim = unpack<int32_t>();
+  auto idx = unpack<uint32_t>();
+  auto fid = unpack<int32_t>();
+
+  auto req = is_output_region ? &task_->output_regions[idx] : &task_->regions[idx];
+  value    = RegionField(req, dim, idx, fid);
+}
+
+CopyDeserializer::CopyDeserializer(const void* args,
+                                   size_t arglen,
+                                   std::vector<ReqsRef>&& all_requirements,
+                                   MapperRuntime* runtime,
+                                   MapperContext context)
+  : BaseDeserializer(static_cast<const int8_t*>(args), arglen),
+    all_reqs_(std::forward<std::vector<ReqsRef>>(all_requirements)),
+    curr_reqs_(all_reqs_.begin()),
+    runtime_(runtime),
+    context_(context),
+    req_index_offset_(0)
+{
+}
+
+void CopyDeserializer::next_requirement_list()
+{
+#ifdef DEBUG_LEGATE
+  assert(curr_reqs_ != all_reqs_.end());
+#endif
+  req_index_offset_ += curr_reqs_->get().size();
+  ++curr_reqs_;
+}
+
+void CopyDeserializer::_unpack(Store& value)
+{
+  auto is_future        = unpack<bool>();
+  auto is_output_region = unpack<bool>();
+  auto dim              = unpack<int32_t>();
+  auto code             = unpack<LegateTypeCode>();
+
+  auto transform = unpack_transform();
+
+#ifdef DEBUG_LEGATE
+  assert(!is_future && !is_output_region);
+#endif
+  auto redop_id = unpack<int32_t>();
+  RegionField rf;
+  _unpack(rf);
+  value =
+    Store(runtime_, context_, dim, code, redop_id, rf, is_output_region, std::move(transform));
+}
+
+void CopyDeserializer::_unpack(RegionField& value)
 {
   auto dim = unpack<int32_t>();
   auto idx = unpack<uint32_t>();
   auto fid = unpack<int32_t>();
 
-  value = RegionField(task_, dim, idx, fid);
+  auto req = &curr_reqs_->get()[idx];
+  value    = RegionField(req, dim, idx + req_index_offset_, fid);
 }
 
 }  // namespace mapping
diff --git a/src/core/utilities/deserializer.h b/src/core/utilities/deserializer.h
index 72ed67852..cd0e6aba7 100644
--- a/src/core/utilities/deserializer.h
+++ b/src/core/utilities/deserializer.h
@@ -23,7 +23,7 @@
 #include "core/comm/communicator.h"
 #include "core/data/scalar.h"
 #include "core/data/store.h"
-#include "core/mapping/task.h"
+#include "core/mapping/operation.h"
 #include "core/utilities/span.h"
 #include "core/utilities/type_traits.h"
 #include "core/utilities/typedefs.h"
@@ -34,7 +34,7 @@ namespace legate {
 template <typename Deserializer>
 class BaseDeserializer {
  public:
-  BaseDeserializer(const Legion::Task* task);
+  BaseDeserializer(const int8_t* args, size_t arglen);
 
  public:
   template <typename T>
@@ -49,8 +49,8 @@ class BaseDeserializer {
   template <typename T, std::enable_if_t<legate_type_code_of<T> != MAX_TYPE_NUMBER>* = nullptr>
   void _unpack(T& value)
   {
-    value      = *reinterpret_cast<const T*>(task_args_.ptr());
-    task_args_ = task_args_.subspan(sizeof(T));
+    value = *reinterpret_cast<const T*>(args_.ptr());
+    args_ = args_.subspan(sizeof(T));
   }
 
  public:
@@ -69,11 +69,10 @@ class BaseDeserializer {
   std::shared_ptr<TransformStack> unpack_transform();
 
  protected:
-  const Legion::Task* task_;
   bool first_task_;
 
  private:
-  Span<const int8_t> task_args_;
+  Span<const int8_t> args_;
 };
 
 class TaskDeserializer : public BaseDeserializer<TaskDeserializer> {
@@ -99,11 +98,11 @@ class TaskDeserializer : public BaseDeserializer<TaskDeserializer> {
 
 namespace mapping {
 
-class MapperDeserializer : public BaseDeserializer<MapperDeserializer> {
+class TaskDeserializer : public BaseDeserializer<TaskDeserializer> {
  public:
-  MapperDeserializer(const Legion::Task* task,
-                     Legion::Mapping::MapperRuntime* runtime,
-                     Legion::Mapping::MapperContext context);
+  TaskDeserializer(const Legion::Task* task,
+                   Legion::Mapping::MapperRuntime* runtime,
+                   Legion::Mapping::MapperContext context);
 
  public:
   using BaseDeserializer::_unpack;
@@ -114,11 +113,42 @@ class MapperDeserializer : public BaseDeserializer<MapperDeserializer> {
   void _unpack(RegionField& value, bool is_output_region);
 
  private:
+  const Legion::Task* task_;
   Legion::Mapping::MapperRuntime* runtime_;
   Legion::Mapping::MapperContext context_;
   uint32_t future_index_;
 };
 
+class CopyDeserializer : public BaseDeserializer<CopyDeserializer> {
+ private:
+  using Requirements = std::vector<Legion::RegionRequirement>;
+  using ReqsRef      = std::reference_wrapper<const Requirements>;
+
+ public:
+  CopyDeserializer(const void* args,
+                   size_t arglen,
+                   std::vector<ReqsRef>&& all_requirements,
+                   Legion::Mapping::MapperRuntime* runtime,
+                   Legion::Mapping::MapperContext context);
+
+ public:
+  using BaseDeserializer::_unpack;
+
+ public:
+  void next_requirement_list();
+
+ public:
+  void _unpack(Store& value);
+  void _unpack(RegionField& value);
+
+ private:
+  std::vector<ReqsRef> all_reqs_;
+  std::vector<ReqsRef>::iterator curr_reqs_;
+  Legion::Mapping::MapperRuntime* runtime_;
+  Legion::Mapping::MapperContext context_;
+  uint32_t req_index_offset_;
+};
+
 }  // namespace mapping
 
 }  // namespace legate
diff --git a/src/core/utilities/deserializer.inl b/src/core/utilities/deserializer.inl
index caebd1a0d..feeed5238 100644
--- a/src/core/utilities/deserializer.inl
+++ b/src/core/utilities/deserializer.inl
@@ -17,8 +17,8 @@
 namespace legate {
 
 template <typename Deserializer>
-BaseDeserializer<Deserializer>::BaseDeserializer(const Legion::Task* task)
-  : task_(task), task_args_{static_cast<const int8_t*>(task->args), task->arglen}
+BaseDeserializer<Deserializer>::BaseDeserializer(const int8_t* args, size_t arglen)
+  : args_(Span<const int8_t>(args, arglen))
 {
 }
 
@@ -33,8 +33,8 @@ void BaseDeserializer<Deserializer>::_unpack(Scalar& value)
 {
   auto tuple = unpack<bool>();
   auto code  = unpack<LegateTypeCode>();
-  value      = Scalar(tuple, code, task_args_.ptr());
-  task_args_ = task_args_.subspan(value.size());
+  value      = Scalar(tuple, code, args_.ptr());
+  args_      = args_.subspan(value.size());
 }
 
 template <typename Deserializer>
diff --git a/typings/legion_cffi/lib.pyi b/typings/legion_cffi/lib.pyi
index 2326b327c..364d43414 100644
--- a/typings/legion_cffi/lib.pyi
+++ b/typings/legion_cffi/lib.pyi
@@ -126,6 +126,7 @@ def legion_copy_launcher_set_possible_src_indirect_out_of_range(
 ) -> Any: ...
 def legion_copy_launcher_set_sharding_space(*args: Any) -> Any: ...
 def legion_copy_launcher_set_provenance(*args: Any) -> Any: ...
+def legion_copy_launcher_set_mapper_arg(*args: Any) -> Any: ...
 def legion_detach_external_resources(*args: Any) -> Any: ...
 def legion_domain_affine_transform_identity(*args: Any) -> Any: ...
 def legion_domain_empty(*args: Any) -> Any: ...
@@ -211,6 +212,7 @@ def legion_index_copy_launcher_set_possible_src_indirect_out_of_range(
 ) -> Any: ...
 def legion_index_copy_launcher_set_sharding_space(*args: Any) -> Any: ...
 def legion_index_copy_launcher_set_provenance(*args: Any) -> Any: ...
+def legion_index_copy_launcher_set_mapper_arg(*args: Any) -> Any: ...
 def legion_index_fill_launcher_create_from_future_with_domain(
     *args: Any,
 ) -> Any: ...

From eae28ac46a421d395ddc6bdf338a5357574444c5 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 1 Nov 2022 11:02:06 -0700
Subject: [PATCH 041/121] Support for concurrent launches (#459)

* Fixes to use concurrent task launches:

* Start using concurrent launches for communicators
* Extend the variant registration API for concurrent variants

* Python API to mark concurrent tasks

* Helper methods in VariantOptions

* Mark communicator metatasks concurrent and remove obsolete fences
---
 legate/core/_legion/task.py | 16 +++++++
 legate/core/communicator.py |  7 ++-
 legate/core/launcher.py     |  5 +++
 legate/core/operation.py    | 28 ++++--------
 src/core/task/task.cc       | 14 +++---
 src/core/task/task.h        | 90 ++++++++++++++++++++++---------------
 typings/legion_cffi/lib.pyi |  2 +
 7 files changed, 96 insertions(+), 66 deletions(-)

diff --git a/legate/core/_legion/task.py b/legate/core/_legion/task.py
index 7e463abec..670b2796c 100644
--- a/legate/core/_legion/task.py
+++ b/legate/core/_legion/task.py
@@ -1028,6 +1028,22 @@ def set_sharding_space(self, space: IndexSpace) -> None:
             self.launcher, space.handle
         )
 
+    def set_concurrent(self, concurrent: bool) -> None:
+        """
+        Set a flag indicating whether point tasks must execute
+        concurrently. Setting true to the flag directs the runtime
+        to make sure the tasks are using a concurrent variant and
+        also mapped to distinct processors with concurrent
+        execution guarantee (i.e., no subset of the processors execute
+        other tasks).
+
+        Parameters
+        ----------
+        concurrent : bool
+            Whether the point tasks must run concurrently
+        """
+        legion.legion_index_launcher_set_concurrent(self.launcher, concurrent)
+
     @dispatch
     def launch(
         self,
diff --git a/legate/core/communicator.py b/legate/core/communicator.py
index a794ab478..019258a51 100644
--- a/legate/core/communicator.py
+++ b/legate/core/communicator.py
@@ -101,14 +101,16 @@ def _initialize(self, volume: int) -> FutureMap:
 
         task = Task(self._context, self._init_nccl, tag=self._tag)
         task.add_future(nccl_id)
+        task.set_concurrent(True)
         handle = task.execute(Rect([volume]))
-        self._runtime.issue_execution_fence()
         return handle
 
     def _finalize(self, volume: int, handle: FutureMap) -> None:
         from .launcher import TaskLauncher as Task
 
         task = Task(self._context, self._finalize_nccl, tag=self._tag)
+        # Finalize may not need to be concurrent, but set it just in case
+        task.set_concurrent(True)
         task.add_future_map(handle)
         task.execute(Rect([volume]))
 
@@ -161,8 +163,8 @@ def _initialize(self, volume: int) -> FutureMap:
         for i in range(volume):
             f = mapping_table_fm.get_future(Point([i]))
             task.add_future(f)
+        task.set_concurrent(True)
         handle = task.execute(Rect([volume]))
-        self._runtime.issue_execution_fence()
         return handle
 
     def _finalize(self, volume: int, handle: FutureMap) -> None:
@@ -170,5 +172,6 @@ def _finalize(self, volume: int, handle: FutureMap) -> None:
 
         task = Task(self._context, self._finalize_cpucoll, tag=self._tag)
         task.add_future_map(handle)
+        task.set_concurrent(True)
         task.execute(Rect([volume]))
         self._runtime.issue_execution_fence()
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index ce87ffada..b19b357c0 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -724,6 +724,7 @@ def __init__(
         self._insert_barrier = False
         self._can_raise_exception = False
         self._provenance = provenance
+        self._concurrent = False
 
     @property
     def library_task_id(self) -> int:
@@ -875,6 +876,9 @@ def insert_barrier(self) -> None:
     def set_can_raise_exception(self, can_raise_exception: bool) -> None:
         self._can_raise_exception = can_raise_exception
 
+    def set_concurrent(self, concurrent: bool) -> None:
+        self._concurrent = concurrent
+
     def set_sharding_space(self, space: IndexSpace) -> None:
         self._sharding_space = space
 
@@ -921,6 +925,7 @@ def build_task(
             out_req.add(task, fields)
         for comm in self._comms:
             task.add_point_future(ArgumentMap(future_map=comm))
+        task.set_concurrent(len(self._comms) > 0 or self._concurrent)
         for future_map in self._future_map_args:
             task.add_point_future(ArgumentMap(future_map=future_map))
         return task
diff --git a/legate/core/operation.py b/legate/core/operation.py
index a158ece7d..998276e6a 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -244,6 +244,7 @@ def __init__(
         self._exn_types: list[type] = []
         self._tb_repr: Union[None, str] = None
         self._side_effect = False
+        self._concurrent = False
 
     @property
     def side_effect(self) -> bool:
@@ -253,8 +254,11 @@ def set_side_effect(self, side_effect: bool) -> None:
         self._side_effect = side_effect
 
     @property
-    def uses_communicator(self) -> bool:
-        return len(self._comm_args) > 0
+    def concurrent(self) -> bool:
+        return self._concurrent
+
+    def set_concurrent(self, concurrent: bool) -> None:
+        self._concurrent = concurrent
 
     def get_name(self) -> str:
         libname = self.context.library.get_name()
@@ -615,25 +619,17 @@ def get_requirement(
         self._add_scalar_args_to_launcher(launcher)
 
         launcher.set_can_raise_exception(self.can_raise_exception)
+        launcher.set_concurrent(self.concurrent)
 
         launch_domain = strategy.launch_domain if strategy.parallel else None
         self._add_communicators(launcher, launch_domain)
 
-        # TODO: For now we make sure no other operations are interleaved with
-        # the set of tasks that use a communicator. In the future, the
-        # communicator monad will do this for us.
-        if self.uses_communicator:
-            self._context.issue_execution_fence()
-
         result: Union[Future, FutureMap]
         if launch_domain is not None:
             result = launcher.execute(launch_domain)
         else:
             result = launcher.execute_single()
 
-        if self.uses_communicator:
-            self._context.issue_execution_fence()
-
         self._demux_scalar_stores(result, launch_domain)
 
 
@@ -776,20 +772,12 @@ def launch(self, strategy: Strategy) -> None:
         self._add_scalar_args_to_launcher(launcher)
 
         launcher.set_can_raise_exception(self.can_raise_exception)
+        launcher.set_concurrent(self.concurrent)
 
         self._add_communicators(launcher, self._launch_domain)
 
-        # TODO: For now we make sure no other operations are interleaved with
-        # the set of tasks that use a communicator. In the future, the
-        # communicator monad will do this for us.
-        if self.uses_communicator:
-            self._context.issue_execution_fence()
-
         result = launcher.execute(self._launch_domain)
 
-        if self.uses_communicator:
-            self._context.issue_execution_fence()
-
         self._demux_scalar_stores(result, self._launch_domain)
 
 
diff --git a/src/core/task/task.cc b/src/core/task/task.cc
index 014c6f801..51ac3e1c1 100644
--- a/src/core/task/task.cc
+++ b/src/core/task/task.cc
@@ -27,10 +27,7 @@ void LegateTaskRegistrar::record_variant(TaskID tid,
                                          TaskLayoutConstraintSet& layout_constraints,
                                          LegateVariantCode var,
                                          Processor::Kind kind,
-                                         bool leaf,
-                                         bool inner,
-                                         bool idempotent,
-                                         size_t ret_size)
+                                         const VariantOptions& options)
 {
   assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) ||
          (kind == Processor::OMP_PROC));
@@ -44,15 +41,16 @@ void LegateTaskRegistrar::record_variant(TaskID tid,
                                                       task_name,
                                                       descriptor,
                                                       var,
-                                                      ret_size));
+                                                      options.return_size));
 
   auto& registrar = pending_task_variants_.back();
   registrar.execution_constraints.swap(execution_constraints);
   registrar.layout_constraints.swap(layout_constraints);
   registrar.add_constraint(ProcessorConstraint(kind));
-  registrar.set_leaf(leaf);
-  registrar.set_inner(inner);
-  registrar.set_idempotent(idempotent);
+  registrar.set_leaf(options.leaf);
+  registrar.set_inner(options.inner);
+  registrar.set_idempotent(options.idempotent);
+  registrar.set_concurrent(options.concurrent);
 }
 
 void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& context)
diff --git a/src/core/task/task.h b/src/core/task/task.h
index f86e9987c..c06006f49 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -35,6 +35,35 @@ namespace legate {
 // We're going to allow for each task to use only up to 341 scalar output stores
 constexpr size_t LEGATE_MAX_SIZE_SCALAR_RETURN = 4096;
 
+struct VariantOptions {
+  bool leaf{true};
+  bool inner{false};
+  bool idempotent{false};
+  bool concurrent{false};
+  size_t return_size{LEGATE_MAX_SIZE_SCALAR_RETURN};
+
+  VariantOptions& with_leaf(bool _leaf)
+  {
+    leaf = _leaf;
+    return *this;
+  }
+  VariantOptions& with_inner(bool _inner)
+  {
+    inner = _inner;
+    return *this;
+  }
+  VariantOptions& with_idempotent(bool _idempotent)
+  {
+    idempotent = _idempotent;
+    return *this;
+  }
+  VariantOptions& with_concurrent(bool _concurrent)
+  {
+    concurrent = _concurrent;
+    return *this;
+  }
+};
+
 using LegateVariantImpl = void (*)(TaskContext&);
 
 template <typename T>
@@ -65,11 +94,6 @@ class LegateTask {
     static const bool value = (sizeof(test<T>(0)) == sizeof(__yes));
   };
 
- public:
-  static void register_variants();
-  template <typename RET_T, typename REDUC_T>
-  static void register_variants_with_return();
-
  public:
   static const char* task_name()
   {
@@ -129,33 +153,24 @@ class LegateTask {
                                Legion::TaskLayoutConstraintSet& layout_constraints,
                                LegateVariantCode var,
                                Legion::Processor::Kind kind,
-                               bool leaf       = false,
-                               bool inner      = false,
-                               bool idempotent = false)
+                               const VariantOptions& options)
   {
     // Construct the code descriptor for this task so that the library
     // can register it later when it is ready
     Legion::CodeDescriptor desc(legate_task_wrapper<TASK_PTR>);
     auto task_id = T::TASK_ID;
 
-    T::Registrar::record_variant(task_id,
-                                 T::task_name(),
-                                 desc,
-                                 execution_constraints,
-                                 layout_constraints,
-                                 var,
-                                 kind,
-                                 leaf,
-                                 inner,
-                                 idempotent,
-                                 LEGATE_MAX_SIZE_SCALAR_RETURN);
+    T::Registrar::record_variant(
+      task_id, T::task_name(), desc, execution_constraints, layout_constraints, var, kind, options);
   }
+  static void register_variants(
+    const std::map<LegateVariantCode, VariantOptions>& all_options = {});
 };
 
 template <typename T, typename BASE, bool HAS_CPU>
 class RegisterCPUVariant {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     Legion::ExecutionConstraintSet execution_constraints;
     Legion::TaskLayoutConstraintSet layout_constraints;
@@ -163,14 +178,14 @@ class RegisterCPUVariant {
                                                     layout_constraints,
                                                     LEGATE_CPU_VARIANT,
                                                     Legion::Processor::LOC_PROC,
-                                                    true /*leaf*/);
+                                                    options);
   }
 };
 
 template <typename T, typename BASE>
 class RegisterCPUVariant<T, BASE, false> {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     // Do nothing
   }
@@ -179,7 +194,7 @@ class RegisterCPUVariant<T, BASE, false> {
 template <typename T, typename BASE, bool HAS_OPENMP>
 class RegisterOMPVariant {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     Legion::ExecutionConstraintSet execution_constraints;
     Legion::TaskLayoutConstraintSet layout_constraints;
@@ -187,14 +202,14 @@ class RegisterOMPVariant {
                                                     layout_constraints,
                                                     LEGATE_OMP_VARIANT,
                                                     Legion::Processor::OMP_PROC,
-                                                    true /*leaf*/);
+                                                    options);
   }
 };
 
 template <typename T, typename BASE>
 class RegisterOMPVariant<T, BASE, false> {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     // Do nothing
   }
@@ -203,7 +218,7 @@ class RegisterOMPVariant<T, BASE, false> {
 template <typename T, typename BASE, bool HAS_GPU>
 class RegisterGPUVariant {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     Legion::ExecutionConstraintSet execution_constraints;
     Legion::TaskLayoutConstraintSet layout_constraints;
@@ -211,25 +226,31 @@ class RegisterGPUVariant {
                                                     layout_constraints,
                                                     LEGATE_GPU_VARIANT,
                                                     Legion::Processor::TOC_PROC,
-                                                    true /*leaf*/);
+                                                    options);
   }
 };
 
 template <typename T, typename BASE>
 class RegisterGPUVariant<T, BASE, false> {
  public:
-  static void register_variant()
+  static void register_variant(const VariantOptions& options)
   {
     // Do nothing
   }
 };
 
 template <typename T>
-/*static*/ void LegateTask<T>::register_variants()
+/*static*/ void LegateTask<T>::register_variants(
+  const std::map<LegateVariantCode, VariantOptions>& all_options)
 {
-  RegisterCPUVariant<T, LegateTask<T>, HasCPUVariant::value>::register_variant();
-  RegisterOMPVariant<T, LegateTask<T>, HasOMPVariant::value>::register_variant();
-  RegisterGPUVariant<T, LegateTask<T>, HasGPUVariant::value>::register_variant();
+  // Make a copy of the map of options so that we can do find-or-create on it
+  auto all_options_copy = all_options;
+  RegisterCPUVariant<T, LegateTask<T>, HasCPUVariant::value>::register_variant(
+    all_options_copy[LEGATE_CPU_VARIANT]);
+  RegisterOMPVariant<T, LegateTask<T>, HasOMPVariant::value>::register_variant(
+    all_options_copy[LEGATE_OMP_VARIANT]);
+  RegisterGPUVariant<T, LegateTask<T>, HasGPUVariant::value>::register_variant(
+    all_options_copy[LEGATE_GPU_VARIANT]);
 }
 
 class LegateTaskRegistrar {
@@ -241,10 +262,7 @@ class LegateTaskRegistrar {
                       Legion::TaskLayoutConstraintSet& layout_constraints,
                       LegateVariantCode var,
                       Legion::Processor::Kind kind,
-                      bool leaf,
-                      bool inner,
-                      bool idempotent,
-                      size_t ret_size);
+                      const VariantOptions& options);
 
  public:
   void register_all_tasks(Legion::Runtime* runtime, LibraryContext& context);
diff --git a/typings/legion_cffi/lib.pyi b/typings/legion_cffi/lib.pyi
index 364d43414..6ae3b017a 100644
--- a/typings/legion_cffi/lib.pyi
+++ b/typings/legion_cffi/lib.pyi
@@ -249,6 +249,7 @@ def legion_index_launcher_execute_outputs(*args: Any) -> Any: ...
 def legion_index_launcher_execute_reduction_and_outputs(*args: Any) -> Any: ...
 def legion_index_launcher_set_sharding_space(*args: Any) -> Any: ...
 def legion_index_launcher_set_provenance(*args: Any) -> Any: ...
+def legion_index_launcher_set_concurrent(*args: Any) -> Any: ...
 def legion_index_partition_create_by_domain(*args: Any) -> Any: ...
 def legion_index_partition_create_by_domain_future_map(*args: Any) -> Any: ...
 def legion_index_partition_create_by_image(*args: Any) -> Any: ...
@@ -501,6 +502,7 @@ __all__ = (
     "legion_index_launcher_execute_reduction_and_outputs",
     "legion_index_launcher_set_sharding_space",
     "legion_index_launcher_set_provenance",
+    "legion_index_launcher_set_concurrent",
     "legion_index_partition_create_by_domain",
     "legion_index_partition_create_by_domain_future_map",
     "legion_index_partition_create_by_image",

From f934afea8123ba10ee45f9afd8754207e9f5c17e Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 1 Nov 2022 15:44:17 -0700
Subject: [PATCH 042/121] Some quality-of-life changes (#458)

* Make is_complex take type codes just like the others and add is_complex_type instead

* Minimize the header dependencies for span.h
---
 src/core/utilities/span.h        |  3 ++-
 src/core/utilities/type_traits.h | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/core/utilities/span.h b/src/core/utilities/span.h
index f35caf41c..a4fd12a8c 100644
--- a/src/core/utilities/span.h
+++ b/src/core/utilities/span.h
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include "legion.h"
+#include <assert.h>
+#include <stddef.h>
 
 namespace legate {
 
diff --git a/src/core/utilities/type_traits.h b/src/core/utilities/type_traits.h
index b172f5b2c..4d8b324b4 100644
--- a/src/core/utilities/type_traits.h
+++ b/src/core/utilities/type_traits.h
@@ -171,16 +171,28 @@ struct is_floating_point {
   static constexpr bool value = std::is_floating_point<legate_type_of<CODE>>::value;
 };
 
-template <typename T>
+template <LegateTypeCode CODE>
 struct is_complex : std::false_type {
 };
 
 template <>
-struct is_complex<complex<float>> : std::true_type {
+struct is_complex<LegateTypeCode::COMPLEX64_LT> : std::true_type {
+};
+
+template <>
+struct is_complex<LegateTypeCode::COMPLEX128_LT> : std::true_type {
+};
+
+template <typename T>
+struct is_complex_type : std::false_type {
+};
+
+template <>
+struct is_complex_type<complex<float>> : std::true_type {
 };
 
 template <>
-struct is_complex<complex<double>> : std::true_type {
+struct is_complex_type<complex<double>> : std::true_type {
 };
 
 }  // namespace legate

From 1c616ccfb65b5eb0a3775f6728b868a9ffc3a76f Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 2 Nov 2022 11:23:44 -0700
Subject: [PATCH 043/121] Missing LEGATE_ABORT (#462)

---
 src/core/mapping/base_mapper.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 9ae61f62b..dcc393023 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -890,6 +890,7 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
                mappable.get_unique_id(),
                memory_kinds[target_memory.kind()],
                target_memory.id);
+  LEGATE_ABORT;
 }
 
 void BaseMapper::select_task_variant(const MapperContext ctx,

From 2ede57f728a74479c207cd12c5e68924ab6b6185 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 2 Nov 2022 16:49:25 -0700
Subject: [PATCH 044/121] Call bind.sh unconditionally (#461)

---
 bind.sh                                  | 193 ++++++++++++++---------
 legate/driver/command.py                 |  28 ++--
 tests/unit/legate/driver/test_command.py |  30 +++-
 3 files changed, 155 insertions(+), 96 deletions(-)

diff --git a/bind.sh b/bind.sh
index edb5fd05b..7e23b7acf 100755
--- a/bind.sh
+++ b/bind.sh
@@ -17,88 +17,131 @@
 
 set -euo pipefail
 
-# Usage: bind.sh <launcher> [--cpus <spec>] [--gpus <spec>] [--mems <spec>] [--nics <spec>] <app> ...
-# <spec> specifies the resources to bind each node-local rank to, with ranks
-# separated by /, e.g. 0,1/2,3/4,5/6,7 for 4 ranks per node.
-
-# Detect node-local rank based on launcher
-IDX=none
-case "$1" in
-    mpirun) IDX="$OMPI_COMM_WORLD_LOCAL_RANK" ;;
-    jsrun) IDX="$OMPI_COMM_WORLD_LOCAL_RANK" ;;
-    srun) IDX="$SLURM_LOCALID" ;;
-    local) IDX=0 ;;
-    none) IDX="${SLURM_LOCALID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${MV2_COMM_WORLD_LOCAL_RANK:-none}}}" ;;
+help() {
+  cat 1>&2 <<EOM
+Usage: bind.sh [OPTIONS]... -- APP...
+
+Options:
+  --launcher={mpirun|srun|jrun|auto|local}
+                    Launcher type, used to set LEGATE_RANK
+                    If 'auto', attempt to find the launcher rank automatically
+                    If 'local', rank is set to "0".
+  --cpus=SPEC       CPU binding specification, passed to numactl
+  --gpus=SPEC       GPU binding specification, used to set CUDA_VISIBLE_DEVICES
+  --mems=SPEC       Memory binding specification, passed to numactl
+  --nics=SPEC       Network interface binding specification, used to set
+                    all of: UCX_NET_DEVICES, NCCL_IB_HCA, GASNET_NUM_QPS,
+                    and GASNET_IBV_PORTS
+
+SPEC specifies the resources to bind each node-local rank to, with ranks
+separated by /, e.g. '0,1/2,3/4,5/6,7' for 4 ranks per node.
+
+APP is the application that will be executed by bind.sh, as well as any
+arguments for it.
+
+If --cpus or --mems is specified, then APP will be invoked with numactl.
+
+An explicit '--' separator should always come after OPTIONS and before APP.
+EOM
+  exit 2
+}
+
+launcher=auto
+while :
+do
+  case "$1" in
+    --launcher) launcher="$2" ;;
+    --cpus) cpus="$2" ;;
+    --gpus) gpus="$2" ;;
+    --mems) mems="$2" ;;
+    --nics) nics="$2" ;;
+    --help) help ;;
+    --)
+      shift;
+      break
+      ;;
+    *)
+      echo "Unexpected option: $1" 1>&2
+      help
+      ;;
+  esac
+  shift 2
+done
+
+case "$launcher" in
+  mpirun) rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}" ;;
+  jsrun ) rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}" ;;
+  srun  ) rank="${SLURM_LOCALID:-unknown}" ;;
+  auto  ) rank="${SLURM_LOCALID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${MV2_COMM_WORLD_LOCAL_RANK:-unknown}}}" ;;
+  local ) rank="0" ;;
+  *)
+    echo "Unexpected launcher value: $launcher" 1>&2
+    help
+    ;;
 esac
-shift
-if [[ "$IDX" == "none" ]]; then
-    echo "Error: Cannot detect node-local rank" 1>&2
+
+if [[ "$rank" == "unknown" ]]; then
+    echo "Error: Could not determine node-local rank" 1>&2
     exit 1
 fi
 
-# Read binding specifications
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --cpus)
-            CPUS=(${2//\// })
-            if [[ "$IDX" -ge "${#CPUS[@]}" ]]; then
-                echo "Error: Incomplete CPU binding specification" 1>&2
-                exit 1
-            fi
-            ;;
-        --gpus)
-            GPUS=(${2//\// })
-            if [[ "$IDX" -ge "${#GPUS[@]}" ]]; then
-                echo "Error: Incomplete GPU binding specification" 1>&2
-                exit 1
-            fi
-            ;;
-        --mems)
-            MEMS=(${2//\// })
-            if [[ "$IDX" -ge "${#MEMS[@]}" ]]; then
-                echo "Error: Incomplete MEM binding specification" 1>&2
-                exit 1
-            fi
-            ;;
-        --nics)
-            NICS=(${2//\// })
-            if [[ "$IDX" -ge "${#NICS[@]}" ]]; then
-                echo "Error: Incomplete NIC binding specification" 1>&2
-                exit 1
-            fi
-            ;;
-        *)
-            break
-            ;;
-    esac
-    shift 2
-done
+export LEGATE_RANK="$rank"
+
+if [ -n "${cpus+x}" ]; then
+  cpus=(${cpus//\// })
+  if [[ "$rank" -ge "${#cpus[@]}" ]]; then
+      echo "Error: Incomplete CPU binding specification" 1>&2
+      exit 1
+  fi
+fi
+
+if [ -n "${gpus+x}" ]; then
+  gpus=(${gpus//\// })
+  if [[ "$rank" -ge "${#gpus[@]}" ]]; then
+      echo "Error: Incomplete GPU binding specification" 1>&2
+      exit 1
+  fi
+  export CUDA_VISIBLE_DEVICES="${gpus[$rank]}"
+fi
 
-# Prepare environment
-if [[ -n "${GPUS+x}" ]]; then
-    export CUDA_VISIBLE_DEVICES="${GPUS[$IDX]}"
+if [ -n "${mems+x}" ]; then
+  mems=(${mems//\// })
+  if [[ "$rank" -ge "${#mems[@]}" ]]; then
+      echo "Error: Incomplete MEM binding specification" 1>&2
+      exit 1
+  fi
 fi
-if [[ -n "${NICS+x}" ]]; then
-    # Set all potentially relevant variables, hopefully they are ignored if we
-    # are not using the corresponding network.
-    NIC="${NICS[$IDX]}"
-    export UCX_NET_DEVICES="${NIC//,/:1,}":1
-    export NCCL_IB_HCA="$NIC"
-    NIC_ARR=(${NIC//,/ })
-    export GASNET_NUM_QPS="${#NIC_ARR[@]}"
-    export GASNET_IBV_PORTS="${NIC//,/+}"
+
+if [ -n "${nics+x}" ]; then
+  nics=(${nics//\// })
+  if [[ "$rank" -ge "${#nics[@]}" ]]; then
+      echo "Error: Incomplete NIC binding specification" 1>&2
+      exit 1
+  fi
+
+  # set all potentially relevant variables (hopefully they are ignored if we
+  # are not using the corresponding network)
+  nic="${nics[$rank]}"
+  nic_array=(${nic//,/ })
+  export UCX_NET_DEVICES="${nic//,/:1,}":1
+  export NCCL_IB_HCA="$nic"
+  export GASNET_NUM_QPS="${#nic_array[@]}"
+  export GASNET_IBV_PORTS="${nic//,/+}"
 fi
 
-# Prepare command
-if command -v numactl &> /dev/null; then
-    if [[ -n "${CPUS+x}" ]]; then
-        set -- --physcpubind "${CPUS[$IDX]}" "$@"
-    fi
-    if [[ -n "${MEMS+x}" ]]; then
-        set -- --membind "${MEMS[$IDX]}" "$@"
-    fi
-    set -- numactl "$@"
-elif [[ -n "${CPUS+x}" || -n "${MEMS+x}" ]]; then
-    echo "Warning: numactl is not available, cannot bind to cores or memories" 1>&2
+# numactl is only needed if cpu or memory pinning was requested
+if [[ -n "${cpus+x}" || -n "${mems+x}" ]]; then
+  if command -v numactl &> /dev/null; then
+      if [[ -n "${cpus+x}" ]]; then
+          set -- --physcpubind "${cpus[$rank]}" "$@"
+      fi
+      if [[ -n "${mems+x}" ]]; then
+          set -- --membind "${mems[$rank]}" "$@"
+      fi
+      set -- numactl "$@"
+  else
+      echo "Warning: numactl is not available, cannot bind to cores or memories" 1>&2
+  fi
 fi
+
 exec "$@"
diff --git a/legate/driver/command.py b/legate/driver/command.py
index f45a10c7c..0e72cfe7b 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -30,21 +30,17 @@
 def cmd_bind(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
-    cpu_bind = config.binding.cpu_bind
-    mem_bind = config.binding.mem_bind
-    gpu_bind = config.binding.gpu_bind
-    nic_bind = config.binding.nic_bind
-
-    if all(x is None for x in (cpu_bind, mem_bind, gpu_bind, nic_bind)):
-        return ()
-
     ranks = config.multi_node.ranks
 
+    if launcher.kind == "none":
+        bind_launcher_arg = "local" if ranks == 1 else "auto"
+    else:
+        bind_launcher_arg = launcher.kind
+
     opts: CommandPart = (
         str(system.legate_paths.bind_sh_path),
-        "local"
-        if launcher.kind == "none" and ranks == 1
-        else str(launcher.kind),
+        "--launcher",
+        bind_launcher_arg,
     )
 
     ranks_per_node = config.multi_node.ranks_per_node
@@ -56,17 +52,17 @@ def check_bind_ranks(name: str, binding: str) -> None:
             raise RuntimeError(errmsg.format(name=name))
 
     bindings = (
-        ("cpu", cpu_bind),
-        ("gpu", gpu_bind),
-        ("mem", mem_bind),
-        ("nic", nic_bind),
+        ("cpu", config.binding.cpu_bind),
+        ("gpu", config.binding.gpu_bind),
+        ("mem", config.binding.mem_bind),
+        ("nic", config.binding.nic_bind),
     )
     for name, binding in bindings:
         if binding is not None:
             check_bind_ranks(name, binding)
             opts += (f"--{name}s", binding)
 
-    return opts
+    return opts + ("--",)
 
 
 def cmd_gdb(
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index 29d4a8632..739dd7f9c 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -67,7 +67,8 @@ def test_default(self, genobjs: GenObjs) -> None:
 
         result = m.cmd_bind(config, system, launcher)
 
-        assert result == ()
+        bind_sh = str(system.legate_paths.bind_sh_path)
+        assert result == (bind_sh, "--launcher", "local", "--")
 
     @pytest.mark.parametrize("kind", ("cpu", "gpu", "mem", "nic"))
     def test_basic_local(self, genobjs: GenObjs, kind: str) -> None:
@@ -76,7 +77,14 @@ def test_basic_local(self, genobjs: GenObjs, kind: str) -> None:
         result = m.cmd_bind(config, system, launcher)
 
         bind_sh = str(system.legate_paths.bind_sh_path)
-        assert result == (bind_sh, "local", f"--{kind}s", "1")
+        assert result == (
+            bind_sh,
+            "--launcher",
+            "local",
+            f"--{kind}s",
+            "1",
+            "--",
+        )
 
     @pytest.mark.parametrize("launch", ("none", "mpirun", "jsrun", "srun"))
     def test_combo_local(
@@ -101,14 +109,17 @@ def test_combo_local(
         result = m.cmd_bind(config, system, launcher)
 
         bind_sh = str(system.legate_paths.bind_sh_path)
-        assert result[:2] == (
+        assert result[:3] == (
             bind_sh,
+            "--launcher",
             "local" if launch == "none" else launch,
         )
-        x = iter(result[2:])
+        x = iter(result[3:])
         for name, binding in zip(x, x):  # pairwise
             assert f"{name} {binding}" in "--cpus 1 --gpus 1 --nics 1 --mems 1"
 
+        assert result[-1] == "--"
+
     @pytest.mark.parametrize("launch", ("none", "mpirun", "jsrun", "srun"))
     @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
     @pytest.mark.parametrize("kind", ("cpu", "gpu", "mem", "nic"))
@@ -127,8 +138,17 @@ def test_ranks_good(
 
         result = m.cmd_bind(config, system, launcher)
 
+        launcher_arg = "auto" if launch == "none" else launch
+
         bind_sh = str(system.legate_paths.bind_sh_path)
-        assert result == (bind_sh, launch, f"--{kind}s", "1/2")
+        assert result == (
+            bind_sh,
+            "--launcher",
+            launcher_arg,
+            f"--{kind}s",
+            "1/2",
+            "--",
+        )
 
     @pytest.mark.parametrize("binding", ("1", "1/2/3"))
     @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)

From 2a1ca972d5bb942fc6c3a2cc84777844a75b20b9 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Fri, 4 Nov 2022 13:18:29 -0700
Subject: [PATCH 045/121] Make `install.py` reconfigure editable installs when
 build type changes (#455)

* pass -mindepth 1 so we don't accidentally delete the search root if it matches one of the `-d` names

* pass unknown flags to `pip install` command

* use CMAKE_ARGS instead of SKBUILD_CONFIGURE_OPTIONS to work around scikit-build bug (fixes #372)

* replace SKBUILD_CONFIGURE_OPTIONS with CMAKE_ARGS everywhere
---
 BUILD.md                                      | 50 +++++++++++--------
 install.py                                    | 14 ++++--
 legate/util/fs.py                             |  4 +-
 scripts/build-install.sh                      |  4 +-
 scripts/build-no-install.sh                   |  4 +-
 scripts/build-separately-no-install.sh        |  4 +-
 scripts/build-with-legion-no-install.sh       |  4 +-
 ...build-with-legion-separately-no-install.sh |  4 +-
 ...uninstall-global-legion-and-legate-core.sh | 36 ++++++-------
 9 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 320059f9f..2495f4d3b 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -17,7 +17,7 @@ limitations under the License.
 
 # TL;DR
 
-1) Check if there are specialized scripts available for your cluster at https://github.com/nv-legate/quickstart.
+1) Check if there are specialized scripts available for your cluster at [nv-legate/quickstart](https://github.com/nv-legate/quickstart).
 2) [Install dependencies from conda](#getting-dependencies-through-conda)
 3) [Build using install.py](#using-installpy)
 
@@ -33,7 +33,7 @@ Please use the `scripts/generate-conda-envs.py` script to create a conda
 environment file listing all the packages that are required to build, run and
 test Legate Core and all downstream libraries. For example:
 
-```
+```shell
 $ ./scripts/generate-conda-envs.py --python 3.10 --ctk 11.7 --os linux --compilers --openmpi
 --- generating: environment-test-linux-py310-cuda-11.7-compilers-openmpi.yaml
 ```
@@ -45,13 +45,13 @@ generated environment file (e.g. all the supported Python versions). See the
 Once you have this environment file, you can install the required packages by
 creating a new conda environment:
 
-```
+```shell
 conda env create -n legate -f <env-file>.yaml
 ```
 
 or by updating an existing environment:
 
-```
+```shell
 conda env update -f <env-file>.yaml
 ```
 
@@ -161,14 +161,14 @@ after to trip GLIBC's internal version checks, since the conda library expects
 to find symbols with more recent version numbers than what is available on the
 system-wide GLIBC:
 
-```
+```shell
 /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /opt/conda/envs/legate/lib/libarrow.so)
 ```
 
 You can usually work around this issue by putting the conda library directory
 first in the dynamic library resolution path:
 
-```
+```shell
 LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
 ```
 
@@ -186,14 +186,14 @@ the C++ and Python components under the currently active Python environment.
 
 To add GPU support, use the `--cuda` flag:
 
-```
+```shell
 ./install.py --cuda
 ```
 
 You can specify the CUDA toolkit directory and the CUDA architecture you want to
 target using the `--with-cuda` and `--arch` flags, e.g.:
 
-```
+```shell
 ./install.py --cuda --with-cuda /usr/local/cuda/ --arch ampere
 ```
 
@@ -215,18 +215,21 @@ You also need to specify the interconnect network of the target machine using th
 
 For example this would be an installation for a
 [DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
-```
+
+```shell
 ./install.py --network gasnet1 --conduit ibv --cuda --arch ampere
 ```
+
 Alternatively, here is an install line for the
 [Piz-Daint](https://www.cscs.ch/computers/dismissed/piz-daint-piz-dora/) supercomputer:
-```
+
+```shell
 ./install.py --network gasnet1 --conduit aries --cuda --arch pascal
 ```
 
 To see all available configuration options, run with the `--help` flag:
 
-```
+```shell
 ./install.py --help
 ```
 
@@ -237,11 +240,13 @@ can still use the pip installer to build and install Legate Core. The following
 command will trigger a single-node, CPU-only build of Legate Core, then install
 it into the currently active Python environment:
 
-```
+```shell
 $ pip install .
 ```
+
 or
-```
+
+```shell
 $ python3 -m pip install .
 ```
 
@@ -249,17 +254,20 @@ $ python3 -m pip install .
 
 Legate relies on CMake to select its toolchain and build flags. Users can set
 the environment variables `CXX` or `CXXFLAGS` prior to building to override the
-CMake defaults. Alternatively, CMake values can be overridden through the
-`SKBUILD_CONFIGURE_OPTIONS` variable:
+CMake defaults.
 
-```
-$ SKBUILD_CONFIGURE_OPTIONS="-D Legion_USE_CUDA:BOOL=ON" \
+Alternatively, CMake and build tool arguments can be passed via the
+`CMAKE_ARGS`/`SKBUILD_CONFIGURE_OPTIONS` and `SKBUILD_BUILD_OPTIONS`
+[environment variables](https://scikit-build.readthedocs.io/en/latest/usage.html#environment-variable-configuration):
+
+```shell
+$ CMAKE_ARGS="${CMAKE_ARGS:-} -D Legion_USE_CUDA:BOOL=ON" \
   pip install .
 ```
 
 An alternative syntax using `setup.py` with `scikit-build` is
 
-```
+```shell
 $ python setup.py install -- -DLegion_USE_CUDA:BOOL=ON
 ```
 
@@ -287,20 +295,20 @@ There are several examples in the `scripts` folder. We walk through the steps in
 
 First, the CMake build needs to be configured:
 
-```
+```shell
 $ cmake -S . -B build -GNinja -D Legion_USE_CUDA=ON
 ```
 
 Once configured, we can build the C++ libraries:
 
-```
+```shell
 $ cmake --build build
 ```
 
 This will invoke Ninja (or make) to execute the build.
 Once the C++ libraries are available, we can do an editable (development) pip installation.
 
-```
+```shell
 $ SKBUILD_BUILD_OPTIONS="-D FIND_LEGATE_CORE_CPP=ON -D legate_core_ROOT=$(pwd)/build" \
   python3 -m pip install \
   --root / --no-deps --no-build-isolation
diff --git a/install.py b/install.py
index e3303ae72..dad033c42 100755
--- a/install.py
+++ b/install.py
@@ -309,6 +309,7 @@ def install(
         print("legion_src_dir:", legion_src_dir)
         print("legion_url:", legion_url)
         print("legion_branch:", legion_branch)
+        print("unknown:", str(unknown))
 
     join = os.path.join
     exists = os.path.exists
@@ -396,15 +397,22 @@ def validate_path(path):
             pip_install_cmd += ["--no-deps", "--no-build-isolation"]
         pip_install_cmd += ["--upgrade"]
 
+    if unknown is not None:
+        pip_install_cmd += unknown
+
     pip_install_cmd += ["."]
 
     if verbose:
         pip_install_cmd += ["-vv"]
 
-    cmake_flags = []
+    # Also use preexisting CMAKE_ARGS from conda if set
+    cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
 
     if cmake_generator:
-        cmake_flags += [f"-G'{cmake_generator}'"]
+        if " " not in cmake_generator:
+            cmake_flags += [f"-G{cmake_generator}"]
+        else:
+            cmake_flags += [f"-G'{cmake_generator}'"]
 
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
@@ -458,7 +466,7 @@ def validate_path(path):
     cmd_env.update(
         {
             "SKBUILD_BUILD_OPTIONS": f"-j{str(thread_count)}",
-            "SKBUILD_CONFIGURE_OPTIONS": "\n".join(cmake_flags),
+            "CMAKE_ARGS": " ".join(cmake_flags),
         }
     )
 
diff --git a/legate/util/fs.py b/legate/util/fs.py
index 15338d783..4b7465799 100644
--- a/legate/util/fs.py
+++ b/legate/util/fs.py
@@ -220,13 +220,13 @@ def get_legion_paths(legate_paths: LegatePaths) -> LegionPaths:
     # 1. Legion was found in a standard system location (/usr, $CONDA_PREFIX)
     # 2. Legion was built as a side-effect of building legate_core:
     #    ```
-    #    SKBUILD_CONFIGURE_OPTIONS="" python -m pip install .
+    #    CMAKE_ARGS="" python -m pip install .
     #    ```
     # 3. Legion was built in a separate directory independent of legate_core
     #    and the path to its build directory was given when configuring
     #    legate_core:
     #    ```
-    #    SKBUILD_CONFIGURE_OPTIONS="-D Legion_ROOT=/legion/build" \
+    #    CMAKE_ARGS="-D Legion_ROOT=/legion/build" \
     #        python -m pip install .
     #    ```
     #
diff --git a/scripts/build-install.sh b/scripts/build-install.sh
index b0aa91925..f7b5a3854 100755
--- a/scripts/build-install.sh
+++ b/scripts/build-install.sh
@@ -13,7 +13,7 @@ source ./scripts/util/uninstall-global-legion-and-legate-core.sh
 rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -30,7 +30,7 @@ ninja_args="-j$(nproc --ignore=2)"
 
 # Build legion_core + legion_core_python and install into the current Python environment
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
         --no-deps --no-build-isolation    \
diff --git a/scripts/build-no-install.sh b/scripts/build-no-install.sh
index b6ced5da5..8cb6665e4 100755
--- a/scripts/build-no-install.sh
+++ b/scripts/build-no-install.sh
@@ -11,7 +11,7 @@ source ./scripts/util/compiler-flags.sh
 rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -28,7 +28,7 @@ ninja_args="-j$(nproc --ignore=2)"
 
 # Build legion_core + legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-separately-no-install.sh b/scripts/build-separately-no-install.sh
index f2b0188c7..1ffacde26 100755
--- a/scripts/build-separately-no-install.sh
+++ b/scripts/build-separately-no-install.sh
@@ -11,7 +11,7 @@ source ./scripts/util/compiler-flags.sh
 rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -48,7 +48,7 @@ cmake_args+="
 
 # Build legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legion-no-install.sh b/scripts/build-with-legion-no-install.sh
index 5d52c2c00..5cc03b624 100755
--- a/scripts/build-with-legion-no-install.sh
+++ b/scripts/build-with-legion-no-install.sh
@@ -26,7 +26,7 @@ if [[ -f "$Legion_ROOT/CMakeCache.txt" ]]; then
 fi
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -41,7 +41,7 @@ cmake_args+="
 
 # Build legion_core + legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legion-separately-no-install.sh b/scripts/build-with-legion-separately-no-install.sh
index 200b67aa9..a497af581 100755
--- a/scripts/build-with-legion-separately-no-install.sh
+++ b/scripts/build-with-legion-separately-no-install.sh
@@ -26,7 +26,7 @@ if [[ -f "$Legion_ROOT/CMakeCache.txt" ]]; then
 fi
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -49,7 +49,7 @@ cmake_args+="
 
 # Build legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/util/uninstall-global-legion-and-legate-core.sh b/scripts/util/uninstall-global-legion-and-legate-core.sh
index 17e17bd5d..916f3e993 100755
--- a/scripts/util/uninstall-global-legion-and-legate-core.sh
+++ b/scripts/util/uninstall-global-legion-and-legate-core.sh
@@ -1,21 +1,21 @@
 #! /usr/bin/env bash
 
-rm -rf $(find "$CONDA_PREFIX" -type d -name '*realm*') \
-       $(find "$CONDA_PREFIX" -type d -name '*legion*') \
-       $(find "$CONDA_PREFIX" -type d -name '*legate*') \
-       $(find "$CONDA_PREFIX" -type d -name '*Legion*') \
-       $(find "$CONDA_PREFIX" -type f -name 'realm*.h') \
-       $(find "$CONDA_PREFIX" -type f -name 'legion*.h') \
-       $(find "$CONDA_PREFIX" -type f -name 'pygion.py') \
-       $(find "$CONDA_PREFIX" -type f -name 'legion_top.py') \
-       $(find "$CONDA_PREFIX" -type f -name 'legion_cffi.py') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'librealm*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'libregent*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'liblegion*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'liblgcore*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'legate.core.egg-link') \
-       $(find "$CONDA_PREFIX/bin" -type f -name '*legion*') \
-       $(find "$CONDA_PREFIX/bin" -type f -name 'legate') \
-       $(find "$CONDA_PREFIX/bin" -type f -name 'bind.sh') \
-       $(find "$CONDA_PREFIX/bin" -type f -name 'lgpatch') \
+rm -rf $(find "$CONDA_PREFIX" -mindepth 1 -type d -name '*realm*') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type d -name '*legion*') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type d -name '*legate*') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type d -name '*Legion*') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type f -name 'realm*.h') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type f -name 'legion*.h') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type f -name 'pygion.py') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type f -name 'legion_top.py') \
+       $(find "$CONDA_PREFIX" -mindepth 1 -type f -name 'legion_cffi.py') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'librealm*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'libregent*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'liblegion*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'liblgcore*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'legate.core.egg-link') \
+       $(find "$CONDA_PREFIX/bin" -mindepth 1 -type f -name '*legion*') \
+       $(find "$CONDA_PREFIX/bin" -mindepth 1 -type f -name 'legate') \
+       $(find "$CONDA_PREFIX/bin" -mindepth 1 -type f -name 'bind.sh') \
+       $(find "$CONDA_PREFIX/bin" -mindepth 1 -type f -name 'lgpatch') \
        ;

From dbe9ebec522a3bef4f427a3aa4f4b6bd4ffa9058 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Fri, 4 Nov 2022 21:07:05 -0700
Subject: [PATCH 046/121] legate/core/types: add missing `to_pandas_type` on
 Complex types (#467)

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/types.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/legate/core/types.py b/legate/core/types.py
index fdce689de..0b226275e 100644
--- a/legate/core/types.py
+++ b/legate/core/types.py
@@ -17,6 +17,7 @@
 from enum import IntEnum, unique
 from typing import Any, Iterable, Type, Union
 
+import numpy as np
 import pyarrow as pa
 
 from . import legion
@@ -41,6 +42,9 @@ def __arrow_ext_deserialize__(
     def __hash__(self) -> int:
         return hash(self.__class__)
 
+    def to_pandas_dtype(self) -> np.dtype[Any]:
+        return np.dtype(np.complex64)
+
 
 class Complex128Dtype(pa.ExtensionType):
     def __init__(self) -> None:
@@ -58,6 +62,9 @@ def __arrow_ext_deserialize__(
     def __hash__(self) -> int:
         return hash(self.__class__)
 
+    def to_pandas_dtype(self) -> np.dtype[Any]:
+        return np.dtype(np.complex128)
+
 
 bool_ = pa.bool_()
 int8 = pa.int8()

From d7f8f99952f32486b0292fa448bf9679a099563e Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Mon, 7 Nov 2022 11:53:19 -0800
Subject: [PATCH 047/121] Python optimization experiments (#460)

* checkpoint

* only compute PartSym hash once

* don't define unneeded closure on every call

* try caching get_subregion_size?
---
 legate/core/constraints.py |   4 +-
 legate/core/operation.py   |  64 ++++++++++++------------
 legate/core/partition.py   |   2 +
 legate/core/shape.py       | 100 +++++++++++++++++++++++++------------
 4 files changed, 104 insertions(+), 66 deletions(-)

diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index b8fdc6c49..d18b5fab0 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -102,6 +102,8 @@ def __init__(
         self._disjoint = disjoint
         self._complete = complete
 
+        self._hash = hash((self._op_hash, self._id))
+
     @property
     def ndim(self) -> int:
         return self._store.ndim
@@ -120,7 +122,7 @@ def __repr__(self) -> str:
         return f"X{self._id}({disj},{comp})@{self._op_name}"
 
     def __hash__(self) -> int:
-        return hash((self._op_hash, self._id))
+        return self._hash
 
     def subst(self, mapping: dict[PartSym, PartitionBase]) -> Expr:
         return Lit(mapping[self])
diff --git a/legate/core/operation.py b/legate/core/operation.py
index 998276e6a..d6788cf60 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -457,6 +457,14 @@ def __init__(
         self._output_parts: list[PartSym] = []
         self._reduction_parts: list[PartSym] = []
 
+    def get_requirement(
+        self, store: Store, part_symb: PartSym, strategy: Strategy
+    ) -> tuple[Proj, int, StorePartition]:
+        store_part = store.partition(strategy.get_partition(part_symb))
+        req = store_part.get_requirement(strategy.launch_ndim)
+        tag = self.get_tag(strategy, part_symb)
+        return req, tag, store_part
+
     def add_input(
         self, store: Store, partition: Optional[PartSym] = None
     ) -> None:
@@ -571,18 +579,10 @@ def launch(self, strategy: Strategy) -> None:
             provenance=self.provenance,
         )
 
-        def get_requirement(
-            store: Store, part_symb: PartSym
-        ) -> tuple[Proj, int, StorePartition]:
-            store_part = store.partition(strategy.get_partition(part_symb))
-            req = store_part.get_requirement(strategy.launch_ndim)
-            tag = self.get_tag(strategy, part_symb)
-            return req, tag, store_part
-
         self.find_all_reusable_store_pairs(strategy)
 
         for store, part_symb in zip(self._inputs, self._input_parts):
-            req, tag, _ = get_requirement(store, part_symb)
+            req, tag, _ = self.get_requirement(store, part_symb, strategy)
             launcher.add_input(store, req, tag=tag)
 
         for idx, (store, part_symb) in enumerate(
@@ -592,7 +592,9 @@ def get_requirement(
                 continue
             if idx in self._reuse_map:
                 store.move_data(self._reuse_map[idx])
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
             launcher.add_output(store, req, tag=tag)
             # We update the key partition of a store only when it gets updated
             store.set_key_partition(store_part.partition)
@@ -600,7 +602,9 @@ def get_requirement(
         for ((store, redop), part_symb) in zip(
             self._reductions, self._reduction_parts
         ):
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
 
             can_read_write = store_part.is_disjoint_for(strategy.launch_domain)
             req.redop = store.type.reduction_op_id(redop)
@@ -941,21 +945,15 @@ def launch(self, strategy: Strategy) -> None:
         # will need to be extended accordingly.
         scatter = len(self._target_indirects) > 0
 
-        def get_requirement(
-            store: Store, part_symb: PartSym
-        ) -> tuple[Proj, int, StorePartition]:
-            store_part = store.partition(strategy.get_partition(part_symb))
-            req = store_part.get_requirement(strategy.launch_ndim)
-            tag = self.get_tag(strategy, part_symb)
-            return req, tag, store_part
-
         for store, part_symb in zip(self._inputs, self._input_parts):
-            req, tag, _ = get_requirement(store, part_symb)
+            req, tag, _ = self.get_requirement(store, part_symb, strategy)
             launcher.add_input(store, req, tag=tag)
 
         for store, part_symb in zip(self._outputs, self._output_parts):
             assert not store.unbound
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
             if scatter:
                 launcher.add_inout(store, req, tag=tag)
             else:
@@ -964,18 +962,24 @@ def get_requirement(
         for ((store, redop), part_symb) in zip(
             self._reductions, self._reduction_parts
         ):
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
             req.redop = store.type.reduction_op_id(redop)
             launcher.add_reduction(store, req, tag=tag)
         for store, part_symb in zip(
             self._source_indirects, self._source_indirect_parts
         ):
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
             launcher.add_source_indirect(store, req, tag=tag)
         for store, part_symb in zip(
             self._target_indirects, self._target_indirect_parts
         ):
-            req, tag, store_part = get_requirement(store, part_symb)
+            req, tag, store_part = self.get_requirement(
+                store, part_symb, strategy
+            )
             launcher.add_target_indirect(store, req, tag=tag)
 
         if strategy.launch_domain is not None:
@@ -1040,17 +1044,11 @@ def add_reduction(
         raise TypeError("No reductions can be added to fills")
 
     def launch(self, strategy: Strategy) -> None:
-        def get_requirement(
-            store: Store, part_symb: PartSym
-        ) -> tuple[Proj, int, StorePartition]:
-            store_part = store.partition(strategy.get_partition(part_symb))
-            req = store_part.get_requirement(strategy.launch_ndim)
-            tag = self.get_tag(strategy, part_symb)
-            return req, tag, store_part
-
         lhs = self._outputs[0]
         lhs_part_sym = self._output_parts[0]
-        lhs_proj, _, lhs_part = get_requirement(lhs, lhs_part_sym)
+        lhs_proj, _, lhs_part = self.get_requirement(
+            lhs, lhs_part_sym, strategy
+        )
         lhs.set_key_partition(lhs_part.partition)
         launcher = FillLauncher(
             self.context,
diff --git a/legate/core/partition.py b/legate/core/partition.py
index ba13b5351..162e7fb6a 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod, abstractproperty
+from functools import lru_cache
 from typing import TYPE_CHECKING, Optional, Sequence, Type, Union
 
 from . import (
@@ -237,6 +238,7 @@ def is_disjoint_for(self, launch_domain: Optional[Rect]) -> bool:
     def has_color(self, color: Shape) -> bool:
         return color >= 0 and color < self._color_shape
 
+    @lru_cache
     def get_subregion_size(self, extents: Shape, color: Shape) -> Shape:
         lo = self._tile_shape * color + self._offset
         hi = self._tile_shape * (color + 1) + self._offset
diff --git a/legate/core/shape.py b/legate/core/shape.py
index 9147d0865..98207191f 100644
--- a/legate/core/shape.py
+++ b/legate/core/shape.py
@@ -26,15 +26,10 @@
 ExtentLike: TypeAlias = Union["Shape", int, Iterable[int]]
 
 
-def _cast_tuple(value: ExtentLike, ndim: int) -> tuple[int, ...]:
-    if isinstance(value, Shape):
-        return value.extents
-    elif isinstance(value, Iterable):
-        return tuple(value)
-    elif isinstance(value, int):
+def _cast_tuple(value: int | Iterable[int], ndim: int) -> tuple[int, ...]:
+    if isinstance(value, int):
         return (value,) * ndim
-    else:
-        raise ValueError(f"Cannot cast {type(value).__name__} to tuple")
+    return tuple(value)
 
 
 class Shape:
@@ -46,8 +41,11 @@ def __init__(
         extents: Optional[ExtentLike] = None,
         ispace: Optional[IndexSpace] = None,
     ) -> None:
-        if extents is not None:
-            self._extents = _cast_tuple(extents, 1)
+        if isinstance(extents, int):
+            self._extents = (extents,)
+            self._ispace = None
+        elif extents is not None:
+            self._extents = tuple(extents)
             self._ispace = None
         else:
             assert ispace is not None
@@ -59,9 +57,7 @@ def extents(self) -> tuple[int, ...]:
         if self._extents is None:
             assert self._ispace is not None
             bounds = self._ispace.get_bounds()
-            lo = bounds.lo
             hi = bounds.hi
-            assert all(lo[idx] == 0 for idx in range(lo.dim))
             self._extents = tuple(hi[idx] + 1 for idx in range(hi.dim))
         return self._extents
 
@@ -148,55 +144,95 @@ def __eq__(self, other: object) -> bool:
             else:
                 return self.extents == other.extents
         elif isinstance(other, (int, Iterable)):
-            lh = _cast_tuple(self, self.ndim)
-            rh = _cast_tuple(other, self.ndim)
+            lh = self.extents
+            rh = (
+                other.extents
+                if isinstance(other, Shape)
+                else _cast_tuple(other, self.ndim)
+            )
             return lh == rh
         else:
             return False
 
     def __le__(self, other: ExtentLike) -> bool:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return len(lh) == len(rh) and lh <= rh
 
     def __lt__(self, other: ExtentLike) -> bool:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return len(lh) == len(rh) and lh < rh
 
     def __ge__(self, other: ExtentLike) -> bool:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return len(lh) == len(rh) and lh >= rh
 
     def __gt__(self, other: ExtentLike) -> bool:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return len(lh) == len(rh) and lh > rh
 
     def __add__(self, other: ExtentLike) -> Shape:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return Shape(tuple(a + b for (a, b) in zip(lh, rh)))
 
     def __sub__(self, other: ExtentLike) -> Shape:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return Shape(tuple(a - b for (a, b) in zip(lh, rh)))
 
     def __mul__(self, other: ExtentLike) -> Shape:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return Shape(tuple(a * b for (a, b) in zip(lh, rh)))
 
     def __mod__(self, other: ExtentLike) -> Shape:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return Shape(tuple(a % b for (a, b) in zip(lh, rh)))
 
     def __floordiv__(self, other: ExtentLike) -> Shape:
-        lh = _cast_tuple(self, self.ndim)
-        rh = _cast_tuple(other, self.ndim)
+        lh = self.extents
+        rh = (
+            other.extents
+            if isinstance(other, Shape)
+            else _cast_tuple(other, self.ndim)
+        )
         return Shape(tuple(a // b for (a, b) in zip(lh, rh)))
 
     def drop(self, dim: int) -> Shape:

From 1ec34b41696b0c712358b13c34b073cdf55856e6 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Mon, 7 Nov 2022 15:47:52 -0700
Subject: [PATCH 048/121] fix for -ll:networks none, we will init MPI if it has
 not been initialized (#465)

* fix for -ll:networks none, we will init MPI if it has not been
initialized.

* add the self mpi finalize

* fix for LEGATE_NEED_NETWORK
---
 src/core/comm/coll.cc | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/core/comm/coll.cc b/src/core/comm/coll.cc
index 8f0f14104..6ca5d6787 100644
--- a/src/core/comm/coll.cc
+++ b/src/core/comm/coll.cc
@@ -59,6 +59,8 @@ static int current_unique_id = 0;
 
 static bool coll_inited = false;
 
+static bool self_mpi_init = false;
+
 // functions start here
 #ifdef LEGATE_USE_NETWORK
 static inline std::pair<int, int> mostFrequent(const int* arr, int n);
@@ -242,23 +244,31 @@ int collInit(int argc, char* argv[])
 {
   current_unique_id = 0;
 #ifdef LEGATE_USE_NETWORK
-  int provided, init_flag = 0;
+  int init_flag = 0;
   CHECK_MPI(MPI_Initialized(&init_flag));
   if (!init_flag) {
-    log_coll.fatal(
-      "MPI has not been initialized, it should be initialized by "
-      "the networking backend");
-    LEGATE_ABORT;
-  } else {
-    int mpi_thread_model;
-    MPI_Query_thread(&mpi_thread_model);
-    if (mpi_thread_model != MPI_THREAD_MULTIPLE) {
+    char* network    = getenv("LEGATE_NEED_NETWORK");
+    int need_network = 0;
+    if (network != nullptr) { need_network = atoi(network); }
+    if (need_network) {
       log_coll.fatal(
-        "MPI has been initialized by others, but is not initialized with "
-        "MPI_THREAD_MULTIPLE");
+        "MPI has not been initialized, it should be initialized by "
+        "the networking backend.");
       LEGATE_ABORT;
+    } else {
+      int provided;
+      MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided);
+      self_mpi_init = true;
     }
   }
+  int mpi_thread_model;
+  MPI_Query_thread(&mpi_thread_model);
+  if (mpi_thread_model != MPI_THREAD_MULTIPLE) {
+    log_coll.fatal(
+      "MPI has been initialized by others, but is not initialized with "
+      "MPI_THREAD_MULTIPLE");
+    LEGATE_ABORT;
+  }
   // check
   int *tag_ub, flag;
   CHECK_MPI(MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub, &flag));
@@ -285,6 +295,7 @@ int collFinalize()
     log_coll.fatal("MPI should not have been finalized");
     LEGATE_ABORT;
   }
+  if (self_mpi_init) { CHECK_MPI(MPI_Finalize()); }
 #else
   for (ThreadComm* thread_comm : thread_comms) {
     assert(!thread_comm->ready_flag);

From 734c2da289ccb863db24754df0026fcbc4b0df50 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Tue, 8 Nov 2022 13:21:21 -0800
Subject: [PATCH 049/121] Add option to have per-file, per-stage test overrides
 (#469)

* leave override config to client projects

* Add option to have per-file, per-stage test overrides

* remove example custom test
---
 legate/tester/__init__.py          | 38 +++++++++++++++++++++---------
 legate/tester/stages/test_stage.py | 23 +++++++++++++++---
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/legate/tester/__init__.py b/legate/tester/__init__.py
index 270abcf8d..045eca19d 100644
--- a/legate/tester/__init__.py
+++ b/legate/tester/__init__.py
@@ -17,9 +17,12 @@
 """
 from __future__ import annotations
 
+from dataclasses import dataclass
 from typing import Union
 from typing_extensions import Literal, TypeAlias
 
+from ..util.types import ArgList
+
 #: Define the available feature types for tests
 FeatureType: TypeAlias = Union[
     Literal["cpus"], Literal["cuda"], Literal["eager"], Literal["openmp"]
@@ -57,15 +60,28 @@
     "openmp",
 )
 
-#: Paths to example files that should be skipped.
-SKIPPED_EXAMPLES = {
-    "examples/ingest.py",
-    "examples/kmeans_sort.py",
-    "examples/lstm_full.py",
-    "examples/wgrad.py",
-}
+#: Paths to test files that should be skipped entirely in all stages.
+#:
+#: Client test scripts should udpate this set with their own customizations.
+SKIPPED_EXAMPLES: set[str] = set()
 
-#: Extra arguments to supply when specific examples are executed.
-PER_FILE_ARGS = {
-    "examples/lstm_full.py": ["--file", "resources/lstm_input.txt"],
-}
+#: Extra arguments to add when specific test files are executed (in any stage).
+#:
+#: Client test scripts should udpate this dict with their own customizations.
+PER_FILE_ARGS: dict[str, ArgList] = {}
+
+
+@dataclass
+class CustomTest:
+    file: str
+    kind: FeatureType
+    args: ArgList
+
+
+#: Customized configurations for specific test files. Each entry will result
+#: in the specified test file being run in the specified stage, with the given
+#: command line arguments appended (overriding default stage arguments). These
+#: files are run serially, after the sharded, parallelized tests.
+#:
+#: Client test scripts should udpate this set with their own customizations.
+CUSTOM_FILES: list[CustomTest] = []
diff --git a/legate/tester/stages/test_stage.py b/legate/tester/stages/test_stage.py
index f9c871461..ed24ae461 100644
--- a/legate/tester/stages/test_stage.py
+++ b/legate/tester/stages/test_stage.py
@@ -23,7 +23,7 @@
 from ...util.colors import yellow
 from ...util.types import ArgList, EnvDict
 from ...util.ui import banner, summary
-from .. import PER_FILE_ARGS, FeatureType
+from .. import CUSTOM_FILES, PER_FILE_ARGS, FeatureType
 from ..config import Config
 from ..test_system import ProcessResult, TestSystem
 from .util import Shard, StageResult, StageSpec, log_proc
@@ -224,7 +224,12 @@ def cov_args(self, config: Config) -> ArgList:
         return args
 
     def run(
-        self, test_file: Path, config: Config, system: TestSystem
+        self,
+        test_file: Path,
+        config: Config,
+        system: TestSystem,
+        *,
+        custom_args: ArgList | None = None,
     ) -> ProcessResult:
         """Execute a single test files with appropriate environment and
         command-line options for a feature test stage.
@@ -254,6 +259,9 @@ def run(
 
         cmd += stage_args + file_args + config.extra_args
 
+        if custom_args:
+            cmd += custom_args
+
         self.delay(shard, config, system)
 
         result = system.run(cmd, test_file, env=self._env(config, system))
@@ -286,4 +294,13 @@ def _launch(
         ]
         pool.close()
 
-        return [job.get() for job in jobs]
+        sharded_results = [job.get() for job in jobs]
+
+        custom = (x for x in CUSTOM_FILES if x.kind == self.kind)
+
+        custom_results = [
+            self.run(Path(x.file), config, system, custom_args=x.args)
+            for x in custom
+        ]
+
+        return sharded_results + custom_results

From e6cc081336982a916678bb822b446244e1b273b4 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 8 Nov 2022 23:38:52 -0800
Subject: [PATCH 050/121] Construct region-backed 0D stores in a correct way
 (#450)

---
 legate/core/launcher.py |  9 ++++++++-
 legate/core/runtime.py  | 19 ++++++++++++++++++-
 legate/core/store.py    | 17 +++++++----------
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index b19b357c0..2f8ff886d 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -1030,7 +1030,14 @@ def add_store(
         flags: int,
     ) -> None:
         assert store.kind is not Future
-        assert store._transform.bottom
+        assert (
+            store._transform.bottom
+            # Although we should not allow any transformed stores for copies,
+            # as affine transformations in copies are not yet supported,
+            # the 0D-to-1D case is benign and the backing region is guaranteed
+            # to be singleton, so we can accept (i.e., ignore) it.
+            or (store.ndim == 0 and store._storage.ndim == 1)
+        )
 
         if TYPE_CHECKING:
             assert isinstance(store.storage, RegionField)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 4d12a6591..6e0796a7a 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -1266,10 +1266,27 @@ def create_store(
             if optimize_scalar and shape is not None and shape.volume() == 1
             else RegionField
         )
-        storage = Storage(shape, 0, dtype, data=data, kind=kind)
+
+        sanitized_shape: Optional[Shape]
+        if kind is RegionField and shape is not None and shape.ndim == 0:
+            from .transform import Project, identity
+
+            # If the client requested a 0D region-backed store, we need to
+            # promote the shape to 1D to create the storage, as Legion
+            # doesn't allow 0D regions. And we also need to set up a transform
+            # to map "0D" points back to 1D so that the store looks like 0D
+            # to the client.
+            sanitized_shape = Shape([1])
+            transform = identity.stack(Project(0, 0))
+        else:
+            sanitized_shape = shape
+            transform = None
+
+        storage = Storage(sanitized_shape, 0, dtype, data=data, kind=kind)
         return Store(
             dtype,
             storage,
+            transform=transform,
             shape=shape,
             ndim=ndim,
         )
diff --git a/legate/core/store.py b/legate/core/store.py
index 4c947829b..a8d01546b 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -18,6 +18,7 @@
 from typing import TYPE_CHECKING, Any, Optional, Sequence, Type, Union
 
 from . import (
+    AffineTransform,
     Attach,
     Detach,
     Future,
@@ -42,7 +43,6 @@
     Project,
     Promote,
     Shift,
-    TransformStack,
     Transpose,
     identity,
 )
@@ -50,7 +50,6 @@
 
 if TYPE_CHECKING:
     from . import (
-        AffineTransform,
         BufferBuilder,
         Partition as LegionPartition,
         PhysicalRegion,
@@ -824,9 +823,7 @@ def get_child_store(self, *indices: int) -> Store:
         child_storage = self._storage_partition.get_child(color)
         child_transform = self.transform
         for dim, offset in enumerate(child_storage.offsets):
-            child_transform = TransformStack(
-                Shift(dim, -offset), child_transform
-            )
+            child_transform = child_transform.stack(Shift(dim, -offset))
         return Store(
             self._store.type,
             child_storage,
@@ -1077,7 +1074,7 @@ def promote(self, extra_dim: int, dim_size: int = 1) -> Store:
         return Store(
             self._dtype,
             self._storage,
-            TransformStack(transform, self._transform),
+            self._transform.stack(transform),
             shape=shape,
         )
 
@@ -1116,7 +1113,7 @@ def project(self, dim: int, index: int) -> Store:
         return Store(
             self._dtype,
             storage,
-            TransformStack(transform, self._transform),
+            self._transform.stack(transform),
             shape=shape,
         )
 
@@ -1160,7 +1157,7 @@ def slice(self, dim: int, sl: slice) -> Store:
         transform = (
             self._transform
             if start == 0
-            else TransformStack(Shift(dim, -start), self._transform)
+            else self._transform.stack(Shift(dim, -start))
         )
         return Store(
             self._dtype,
@@ -1192,7 +1189,7 @@ def transpose(self, axes: tuple[int, ...]) -> Store:
         return Store(
             self._dtype,
             self._storage,
-            TransformStack(transform, self._transform),
+            self._transform.stack(transform),
             shape=shape,
         )
 
@@ -1218,7 +1215,7 @@ def delinearize(self, dim: int, shape: tuple[int, ...]) -> Store:
         return Store(
             self._dtype,
             self._storage,
-            TransformStack(transform, self._transform),
+            self._transform.stack(transform),
             shape=new_shape,
         )
 

From bdcb603ee19fc6f36e265475cc9075ac8f73849d Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 9 Nov 2022 01:26:15 -0800
Subject: [PATCH 051/121] Show provenance strings in the progress logs (#473)

---
 src/core/runtime/runtime.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index 78329649b..4ef4d831d 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -123,9 +123,10 @@ static void extract_scalar_task(
   point_str << point[0];
   for (int32_t dim = 1; dim < task->index_point.dim; ++dim) point_str << "," << point[dim];
 
-  log_legate.print("%s %s task, pt = (%s), proc = " IDFMT,
+  log_legate.print("%s %s task [%s], pt = (%s), proc = " IDFMT,
                    task_name,
                    proc_kind_str,
+                   task->get_provenance_string().c_str(),
                    point_str.str().c_str(),
                    exec_proc.id);
 }

From eabdbefc2b66c91d76ab2e1c307df908fd0c070b Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 9 Nov 2022 11:22:02 -0800
Subject: [PATCH 052/121] Use bind.sh for all rank detection (#471)

* use bind.sh for all rank detection

* add tests for detected_rank

* add --bind-detail option

* address review comments

* remove Launcher.rank_id

* more careful debug echo

* docs
---
 bind.sh                                   | 80 +++++++++++++++++------
 legate/driver/args.py                     |  9 +++
 legate/driver/command.py                  | 16 ++++-
 legate/driver/config.py                   |  1 +
 legate/driver/driver.py                   |  5 +-
 legate/driver/launcher.py                 | 41 +++++-------
 legate/driver/logs.py                     |  2 +-
 legate/jupyter/config.py                  |  2 +-
 tests/unit/legate/driver/test_args.py     |  3 +
 tests/unit/legate/driver/test_command.py  | 50 +++++++++++---
 tests/unit/legate/driver/test_config.py   |  5 +-
 tests/unit/legate/driver/test_launcher.py | 40 ++++++------
 tests/unit/legate/jupyter/test_config.py  |  4 +-
 13 files changed, 179 insertions(+), 79 deletions(-)

diff --git a/bind.sh b/bind.sh
index 7e23b7acf..394deed3c 100755
--- a/bind.sh
+++ b/bind.sh
@@ -32,6 +32,7 @@ Options:
   --nics=SPEC       Network interface binding specification, used to set
                     all of: UCX_NET_DEVICES, NCCL_IB_HCA, GASNET_NUM_QPS,
                     and GASNET_IBV_PORTS
+  --debug           print out the final computed invocation before exectuting
 
 SPEC specifies the resources to bind each node-local rank to, with ranks
 separated by /, e.g. '0,1/2,3/4,5/6,7' for 4 ranks per node.
@@ -46,15 +47,17 @@ EOM
   exit 2
 }
 
+debug="0"
 launcher=auto
 while :
 do
   case "$1" in
-    --launcher) launcher="$2" ;;
-    --cpus) cpus="$2" ;;
-    --gpus) gpus="$2" ;;
-    --mems) mems="$2" ;;
-    --nics) nics="$2" ;;
+    --launcher) launcher="$2"; shift 2 ;;
+    --cpus) cpus="$2"; shift 2 ;;
+    --gpus) gpus="$2"; shift 2 ;;
+    --mems) mems="$2"; shift 2 ;;
+    --nics) nics="$2"; shift 2 ;;
+    --debug) debug="1"; shift ;;
     --help) help ;;
     --)
       shift;
@@ -65,31 +68,51 @@ do
       help
       ;;
   esac
-  shift 2
 done
 
 case "$launcher" in
-  mpirun) rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}" ;;
-  jsrun ) rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}" ;;
-  srun  ) rank="${SLURM_LOCALID:-unknown}" ;;
-  auto  ) rank="${SLURM_LOCALID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${MV2_COMM_WORLD_LOCAL_RANK:-unknown}}}" ;;
-  local ) rank="0" ;;
+  mpirun)
+    local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}"
+    global_rank="${OMPI_COMM_WORLD_RANK:-unknown}"
+    ;;
+  jsrun )
+    local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}"
+    gloabl_rank="${OMPI_COMM_WORLD_RANK:-unknown}"
+    ;;
+  srun  )
+    local_rank="${SLURM_LOCALID:-unknown}"
+    global_rank="${SLURM_PROCID:-unknown}"
+    ;;
+  auto  )
+    local_rank="${SLURM_LOCALID:-${OMPI_COMM_WORLD_LOCAL_RANK:-${MV2_COMM_WORLD_LOCAL_RANK:-unknown}}}"
+    global_rank="${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${MV2_COMM_WORLD_RANK:-${SLURM_PROCID:-unknown}}}}"
+    ;;
+  local )
+    local_rank="0"
+    global_rank="0"
+    ;;
   *)
     echo "Unexpected launcher value: $launcher" 1>&2
     help
     ;;
 esac
 
-if [[ "$rank" == "unknown" ]]; then
+if [[ "$local_rank" == "unknown" ]]; then
     echo "Error: Could not determine node-local rank" 1>&2
     exit 1
 fi
 
-export LEGATE_RANK="$rank"
+if [[ "$global_rank" == "unknown" ]]; then
+    echo "Error: Could not determine global rank" 1>&2
+    exit 1
+fi
+
+export LEGATE_LOCAL_RANK="$local_rank"
+export LEGATE_GLOBAL_RANK="$global_rank"
 
 if [ -n "${cpus+x}" ]; then
   cpus=(${cpus//\// })
-  if [[ "$rank" -ge "${#cpus[@]}" ]]; then
+  if [[ "$local_rank" -ge "${#cpus[@]}" ]]; then
       echo "Error: Incomplete CPU binding specification" 1>&2
       exit 1
   fi
@@ -97,16 +120,16 @@ fi
 
 if [ -n "${gpus+x}" ]; then
   gpus=(${gpus//\// })
-  if [[ "$rank" -ge "${#gpus[@]}" ]]; then
+  if [[ "$local_rank" -ge "${#gpus[@]}" ]]; then
       echo "Error: Incomplete GPU binding specification" 1>&2
       exit 1
   fi
-  export CUDA_VISIBLE_DEVICES="${gpus[$rank]}"
+  export CUDA_VISIBLE_DEVICES="${gpus[$local_rank]}"
 fi
 
 if [ -n "${mems+x}" ]; then
   mems=(${mems//\// })
-  if [[ "$rank" -ge "${#mems[@]}" ]]; then
+  if [[ "$local_rank" -ge "${#mems[@]}" ]]; then
       echo "Error: Incomplete MEM binding specification" 1>&2
       exit 1
   fi
@@ -114,14 +137,14 @@ fi
 
 if [ -n "${nics+x}" ]; then
   nics=(${nics//\// })
-  if [[ "$rank" -ge "${#nics[@]}" ]]; then
+  if [[ "$local_rank" -ge "${#nics[@]}" ]]; then
       echo "Error: Incomplete NIC binding specification" 1>&2
       exit 1
   fi
 
   # set all potentially relevant variables (hopefully they are ignored if we
   # are not using the corresponding network)
-  nic="${nics[$rank]}"
+  nic="${nics[$local_rank]}"
   nic_array=(${nic//,/ })
   export UCX_NET_DEVICES="${nic//,/:1,}":1
   export NCCL_IB_HCA="$nic"
@@ -133,10 +156,10 @@ fi
 if [[ -n "${cpus+x}" || -n "${mems+x}" ]]; then
   if command -v numactl &> /dev/null; then
       if [[ -n "${cpus+x}" ]]; then
-          set -- --physcpubind "${cpus[$rank]}" "$@"
+          set -- --physcpubind "${cpus[$local_rank]}" "$@"
       fi
       if [[ -n "${mems+x}" ]]; then
-          set -- --membind "${mems[$rank]}" "$@"
+          set -- --membind "${mems[$local_rank]}" "$@"
       fi
       set -- numactl "$@"
   else
@@ -144,4 +167,19 @@ if [[ -n "${cpus+x}" || -n "${mems+x}" ]]; then
   fi
 fi
 
+# arguments may contain the substring %%LEGATE_GLOBAL_RANK%% which needs to be
+# be replaced with the actual computed rank for downstream processes to use
+updated=()
+for arg in "$@"; do
+  updated+=("${arg/\%\%LEGATE_GLOBAL_RANK\%\%/$LEGATE_GLOBAL_RANK}")
+done
+
+set -- "${updated[@]}"
+
+if [ "$debug" == "1" ]; then
+  echo -n "bind.sh: $@" 1>&2
+  for TOK in "$@"; do printf " %q" "$TOK" 1>&2; done
+  echo
+fi
+
 exec "$@"
diff --git a/legate/driver/args.py b/legate/driver/args.py
index c473efa5f..e36f783b1 100644
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -304,6 +304,15 @@
 )
 
 
+info.add_argument(
+    "--bind-detail",
+    dest="bind_detail",
+    action="store_true",
+    required=False,
+    help="print out the final invocation run by bind.sh",
+)
+
+
 other = parser.add_argument_group("Other options")
 
 
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 0e72cfe7b..0c7909564 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -27,6 +27,10 @@
 __all__ = ("CMD_PARTS",)
 
 
+# this will be replaced by bind.sh with the actual computed rank at runtime
+LEGATE_GLOBAL_RANK_SUBSTITUTION = "%%LEGATE_GLOBAL_RANK%%"
+
+
 def cmd_bind(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
@@ -62,6 +66,9 @@ def check_bind_ranks(name: str, binding: str) -> None:
             check_bind_ranks(name, binding)
             opts += (f"--{name}s", binding)
 
+    if config.info.bind_detail:
+        opts += ("--debug",)
+
     return opts + ("--",)
 
 
@@ -97,7 +104,10 @@ def cmd_nvprof(
     if not config.profiling.nvprof:
         return ()
 
-    log_path = str(config.logging.logdir / f"legate_{launcher.rank_id}.nvvp")
+    log_path = str(
+        config.logging.logdir
+        / f"legate_{LEGATE_GLOBAL_RANK_SUBSTITUTION}.nvvp"
+    )
 
     return ("nvprof", "-o", log_path)
 
@@ -108,7 +118,9 @@ def cmd_nsys(
     if not config.profiling.nsys:
         return ()
 
-    log_path = str(config.logging.logdir / f"legate_{launcher.rank_id}")
+    log_path = str(
+        config.logging.logdir / f"legate_{LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+    )
     targets = config.profiling.nsys_targets
     extra = config.profiling.nsys_extra
 
diff --git a/legate/driver/config.py b/legate/driver/config.py
index 470cca123..711162ac5 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -134,6 +134,7 @@ class Info(DataclassMixin):
     progress: bool
     mem_usage: bool
     verbose: bool
+    bind_detail: bool
 
 
 @dataclass(frozen=True)
diff --git a/legate/driver/driver.py b/legate/driver/driver.py
index 5329b951f..7f3e17d33 100644
--- a/legate/driver/driver.py
+++ b/legate/driver/driver.py
@@ -93,7 +93,10 @@ def run(self) -> int:
         """
         if self.config.info.verbose:
             # we only want to print verbose output on a "head" node
-            if self.launcher.kind != "none" or self.launcher.rank_id == "0":
+            if (
+                self.launcher.kind != "none"
+                or self.launcher.detected_rank_id == "0"
+            ):
                 print_verbose(self.system, self)
 
         self._darwin_gdb_warn()
diff --git a/legate/driver/launcher.py b/legate/driver/launcher.py
index e41b0a2e1..1f67046b5 100644
--- a/legate/driver/launcher.py
+++ b/legate/driver/launcher.py
@@ -54,8 +54,8 @@
 class Launcher:
     """A base class for custom launch handlers for Legate.
 
-    Subclasses should set ``kind``, ``rank_id``, and ``cmd`` properties during
-    their initialization.
+    Subclasses should set ``kind`` and ``cmd`` properties during their
+    initialization.
 
     Parameters
     ----------
@@ -67,10 +67,11 @@ class Launcher:
 
     kind: LauncherType
 
-    rank_id: str
-
     cmd: Command
 
+    # base class will attempt to set this
+    detected_rank_id: str | None = None
+
     _config: ConfigProtocol
 
     _system: System
@@ -83,13 +84,20 @@ def __init__(self, config: ConfigProtocol, system: System) -> None:
         self._config = config
         self._system = system
 
+        if config.multi_node.ranks == 1:
+            self.detected_rank_id = "0"
+        else:
+            for var in RANK_ENV_VARS:
+                if var in system.env:
+                    self.detected_rank_id = system.env[var]
+                    break
+
         self._check_realm_python()
 
     def __eq__(self, other: object) -> bool:
         return (
             isinstance(other, type(self))
             and self.kind == other.kind
-            and self.rank_id == other.rank_id
             and self.cmd == other.cmd
             and self.env == other.env
         )
@@ -294,18 +302,11 @@ class SimpleLauncher(Launcher):
     def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
-        if config.multi_node.ranks == 1:
-            self.rank_id = "0"
-
-        else:
-            for var in RANK_ENV_VARS:
-                if var in system.env:
-                    self.rank_id = system.env[var]
-                    break
-
-            # NB: for-else clause! (executes if NO loop break)
-            else:
-                raise RuntimeError(RANK_ERR_MSG)
+        # bind.sh handles computing local and global rank id, even in the
+        # simple case, just for consistency. But we do still check the known
+        # rank env vars below in order to issue RANK_ERR_MSG if needed
+        if config.multi_node.ranks > 1 and self.detected_rank_id is None:
+            raise RuntimeError(RANK_ERR_MSG)
 
         self.cmd = ()
 
@@ -322,8 +323,6 @@ class MPILauncher(Launcher):
     def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
-        self.rank_id = "%q{OMPI_COMM_WORLD_RANK}"
-
         ranks = config.multi_node.ranks
         ranks_per_node = config.multi_node.ranks_per_node
 
@@ -352,8 +351,6 @@ class JSRunLauncher(Launcher):
     def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
-        self.rank_id = "%q{OMPI_COMM_WORLD_RANK}"
-
         ranks = config.multi_node.ranks
         ranks_per_node = config.multi_node.ranks_per_node
 
@@ -380,8 +377,6 @@ class SRunLauncher(Launcher):
     def __init__(self, config: ConfigProtocol, system: System) -> None:
         super().__init__(config, system)
 
-        self.rank_id = "%q{SLURM_PROCID}"
-
         ranks = config.multi_node.ranks
         ranks_per_node = config.multi_node.ranks_per_node
 
diff --git a/legate/driver/logs.py b/legate/driver/logs.py
index 95b2ed46f..8dad3a3b4 100644
--- a/legate/driver/logs.py
+++ b/legate/driver/logs.py
@@ -185,7 +185,7 @@ def process_logs(
 
     handlers: list[LogHandler] = []
 
-    if launcher.kind != "none" or launcher.rank_id == "0":
+    if launcher.kind != "none" or launcher.detected_rank_id == "0":
         if config.profiling.profile:
             handlers.append(ProfilingHandler(config, system))
 
diff --git a/legate/jupyter/config.py b/legate/jupyter/config.py
index 745238b63..77ee521cd 100644
--- a/legate/jupyter/config.py
+++ b/legate/jupyter/config.py
@@ -86,5 +86,5 @@ def __init__(self, argv: ArgList) -> None:
         self.debugging = Debugging(
             False, False, False, False, False, False, False
         )
-        self.info = Info(False, False, self.verbose > 0)
+        self.info = Info(False, False, self.verbose > 0, False)
         self.other = Other(None, False, False)
diff --git a/tests/unit/legate/driver/test_args.py b/tests/unit/legate/driver/test_args.py
index fa9d36929..4881521e8 100644
--- a/tests/unit/legate/driver/test_args.py
+++ b/tests/unit/legate/driver/test_args.py
@@ -166,6 +166,9 @@ def test_mem_usage(self) -> None:
     def test_verbose(self) -> None:
         assert m.parser.get_default("verbose") is False
 
+    def test_bind_detail(self) -> None:
+        assert m.parser.get_default("bind_detail") is False
+
     # other
 
     def test_module(self) -> None:
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index 739dd7f9c..fede85b11 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -32,6 +32,10 @@ def test___all__() -> None:
     assert m.__all__ == ("CMD_PARTS",)
 
 
+def test_LEGATE_GLOBAL_RANK_SUBSTITUTION() -> None:
+    assert m.LEGATE_GLOBAL_RANK_SUBSTITUTION == "%%LEGATE_GLOBAL_RANK%%"
+
+
 def test_CMD_PARTS() -> None:
     assert m.CMD_PARTS == (
         m.cmd_bind,
@@ -70,6 +74,14 @@ def test_default(self, genobjs: GenObjs) -> None:
         bind_sh = str(system.legate_paths.bind_sh_path)
         assert result == (bind_sh, "--launcher", "local", "--")
 
+    def test_bind_detail(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs(["--bind-detail"])
+
+        result = m.cmd_bind(config, system, launcher)
+
+        bind_sh = str(system.legate_paths.bind_sh_path)
+        assert result == (bind_sh, "--launcher", "local", "--debug", "--")
+
     @pytest.mark.parametrize("kind", ("cpu", "gpu", "mem", "nic"))
     def test_basic_local(self, genobjs: GenObjs, kind: str) -> None:
         config, system, launcher = genobjs([f"--{kind}-bind", "1"])
@@ -261,7 +273,10 @@ def test_with_option(self, genobjs: GenObjs) -> None:
 
         result = m.cmd_nvprof(config, system, launcher)
 
-        log_path = str(config.logging.logdir / "legate_0.nvvp")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}.nvvp"
+        )
         assert result == ("nvprof", "-o", log_path)
 
     @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
@@ -277,7 +292,10 @@ def test_multi_rank_no_launcher(
 
         result = m.cmd_nvprof(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{rank}.nvvp")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}.nvvp"
+        )
         assert result == ("nvprof", "-o", log_path)
 
     @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
@@ -294,7 +312,8 @@ def test_multi_rank_with_launcher(
         result = m.cmd_nvprof(config, system, launcher)
 
         log_path = str(
-            config.logging.logdir / f"legate_{launcher.rank_id}.nvvp"
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}.nvvp"
         )
         assert result == ("nvprof", "-o", log_path)
 
@@ -320,7 +339,10 @@ def test_multi_rank_no_launcher(
 
         result = m.cmd_nsys(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{rank}")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+        )
         assert result == (
             "nsys",
             "profile",
@@ -343,7 +365,10 @@ def test_multi_rank_with_launcher(
 
         result = m.cmd_nsys(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{launcher.rank_id}")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+        )
         assert result == (
             "nsys",
             "profile",
@@ -376,7 +401,10 @@ def test_multi_rank_extra_no_s(
 
         result = m.cmd_nsys(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{rank}")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+        )
         assert result == (
             "nsys",
             "profile",
@@ -414,7 +442,10 @@ def test_multi_rank_extra_with_s(
 
         result = m.cmd_nsys(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{rank}")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+        )
         assert result == (
             "nsys",
             "profile",
@@ -447,7 +478,10 @@ def test_multi_rank_targets(
 
         result = m.cmd_nsys(config, system, launcher)
 
-        log_path = str(config.logging.logdir / f"legate_{rank}")
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}"
+        )
         assert result == (
             "nsys",
             "profile",
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 249107ed2..67e0d2473 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -247,6 +247,7 @@ def test_fields(self) -> None:
             "progress",
             "mem_usage",
             "verbose",
+            "bind_detail",
         }
 
     def test_mixin(self) -> None:
@@ -331,7 +332,9 @@ def test_default_init(self) -> None:
             event=False,
         )
 
-        assert c.info == m.Info(progress=False, mem_usage=False, verbose=False)
+        assert c.info == m.Info(
+            progress=False, mem_usage=False, verbose=False, bind_detail=False
+        )
 
         assert c.other == m.Other(module=None, dry_run=False, rlwrap=False)
 
diff --git a/tests/unit/legate/driver/test_launcher.py b/tests/unit/legate/driver/test_launcher.py
index ebfc793c5..f9a1b9d1a 100644
--- a/tests/unit/legate/driver/test_launcher.py
+++ b/tests/unit/legate/driver/test_launcher.py
@@ -126,7 +126,7 @@ def test_identical_config(
 
         assert launcher1 == launcher2
         assert launcher1.kind == launcher2.kind
-        assert launcher1.rank_id == launcher2.rank_id
+        assert launcher1.detected_rank_id == launcher2.detected_rank_id
         assert launcher1.cmd == launcher2.cmd
         assert launcher1.env == launcher2.env
 
@@ -358,7 +358,7 @@ def test_single_rank(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "0"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == ()
 
     def test_single_rank_launcher_extra_ignored(
@@ -370,7 +370,7 @@ def test_single_rank_launcher_extra_ignored(
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "0"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == ()
 
     @pytest.mark.parametrize("rank_var", m.RANK_ENV_VARS)
@@ -388,7 +388,7 @@ def test_multi_rank(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "123"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == ()
 
     def test_multi_rank_bad(self, genconfig: GenConfig) -> None:
@@ -416,7 +416,7 @@ def test_multi_rank_launcher_extra_ignored(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "123"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == ()
 
 
@@ -465,7 +465,7 @@ def test_single_rank(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "0"
 
         # TODO (bv) -x env args currnetly too fragile to test
         assert launcher.cmd[:10] == (
@@ -490,9 +490,9 @@ def test_single_rank_launcher_extra(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "0"
 
-        # TODO (bv) -x env args currnetly too fragile to test
+        # TODO (bv) -x env args currently too fragile to test
         assert launcher.cmd[:10] == (
             ("mpirun",)
             + ("-n", "1", "--npernode", "1", "--bind-to", "none")
@@ -517,7 +517,7 @@ def test_multi_rank(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "123"
 
         # TODO (bv) -x env args currnetly too fragile to test
         assert launcher.cmd[:10] == (
@@ -555,7 +555,7 @@ def test_multi_rank_launcher_extra(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "123"
 
         # TODO (bv) -x env args currnetly too fragile to test
         assert launcher.cmd[:10] == (
@@ -575,7 +575,7 @@ def test_single_rank(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == (
             ("jsrun",)
             + ("-n", "1", "-r", "1", "-a", "1")
@@ -596,7 +596,7 @@ def test_single_rank_launcher_extra(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == (
             ("jsrun",)
             + ("-n", "1", "-r", "1", "-a", "1")
@@ -619,7 +619,7 @@ def test_multi_rank(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == (
             ("jsrun",)
             + ("-n", "100", "-r", "1", "-a", "2")
@@ -652,7 +652,7 @@ def test_multi_rank_launcher_extra(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{OMPI_COMM_WORLD_RANK}"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == (
             ("jsrun",)
             + ("-n", "100", "-r", "1", "-a", "2")
@@ -667,7 +667,7 @@ def test_single_rank(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == ("srun", "-n", "1", "--ntasks-per-node", "1")
 
     def test_single_rank_launcher_extra(self, genconfig: GenConfig) -> None:
@@ -684,7 +684,7 @@ def test_single_rank_launcher_extra(self, genconfig: GenConfig) -> None:
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == (
             "srun",
             "-n",
@@ -705,7 +705,7 @@ def test_single_rank_debugging(
 
         launcher = m.Launcher.create(config, SYSTEM)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "0"
         assert launcher.cmd == (
             "srun",
             "-n",
@@ -730,7 +730,7 @@ def test_multi_rank(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == ("srun", "-n", "200", "--ntasks-per-node", "2")
 
     @pytest.mark.parametrize("rank_var", m.RANK_ENV_VARS)
@@ -758,7 +758,7 @@ def test_multi_rank_launcher_extra(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == (
             "srun",
             "-n",
@@ -790,7 +790,7 @@ def test_multi_rank_debugging(
         system = System()
         launcher = m.Launcher.create(config, system)
 
-        assert launcher.rank_id == "%q{SLURM_PROCID}"
+        assert launcher.detected_rank_id == "123"
         assert launcher.cmd == (
             "srun",
             "-n",
diff --git a/tests/unit/legate/jupyter/test_config.py b/tests/unit/legate/jupyter/test_config.py
index 4e956ff85..17931ed5c 100644
--- a/tests/unit/legate/jupyter/test_config.py
+++ b/tests/unit/legate/jupyter/test_config.py
@@ -107,7 +107,9 @@ def test_default_init(self) -> None:
             event=False,
         )
 
-        assert c.info == m.Info(progress=False, mem_usage=False, verbose=False)
+        assert c.info == m.Info(
+            progress=False, mem_usage=False, verbose=False, bind_detail=False
+        )
 
         assert c.other == m.Other(module=None, dry_run=False, rlwrap=False)
 

From d23493efbe1a1be3f0029ee80ae830185a750264 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Thu, 10 Nov 2022 11:47:45 -0800
Subject: [PATCH 053/121] Print build start and end time (#474)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index 96fbdc851..27b5aead1 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 
-set -x;
-
 # Rewrite conda's -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY to
 #                 -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
 CMAKE_ARGS="$(echo "$CMAKE_ARGS" | sed -r "s@_INCLUDE=ONLY@_INCLUDE=BOTH@g")"
@@ -30,6 +28,8 @@ export CUDAFLAGS="-UNDEBUG"
 export CMAKE_GENERATOR=Ninja
 export CUDAHOSTCXX=${CXX}
 
+echo "Build starting on $(date)"
+
 cmake -S . -B build ${CMAKE_ARGS}
 cmake --build build -j$CPU_COUNT
 cmake --install build --prefix "$PREFIX"
@@ -49,6 +49,8 @@ $PYTHON -m pip install             \
   --disable-pip-version-check      \
   . -vv
 
+echo "Build ending on $(date)"
+
 # Legion leaves an egg-info file which will confuse conda trying to pick up the information
 # Remove it so the legate-core is the only egg-info file added
 rm -rf $SP_DIR/legion*egg-info

From 3cdc2cc6f755d0a3eb0d72aa802c865cba831592 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 10 Nov 2022 13:15:14 -0800
Subject: [PATCH 054/121] Pass a sufficiently high default value for gasnet's
 ibv-max-hcas (#477)

Previously we would pass this setting through an envvar in
quickstart build.sh, but Legion's cmake workflow overrides this
value, so we need to pass it as a cmake flag.

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 install.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/install.py b/install.py
index dad033c42..fdff8e692 100755
--- a/install.py
+++ b/install.py
@@ -439,6 +439,7 @@ def validate_path(path):
 -DLegion_REDOP_HALF=ON
 -DLegion_BUILD_BINDINGS=ON
 -DLegion_BUILD_JUPYTER=ON
+-DLegion_EMBED_GASNet_CONFIGURE_ARGS="--with-ibv-max-hcas=8"
 """.splitlines()
 
     if nccl_dir:

From 5792d5f376c12c8481272b4fd61fd0a2a4d8e1a3 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 11 Nov 2022 10:08:22 -0800
Subject: [PATCH 055/121] Clarification in docs

---
 BUILD.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BUILD.md b/BUILD.md
index 2495f4d3b..f14dd8710 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -133,7 +133,7 @@ Only necessary if you wish to run on multiple nodes.
 Not available on conda; typically available through MOFED or the system-level
 package manager.
 
-If using UCX, a build configured with `--enable-mt` is required.
+If using UCX, a build of UCX configured with `--enable-mt` is required.
 
 ## Alternative sources for dependencies
 

From 002f828b469d8bc287ce5b87aac8f38f8f15ebe0 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 14 Nov 2022 11:32:08 -0800
Subject: [PATCH 056/121] Support for library specific annotations (#464)

* Support for library specific annotations

* Address review comments

* Replace itertools.chain with a simpler code
---
 legate/core/__init__.py  |  2 +-
 legate/core/context.py   | 63 +++++++++++++++++++++++++++++++++++-----
 legate/core/operation.py |  7 ++++-
 3 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/legate/core/__init__.py b/legate/core/__init__.py
index 8a6beee0a..0c5eda106 100644
--- a/legate/core/__init__.py
+++ b/legate/core/__init__.py
@@ -69,7 +69,7 @@
 
 # Import select types for Legate library construction
 from .allocation import DistributedAllocation
-from .context import track_provenance
+from .context import Annotation, track_provenance
 from .legate import (
     Array,
     Library,
diff --git a/legate/core/context.py b/legate/core/context.py
index d242d3718..e1aac4536 100644
--- a/legate/core/context.py
+++ b/legate/core/context.py
@@ -66,6 +66,34 @@ def find_last_user_frame(libname: str) -> str:
     return f"{frame.f_code.co_filename}:{frame.f_lineno}"
 
 
+class LibraryAnnotations:
+    def __init__(self) -> None:
+        self._entries: dict[str, str] = {}
+        self._provenance: Union[str, None] = None
+
+    @property
+    def provenance(self) -> Optional[str]:
+        return self._provenance
+
+    def set_provenance(self, provenance: str) -> None:
+        self._provenance = provenance
+
+    def reset_provenance(self) -> None:
+        self._provenance = None
+
+    def update(self, **kwargs: Any) -> None:
+        self._entries.update(**kwargs)
+
+    def remove(self, key: str) -> None:
+        del self._entries[key]
+
+    def __repr__(self) -> str:
+        pairs = [f"{key},{value}" for key, value in self._entries.items()]
+        if self._provenance is not None:
+            pairs.append(f"Provenance,{self._provenance}")
+        return "|".join(pairs)
+
+
 class Context:
     def __init__(
         self,
@@ -125,7 +153,7 @@ def _create_scope(
         )
 
         self._libname = library.get_name()
-        self._provenance: list[Union[str, None]] = [None]
+        self._annotations: list[LibraryAnnotations] = [LibraryAnnotations()]
 
     def destroy(self) -> None:
         self._library.destroy()
@@ -162,9 +190,16 @@ def empty_argmap(self) -> ArgumentMap:
     def type_system(self) -> TypeSystem:
         return self._type_system
 
+    @property
+    def annotation(self) -> LibraryAnnotations:
+        return self._annotations[-1]
+
+    def get_all_annotations(self) -> str:
+        return str(self.annotation)
+
     @property
     def provenance(self) -> Optional[str]:
-        return self._provenance[-1]
+        return self.annotation.provenance
 
     def get_task_id(self, task_id: int) -> int:
         return self._task_scope.translate(task_id)
@@ -209,18 +244,19 @@ def get_unique_op_id(self) -> int:
         return self._runtime.get_unique_op_id()
 
     def set_provenance(self, provenance: str) -> None:
-        self._provenance[-1] = provenance
+        self._annotations[-1].set_provenance(provenance)
 
     def reset_provenance(self) -> None:
-        self._provenance[-1] = None
+        self._annotations[-1].reset_provenance()
 
     def push_provenance(self, provenance: str) -> None:
-        self._provenance.append(provenance)
+        self._annotations.append(LibraryAnnotations())
+        self.set_provenance(provenance)
 
     def pop_provenance(self) -> None:
-        if len(self._provenance) == 1:
+        if len(self._annotations) == 1:
             raise ValueError("Provenance stack underflow")
-        self._provenance.pop(-1)
+        self._annotations.pop(-1)
 
     def track_provenance(
         self, func: AnyCallable, nested: bool = False
@@ -380,3 +416,16 @@ def decorator(func: AnyCallable) -> AnyCallable:
         return context.track_provenance(func, nested=nested)
 
     return decorator
+
+
+class Annotation:
+    def __init__(self, context: Context, pairs: dict[str, str]) -> None:
+        self._annotation = context.annotation
+        self._pairs = pairs
+
+    def __enter__(self) -> None:
+        self._annotation.update(**self._pairs)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        for key in self._pairs.keys():
+            self._annotation.remove(key)
diff --git a/legate/core/operation.py b/legate/core/operation.py
index d6788cf60..07738aa51 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -140,10 +140,15 @@ def __init__(
         self._all_parts: list[PartSym] = []
         self._launch_domain: Union[Rect, None] = None
         self._error_on_interference = True
+        self._provenance = (
+            None
+            if context.provenance is None
+            else (f"{context.provenance}$" f"{context.get_all_annotations()}")
+        )
 
     @property
     def provenance(self) -> Optional[str]:
-        return self._context.provenance
+        return self._provenance
 
     def get_all_stores(self) -> OrderedSet[Store]:
         result: OrderedSet[Store] = OrderedSet()

From 75e46db05a341c8fe6cc85c8d6645e8adff56036 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Mon, 14 Nov 2022 14:32:17 -0800
Subject: [PATCH 057/121] Add --cprofile driver option (#475)

* stdout for debug output

* add --cprofile driver option
---
 bind.sh                                  |  4 ++--
 legate/driver/args.py                    |  9 +++++++++
 legate/driver/command.py                 | 16 +++++++++++++++-
 legate/driver/config.py                  |  1 +
 legate/jupyter/config.py                 |  2 +-
 tests/unit/legate/driver/test_command.py | 18 ++++++++++++++++++
 tests/unit/legate/driver/test_config.py  |  6 ++++++
 tests/unit/legate/jupyter/test_config.py |  1 +
 8 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/bind.sh b/bind.sh
index 394deed3c..86a9bf44b 100755
--- a/bind.sh
+++ b/bind.sh
@@ -177,8 +177,8 @@ done
 set -- "${updated[@]}"
 
 if [ "$debug" == "1" ]; then
-  echo -n "bind.sh: $@" 1>&2
-  for TOK in "$@"; do printf " %q" "$TOK" 1>&2; done
+  echo -n "bind.sh: $@"
+  for TOK in "$@"; do printf " %q" "$TOK"; done
   echo
 fi
 
diff --git a/legate/driver/args.py b/legate/driver/args.py
index e36f783b1..9d6758a07 100644
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -129,6 +129,15 @@
 )
 
 
+profiling.add_argument(
+    "--cprofile",
+    dest="cprofile",
+    action="store_true",
+    required=False,
+    help="profile Python execution with the cprofile module",
+)
+
+
 profiling.add_argument(
     "--nvprof",
     dest="nvprof",
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 0c7909564..3ff4bbef7 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -152,8 +152,22 @@ def cmd_module(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
     module = config.other.module
+    cprofile = config.profiling.cprofile
 
-    return () if module is None else ("-m", module)
+    if cprofile and module is not None:
+        raise ValueError("Only one of --module or --cprofile may be used")
+
+    if module is not None:
+        return ("-m", module)
+
+    if cprofile:
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{LEGATE_GLOBAL_RANK_SUBSTITUTION}.cprof"
+        )
+        return ("-m", "cProfile", "-o", log_path)
+
+    return ()
 
 
 def cmd_rlwrap(
diff --git a/legate/driver/config.py b/legate/driver/config.py
index 711162ac5..6a526214f 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -90,6 +90,7 @@ class Memory(DataclassMixin):
 @dataclass(frozen=True)
 class Profiling(DataclassMixin):
     profile: bool
+    cprofile: bool
     nvprof: bool
     nsys: bool
     nsys_targets: str  # TODO: multi-choice
diff --git a/legate/jupyter/config.py b/legate/jupyter/config.py
index 77ee521cd..2acbc6dcb 100644
--- a/legate/jupyter/config.py
+++ b/legate/jupyter/config.py
@@ -81,7 +81,7 @@ def __init__(self, argv: ArgList) -> None:
         # turn everything else off
         self.user_opts: tuple[str, ...] = ()
         self.binding = Binding(None, None, None, None)
-        self.profiling = Profiling(False, False, False, "", [])
+        self.profiling = Profiling(False, False, False, False, "", [])
         self.logging = Logging(None, Path(), False, False)
         self.debugging = Debugging(
             False, False, False, False, False, False, False
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index fede85b11..e79310c93 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -548,6 +548,24 @@ def test_with_module(self, genobjs: GenObjs) -> None:
 
         assert result == ("-m", "foo")
 
+    def test_with_cprofile(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs(["--cprofile"])
+
+        result = m.cmd_module(config, system, launcher)
+
+        log_path = str(
+            config.logging.logdir
+            / f"legate_{m.LEGATE_GLOBAL_RANK_SUBSTITUTION}.cprof"
+        )
+        assert result == ("-m", "cProfile", "-o", log_path)
+
+    def test_module_and_cprofile_error(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs(["--module", "foo", "--cprofile"])
+
+        err = "Only one of --module or --cprofile may be used"
+        with pytest.raises(ValueError, match=err):
+            m.cmd_module(config, system, launcher)
+
 
 class Test_cmd_rlwrap:
     def test_default(self, genobjs: GenObjs) -> None:
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 67e0d2473..483719a6e 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -152,6 +152,7 @@ class TestProfiling:
     def test_fields(self) -> None:
         assert set(m.Profiling.__dataclass_fields__) == {
             "profile",
+            "cprofile",
             "nvprof",
             "nsys",
             "nsys_targets",
@@ -168,6 +169,7 @@ def test_mixin(self) -> None:
     def test_nsys_extra_fixup_basic(self, extra: list[str]) -> None:
         p = m.Profiling(
             profile=True,
+            cprofile=True,
             nvprof=True,
             nsys=True,
             nsys_targets="foo,bar",
@@ -178,6 +180,7 @@ def test_nsys_extra_fixup_basic(self, extra: list[str]) -> None:
     def test_nsys_extra_fixup_complex(self) -> None:
         p = m.Profiling(
             profile=True,
+            cprofile=True,
             nvprof=True,
             nsys=True,
             nsys_targets="foo,bar",
@@ -199,6 +202,7 @@ def test_nsys_extra_fixup_complex(self) -> None:
     def test_nsys_extra_fixup_quoted(self) -> None:
         p = m.Profiling(
             profile=True,
+            cprofile=True,
             nvprof=True,
             nsys=True,
             nsys_targets="foo,bar",
@@ -309,6 +313,7 @@ def test_default_init(self) -> None:
 
         c.profiling == m.Profiling(
             profile=False,
+            cprofile=False,
             nvprof=False,
             nsys=False,
             nsys_targets="",
@@ -414,6 +419,7 @@ def test_log_to_file_fixup(
                 "--gdb",
                 "--keep-logs",
                 "--profile",
+                "--cprofile",
             )
         ),
     )
diff --git a/tests/unit/legate/jupyter/test_config.py b/tests/unit/legate/jupyter/test_config.py
index 17931ed5c..d1c425237 100644
--- a/tests/unit/legate/jupyter/test_config.py
+++ b/tests/unit/legate/jupyter/test_config.py
@@ -84,6 +84,7 @@ def test_default_init(self) -> None:
 
         c.profiling == m.Profiling(
             profile=False,
+            cprofile=False,
             nvprof=False,
             nsys=False,
             nsys_targets="",

From ad2c8fe5cdd6b8ef7da17474caf36b9a2b148bb0 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 14 Nov 2022 23:15:27 -0800
Subject: [PATCH 058/121] Optimize scalar extraction (#472)

* Make the scalar extraction a constant time operation

* Comment about the format for packed return values

* Use a read-write accessor to extract the pointer of a ReturnValue

* Pick the right variant of extract_scalar based on machine configuration
---
 legate/core/runtime.py      |  13 ++-
 src/core/runtime/runtime.cc |  23 +++--
 src/core/task/return.cc     | 165 +++++++++++++++++++++---------------
 src/core/task/return.h      |  30 ++++++-
 src/core/task/task.h        |   5 ++
 5 files changed, 158 insertions(+), 78 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 6e0796a7a..56c106471 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -1045,6 +1045,15 @@ def num_omps(self) -> int:
     def num_gpus(self) -> int:
         return self._num_gpus
 
+    @property
+    def core_task_variant_id(self) -> int:
+        if self.num_gpus > 0:
+            return self.core_library.LEGATE_GPU_VARIANT
+        elif self.num_omps > 0:
+            return self.core_library.LEGATE_OMP_VARIANT
+        else:
+            return self.core_library.LEGATE_CPU_VARIANT
+
     @property
     def attachment_manager(self) -> AttachmentManager:
         return self._attachment_manager
@@ -1468,7 +1477,7 @@ def extract_scalar(self, future: Future, idx: int) -> Future:
         launcher = TaskLauncher(
             self.core_context,
             self.core_library.LEGATE_CORE_EXTRACT_SCALAR_TASK_ID,
-            tag=self.core_library.LEGATE_CPU_VARIANT,
+            tag=self.core_task_variant_id,
         )
         launcher.add_future(future)
         launcher.add_scalar_arg(idx, ty.int32)
@@ -1482,7 +1491,7 @@ def extract_scalar_with_domain(
         launcher = TaskLauncher(
             self.core_context,
             self.core_library.LEGATE_CORE_EXTRACT_SCALAR_TASK_ID,
-            tag=self.core_library.LEGATE_CPU_VARIANT,
+            tag=self.core_task_variant_id,
         )
         launcher.add_future_map(future)
         launcher.add_scalar_arg(idx, ty.int32)
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index 4ef4d831d..fb1549cf7 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -22,6 +22,7 @@
 #include "core/task/exception.h"
 #include "core/task/task.h"
 #include "core/utilities/deserializer.h"
+#include "core/utilities/machine.h"
 #include "legate.h"
 
 namespace legate {
@@ -95,11 +96,11 @@ static void extract_scalar_task(
   Core::show_progress(task, legion_context, runtime, task->get_task_name());
 
   TaskContext context(task, *regions, legion_context, runtime);
-  auto values = task->futures[0].get_result<ReturnValues>();
-  auto idx    = context.scalars()[0].value<int32_t>();
+  auto idx            = context.scalars()[0].value<int32_t>();
+  auto value_and_size = ReturnValues::extract(task->futures[0], idx);
 
   // Legion postamble
-  ReturnValues({values[idx]}).finalize(legion_context);
+  value_and_size.finalize(legion_context);
 }
 
 /*static*/ void Core::shutdown(void)
@@ -159,13 +160,19 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   };
 
   // Register the task variants
-  {
-    auto registrar =
-      make_registrar(extract_scalar_task_id, extract_scalar_task_name, Processor::LOC_PROC);
+  auto register_extract_scalar = [&](auto proc_kind, auto variant_id) {
+    auto registrar = make_registrar(extract_scalar_task_id, extract_scalar_task_name, proc_kind);
     Legion::CodeDescriptor desc(extract_scalar_task);
     runtime->register_task_variant(
-      registrar, desc, nullptr, 0, LEGATE_MAX_SIZE_SCALAR_RETURN, LEGATE_CPU_VARIANT);
-  }
+      registrar, desc, nullptr, 0, LEGATE_MAX_SIZE_SCALAR_RETURN, variant_id);
+  };
+  register_extract_scalar(Processor::LOC_PROC, LEGATE_CPU_VARIANT);
+#ifdef LEGATE_USE_CUDA
+  register_extract_scalar(Processor::TOC_PROC, LEGATE_GPU_VARIANT);
+#endif
+#ifdef LEGATE_USE_OPENMP
+  register_extract_scalar(Processor::OMP_PROC, LEGATE_OMP_VARIANT);
+#endif
   comm::register_tasks(machine, runtime, context);
 }
 
diff --git a/src/core/task/return.cc b/src/core/task/return.cc
index 3a9cd9216..c44365516 100644
--- a/src/core/task/return.cc
+++ b/src/core/task/return.cc
@@ -35,6 +35,40 @@ using namespace Legion;
 
 namespace legate {
 
+ReturnValue::ReturnValue(Legion::UntypedDeferredValue value, size_t size)
+  : value_(value), size_(size)
+{
+  is_device_value_ = value.get_instance().get_location().kind() == Memory::Kind::GPU_FB_MEM;
+}
+
+/*static*/ ReturnValue ReturnValue::unpack(const void* ptr, size_t size, Memory::Kind memory_kind)
+{
+  ReturnValue result(UntypedDeferredValue(size, memory_kind), size);
+#ifdef DEBUG_LEGATE
+  assert(!result.is_device_value());
+#endif
+  memcpy(result.ptr(), ptr, size);
+
+  return result;
+}
+
+void ReturnValue::finalize(Legion::Context legion_context) const
+{
+  value_.finalize(legion_context);
+}
+
+void* ReturnValue::ptr()
+{
+  AccessorRW<int8_t, 1> acc(value_, size_, false);
+  return acc.ptr(0);
+}
+
+const void* ReturnValue::ptr() const
+{
+  AccessorRO<int8_t, 1> acc(value_, size_, false);
+  return acc.ptr(0);
+}
+
 struct JoinReturnedException {
   using LHS = ReturnedException;
   using RHS = LHS;
@@ -145,53 +179,6 @@ ReturnValue ReturnedException::pack() const
   return ReturnValue(buffer, buffer_size);
 }
 
-namespace {
-
-template <bool PACK_SIZE>
-int8_t* pack_return_value(int8_t* target, const ReturnValue& value)
-{
-  if constexpr (PACK_SIZE) {
-    *reinterpret_cast<uint32_t*>(target) = value.second;
-    target += sizeof(uint32_t);
-  }
-
-  AccessorRO<int8_t, 1> acc(value.first, value.second, false);
-  memcpy(target, acc.ptr(0), value.second);
-  return target + value.second;
-}
-
-#ifdef LEGATE_USE_CUDA
-
-template <bool PACK_SIZE>
-int8_t* pack_return_value(int8_t* target, const ReturnValue& value, cuda::StreamView& stream)
-{
-  if constexpr (PACK_SIZE) {
-    *reinterpret_cast<uint32_t*>(target) = value.second;
-    target += sizeof(uint32_t);
-  }
-
-  AccessorRO<int8_t, 1> acc(value.first, value.second, false);
-  CHECK_CUDA(cudaMemcpyAsync(target, acc.ptr(0), value.second, cudaMemcpyDeviceToHost, stream));
-  return target + value.second;
-}
-
-#endif
-
-ReturnValue unpack_return_value(const int8_t*& ptr, Memory::Kind memory_kind)
-{
-  auto size = *reinterpret_cast<const uint32_t*>(ptr);
-  ptr += sizeof(uint32_t);
-
-  UntypedDeferredValue value(size, memory_kind);
-  AccessorWO<int8_t, 1> acc(value, size, false);
-  memcpy(acc.ptr(0), ptr, size);
-  ptr += size;
-
-  return ReturnValue(value, size);
-}
-
-}  // namespace
-
 ReturnValues::ReturnValues() {}
 
 ReturnValues::ReturnValues(std::vector<ReturnValue>&& return_values)
@@ -199,9 +186,9 @@ ReturnValues::ReturnValues(std::vector<ReturnValue>&& return_values)
 {
   if (return_values_.size() > 1) {
     buffer_size_ += sizeof(uint32_t);
-    for (auto& ret : return_values_) buffer_size_ += sizeof(uint32_t) + ret.second;
+    for (auto& ret : return_values_) buffer_size_ += sizeof(uint32_t) + ret.size();
   } else if (return_values_.size() > 0)
-    buffer_size_ = return_values_[0].second;
+    buffer_size_ = return_values_[0].size();
 }
 
 ReturnValue ReturnValues::operator[](int32_t idx) const { return return_values_[idx]; }
@@ -210,31 +197,51 @@ size_t ReturnValues::legion_buffer_size() const { return buffer_size_; }
 
 void ReturnValues::legion_serialize(void* buffer) const
 {
+  // We pack N return values into the buffer in the following format:
+  //
+  // +--------+-----------+-----+------------+-------+-------+-------+-----
+  // |   #    | offset to |     | offset to  | total | value | value | ...
+  // | values | scalar 1  | ... | scalar N-1 | value |   1   |   2   |
+  // |        |           |     |            | size  |       |       |
+  // +--------+-----------+-----+------------+-------+-------+-------+-----
+  //           <============ offsets ===============> <==== values =======>
+  //
+  // the size of value i is computed by offsets[i] - (i == 0 ? 0 : offsets[i-1])
+
 #ifdef LEGATE_USE_CUDA
   auto stream = cuda::StreamPool::get_stream_pool().get_stream();
 #endif
 
-  auto ptr = static_cast<int8_t*>(buffer);
   if (return_values_.size() == 1) {
     auto& ret = return_values_.front();
 #ifdef LEGATE_USE_CUDA
-    if (ret.first.get_instance().get_location().kind() == Memory::Kind::GPU_FB_MEM)
-      ptr = pack_return_value<false>(ptr, ret, stream);
+    if (ret.is_device_value())
+      CHECK_CUDA(cudaMemcpyAsync(buffer, ret.ptr(), ret.size(), cudaMemcpyDeviceToHost, stream));
     else
 #endif
-      ptr = pack_return_value<false>(ptr, ret);
-  } else {
-    *reinterpret_cast<uint32_t*>(ptr) = return_values_.size();
-    ptr += sizeof(uint32_t);
+      memcpy(buffer, ret.ptr(), ret.size());
+    return;
+  }
+
+  *static_cast<uint32_t*>(buffer) = return_values_.size();
+  auto ptr                        = static_cast<int8_t*>(buffer) + sizeof(uint32_t);
 
-    for (auto& ret : return_values_) {
+  uint32_t offset = 0;
+  for (auto ret : return_values_) {
+    offset += ret.size();
+    *reinterpret_cast<uint32_t*>(ptr) = offset;
+    ptr                               = ptr + sizeof(uint32_t);
+  }
+
+  for (auto ret : return_values_) {
+    uint32_t size = ret.size();
 #ifdef LEGATE_USE_CUDA
-      if (ret.first.get_instance().get_location().kind() == Memory::Kind::GPU_FB_MEM)
-        ptr = pack_return_value<true>(ptr, ret, stream);
-      else
+    if (ret.is_device_value())
+      CHECK_CUDA(cudaMemcpyAsync(ptr, ret.ptr(), size, cudaMemcpyDeviceToHost, stream));
+    else
 #endif
-        ptr = pack_return_value<true>(ptr, ret);
-    }
+      memcpy(ptr, ret.ptr(), size);
+    ptr += size;
   }
 }
 
@@ -244,11 +251,35 @@ void ReturnValues::legion_deserialize(const void* buffer)
 
   auto ptr        = static_cast<const int8_t*>(buffer);
   auto num_values = *reinterpret_cast<const uint32_t*>(ptr);
-  ptr += sizeof(uint32_t);
-  return_values_.resize(num_values);
 
-  for (auto& ret : return_values_) ret = unpack_return_value(ptr, mem_kind);
-  buffer_size_ = ptr - static_cast<const int8_t*>(buffer);
+  auto offsets = reinterpret_cast<const uint32_t*>(ptr + sizeof(uint32_t));
+  auto values  = ptr + sizeof(uint32_t) + sizeof(uint32_t) * num_values;
+
+  uint32_t offset = 0;
+  for (uint32_t idx = 0; idx < num_values; ++idx) {
+    uint32_t next_offset = offsets[idx];
+    uint32_t size        = next_offset - offset;
+    return_values_.push_back(ReturnValue::unpack(values + offset, size, mem_kind));
+    offset = next_offset;
+  }
+}
+
+/*static*/ ReturnValue ReturnValues::extract(Legion::Future future, uint32_t to_extract)
+{
+  auto kind          = find_memory_kind_for_executing_processor();
+  const auto* buffer = future.get_buffer(kind);
+
+  auto ptr        = static_cast<const int8_t*>(buffer);
+  auto num_values = *reinterpret_cast<const uint32_t*>(ptr);
+
+  auto offsets = reinterpret_cast<const uint32_t*>(ptr + sizeof(uint32_t));
+  auto values  = ptr + sizeof(uint32_t) + sizeof(uint32_t) * num_values;
+
+  uint32_t next_offset = offsets[to_extract];
+  uint32_t offset      = to_extract == 0 ? 0 : offsets[to_extract - 1];
+  uint32_t size        = next_offset - offset;
+
+  return ReturnValue::unpack(values + offset, size, kind);
 }
 
 void ReturnValues::finalize(Context legion_context) const
@@ -257,7 +288,7 @@ void ReturnValues::finalize(Context legion_context) const
     Runtime::legion_task_postamble(legion_context);
     return;
   } else if (return_values_.size() == 1) {
-    return_values_.front().first.finalize(legion_context);
+    return_values_.front().finalize(legion_context);
     return;
   }
 
diff --git a/src/core/task/return.h b/src/core/task/return.h
index f767f6161..9fa558e64 100644
--- a/src/core/task/return.h
+++ b/src/core/task/return.h
@@ -20,7 +20,32 @@
 
 namespace legate {
 
-using ReturnValue = std::pair<Legion::UntypedDeferredValue, size_t>;
+struct ReturnValue {
+ public:
+  ReturnValue(Legion::UntypedDeferredValue value, size_t size);
+
+ public:
+  ReturnValue(const ReturnValue&)            = default;
+  ReturnValue& operator=(const ReturnValue&) = default;
+
+ public:
+  static ReturnValue unpack(const void* ptr, size_t size, Legion::Memory::Kind memory_kind);
+
+ public:
+  void* ptr();
+  const void* ptr() const;
+  const size_t size() const { return size_; }
+  const bool is_device_value() const { return is_device_value_; }
+
+ public:
+  // Calls the Legion postamble with an instance
+  void finalize(Legion::Context legion_context) const;
+
+ private:
+  Legion::UntypedDeferredValue value_{};
+  size_t size_{0};
+  bool is_device_value_{false};
+};
 
 struct ReturnedException {
  public:
@@ -65,6 +90,9 @@ struct ReturnValues {
   void legion_serialize(void* buffer) const;
   void legion_deserialize(const void* buffer);
 
+ public:
+  static ReturnValue extract(Legion::Future future, uint32_t to_extract);
+
  public:
   // Calls the Legion postamble with an instance that packs all return values
   void finalize(Legion::Context legion_context) const;
diff --git a/src/core/task/task.h b/src/core/task/task.h
index c06006f49..b90a4e86b 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -62,6 +62,11 @@ struct VariantOptions {
     concurrent = _concurrent;
     return *this;
   }
+  VariantOptions& with_return_size(size_t _return_size)
+  {
+    return_size = _return_size;
+    return *this;
+  }
 };
 
 using LegateVariantImpl = void (*)(TaskContext&);

From 878b6b893f9b73dc57f7e24ada38ea1667c70b0e Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 15 Nov 2022 10:16:16 -0800
Subject: [PATCH 059/121] Make overlap check tight (#479)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/core/store.py | 42 +++++++++---------------------------------
 1 file changed, 9 insertions(+), 33 deletions(-)

diff --git a/legate/core/store.py b/legate/core/store.py
index a8d01546b..7b09516f1 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -655,42 +655,18 @@ def overlaps(self, other: Storage) -> bool:
         lhs = self
         rhs = other
 
-        lhs_root = lhs.get_root()
-        rhs_root = rhs.get_root()
-
-        if lhs_root is not rhs_root:
+        if lhs.get_root() is not rhs.get_root():
             return False
 
-        lhs_lvl = lhs.level
-        rhs_lvl = rhs.level
-
-        if lhs_lvl > rhs_lvl:
-            lhs, rhs = rhs, lhs
-            lhs_lvl, rhs_lvl = rhs_lvl, lhs_lvl
-
-        while lhs_lvl < rhs_lvl:
-            rhs_parent = rhs.parent
-            assert rhs_parent is not None
-            rhs = rhs_parent.parent
-            rhs_lvl -= 2
+        if lhs.volume() == 0 or rhs.volume() == 0:
+            return False
 
-        if lhs is rhs:
-            return True
-        else:
-            assert lhs.has_parent and rhs.has_parent
-            assert self.parent is not None
-            # Legion doesn't allow passing aliased partitions to a task
-            if lhs.parent is not rhs.parent:
-                return True
-            else:
-                # TODO: This check is incorrect if the partition is aliased.
-                #       Since we only have a tiling, which is a disjoint
-                #       partition, we put this assertion here to remember
-                #       that we need to exdtend this logic if we have other
-                #       partitions. (We need to carry around the disjointness
-                #       of each partition.)
-                assert isinstance(self.parent._partition, Tiling)
-                return lhs.color == rhs.color
+        return all(
+            roff < loff + lext if loff <= roff else loff < roff + rext
+            for (loff, lext, roff, rext) in zip(
+                lhs.offsets, lhs.extents, rhs.offsets, rhs.extents
+            )
+        )
 
     def attach_external_allocation(
         self, context: Context, alloc: Attachable, share: bool

From 59e1a1ca4fd31e709c360200b59dcebfea9c4398 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 15 Nov 2022 13:41:24 -0700
Subject: [PATCH 060/121] Refactor CPU collective communicator (#468)

* fix for -ll:networks none, we will init MPI if it has not been
initialized.

* refactor mpi comm

* add the missing file

* refactor local comm

* remove unused files

* always build local comm

* now the both mpi and local comms are working

* move common used functions into the base class

* add a virtual destructor for BackendNetwork base class

* select network based on LEGATE_NEED_NETWORK

* minor fix for local network

* mute printf

* use debug instead of info
---
 legate_core_cpp.cmake                   |  11 +-
 src/core/comm/allgather_thread_local.cc |  82 ----
 src/core/comm/allgather_thread_mpi.cc   |  55 ---
 src/core/comm/alltoall_thread_local.cc  |  87 ----
 src/core/comm/alltoall_thread_mpi.cc    |  94 ----
 src/core/comm/alltoallv_thread_local.cc | 100 -----
 src/core/comm/alltoallv_thread_mpi.cc   | 103 -----
 src/core/comm/bcast_thread_mpi.cc       |  85 ----
 src/core/comm/coll.cc                   | 406 +----------------
 src/core/comm/coll.h                    | 243 ++++++----
 src/core/comm/comm_cpu.cc               |  27 +-
 src/core/comm/gather_thread_mpi.cc      | 100 -----
 src/core/comm/local_comm.cc             | 351 +++++++++++++++
 src/core/comm/mpi_comm.cc               | 575 ++++++++++++++++++++++++
 14 files changed, 1124 insertions(+), 1195 deletions(-)
 delete mode 100644 src/core/comm/allgather_thread_local.cc
 delete mode 100644 src/core/comm/allgather_thread_mpi.cc
 delete mode 100644 src/core/comm/alltoall_thread_local.cc
 delete mode 100644 src/core/comm/alltoall_thread_mpi.cc
 delete mode 100644 src/core/comm/alltoallv_thread_local.cc
 delete mode 100644 src/core/comm/alltoallv_thread_mpi.cc
 delete mode 100644 src/core/comm/bcast_thread_mpi.cc
 delete mode 100644 src/core/comm/gather_thread_mpi.cc
 create mode 100644 src/core/comm/local_comm.cc
 create mode 100644 src/core/comm/mpi_comm.cc

diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index 6150a1908..7502e501d 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -211,16 +211,11 @@ list(APPEND legate_core_SOURCES
 
 if(Legion_NETWORKS)
   list(APPEND legate_core_SOURCES
-    src/core/comm/alltoall_thread_mpi.cc
-    src/core/comm/alltoallv_thread_mpi.cc
-    src/core/comm/gather_thread_mpi.cc
-    src/core/comm/allgather_thread_mpi.cc
-    src/core/comm/bcast_thread_mpi.cc)
+    src/core/comm/mpi_comm.cc
+    src/core/comm/local_comm.cc)
 else()
   list(APPEND legate_core_SOURCES
-    src/core/comm/alltoall_thread_local.cc
-    src/core/comm/alltoallv_thread_local.cc
-    src/core/comm/allgather_thread_local.cc)
+    src/core/comm/local_comm.cc)
 endif()
 
 if(Legion_USE_CUDA)
diff --git a/src/core/comm/allgather_thread_local.cc b/src/core/comm/allgather_thread_local.cc
deleted file mode 100644
index c2a3587b9..000000000
--- a/src/core/comm/allgather_thread_local.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int allgatherLocal(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
-{
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  int type_extent = getDtypeSize(type);
-
-  const void* sendbuf_tmp = sendbuf;
-
-  // MPI_IN_PLACE
-  if (sendbuf == recvbuf) { sendbuf_tmp = allocateInplaceBuffer(recvbuf, type_extent * count); }
-
-  global_comm->comm->buffers[global_rank] = sendbuf_tmp;
-  __sync_synchronize();
-
-  for (int recvfrom_global_rank = 0; recvfrom_global_rank < total_size; recvfrom_global_rank++) {
-    // wait for other threads to update the buffer address
-    while (global_comm->comm->buffers[recvfrom_global_rank] == nullptr)
-      ;
-    const void* src = global_comm->comm->buffers[recvfrom_global_rank];
-    char* dst       = static_cast<char*>(recvbuf) +
-                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "AllgatherLocal i: %d === global_rank %d, dtype %d, copy rank %d (%p) to rank %d (%p)",
-      recvfrom_global_rank,
-      global_rank,
-      type_extent,
-      recvfrom_global_rank,
-      src,
-      global_rank,
-      dst);
-#endif
-    memcpy(dst, src, count * type_extent);
-  }
-
-  barrierLocal(global_comm);
-  if (sendbuf == recvbuf) { free(const_cast<void*>(sendbuf_tmp)); }
-
-  __sync_synchronize();
-
-  resetLocalBuffer(global_comm);
-  barrierLocal(global_comm);
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/allgather_thread_mpi.cc b/src/core/comm/allgather_thread_mpi.cc
deleted file mode 100644
index 4e256ce12..000000000
--- a/src/core/comm/allgather_thread_mpi.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-int allgatherMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
-{
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
-
-  MPI_Aint lb, type_extent;
-  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
-
-  void* sendbuf_tmp = const_cast<void*>(sendbuf);
-
-  // MPI_IN_PLACE
-  if (sendbuf == recvbuf) { sendbuf_tmp = allocateInplaceBuffer(recvbuf, type_extent * count); }
-
-  gatherMPI(sendbuf_tmp, recvbuf, count, type, 0, global_comm);
-
-  bcastMPI(recvbuf, count * total_size, type, 0, global_comm);
-
-  if (sendbuf == recvbuf) { free(sendbuf_tmp); }
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/alltoall_thread_local.cc b/src/core/comm/alltoall_thread_local.cc
deleted file mode 100644
index bffb1061a..000000000
--- a/src/core/comm/alltoall_thread_local.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int alltoallLocal(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
-{
-  int res;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  int type_extent = getDtypeSize(type);
-
-  global_comm->comm->buffers[global_rank] = sendbuf;
-  __sync_synchronize();
-
-  int recvfrom_global_rank;
-  int recvfrom_seg_id  = global_rank;
-  const void* src_base = nullptr;
-  for (int i = 1; i < total_size + 1; i++) {
-    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
-    // wait for other threads to update the buffer address
-    while (global_comm->comm->buffers[recvfrom_global_rank] == nullptr)
-      ;
-    src_base  = global_comm->comm->buffers[recvfrom_global_rank];
-    char* src = static_cast<char*>(const_cast<void*>(src_base)) +
-                static_cast<ptrdiff_t>(recvfrom_seg_id) * type_extent * count;
-    char* dst = static_cast<char*>(recvbuf) +
-                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "AlltoallLocal i: %d === global_rank %d, dtype %d, copy rank %d (seg %d, %p) to rank %d (seg "
-      "%d, %p)",
-      i,
-      global_rank,
-      type_extent,
-      recvfrom_global_rank,
-      recvfrom_seg_id,
-      src,
-      global_rank,
-      recvfrom_global_rank,
-      dst);
-#endif
-    memcpy(dst, src, count * type_extent);
-  }
-
-  barrierLocal(global_comm);
-
-  __sync_synchronize();
-
-  resetLocalBuffer(global_comm);
-  barrierLocal(global_comm);
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/alltoall_thread_mpi.cc b/src/core/comm/alltoall_thread_mpi.cc
deleted file mode 100644
index d151dd5d7..000000000
--- a/src/core/comm/alltoall_thread_mpi.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int alltoallMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
-{
-  MPI_Status status;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
-
-  MPI_Aint lb, type_extent;
-  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
-
-  int sendto_global_rank, recvfrom_global_rank, sendto_mpi_rank, recvfrom_mpi_rank;
-  for (int i = 1; i < total_size + 1; i++) {
-    sendto_global_rank   = (global_rank + i) % total_size;
-    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
-    char* src            = static_cast<char*>(const_cast<void*>(sendbuf)) +
-                static_cast<ptrdiff_t>(sendto_global_rank) * type_extent * count;
-    char* dst = static_cast<char*>(recvbuf) +
-                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
-    sendto_mpi_rank   = global_comm->mapping_table.mpi_rank[sendto_global_rank];
-    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[recvfrom_global_rank];
-    assert(sendto_global_rank == global_comm->mapping_table.global_rank[sendto_global_rank]);
-    assert(recvfrom_global_rank == global_comm->mapping_table.global_rank[recvfrom_global_rank]);
-    // tag: seg idx + rank_idx + tag
-    int send_tag = generateAlltoallTag(sendto_global_rank, global_rank, global_comm);
-    int recv_tag = generateAlltoallTag(global_rank, recvfrom_global_rank, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "AlltoallMPI i: %d === global_rank %d, mpi rank %d, send to %d (%d), send_tag %d, "
-      "recv from %d (%d), "
-      "recv_tag %d",
-      i,
-      global_rank,
-      global_comm->mpi_rank,
-      sendto_global_rank,
-      sendto_mpi_rank,
-      send_tag,
-      recvfrom_global_rank,
-      recvfrom_mpi_rank,
-      recv_tag);
-#endif
-    CHECK_MPI(MPI_Sendrecv(src,
-                           count,
-                           mpi_type,
-                           sendto_mpi_rank,
-                           send_tag,
-                           dst,
-                           count,
-                           mpi_type,
-                           recvfrom_mpi_rank,
-                           recv_tag,
-                           global_comm->comm,
-                           &status));
-  }
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/alltoallv_thread_local.cc b/src/core/comm/alltoallv_thread_local.cc
deleted file mode 100644
index 6615e7459..000000000
--- a/src/core/comm/alltoallv_thread_local.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int alltoallvLocal(const void* sendbuf,
-                   const int sendcounts[],
-                   const int sdispls[],
-                   void* recvbuf,
-                   const int recvcounts[],
-                   const int rdispls[],
-                   CollDataType type,
-                   CollComm global_comm)
-{
-  int res;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  int type_extent = getDtypeSize(type);
-
-  global_comm->comm->displs[global_rank]  = sdispls;
-  global_comm->comm->buffers[global_rank] = sendbuf;
-  __sync_synchronize();
-
-  int recvfrom_global_rank;
-  int recvfrom_seg_id  = global_rank;
-  const void* src_base = nullptr;
-  const int* displs    = nullptr;
-  for (int i = 1; i < total_size + 1; i++) {
-    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
-    // wait for other threads to update the buffer address
-    while (global_comm->comm->buffers[recvfrom_global_rank] == nullptr ||
-           global_comm->comm->displs[recvfrom_global_rank] == nullptr)
-      ;
-    src_base  = global_comm->comm->buffers[recvfrom_global_rank];
-    displs    = global_comm->comm->displs[recvfrom_global_rank];
-    char* src = static_cast<char*>(const_cast<void*>(src_base)) +
-                static_cast<ptrdiff_t>(displs[recvfrom_seg_id]) * type_extent;
-    char* dst = static_cast<char*>(recvbuf) +
-                static_cast<ptrdiff_t>(rdispls[recvfrom_global_rank]) * type_extent;
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "AlltoallvLocal i: %d === global_rank %d, dtype %d, copy rank %d (seg %d, sdispls %d, %p) to "
-      "rank %d (seg "
-      "%d, rdispls %d, %p)",
-      i,
-      global_rank,
-      type_extent,
-      recvfrom_global_rank,
-      recvfrom_seg_id,
-      sdispls[recvfrom_seg_id],
-      src,
-      global_rank,
-      recvfrom_global_rank,
-      rdispls[recvfrom_global_rank],
-      dst);
-#endif
-    memcpy(dst, src, recvcounts[recvfrom_global_rank] * type_extent);
-  }
-
-  barrierLocal(global_comm);
-
-  __sync_synchronize();
-
-  resetLocalBuffer(global_comm);
-  barrierLocal(global_comm);
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/alltoallv_thread_mpi.cc b/src/core/comm/alltoallv_thread_mpi.cc
deleted file mode 100644
index 1bcc8806e..000000000
--- a/src/core/comm/alltoallv_thread_mpi.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int alltoallvMPI(const void* sendbuf,
-                 const int sendcounts[],
-                 const int sdispls[],
-                 void* recvbuf,
-                 const int recvcounts[],
-                 const int rdispls[],
-                 CollDataType type,
-                 CollComm global_comm)
-{
-  MPI_Status status;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
-
-  MPI_Aint lb, type_extent;
-  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
-
-  int sendto_global_rank, recvfrom_global_rank, sendto_mpi_rank, recvfrom_mpi_rank;
-  for (int i = 1; i < total_size + 1; i++) {
-    sendto_global_rank   = (global_rank + i) % total_size;
-    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
-    char* src            = static_cast<char*>(const_cast<void*>(sendbuf)) +
-                static_cast<ptrdiff_t>(sdispls[sendto_global_rank]) * type_extent;
-    char* dst = static_cast<char*>(recvbuf) +
-                static_cast<ptrdiff_t>(rdispls[recvfrom_global_rank]) * type_extent;
-    int scount        = sendcounts[sendto_global_rank];
-    int rcount        = recvcounts[recvfrom_global_rank];
-    sendto_mpi_rank   = global_comm->mapping_table.mpi_rank[sendto_global_rank];
-    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[recvfrom_global_rank];
-    assert(sendto_global_rank == global_comm->mapping_table.global_rank[sendto_global_rank]);
-    assert(recvfrom_global_rank == global_comm->mapping_table.global_rank[recvfrom_global_rank]);
-    // tag: seg idx + rank_idx + tag
-    int send_tag = generateAlltoallvTag(sendto_global_rank, global_rank, global_comm);
-    int recv_tag = generateAlltoallvTag(global_rank, recvfrom_global_rank, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "AlltoallvMPI i: %d === global_rank %d, mpi rank %d, send to %d (%d), send_tag %d, "
-      "recv from %d (%d), "
-      "recv_tag %d",
-      i,
-      global_rank,
-      global_comm->mpi_rank,
-      sendto_global_rank,
-      sendto_mpi_rank,
-      send_tag,
-      recvfrom_global_rank,
-      recvfrom_mpi_rank,
-      recv_tag);
-#endif
-    CHECK_MPI(MPI_Sendrecv(src,
-                           scount,
-                           mpi_type,
-                           sendto_mpi_rank,
-                           send_tag,
-                           dst,
-                           rcount,
-                           mpi_type,
-                           recvfrom_mpi_rank,
-                           recv_tag,
-                           global_comm->comm,
-                           &status));
-  }
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/bcast_thread_mpi.cc b/src/core/comm/bcast_thread_mpi.cc
deleted file mode 100644
index 6c7f77092..000000000
--- a/src/core/comm/bcast_thread_mpi.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int bcastMPI(void* buf, int count, CollDataType type, int root, CollComm global_comm)
-{
-  int tag;
-  MPI_Status status;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  int root_mpi_rank = global_comm->mapping_table.mpi_rank[root];
-  assert(root == global_comm->mapping_table.global_rank[root]);
-
-  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
-
-  // non-root
-  if (global_rank != root) {
-    tag = generateBcastTag(global_rank, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug("BcastMPI: non-root recv global_rank %d, mpi rank %d, send to %d (%d), tag %d",
-                   global_rank,
-                   global_comm->mpi_rank,
-                   root,
-                   root_mpi_rank,
-                   tag);
-#endif
-    CHECK_MPI(MPI_Recv(buf, count, mpi_type, root_mpi_rank, tag, global_comm->comm, &status));
-    return CollSuccess;
-  }
-
-  // root
-  int sendto_mpi_rank;
-  for (int i = 0; i < total_size; i++) {
-    sendto_mpi_rank = global_comm->mapping_table.mpi_rank[i];
-    assert(i == global_comm->mapping_table.global_rank[i]);
-    tag = generateBcastTag(i, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug("BcastMPI: root i %d === global_rank %d, mpi rank %d, send to %d (%d), tag %d",
-                   i,
-                   global_rank,
-                   global_comm->mpi_rank,
-                   i,
-                   sendto_mpi_rank,
-                   tag);
-#endif
-    if (global_rank != i) {
-      CHECK_MPI(MPI_Send(buf, count, mpi_type, sendto_mpi_rank, tag, global_comm->comm));
-    }
-  }
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/coll.cc b/src/core/comm/coll.cc
index 6ca5d6787..5f1a1f4e9 100644
--- a/src/core/comm/coll.cc
+++ b/src/core/comm/coll.cc
@@ -38,128 +38,20 @@ namespace coll {
 using namespace Legion;
 Logger log_coll("coll");
 
-#ifdef LEGATE_USE_NETWORK
-
-enum CollTag : int {
-  BCAST_TAG     = 0,
-  GATHER_TAG    = 1,
-  ALLTOALL_TAG  = 2,
-  ALLTOALLV_TAG = 3,
-  MAX_TAG       = 10,
-};
-
-static int mpi_tag_ub = 0;
-
-static std::vector<MPI_Comm> mpi_comms;
-#else  // undef LEGATE_USE_NETWORK
-static std::vector<ThreadComm*> thread_comms;
-#endif
-
-static int current_unique_id = 0;
-
-static bool coll_inited = false;
-
-static bool self_mpi_init = false;
+BackendNetwork* backend_network = nullptr;
 
 // functions start here
-#ifdef LEGATE_USE_NETWORK
-static inline std::pair<int, int> mostFrequent(const int* arr, int n);
-static inline int match2ranks(int rank1, int rank2, CollComm global_comm);
-#endif
-
 int collCommCreate(CollComm global_comm,
                    int global_comm_size,
                    int global_rank,
                    int unique_id,
                    const int* mapping_table)
 {
-  global_comm->global_comm_size = global_comm_size;
-  global_comm->global_rank      = global_rank;
-  global_comm->status           = true;
-  global_comm->unique_id        = unique_id;
-#ifdef LEGATE_USE_NETWORK
-  int mpi_rank, mpi_comm_size;
-  int *tag_ub, flag;
-  int compare_result;
-  MPI_Comm comm = mpi_comms[unique_id];
-  CHECK_MPI(MPI_Comm_compare(comm, MPI_COMM_WORLD, &compare_result));
-  assert(MPI_CONGRUENT == compare_result);
-
-  CHECK_MPI(MPI_Comm_rank(comm, &mpi_rank));
-  CHECK_MPI(MPI_Comm_size(comm, &mpi_comm_size));
-  global_comm->mpi_comm_size = mpi_comm_size;
-  global_comm->mpi_rank      = mpi_rank;
-  global_comm->comm          = comm;
-  assert(mapping_table != nullptr);
-  global_comm->mapping_table.global_rank = (int*)malloc(sizeof(int) * global_comm_size);
-  global_comm->mapping_table.mpi_rank    = (int*)malloc(sizeof(int) * global_comm_size);
-  memcpy(global_comm->mapping_table.mpi_rank, mapping_table, sizeof(int) * global_comm_size);
-  for (int i = 0; i < global_comm_size; i++) { global_comm->mapping_table.global_rank[i] = i; }
-  std::pair<int, int> p             = mostFrequent(mapping_table, global_comm_size);
-  global_comm->nb_threads           = p.first;
-  global_comm->mpi_comm_size_actual = p.second;
-#else
-  assert(mapping_table == nullptr);
-  global_comm->mpi_comm_size        = 1;
-  global_comm->mpi_comm_size_actual = 1;
-  global_comm->mpi_rank             = 0;
-  if (global_comm->global_rank == 0) {
-    pthread_barrier_init((pthread_barrier_t*)&(thread_comms[global_comm->unique_id]->barrier),
-                         nullptr,
-                         global_comm->global_comm_size);
-    thread_comms[global_comm->unique_id]->buffers =
-      (const void**)malloc(sizeof(void*) * global_comm_size);
-    thread_comms[global_comm->unique_id]->displs =
-      (const int**)malloc(sizeof(int*) * global_comm_size);
-    for (int i = 0; i < global_comm_size; i++) {
-      thread_comms[global_comm->unique_id]->buffers[i] = nullptr;
-      thread_comms[global_comm->unique_id]->displs[i]  = nullptr;
-    }
-    __sync_synchronize();
-    thread_comms[global_comm->unique_id]->ready_flag = true;
-  }
-  __sync_synchronize();
-  volatile ThreadComm* data = thread_comms[global_comm->unique_id];
-  while (data->ready_flag != true) { data = thread_comms[global_comm->unique_id]; }
-  global_comm->comm = thread_comms[global_comm->unique_id];
-  barrierLocal(global_comm);
-  assert(global_comm->comm->ready_flag == true);
-  assert(global_comm->comm->buffers != nullptr);
-  assert(global_comm->comm->displs != nullptr);
-  global_comm->nb_threads = global_comm->global_comm_size;
-#endif
-  return CollSuccess;
+  return backend_network->comm_create(
+    global_comm, global_comm_size, global_rank, unique_id, mapping_table);
 }
 
-int collCommDestroy(CollComm global_comm)
-{
-#ifdef LEGATE_USE_NETWORK
-  if (global_comm->mapping_table.global_rank != nullptr) {
-    free(global_comm->mapping_table.global_rank);
-    global_comm->mapping_table.global_rank = nullptr;
-  }
-  if (global_comm->mapping_table.mpi_rank != nullptr) {
-    free(global_comm->mapping_table.mpi_rank);
-    global_comm->mapping_table.mpi_rank = nullptr;
-  }
-#else
-  barrierLocal(global_comm);
-  if (global_comm->global_rank == 0) {
-    pthread_barrier_destroy((pthread_barrier_t*)&(thread_comms[global_comm->unique_id]->barrier));
-    free(thread_comms[global_comm->unique_id]->buffers);
-    thread_comms[global_comm->unique_id]->buffers = nullptr;
-    free(thread_comms[global_comm->unique_id]->displs);
-    thread_comms[global_comm->unique_id]->displs = nullptr;
-    __sync_synchronize();
-    thread_comms[global_comm->unique_id]->ready_flag = false;
-  }
-  __sync_synchronize();
-  volatile ThreadComm* data = thread_comms[global_comm->unique_id];
-  while (data->ready_flag != false) { data = thread_comms[global_comm->unique_id]; }
-#endif
-  global_comm->status = false;
-  return CollSuccess;
-}
+int collCommDestroy(CollComm global_comm) { return backend_network->comm_destroy(global_comm); }
 
 int collAlltoallv(const void* sendbuf,
                   const int sendcounts[],
@@ -185,13 +77,8 @@ int collAlltoallv(const void* sendbuf,
     global_comm->mpi_comm_size,
     global_comm->mpi_comm_size_actual,
     global_comm->nb_threads);
-#ifdef LEGATE_USE_NETWORK
-  return alltoallvMPI(
-    sendbuf, sendcounts, sdispls, recvbuf, recvcounts, rdispls, type, global_comm);
-#else
-  return alltoallvLocal(
+  return backend_network->alltoallv(
     sendbuf, sendcounts, sdispls, recvbuf, recvcounts, rdispls, type, global_comm);
-#endif
 }
 
 int collAlltoall(
@@ -212,11 +99,7 @@ int collAlltoall(
     global_comm->mpi_comm_size,
     global_comm->mpi_comm_size_actual,
     global_comm->nb_threads);
-#ifdef LEGATE_USE_NETWORK
-  return alltoallMPI(sendbuf, recvbuf, count, type, global_comm);
-#else
-  return alltoallLocal(sendbuf, recvbuf, count, type, global_comm);
-#endif
+  return backend_network->alltoall(sendbuf, recvbuf, count, type, global_comm);
 }
 
 int collAllgather(
@@ -232,288 +115,47 @@ int collAllgather(
     global_comm->mpi_comm_size,
     global_comm->mpi_comm_size_actual,
     global_comm->nb_threads);
-#ifdef LEGATE_USE_NETWORK
-  return allgatherMPI(sendbuf, recvbuf, count, type, global_comm);
-#else
-  return allgatherLocal(sendbuf, recvbuf, count, type, global_comm);
-#endif
+  return backend_network->allgather(sendbuf, recvbuf, count, type, global_comm);
 }
 
 // called from main thread
 int collInit(int argc, char* argv[])
 {
-  current_unique_id = 0;
 #ifdef LEGATE_USE_NETWORK
-  int init_flag = 0;
-  CHECK_MPI(MPI_Initialized(&init_flag));
-  if (!init_flag) {
-    char* network    = getenv("LEGATE_NEED_NETWORK");
-    int need_network = 0;
-    if (network != nullptr) { need_network = atoi(network); }
-    if (need_network) {
-      log_coll.fatal(
-        "MPI has not been initialized, it should be initialized by "
-        "the networking backend.");
-      LEGATE_ABORT;
-    } else {
-      int provided;
-      MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided);
-      self_mpi_init = true;
-    }
-  }
-  int mpi_thread_model;
-  MPI_Query_thread(&mpi_thread_model);
-  if (mpi_thread_model != MPI_THREAD_MULTIPLE) {
-    log_coll.fatal(
-      "MPI has been initialized by others, but is not initialized with "
-      "MPI_THREAD_MULTIPLE");
-    LEGATE_ABORT;
+  char* network    = getenv("LEGATE_NEED_NETWORK");
+  int need_network = 0;
+  if (network != nullptr) { need_network = atoi(network); }
+  if (need_network) {
+    backend_network = new MPINetwork(argc, argv);
+  } else {
+    backend_network = new LocalNetwork(argc, argv);
   }
-  // check
-  int *tag_ub, flag;
-  CHECK_MPI(MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub, &flag));
-  assert(flag);
-  mpi_tag_ub = *tag_ub;
-  assert(mpi_comms.empty());
 #else
-  assert(thread_comms.empty());
+  backend_network = new LocalNetwork(argc, argv);
 #endif
-  coll_inited = true;
   return CollSuccess;
 }
 
 int collFinalize()
 {
-  assert(coll_inited == true);
-  coll_inited = false;
-#ifdef LEGATE_USE_NETWORK
-  for (MPI_Comm& mpi_comm : mpi_comms) { CHECK_MPI(MPI_Comm_free(&mpi_comm)); }
-  mpi_comms.clear();
-  int fina_flag = 0;
-  CHECK_MPI(MPI_Finalized(&fina_flag));
-  if (fina_flag == 1) {
-    log_coll.fatal("MPI should not have been finalized");
-    LEGATE_ABORT;
-  }
-  if (self_mpi_init) { CHECK_MPI(MPI_Finalize()); }
-#else
-  for (ThreadComm* thread_comm : thread_comms) {
-    assert(!thread_comm->ready_flag);
-    free(thread_comm);
-  }
-  thread_comms.clear();
-#endif
+  delete backend_network;
   return CollSuccess;
 }
 
-int collGetUniqueId(int* id)
-{
-  *id = current_unique_id;
-  current_unique_id++;
-  return CollSuccess;
-}
+int collInitComm() { return backend_network->init_comm(); }
 
-int collInitComm()
-{
-  int id = 0;
-  collGetUniqueId(&id);
-#ifdef LEGATE_USE_NETWORK
-#ifdef DEBUG_LEGATE
-  int mpi_rank;
-  int send_id = id;
-  // check if all ranks get the same unique id
-  CHECK_MPI(MPI_Bcast(&send_id, 1, MPI_INT, 0, MPI_COMM_WORLD));
-  assert(send_id == id);
-#endif
-  assert(mpi_comms.size() == id);
-  // create mpi comm
-  MPI_Comm mpi_comm;
-  CHECK_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm));
-  mpi_comms.push_back(mpi_comm);
-#else
-  assert(thread_comms.size() == id);
-  // create thread comm
-  ThreadComm* thread_comm = (ThreadComm*)malloc(sizeof(ThreadComm));
-  thread_comm->ready_flag = false;
-  thread_comm->buffers    = nullptr;
-  thread_comm->displs     = nullptr;
-  thread_comms.push_back(thread_comm);
-#endif
-  log_coll.debug("Init comm id %d", id);
-  return id;
-}
-
-#ifdef LEGATE_USE_NETWORK
-static inline std::pair<int, int> mostFrequent(const int* arr, int n)
-{
-  std::unordered_map<int, int> hash;
-  for (int i = 0; i < n; i++) hash[arr[i]]++;
-
-  // find the max frequency
-  int max_count = 0;
-  std::unordered_map<int, int>::iterator it;
-  for (it = hash.begin(); it != hash.end(); it++) {
-    if (max_count < it->second) { max_count = it->second; }
-  }
-
-  return std::make_pair(max_count, hash.size());
-}
-
-static inline int match2ranks(int rank1, int rank2, CollComm global_comm)
-{
-  // tag: seg idx + rank_idx + tag
-  // send_tag = sendto_global_rank * 10000 + global_rank (concat 2 ranks)
-  // which dst seg it sends to (in dst rank)
-  // recv_tag = global_rank * 10000 + recvfrom_global_rank (concat 2 ranks)
-  // idx of current seg we are receving (in src/my rank)
-  // example:
-  // 00 | 01 | 02 | 03
-  // 10 | 11 | 12 | 13
-  // 20 | 21 | 22 | 23
-  // 30 | 31 | 32 | 33
-  // 01's send_tag = 10, 10's recv_tag = 10, match
-  // 12's send_tag = 21, 21's recv_tag = 21, match
-
-  int tag;
-  // old tagging system for debug
-  // constexpr int const max_ranks = 10000;
-  // tag                           = rank1 * max_ranks + rank2;
-
-  // new tagging system, if crash, switch to the old one
-
-  tag = rank1 % global_comm->nb_threads * global_comm->global_comm_size + rank2;
-
-  // Szudzik's Function, two numbers < 32768
-  // if (rank1 >= rank2) {
-  //   tag = rank1*rank1 + rank1 + rank2;
-  // } else {
-  //   tag = rank1 + rank2*rank2;
-  // }
-
-  // Cantor Pairing Function, two numbers < 32768
-  // tag = (rank1 + rank2) * (rank1 + rank2 + 1) / 2 + rank1;
-
-  return tag;
-}
-
-MPI_Datatype dtypeToMPIDtype(CollDataType dtype)
-{
-  switch (dtype) {
-    case CollDataType::CollInt8: {
-      return MPI_INT8_T;
-    }
-    case CollDataType::CollChar: {
-      return MPI_CHAR;
-    }
-    case CollDataType::CollUint8: {
-      return MPI_UINT8_T;
-    }
-    case CollDataType::CollInt: {
-      return MPI_INT;
-    }
-    case CollDataType::CollUint32: {
-      return MPI_UINT32_T;
-    }
-    case CollDataType::CollInt64: {
-      return MPI_INT64_T;
-    }
-    case CollDataType::CollUint64: {
-      return MPI_UINT64_T;
-    }
-    case CollDataType::CollFloat: {
-      return MPI_FLOAT;
-    }
-    case CollDataType::CollDouble: {
-      return MPI_DOUBLE;
-    }
-    default: {
-      log_coll.fatal("Unknown datatype");
-      LEGATE_ABORT;
-      return MPI_BYTE;
-    }
-  }
-}
-
-int generateAlltoallTag(int rank1, int rank2, CollComm global_comm)
-{
-  int tag = match2ranks(rank1, rank2, global_comm) * CollTag::MAX_TAG + CollTag::ALLTOALL_TAG;
-  assert(tag <= mpi_tag_ub && tag > 0);
-  return tag;
-}
-
-int generateAlltoallvTag(int rank1, int rank2, CollComm global_comm)
-{
-  int tag = match2ranks(rank1, rank2, global_comm) * CollTag::MAX_TAG + CollTag::ALLTOALLV_TAG;
-  assert(tag <= mpi_tag_ub && tag > 0);
-  return tag;
-}
-
-int generateBcastTag(int rank, CollComm global_comm)
-{
-  int tag = rank * CollTag::MAX_TAG + CollTag::BCAST_TAG;
-  assert(tag <= mpi_tag_ub && tag >= 0);
-  return tag;
-}
-
-int generateGatherTag(int rank, CollComm global_comm)
-{
-  int tag = rank * CollTag::MAX_TAG + CollTag::GATHER_TAG;
-  assert(tag <= mpi_tag_ub && tag > 0);
-  return tag;
-}
-
-#else  // undef LEGATE_USE_NETWORK
-size_t getDtypeSize(CollDataType dtype)
-{
-  switch (dtype) {
-    case CollDataType::CollInt8:
-    case CollDataType::CollChar: {
-      return sizeof(char);
-    }
-    case CollDataType::CollUint8: {
-      return sizeof(uint8_t);
-    }
-    case CollDataType::CollInt: {
-      return sizeof(int);
-    }
-    case CollDataType::CollUint32: {
-      return sizeof(uint32_t);
-    }
-    case CollDataType::CollInt64: {
-      return sizeof(int64_t);
-    }
-    case CollDataType::CollUint64: {
-      return sizeof(uint64_t);
-    }
-    case CollDataType::CollFloat: {
-      return sizeof(float);
-    }
-    case CollDataType::CollDouble: {
-      return sizeof(double);
-    }
-    default: {
-      log_coll.fatal("Unknown datatype");
-      LEGATE_ABORT;
-      return 0;
-    }
-  }
-}
+BackendNetwork::BackendNetwork() : coll_inited(false), current_unique_id(0) {}
 
-void resetLocalBuffer(CollComm global_comm)
-{
-  int global_rank                         = global_comm->global_rank;
-  global_comm->comm->buffers[global_rank] = nullptr;
-  global_comm->comm->displs[global_rank]  = nullptr;
-}
+BackendNetwork::~BackendNetwork() {}
 
-void barrierLocal(CollComm global_comm)
+int BackendNetwork::collGetUniqueId(int* id)
 {
-  assert(coll_inited == true);
-  pthread_barrier_wait(const_cast<pthread_barrier_t*>(&(global_comm->comm->barrier)));
+  *id = current_unique_id;
+  current_unique_id++;
+  return CollSuccess;
 }
-#endif
 
-void* allocateInplaceBuffer(const void* recvbuf, size_t size)
+void* BackendNetwork::allocateInplaceBuffer(const void* recvbuf, size_t size)
 {
   void* sendbuf_tmp = malloc(size);
   assert(sendbuf_tmp != nullptr);
diff --git a/src/core/comm/coll.h b/src/core/comm/coll.h
index 0efd39add..08397f53e 100644
--- a/src/core/comm/coll.h
+++ b/src/core/comm/coll.h
@@ -22,7 +22,8 @@
 
 #ifdef LEGATE_USE_NETWORK
 #include <mpi.h>
-#else
+#endif
+
 // If we aren't building with networking, we'll use pthread_barrier to
 // construct a communicator for thread-local communication. Mac OS
 // does not implement pthread barriers, so we need to include an
@@ -32,26 +33,17 @@
 #if !defined(_POSIX_BARRIERS) || (_POSIX_BARRIERS < 0)
 #include "core/comm/pthread_barrier.h"
 #endif
-#endif
 
 namespace legate {
 namespace comm {
 namespace coll {
 
 #ifdef LEGATE_USE_NETWORK
-
-#define CHECK_MPI(expr)                    \
-  do {                                     \
-    int result = (expr);                   \
-    check_mpi(result, __FILE__, __LINE__); \
-  } while (false)
-
 struct RankMappingTable {
   int* mpi_rank;
   int* global_rank;
 };
-
-#else
+#endif
 
 struct ThreadComm {
   pthread_barrier_t barrier;
@@ -59,7 +51,6 @@ struct ThreadComm {
   const void** buffers;
   const int** displs;
 };
-#endif
 
 enum class CollDataType : int {
   CollInt8   = 0,
@@ -78,13 +69,17 @@ enum CollStatus : int {
   CollError   = 1,
 };
 
+enum CollCommType : int {
+  CollMPI   = 0,
+  CollLocal = 1,
+};
+
 struct Coll_Comm {
 #ifdef LEGATE_USE_NETWORK
-  MPI_Comm comm;
+  MPI_Comm mpi_comm;
   RankMappingTable mapping_table;
-#else
-  volatile ThreadComm* comm;
 #endif
+  volatile ThreadComm* local_comm;
   int mpi_rank;
   int mpi_comm_size;
   int mpi_comm_size_actual;
@@ -97,6 +92,151 @@ struct Coll_Comm {
 
 typedef Coll_Comm* CollComm;
 
+class BackendNetwork {
+ public:
+  BackendNetwork();
+  virtual ~BackendNetwork();
+  virtual int init_comm() = 0;
+
+  virtual int comm_create(CollComm global_comm,
+                          int global_comm_size,
+                          int global_rank,
+                          int unique_id,
+                          const int* mapping_table) = 0;
+
+  virtual int comm_destroy(CollComm global_comm) = 0;
+
+  virtual int alltoallv(const void* sendbuf,
+                        const int sendcounts[],
+                        const int sdispls[],
+                        void* recvbuf,
+                        const int recvcounts[],
+                        const int rdispls[],
+                        CollDataType type,
+                        CollComm global_comm) = 0;
+
+  virtual int alltoall(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm) = 0;
+
+  virtual int allgather(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm) = 0;
+
+ protected:
+  int collGetUniqueId(int* id);
+
+  void* allocateInplaceBuffer(const void* recvbuf, size_t size);
+
+ public:
+  CollCommType comm_type;
+
+ protected:
+  bool coll_inited;
+  int current_unique_id;
+};
+
+#ifdef LEGATE_USE_NETWORK
+class MPINetwork : public BackendNetwork {
+ public:
+  MPINetwork(int argc, char* argv[]);
+
+  ~MPINetwork();
+
+  int init_comm();
+
+  int comm_create(CollComm global_comm,
+                  int global_comm_size,
+                  int global_rank,
+                  int unique_id,
+                  const int* mapping_table);
+
+  int comm_destroy(CollComm global_comm);
+
+  int alltoallv(const void* sendbuf,
+                const int sendcounts[],
+                const int sdispls[],
+                void* recvbuf,
+                const int recvcounts[],
+                const int rdispls[],
+                CollDataType type,
+                CollComm global_comm);
+
+  int alltoall(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
+
+  int allgather(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
+
+ protected:
+  int gather(const void* sendbuf,
+             void* recvbuf,
+             int count,
+             CollDataType type,
+             int root,
+             CollComm global_comm);
+
+  int bcast(void* buf, int count, CollDataType type, int root, CollComm global_comm);
+
+  MPI_Datatype dtypeToMPIDtype(CollDataType dtype);
+
+  int generateAlltoallTag(int rank1, int rank2, CollComm global_comm);
+
+  int generateAlltoallvTag(int rank1, int rank2, CollComm global_comm);
+
+  int generateBcastTag(int rank, CollComm global_comm);
+
+  int generateGatherTag(int rank, CollComm global_comm);
+
+ private:
+  int mpi_tag_ub;
+  bool self_init_mpi;
+  std::vector<MPI_Comm> mpi_comms;
+};
+#endif
+
+class LocalNetwork : public BackendNetwork {
+ public:
+  LocalNetwork(int argc, char* argv[]);
+
+  ~LocalNetwork();
+
+  int init_comm();
+
+  int comm_create(CollComm global_comm,
+                  int global_comm_size,
+                  int global_rank,
+                  int unique_id,
+                  const int* mapping_table);
+
+  int comm_destroy(CollComm global_comm);
+
+  int alltoallv(const void* sendbuf,
+                const int sendcounts[],
+                const int sdispls[],
+                void* recvbuf,
+                const int recvcounts[],
+                const int rdispls[],
+                CollDataType type,
+                CollComm global_comm);
+
+  int alltoall(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
+
+  int allgather(
+    const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
+
+ protected:
+  size_t getDtypeSize(CollDataType dtype);
+
+  void resetLocalBuffer(CollComm global_comm);
+
+  void barrierLocal(CollComm global_comm);
+
+ private:
+  std::vector<ThreadComm*> thread_comms;
+};
+
+extern BackendNetwork* backend_network;
+
 int collCommCreate(CollComm global_comm,
                    int global_comm_size,
                    int global_rank,
@@ -124,81 +264,8 @@ int collInit(int argc, char* argv[]);
 
 int collFinalize();
 
-int collGetUniqueId(int* id);
-
 int collInitComm();
 
-// The following functions should not be called by users
-#ifdef LEGATE_USE_NETWORK
-int alltoallvMPI(const void* sendbuf,
-                 const int sendcounts[],
-                 const int sdispls[],
-                 void* recvbuf,
-                 const int recvcounts[],
-                 const int rdispls[],
-                 CollDataType type,
-                 CollComm global_comm);
-
-int alltoallMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
-
-int gatherMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, int root, CollComm global_comm);
-
-int allgatherMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
-
-int bcastMPI(void* buf, int count, CollDataType type, int root, CollComm global_comm);
-
-MPI_Datatype dtypeToMPIDtype(CollDataType dtype);
-
-int generateAlltoallTag(int rank1, int rank2, CollComm global_comm);
-
-int generateAlltoallvTag(int rank1, int rank2, CollComm global_comm);
-
-int generateBcastTag(int rank, CollComm global_comm);
-
-int generateGatherTag(int rank, CollComm global_comm);
-#else
-size_t getDtypeSize(CollDataType dtype);
-
-int alltoallvLocal(const void* sendbuf,
-                   const int sendcounts[],
-                   const int sdispls[],
-                   void* recvbuf,
-                   const int recvcounts[],
-                   const int rdispls[],
-                   CollDataType type,
-                   CollComm global_comm);
-
-int alltoallLocal(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
-
-int allgatherLocal(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm);
-
-void resetLocalBuffer(CollComm global_comm);
-
-void barrierLocal(CollComm global_comm);
-#endif
-
-void* allocateInplaceBuffer(const void* recvbuf, size_t size);
-
-#ifdef LEGATE_USE_NETWORK
-inline void check_mpi(int error, const char* file, int line)
-{
-  if (error != MPI_SUCCESS) {
-    fprintf(
-      stderr, "Internal MPI failure with error code %d in file %s at line %d\n", error, file, line);
-#ifdef DEBUG_LEGATE
-    assert(false);
-#else
-    exit(error);
-#endif
-  }
-}
-#endif
-
 }  // namespace coll
 }  // namespace comm
 }  // namespace legate
diff --git a/src/core/comm/comm_cpu.cc b/src/core/comm/comm_cpu.cc
index 161bce446..05c2f6283 100644
--- a/src/core/comm/comm_cpu.cc
+++ b/src/core/comm/comm_cpu.cc
@@ -33,7 +33,9 @@ static int init_cpucoll_mapping(const Legion::Task* task,
   Core::show_progress(task, context, runtime, task->get_task_name());
   int mpi_rank = 0;
 #if defined(LEGATE_USE_NETWORK)
-  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+  if (coll::backend_network->comm_type == coll::CollCommType::CollMPI) {
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+  }
 #endif
 
   return mpi_rank;
@@ -55,17 +57,20 @@ static coll::CollComm init_cpucoll(const Legion::Task* task,
   coll::CollComm comm = (coll::CollComm)malloc(sizeof(coll::Coll_Comm));
 
 #ifdef LEGATE_USE_NETWORK
-  int* mapping_table = (int*)malloc(sizeof(int) * num_ranks);
-  for (int i = 0; i < num_ranks; i++) {
-    const int mapping_table_element = task->futures[i + 1].get_result<int>();
-    mapping_table[i]                = mapping_table_element;
-  }
-  coll::collCommCreate(comm, num_ranks, point, unique_id, mapping_table);
-  assert(mapping_table[point] == comm->mpi_rank);
-  free(mapping_table);
-#else
-  coll::collCommCreate(comm, num_ranks, point, unique_id, nullptr);
+  if (coll::backend_network->comm_type == coll::CollCommType::CollMPI) {
+    int* mapping_table = (int*)malloc(sizeof(int) * num_ranks);
+    for (int i = 0; i < num_ranks; i++) {
+      const int mapping_table_element = task->futures[i + 1].get_result<int>();
+      mapping_table[i]                = mapping_table_element;
+    }
+    coll::collCommCreate(comm, num_ranks, point, unique_id, mapping_table);
+    assert(mapping_table[point] == comm->mpi_rank);
+    free(mapping_table);
+  } else
 #endif
+  {
+    coll::collCommCreate(comm, num_ranks, point, unique_id, nullptr);
+  }
 
   return comm;
 }
diff --git a/src/core/comm/gather_thread_mpi.cc b/src/core/comm/gather_thread_mpi.cc
deleted file mode 100644
index 4ba7fe455..000000000
--- a/src/core/comm/gather_thread_mpi.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2022 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "coll.h"
-#include "legion.h"
-
-namespace legate {
-namespace comm {
-namespace coll {
-
-using namespace Legion;
-extern Logger log_coll;
-
-int gatherMPI(
-  const void* sendbuf, void* recvbuf, int count, CollDataType type, int root, CollComm global_comm)
-{
-  MPI_Status status;
-
-  int total_size  = global_comm->global_comm_size;
-  int global_rank = global_comm->global_rank;
-
-  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
-
-  // Should not see inplace here
-  if (sendbuf == recvbuf) { assert(0); }
-
-  int root_mpi_rank = global_comm->mapping_table.mpi_rank[root];
-  assert(root == global_comm->mapping_table.global_rank[root]);
-
-  int tag;
-
-  // non-root
-  if (global_rank != root) {
-    tag = generateGatherTag(global_rank, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug("GatherMPI: non-root send global_rank %d, mpi rank %d, send to %d (%d), tag %d",
-                   global_rank,
-                   global_comm->mpi_rank,
-                   root,
-                   root_mpi_rank,
-                   tag);
-#endif
-    CHECK_MPI(MPI_Send(sendbuf, count, mpi_type, root_mpi_rank, tag, global_comm->comm));
-    return CollSuccess;
-  }
-
-  // root
-  MPI_Aint incr, lb, type_extent;
-  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
-  incr      = type_extent * static_cast<ptrdiff_t>(count);
-  char* dst = static_cast<char*>(recvbuf);
-  int recvfrom_mpi_rank;
-  for (int i = 0; i < total_size; i++) {
-    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[i];
-    assert(i == global_comm->mapping_table.global_rank[i]);
-    tag = generateGatherTag(i, global_comm);
-#ifdef DEBUG_LEGATE
-    log_coll.debug(
-      "GatherMPI: root i %d === global_rank %d, mpi rank %d, recv %p, from %d (%d), tag %d",
-      i,
-      global_rank,
-      global_comm->mpi_rank,
-      dst,
-      i,
-      recvfrom_mpi_rank,
-      tag);
-#endif
-    assert(dst != nullptr);
-    if (global_rank == i) {
-      memcpy(dst, sendbuf, incr);
-    } else {
-      CHECK_MPI(MPI_Recv(dst, count, mpi_type, recvfrom_mpi_rank, tag, global_comm->comm, &status));
-    }
-    dst += incr;
-  }
-
-  return CollSuccess;
-}
-
-}  // namespace coll
-}  // namespace comm
-}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/local_comm.cc b/src/core/comm/local_comm.cc
new file mode 100644
index 000000000..8adc4a2f3
--- /dev/null
+++ b/src/core/comm/local_comm.cc
@@ -0,0 +1,351 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "coll.h"
+#include "legate.h"
+#include "legion.h"
+
+namespace legate {
+namespace comm {
+namespace coll {
+
+using namespace Legion;
+extern Logger log_coll;
+
+// public functions start from here
+
+LocalNetwork::LocalNetwork(int argc, char* argv[]) : BackendNetwork()
+{
+  log_coll.debug("Enable LocalNetwork");
+  assert(current_unique_id == 0);
+  assert(thread_comms.empty());
+  BackendNetwork::coll_inited = true;
+  BackendNetwork::comm_type   = CollCommType::CollLocal;
+}
+
+LocalNetwork::~LocalNetwork()
+{
+  log_coll.debug("Finalize LocalNetwork");
+  assert(BackendNetwork::coll_inited == true);
+  for (ThreadComm* thread_comm : thread_comms) {
+    assert(!thread_comm->ready_flag);
+    free(thread_comm);
+  }
+  thread_comms.clear();
+  BackendNetwork::coll_inited = false;
+}
+
+int LocalNetwork::comm_create(CollComm global_comm,
+                              int global_comm_size,
+                              int global_rank,
+                              int unique_id,
+                              const int* mapping_table)
+{
+  global_comm->global_comm_size = global_comm_size;
+  global_comm->global_rank      = global_rank;
+  global_comm->status           = true;
+  global_comm->unique_id        = unique_id;
+  assert(mapping_table == nullptr);
+  global_comm->mpi_comm_size        = 1;
+  global_comm->mpi_comm_size_actual = 1;
+  global_comm->mpi_rank             = 0;
+  if (global_comm->global_rank == 0) {
+    pthread_barrier_init((pthread_barrier_t*)&(thread_comms[global_comm->unique_id]->barrier),
+                         nullptr,
+                         global_comm->global_comm_size);
+    thread_comms[global_comm->unique_id]->buffers =
+      (const void**)malloc(sizeof(void*) * global_comm_size);
+    thread_comms[global_comm->unique_id]->displs =
+      (const int**)malloc(sizeof(int*) * global_comm_size);
+    for (int i = 0; i < global_comm_size; i++) {
+      thread_comms[global_comm->unique_id]->buffers[i] = nullptr;
+      thread_comms[global_comm->unique_id]->displs[i]  = nullptr;
+    }
+    __sync_synchronize();
+    thread_comms[global_comm->unique_id]->ready_flag = true;
+  }
+  __sync_synchronize();
+  volatile ThreadComm* data = thread_comms[global_comm->unique_id];
+  while (data->ready_flag != true) { data = thread_comms[global_comm->unique_id]; }
+  global_comm->local_comm = thread_comms[global_comm->unique_id];
+  barrierLocal(global_comm);
+  assert(global_comm->local_comm->ready_flag == true);
+  assert(global_comm->local_comm->buffers != nullptr);
+  assert(global_comm->local_comm->displs != nullptr);
+  global_comm->nb_threads = global_comm->global_comm_size;
+  return CollSuccess;
+}
+
+int LocalNetwork::comm_destroy(CollComm global_comm)
+{
+  barrierLocal(global_comm);
+  if (global_comm->global_rank == 0) {
+    pthread_barrier_destroy((pthread_barrier_t*)&(thread_comms[global_comm->unique_id]->barrier));
+    free(thread_comms[global_comm->unique_id]->buffers);
+    thread_comms[global_comm->unique_id]->buffers = nullptr;
+    free(thread_comms[global_comm->unique_id]->displs);
+    thread_comms[global_comm->unique_id]->displs = nullptr;
+    __sync_synchronize();
+    thread_comms[global_comm->unique_id]->ready_flag = false;
+  }
+  __sync_synchronize();
+  volatile ThreadComm* data = thread_comms[global_comm->unique_id];
+  while (data->ready_flag != false) { data = thread_comms[global_comm->unique_id]; }
+  global_comm->status = false;
+  return CollSuccess;
+}
+
+int LocalNetwork::init_comm()
+{
+  int id = 0;
+  collGetUniqueId(&id);
+  assert(thread_comms.size() == id);
+  // create thread comm
+  ThreadComm* thread_comm = (ThreadComm*)malloc(sizeof(ThreadComm));
+  thread_comm->ready_flag = false;
+  thread_comm->buffers    = nullptr;
+  thread_comm->displs     = nullptr;
+  thread_comms.push_back(thread_comm);
+  log_coll.debug("Init comm id %d", id);
+  return id;
+}
+
+int LocalNetwork::alltoallv(const void* sendbuf,
+                            const int sendcounts[],
+                            const int sdispls[],
+                            void* recvbuf,
+                            const int recvcounts[],
+                            const int rdispls[],
+                            CollDataType type,
+                            CollComm global_comm)
+{
+  int res;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  int type_extent = getDtypeSize(type);
+
+  global_comm->local_comm->displs[global_rank]  = sdispls;
+  global_comm->local_comm->buffers[global_rank] = sendbuf;
+  __sync_synchronize();
+
+  int recvfrom_global_rank;
+  int recvfrom_seg_id  = global_rank;
+  const void* src_base = nullptr;
+  const int* displs    = nullptr;
+  for (int i = 1; i < total_size + 1; i++) {
+    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
+    // wait for other threads to update the buffer address
+    while (global_comm->local_comm->buffers[recvfrom_global_rank] == nullptr ||
+           global_comm->local_comm->displs[recvfrom_global_rank] == nullptr)
+      ;
+    src_base  = global_comm->local_comm->buffers[recvfrom_global_rank];
+    displs    = global_comm->local_comm->displs[recvfrom_global_rank];
+    char* src = static_cast<char*>(const_cast<void*>(src_base)) +
+                static_cast<ptrdiff_t>(displs[recvfrom_seg_id]) * type_extent;
+    char* dst = static_cast<char*>(recvbuf) +
+                static_cast<ptrdiff_t>(rdispls[recvfrom_global_rank]) * type_extent;
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "AlltoallvLocal i: %d === global_rank %d, dtype %d, copy rank %d (seg %d, sdispls %d, %p) to "
+      "rank %d (seg "
+      "%d, rdispls %d, %p)",
+      i,
+      global_rank,
+      type_extent,
+      recvfrom_global_rank,
+      recvfrom_seg_id,
+      sdispls[recvfrom_seg_id],
+      src,
+      global_rank,
+      recvfrom_global_rank,
+      rdispls[recvfrom_global_rank],
+      dst);
+#endif
+    memcpy(dst, src, recvcounts[recvfrom_global_rank] * type_extent);
+  }
+
+  barrierLocal(global_comm);
+
+  __sync_synchronize();
+
+  resetLocalBuffer(global_comm);
+  barrierLocal(global_comm);
+
+  return CollSuccess;
+}
+
+int LocalNetwork::alltoall(
+  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
+{
+  int res;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  int type_extent = getDtypeSize(type);
+
+  global_comm->local_comm->buffers[global_rank] = sendbuf;
+  __sync_synchronize();
+
+  int recvfrom_global_rank;
+  int recvfrom_seg_id  = global_rank;
+  const void* src_base = nullptr;
+  for (int i = 1; i < total_size + 1; i++) {
+    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
+    // wait for other threads to update the buffer address
+    while (global_comm->local_comm->buffers[recvfrom_global_rank] == nullptr)
+      ;
+    src_base  = global_comm->local_comm->buffers[recvfrom_global_rank];
+    char* src = static_cast<char*>(const_cast<void*>(src_base)) +
+                static_cast<ptrdiff_t>(recvfrom_seg_id) * type_extent * count;
+    char* dst = static_cast<char*>(recvbuf) +
+                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "AlltoallLocal i: %d === global_rank %d, dtype %d, copy rank %d (seg %d, %p) to rank %d (seg "
+      "%d, %p)",
+      i,
+      global_rank,
+      type_extent,
+      recvfrom_global_rank,
+      recvfrom_seg_id,
+      src,
+      global_rank,
+      recvfrom_global_rank,
+      dst);
+#endif
+    memcpy(dst, src, count * type_extent);
+  }
+
+  barrierLocal(global_comm);
+
+  __sync_synchronize();
+
+  resetLocalBuffer(global_comm);
+  barrierLocal(global_comm);
+
+  return CollSuccess;
+}
+
+int LocalNetwork::allgather(
+  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
+{
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  int type_extent = getDtypeSize(type);
+
+  const void* sendbuf_tmp = sendbuf;
+
+  // MPI_IN_PLACE
+  if (sendbuf == recvbuf) { sendbuf_tmp = allocateInplaceBuffer(recvbuf, type_extent * count); }
+
+  global_comm->local_comm->buffers[global_rank] = sendbuf_tmp;
+  __sync_synchronize();
+
+  for (int recvfrom_global_rank = 0; recvfrom_global_rank < total_size; recvfrom_global_rank++) {
+    // wait for other threads to update the buffer address
+    while (global_comm->local_comm->buffers[recvfrom_global_rank] == nullptr)
+      ;
+    const void* src = global_comm->local_comm->buffers[recvfrom_global_rank];
+    char* dst       = static_cast<char*>(recvbuf) +
+                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "AllgatherLocal i: %d === global_rank %d, dtype %d, copy rank %d (%p) to rank %d (%p)",
+      recvfrom_global_rank,
+      global_rank,
+      type_extent,
+      recvfrom_global_rank,
+      src,
+      global_rank,
+      dst);
+#endif
+    memcpy(dst, src, count * type_extent);
+  }
+
+  barrierLocal(global_comm);
+  if (sendbuf == recvbuf) { free(const_cast<void*>(sendbuf_tmp)); }
+
+  __sync_synchronize();
+
+  resetLocalBuffer(global_comm);
+  barrierLocal(global_comm);
+
+  return CollSuccess;
+}
+
+// protected functions start from here
+
+size_t LocalNetwork::getDtypeSize(CollDataType dtype)
+{
+  switch (dtype) {
+    case CollDataType::CollInt8:
+    case CollDataType::CollChar: {
+      return sizeof(char);
+    }
+    case CollDataType::CollUint8: {
+      return sizeof(uint8_t);
+    }
+    case CollDataType::CollInt: {
+      return sizeof(int);
+    }
+    case CollDataType::CollUint32: {
+      return sizeof(uint32_t);
+    }
+    case CollDataType::CollInt64: {
+      return sizeof(int64_t);
+    }
+    case CollDataType::CollUint64: {
+      return sizeof(uint64_t);
+    }
+    case CollDataType::CollFloat: {
+      return sizeof(float);
+    }
+    case CollDataType::CollDouble: {
+      return sizeof(double);
+    }
+    default: {
+      log_coll.fatal("Unknown datatype");
+      LEGATE_ABORT;
+      return 0;
+    }
+  }
+}
+
+void LocalNetwork::resetLocalBuffer(CollComm global_comm)
+{
+  int global_rank                               = global_comm->global_rank;
+  global_comm->local_comm->buffers[global_rank] = nullptr;
+  global_comm->local_comm->displs[global_rank]  = nullptr;
+}
+
+void LocalNetwork::barrierLocal(CollComm global_comm)
+{
+  assert(BackendNetwork::coll_inited == true);
+  pthread_barrier_wait(const_cast<pthread_barrier_t*>(&(global_comm->local_comm->barrier)));
+}
+
+}  // namespace coll
+}  // namespace comm
+}  // namespace legate
\ No newline at end of file
diff --git a/src/core/comm/mpi_comm.cc b/src/core/comm/mpi_comm.cc
new file mode 100644
index 000000000..1761701ff
--- /dev/null
+++ b/src/core/comm/mpi_comm.cc
@@ -0,0 +1,575 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "coll.h"
+#include "legate.h"
+#include "legion.h"
+
+namespace legate {
+namespace comm {
+namespace coll {
+
+using namespace Legion;
+extern Logger log_coll;
+
+enum CollTag : int {
+  BCAST_TAG     = 0,
+  GATHER_TAG    = 1,
+  ALLTOALL_TAG  = 2,
+  ALLTOALLV_TAG = 3,
+  MAX_TAG       = 10,
+};
+
+static inline std::pair<int, int> mostFrequent(const int* arr, int n);
+static inline int match2ranks(int rank1, int rank2, CollComm global_comm);
+
+inline void check_mpi(int error, const char* file, int line)
+{
+  if (error != MPI_SUCCESS) {
+    fprintf(
+      stderr, "Internal MPI failure with error code %d in file %s at line %d\n", error, file, line);
+#ifdef DEBUG_LEGATE
+    assert(false);
+#else
+    exit(error);
+#endif
+  }
+}
+
+#define CHECK_MPI(expr)                    \
+  do {                                     \
+    int result = (expr);                   \
+    check_mpi(result, __FILE__, __LINE__); \
+  } while (false)
+
+// public functions start from here
+
+MPINetwork::MPINetwork(int argc, char* argv[])
+  : BackendNetwork(), mpi_tag_ub(0), self_init_mpi(false)
+{
+  log_coll.debug("Enable MPINetwork");
+  assert(current_unique_id == 0);
+  int provided, init_flag = 0;
+  CHECK_MPI(MPI_Initialized(&init_flag));
+  if (!init_flag) {
+    log_coll.fatal(
+      "MPI has not been initialized, it should be initialized by "
+      "the networking backend.");
+    LEGATE_ABORT;
+  }
+  int mpi_thread_model;
+  MPI_Query_thread(&mpi_thread_model);
+  if (mpi_thread_model != MPI_THREAD_MULTIPLE) {
+    log_coll.fatal(
+      "MPI has been initialized by others, but is not initialized with "
+      "MPI_THREAD_MULTIPLE");
+    LEGATE_ABORT;
+  }
+  // check
+  int *tag_ub, flag;
+  CHECK_MPI(MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &tag_ub, &flag));
+  assert(flag);
+  mpi_tag_ub = *tag_ub;
+  assert(mpi_comms.empty());
+  BackendNetwork::coll_inited = true;
+  BackendNetwork::comm_type   = CollCommType::CollMPI;
+}
+
+MPINetwork::~MPINetwork()
+{
+  log_coll.debug("Finalize MPINetwork");
+  assert(BackendNetwork::coll_inited == true);
+  for (MPI_Comm& mpi_comm : mpi_comms) { CHECK_MPI(MPI_Comm_free(&mpi_comm)); }
+  mpi_comms.clear();
+  int fina_flag = 0;
+  CHECK_MPI(MPI_Finalized(&fina_flag));
+  if (fina_flag == 1) {
+    log_coll.fatal("MPI should not have been finalized");
+    LEGATE_ABORT;
+  }
+  if (self_init_mpi) {
+    MPI_Finalize();
+    printf("finalize mpi\n");
+  }
+  BackendNetwork::coll_inited = false;
+}
+
+int MPINetwork::init_comm()
+{
+  int id = 0;
+  collGetUniqueId(&id);
+#ifdef DEBUG_LEGATE
+  int mpi_rank;
+  int send_id = id;
+  // check if all ranks get the same unique id
+  CHECK_MPI(MPI_Bcast(&send_id, 1, MPI_INT, 0, MPI_COMM_WORLD));
+  assert(send_id == id);
+#endif
+  assert(mpi_comms.size() == id);
+  // create mpi comm
+  MPI_Comm mpi_comm;
+  CHECK_MPI(MPI_Comm_dup(MPI_COMM_WORLD, &mpi_comm));
+  mpi_comms.push_back(mpi_comm);
+  log_coll.debug("Init comm id %d", id);
+  return id;
+}
+
+int MPINetwork::comm_create(CollComm global_comm,
+                            int global_comm_size,
+                            int global_rank,
+                            int unique_id,
+                            const int* mapping_table)
+{
+  global_comm->global_comm_size = global_comm_size;
+  global_comm->global_rank      = global_rank;
+  global_comm->status           = true;
+  global_comm->unique_id        = unique_id;
+  int mpi_rank, mpi_comm_size;
+  int *tag_ub, flag;
+  int compare_result;
+  MPI_Comm comm = mpi_comms[unique_id];
+  CHECK_MPI(MPI_Comm_compare(comm, MPI_COMM_WORLD, &compare_result));
+  assert(MPI_CONGRUENT == compare_result);
+
+  CHECK_MPI(MPI_Comm_rank(comm, &mpi_rank));
+  CHECK_MPI(MPI_Comm_size(comm, &mpi_comm_size));
+  global_comm->mpi_comm_size = mpi_comm_size;
+  global_comm->mpi_rank      = mpi_rank;
+  global_comm->mpi_comm      = comm;
+  assert(mapping_table != nullptr);
+  global_comm->mapping_table.global_rank = (int*)malloc(sizeof(int) * global_comm_size);
+  global_comm->mapping_table.mpi_rank    = (int*)malloc(sizeof(int) * global_comm_size);
+  memcpy(global_comm->mapping_table.mpi_rank, mapping_table, sizeof(int) * global_comm_size);
+  for (int i = 0; i < global_comm_size; i++) { global_comm->mapping_table.global_rank[i] = i; }
+  std::pair<int, int> p             = mostFrequent(mapping_table, global_comm_size);
+  global_comm->nb_threads           = p.first;
+  global_comm->mpi_comm_size_actual = p.second;
+  return CollSuccess;
+}
+
+int MPINetwork::comm_destroy(CollComm global_comm)
+{
+  if (global_comm->mapping_table.global_rank != nullptr) {
+    free(global_comm->mapping_table.global_rank);
+    global_comm->mapping_table.global_rank = nullptr;
+  }
+  if (global_comm->mapping_table.mpi_rank != nullptr) {
+    free(global_comm->mapping_table.mpi_rank);
+    global_comm->mapping_table.mpi_rank = nullptr;
+  }
+  global_comm->status = false;
+  return CollSuccess;
+}
+
+int MPINetwork::alltoallv(const void* sendbuf,
+                          const int sendcounts[],
+                          const int sdispls[],
+                          void* recvbuf,
+                          const int recvcounts[],
+                          const int rdispls[],
+                          CollDataType type,
+                          CollComm global_comm)
+{
+  MPI_Status status;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
+
+  MPI_Aint lb, type_extent;
+  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
+
+  int sendto_global_rank, recvfrom_global_rank, sendto_mpi_rank, recvfrom_mpi_rank;
+  for (int i = 1; i < total_size + 1; i++) {
+    sendto_global_rank   = (global_rank + i) % total_size;
+    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
+    char* src            = static_cast<char*>(const_cast<void*>(sendbuf)) +
+                static_cast<ptrdiff_t>(sdispls[sendto_global_rank]) * type_extent;
+    char* dst = static_cast<char*>(recvbuf) +
+                static_cast<ptrdiff_t>(rdispls[recvfrom_global_rank]) * type_extent;
+    int scount        = sendcounts[sendto_global_rank];
+    int rcount        = recvcounts[recvfrom_global_rank];
+    sendto_mpi_rank   = global_comm->mapping_table.mpi_rank[sendto_global_rank];
+    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[recvfrom_global_rank];
+    assert(sendto_global_rank == global_comm->mapping_table.global_rank[sendto_global_rank]);
+    assert(recvfrom_global_rank == global_comm->mapping_table.global_rank[recvfrom_global_rank]);
+    // tag: seg idx + rank_idx + tag
+    int send_tag = generateAlltoallvTag(sendto_global_rank, global_rank, global_comm);
+    int recv_tag = generateAlltoallvTag(global_rank, recvfrom_global_rank, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "AlltoallvMPI i: %d === global_rank %d, mpi rank %d, send to %d (%d), send_tag %d, "
+      "recv from %d (%d), "
+      "recv_tag %d",
+      i,
+      global_rank,
+      global_comm->mpi_rank,
+      sendto_global_rank,
+      sendto_mpi_rank,
+      send_tag,
+      recvfrom_global_rank,
+      recvfrom_mpi_rank,
+      recv_tag);
+#endif
+    CHECK_MPI(MPI_Sendrecv(src,
+                           scount,
+                           mpi_type,
+                           sendto_mpi_rank,
+                           send_tag,
+                           dst,
+                           rcount,
+                           mpi_type,
+                           recvfrom_mpi_rank,
+                           recv_tag,
+                           global_comm->mpi_comm,
+                           &status));
+  }
+
+  return CollSuccess;
+}
+
+int MPINetwork::alltoall(
+  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
+{
+  MPI_Status status;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
+
+  MPI_Aint lb, type_extent;
+  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
+
+  int sendto_global_rank, recvfrom_global_rank, sendto_mpi_rank, recvfrom_mpi_rank;
+  for (int i = 1; i < total_size + 1; i++) {
+    sendto_global_rank   = (global_rank + i) % total_size;
+    recvfrom_global_rank = (global_rank + total_size - i) % total_size;
+    char* src            = static_cast<char*>(const_cast<void*>(sendbuf)) +
+                static_cast<ptrdiff_t>(sendto_global_rank) * type_extent * count;
+    char* dst = static_cast<char*>(recvbuf) +
+                static_cast<ptrdiff_t>(recvfrom_global_rank) * type_extent * count;
+    sendto_mpi_rank   = global_comm->mapping_table.mpi_rank[sendto_global_rank];
+    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[recvfrom_global_rank];
+    assert(sendto_global_rank == global_comm->mapping_table.global_rank[sendto_global_rank]);
+    assert(recvfrom_global_rank == global_comm->mapping_table.global_rank[recvfrom_global_rank]);
+    // tag: seg idx + rank_idx + tag
+    int send_tag = generateAlltoallTag(sendto_global_rank, global_rank, global_comm);
+    int recv_tag = generateAlltoallTag(global_rank, recvfrom_global_rank, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "AlltoallMPI i: %d === global_rank %d, mpi rank %d, send to %d (%d), send_tag %d, "
+      "recv from %d (%d), "
+      "recv_tag %d",
+      i,
+      global_rank,
+      global_comm->mpi_rank,
+      sendto_global_rank,
+      sendto_mpi_rank,
+      send_tag,
+      recvfrom_global_rank,
+      recvfrom_mpi_rank,
+      recv_tag);
+#endif
+    CHECK_MPI(MPI_Sendrecv(src,
+                           count,
+                           mpi_type,
+                           sendto_mpi_rank,
+                           send_tag,
+                           dst,
+                           count,
+                           mpi_type,
+                           recvfrom_mpi_rank,
+                           recv_tag,
+                           global_comm->mpi_comm,
+                           &status));
+  }
+
+  return CollSuccess;
+}
+
+int MPINetwork::allgather(
+  const void* sendbuf, void* recvbuf, int count, CollDataType type, CollComm global_comm)
+{
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
+
+  MPI_Aint lb, type_extent;
+  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
+
+  void* sendbuf_tmp = const_cast<void*>(sendbuf);
+
+  // MPI_IN_PLACE
+  if (sendbuf == recvbuf) { sendbuf_tmp = allocateInplaceBuffer(recvbuf, type_extent * count); }
+
+  gather(sendbuf_tmp, recvbuf, count, type, 0, global_comm);
+
+  bcast(recvbuf, count * total_size, type, 0, global_comm);
+
+  if (sendbuf == recvbuf) { free(sendbuf_tmp); }
+
+  return CollSuccess;
+}
+
+int MPINetwork::gather(
+  const void* sendbuf, void* recvbuf, int count, CollDataType type, int root, CollComm global_comm)
+{
+  MPI_Status status;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
+
+  // Should not see inplace here
+  if (sendbuf == recvbuf) { assert(0); }
+
+  int root_mpi_rank = global_comm->mapping_table.mpi_rank[root];
+  assert(root == global_comm->mapping_table.global_rank[root]);
+
+  int tag;
+
+  // non-root
+  if (global_rank != root) {
+    tag = generateGatherTag(global_rank, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug("GatherMPI: non-root send global_rank %d, mpi rank %d, send to %d (%d), tag %d",
+                   global_rank,
+                   global_comm->mpi_rank,
+                   root,
+                   root_mpi_rank,
+                   tag);
+#endif
+    CHECK_MPI(MPI_Send(sendbuf, count, mpi_type, root_mpi_rank, tag, global_comm->mpi_comm));
+    return CollSuccess;
+  }
+
+  // root
+  MPI_Aint incr, lb, type_extent;
+  MPI_Type_get_extent(mpi_type, &lb, &type_extent);
+  incr      = type_extent * static_cast<ptrdiff_t>(count);
+  char* dst = static_cast<char*>(recvbuf);
+  int recvfrom_mpi_rank;
+  for (int i = 0; i < total_size; i++) {
+    recvfrom_mpi_rank = global_comm->mapping_table.mpi_rank[i];
+    assert(i == global_comm->mapping_table.global_rank[i]);
+    tag = generateGatherTag(i, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug(
+      "GatherMPI: root i %d === global_rank %d, mpi rank %d, recv %p, from %d (%d), tag %d",
+      i,
+      global_rank,
+      global_comm->mpi_rank,
+      dst,
+      i,
+      recvfrom_mpi_rank,
+      tag);
+#endif
+    assert(dst != nullptr);
+    if (global_rank == i) {
+      memcpy(dst, sendbuf, incr);
+    } else {
+      CHECK_MPI(
+        MPI_Recv(dst, count, mpi_type, recvfrom_mpi_rank, tag, global_comm->mpi_comm, &status));
+    }
+    dst += incr;
+  }
+
+  return CollSuccess;
+}
+
+int MPINetwork::bcast(void* buf, int count, CollDataType type, int root, CollComm global_comm)
+{
+  int tag;
+  MPI_Status status;
+
+  int total_size  = global_comm->global_comm_size;
+  int global_rank = global_comm->global_rank;
+
+  int root_mpi_rank = global_comm->mapping_table.mpi_rank[root];
+  assert(root == global_comm->mapping_table.global_rank[root]);
+
+  MPI_Datatype mpi_type = dtypeToMPIDtype(type);
+
+  // non-root
+  if (global_rank != root) {
+    tag = generateBcastTag(global_rank, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug("BcastMPI: non-root recv global_rank %d, mpi rank %d, send to %d (%d), tag %d",
+                   global_rank,
+                   global_comm->mpi_rank,
+                   root,
+                   root_mpi_rank,
+                   tag);
+#endif
+    CHECK_MPI(MPI_Recv(buf, count, mpi_type, root_mpi_rank, tag, global_comm->mpi_comm, &status));
+    return CollSuccess;
+  }
+
+  // root
+  int sendto_mpi_rank;
+  for (int i = 0; i < total_size; i++) {
+    sendto_mpi_rank = global_comm->mapping_table.mpi_rank[i];
+    assert(i == global_comm->mapping_table.global_rank[i]);
+    tag = generateBcastTag(i, global_comm);
+#ifdef DEBUG_LEGATE
+    log_coll.debug("BcastMPI: root i %d === global_rank %d, mpi rank %d, send to %d (%d), tag %d",
+                   i,
+                   global_rank,
+                   global_comm->mpi_rank,
+                   i,
+                   sendto_mpi_rank,
+                   tag);
+#endif
+    if (global_rank != i) {
+      CHECK_MPI(MPI_Send(buf, count, mpi_type, sendto_mpi_rank, tag, global_comm->mpi_comm));
+    }
+  }
+
+  return CollSuccess;
+}
+
+static inline std::pair<int, int> mostFrequent(const int* arr, int n)
+{
+  std::unordered_map<int, int> hash;
+  for (int i = 0; i < n; i++) hash[arr[i]]++;
+
+  // find the max frequency
+  int max_count = 0;
+  std::unordered_map<int, int>::iterator it;
+  for (it = hash.begin(); it != hash.end(); it++) {
+    if (max_count < it->second) { max_count = it->second; }
+  }
+
+  return std::make_pair(max_count, hash.size());
+}
+
+static inline int match2ranks(int rank1, int rank2, CollComm global_comm)
+{
+  // tag: seg idx + rank_idx + tag
+  // send_tag = sendto_global_rank * 10000 + global_rank (concat 2 ranks)
+  // which dst seg it sends to (in dst rank)
+  // recv_tag = global_rank * 10000 + recvfrom_global_rank (concat 2 ranks)
+  // idx of current seg we are receving (in src/my rank)
+  // example:
+  // 00 | 01 | 02 | 03
+  // 10 | 11 | 12 | 13
+  // 20 | 21 | 22 | 23
+  // 30 | 31 | 32 | 33
+  // 01's send_tag = 10, 10's recv_tag = 10, match
+  // 12's send_tag = 21, 21's recv_tag = 21, match
+
+  int tag;
+  // old tagging system for debug
+  // constexpr int const max_ranks = 10000;
+  // tag                           = rank1 * max_ranks + rank2;
+
+  // new tagging system, if crash, switch to the old one
+
+  tag = rank1 % global_comm->nb_threads * global_comm->global_comm_size + rank2;
+
+  // Szudzik's Function, two numbers < 32768
+  // if (rank1 >= rank2) {
+  //   tag = rank1*rank1 + rank1 + rank2;
+  // } else {
+  //   tag = rank1 + rank2*rank2;
+  // }
+
+  // Cantor Pairing Function, two numbers < 32768
+  // tag = (rank1 + rank2) * (rank1 + rank2 + 1) / 2 + rank1;
+
+  return tag;
+}
+
+// protected functions start from here
+
+MPI_Datatype MPINetwork::dtypeToMPIDtype(CollDataType dtype)
+{
+  switch (dtype) {
+    case CollDataType::CollInt8: {
+      return MPI_INT8_T;
+    }
+    case CollDataType::CollChar: {
+      return MPI_CHAR;
+    }
+    case CollDataType::CollUint8: {
+      return MPI_UINT8_T;
+    }
+    case CollDataType::CollInt: {
+      return MPI_INT;
+    }
+    case CollDataType::CollUint32: {
+      return MPI_UINT32_T;
+    }
+    case CollDataType::CollInt64: {
+      return MPI_INT64_T;
+    }
+    case CollDataType::CollUint64: {
+      return MPI_UINT64_T;
+    }
+    case CollDataType::CollFloat: {
+      return MPI_FLOAT;
+    }
+    case CollDataType::CollDouble: {
+      return MPI_DOUBLE;
+    }
+    default: {
+      log_coll.fatal("Unknown datatype");
+      LEGATE_ABORT;
+      return MPI_BYTE;
+    }
+  }
+}
+
+int MPINetwork::generateAlltoallTag(int rank1, int rank2, CollComm global_comm)
+{
+  int tag = match2ranks(rank1, rank2, global_comm) * CollTag::MAX_TAG + CollTag::ALLTOALL_TAG;
+  assert(tag <= mpi_tag_ub && tag > 0);
+  return tag;
+}
+
+int MPINetwork::generateAlltoallvTag(int rank1, int rank2, CollComm global_comm)
+{
+  int tag = match2ranks(rank1, rank2, global_comm) * CollTag::MAX_TAG + CollTag::ALLTOALLV_TAG;
+  assert(tag <= mpi_tag_ub && tag > 0);
+  return tag;
+}
+
+int MPINetwork::generateBcastTag(int rank, CollComm global_comm)
+{
+  int tag = rank * CollTag::MAX_TAG + CollTag::BCAST_TAG;
+  assert(tag <= mpi_tag_ub && tag >= 0);
+  return tag;
+}
+
+int MPINetwork::generateGatherTag(int rank, CollComm global_comm)
+{
+  int tag = rank * CollTag::MAX_TAG + CollTag::GATHER_TAG;
+  assert(tag <= mpi_tag_ub && tag > 0);
+  return tag;
+}
+
+}  // namespace coll
+}  // namespace comm
+}  // namespace legate
\ No newline at end of file

From 4d5417e6295464d867a5a2ba8f1ea7dd5bfd4fae Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 17 Nov 2022 12:35:11 -0800
Subject: [PATCH 061/121] Conda env script fixes (#481)

* Disallow cmake 3.25.0

* CUDAConfig.ctk can be "none" but not None
---
 scripts/generate-conda-envs.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index a5cd426ee..494b408ba 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -48,13 +48,13 @@ def format(self, kind: str) -> str:
 
 @dataclass(frozen=True)
 class CUDAConfig(SectionConfig):
-    ctk_version: str | None
+    ctk_version: str
 
     header = "cuda"
 
     @property
     def conda(self) -> Reqs:
-        if self.ctk_version is None:
+        if self.ctk_version == "none":
             return ()
 
         return (
@@ -81,7 +81,8 @@ class BuildConfig(SectionConfig):
     @property
     def conda(self) -> Reqs:
         pkgs = (
-            "cmake>=3.24",
+            # 3.25.0 triggers gitlab.kitware.com/cmake/cmake/-/issues/24119
+            "cmake>=3.24,!=3.25.0",
             "git",
             "make",
             "scikit-build>=0.13.1",
@@ -166,7 +167,7 @@ class EnvConfig:
     use: str
     python: str
     os: OSType
-    ctk: str | None
+    ctk: str
     compilers: bool
     openmpi: bool
 

From 8f6fc0697418a3719814e9f9c6842746b5c5b7ec Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 17 Nov 2022 13:41:45 -0800
Subject: [PATCH 062/121] Refactoring changes (#478)

* Start assigning unique ids to stores and storages

* Move partition caches to the runtime

* Two mapper changes

* Start using a special Legate functor for identity projections
* Refactor the core mapper's slice_task using dispatch

* Make constructors fetch the unique ids instead of passing them as arguments

* Stop using offsets in linearization
---
 legate/core/partition.py        |  11 ++--
 legate/core/runtime.py          | 103 ++++++++++++++++++++++++++------
 legate/core/store.py            |  74 ++++++++++++-----------
 src/core/mapping/base_mapper.cc |  71 +++++-----------------
 src/core/mapping/base_mapper.h  |   4 --
 src/core/mapping/core_mapper.cc |  91 ++++++++++------------------
 src/core/runtime/projection.cc  |  36 +++++++----
 7 files changed, 196 insertions(+), 194 deletions(-)

diff --git a/legate/core/partition.py b/legate/core/partition.py
index 162e7fb6a..be243911d 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -38,6 +38,9 @@
 RequirementType = Union[Type[Broadcast], Type[Partition]]
 
 
+part_mgr = runtime.partition_manager
+
+
 class PartitionBase(ABC):
     @abstractproperty
     def color_shape(self) -> Optional[Shape]:
@@ -295,7 +298,7 @@ def construct(
         self, region: Region, complete: bool = False
     ) -> Optional[LegionPartition]:
         index_space = region.index_space
-        index_partition = runtime.find_partition(index_space, self)
+        index_partition = part_mgr.find_index_partition(index_space, self)
         if index_partition is None:
             tile_shape = self._tile_shape
             transform = Transform(tile_shape.ndim, tile_shape.ndim)
@@ -322,7 +325,7 @@ def construct(
                 kind=kind,
                 keep=True,  # export this partition functor to other libraries
             )
-            runtime.record_partition(index_space, self, index_partition)
+            part_mgr.record_index_partition(index_space, self, index_partition)
         return region.get_child(index_partition)
 
 
@@ -406,7 +409,7 @@ def construct(
         assert complete
 
         index_space = region.index_space
-        index_partition = runtime.find_partition(index_space, self)
+        index_partition = part_mgr.find_index_partition(index_space, self)
         if index_partition is None:
             color_space = runtime.find_or_create_index_space(self._color_shape)
             functor = PartitionByWeights(self._weights)
@@ -420,5 +423,5 @@ def construct(
                 kind=kind,
                 keep=True,  # export this partition functor to other libraries
             )
-            runtime.record_partition(index_space, self, index_partition)
+            part_mgr.record_index_partition(index_space, self, index_partition)
         return region.get_child(index_partition)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 56c106471..3b6c5ddec 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -52,7 +52,11 @@
 
 if TYPE_CHECKING:
     from . import ArgumentMap, Detach, IndexDetach, IndexPartition, Library
-    from ._legion import FieldListLike, PhysicalRegion
+    from ._legion import (
+        FieldListLike,
+        PhysicalRegion,
+        Partition as LegionPartition,
+    )
     from .communicator import Communicator
     from .context import Context
     from .corelib import CoreLib
@@ -619,6 +623,12 @@ def __init__(self, runtime: Runtime) -> None:
         self._index_partitions: dict[
             tuple[IndexSpace, PartitionBase], IndexPartition
         ] = {}
+        # Maps storage id-partition pairs to Legion partitions
+        self._legion_partitions: dict[
+            tuple[int, PartitionBase], Union[None, LegionPartition]
+        ] = {}
+        self._storage_key_partitions: dict[int, PartitionBase] = {}
+        self._store_key_partitions: dict[int, PartitionBase] = {}
 
     def compute_launch_shape(
         self, store: Store, restrictions: tuple[Restriction, ...]
@@ -815,13 +825,13 @@ def use_complete_tiling(self, shape: Shape, tile_shape: Shape) -> bool:
         num_tiles = (shape // tile_shape).volume()
         return not (num_tiles > 256 and num_tiles > 16 * self._num_pieces)
 
-    def find_partition(
+    def find_index_partition(
         self, index_space: IndexSpace, functor: PartitionBase
     ) -> Union[IndexPartition, None]:
         key = (index_space, functor)
         return self._index_partitions.get(key)
 
-    def record_partition(
+    def record_index_partition(
         self,
         index_space: IndexSpace,
         functor: PartitionBase,
@@ -831,6 +841,59 @@ def record_partition(
         assert key not in self._index_partitions
         self._index_partitions[key] = index_partition
 
+    def find_store_key_partition(
+        self, store_id: int, restrictions: tuple[Restriction, ...]
+    ) -> Union[None, PartitionBase]:
+        partition = self._store_key_partitions.get(store_id)
+        if partition is not None and not partition.satisfies_restriction(
+            restrictions
+        ):
+            partition = None
+        return partition
+
+    def record_store_key_partition(
+        self, store_id: int, key_partition: PartitionBase
+    ) -> None:
+        self._store_key_partitions[store_id] = key_partition
+
+    def reset_store_key_partition(self, store_id: int) -> None:
+        del self._store_key_partitions[store_id]
+
+    def find_storage_key_partition(
+        self, storage_id: int, restrictions: tuple[Restriction, ...]
+    ) -> Union[None, PartitionBase]:
+        partition = self._storage_key_partitions.get(storage_id)
+        if partition is not None and not partition.satisfies_restriction(
+            restrictions
+        ):
+            partition = None
+        return partition
+
+    def record_storage_key_partition(
+        self, storage_id: int, key_partition: PartitionBase
+    ) -> None:
+        self._storage_key_partitions[storage_id] = key_partition
+
+    def reset_storage_key_partition(self, storage_id: int) -> None:
+        del self._storage_key_partitions[storage_id]
+
+    def find_legion_partition(
+        self, storage_id: int, functor: PartitionBase
+    ) -> tuple[Optional[LegionPartition], bool]:
+        key = (storage_id, functor)
+        found = key in self._legion_partitions
+        part = self._legion_partitions.get(key)
+        return part, found
+
+    def record_legion_partition(
+        self,
+        storage_id: int,
+        functor: PartitionBase,
+        legion_partition: Optional[LegionPartition],
+    ) -> None:
+        key = (storage_id, functor)
+        self._legion_partitions[key] = legion_partition
+
 
 class CommunicatorManager:
     def __init__(self, runtime: Runtime) -> None:
@@ -904,6 +967,9 @@ def __init__(self, core_library: CoreLib) -> None:
             ty.uint32,
         )
 
+        self._next_store_id = 0
+        self._next_storage_id = 0
+
         self._barriers: List[legion.legion_phase_barrier_t] = []
         self.nccl_needs_barrier = bool(
             self._core_context.get_tunable(
@@ -1138,6 +1204,14 @@ def get_unique_op_id(self) -> int:
         self._unique_op_id += 1
         return op_id
 
+    def get_next_store_id(self) -> int:
+        self._next_store_id += 1
+        return self._next_store_id
+
+    def get_next_storage_id(self) -> int:
+        self._next_storage_id += 1
+        return self._next_storage_id
+
     def dispatch(self, op: Dispatchable[T]) -> T:
         self._attachment_manager.perform_detachments()
         self._attachment_manager.prune_detachments()
@@ -1291,7 +1365,13 @@ def create_store(
             sanitized_shape = shape
             transform = None
 
-        storage = Storage(sanitized_shape, 0, dtype, data=data, kind=kind)
+        storage = Storage(
+            sanitized_shape,
+            0,
+            dtype,
+            data=data,
+            kind=kind,
+        )
         return Store(
             dtype,
             storage,
@@ -1456,21 +1536,6 @@ def create_region(
             handle,
         )
 
-    def find_partition(
-        self, index_space: IndexSpace, functor: PartitionBase
-    ) -> Union[IndexPartition, None]:
-        return self._partition_manager.find_partition(index_space, functor)
-
-    def record_partition(
-        self,
-        index_space: IndexSpace,
-        functor: PartitionBase,
-        index_partition: IndexPartition,
-    ) -> None:
-        self._partition_manager.record_partition(
-            index_space, functor, index_partition
-        )
-
     def extract_scalar(self, future: Future, idx: int) -> Future:
         from .launcher import TaskLauncher
 
diff --git a/legate/core/store.py b/legate/core/store.py
index 7b09516f1..be7e44ea1 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -511,6 +511,7 @@ def __init__(
         )
         assert not isinstance(data, Future) or parent is None
         assert parent is None or color is not None
+        self._unique_id = runtime.get_next_storage_id()
         self._extents = extents
         self._offsets = offsets
         self._level = level
@@ -519,8 +520,6 @@ def __init__(
         self._kind = kind
         self._parent = parent
         self._color = color
-        self._partitions: dict[PartitionBase, Optional[LegionPartition]] = {}
-        self._key_partition: Union[None, PartitionBase] = None
 
         if self._offsets is None and self._extents is not None:
             self._offsets = Shape((0,) * self._extents.ndim)
@@ -738,21 +737,20 @@ def get_inline_allocation(
     def find_key_partition(
         self, restrictions: tuple[Restriction, ...]
     ) -> Optional[PartitionBase]:
-        if (
-            self._key_partition is not None
-            and self._key_partition.satisfies_restriction(restrictions)
-        ):
-            return self._key_partition
-        elif self._parent is not None:
-            return self._parent.find_key_partition(restrictions)
-        else:
-            return None
+        partition = partition_manager.find_storage_key_partition(
+            self._unique_id, restrictions
+        )
+        if partition is None and self._parent is not None:
+            partition = self._parent.find_key_partition(restrictions)
+        return partition
 
     def set_key_partition(self, partition: PartitionBase) -> None:
-        self._key_partition = partition
+        partition_manager.record_storage_key_partition(
+            self._unique_id, partition
+        )
 
     def reset_key_partition(self) -> None:
-        self._key_partition = None
+        partition_manager.reset_storage_key_partition(self._unique_id)
 
     def find_or_create_legion_partition(
         self, functor: PartitionBase, complete: bool
@@ -762,12 +760,14 @@ def find_or_create_legion_partition(
 
         assert isinstance(self.data, RegionField)
 
-        if functor in self._partitions:
-            return self._partitions[functor]
-
-        part = functor.construct(self.data.region, complete=complete)
-        self._partitions[functor] = part
-
+        part, found = partition_manager.find_legion_partition(
+            self._unique_id, functor
+        )
+        if not found:
+            part = functor.construct(self.data.region, complete=complete)
+            partition_manager.record_legion_partition(
+                self._unique_id, functor, part
+            )
         return part
 
 
@@ -862,12 +862,12 @@ def __init__(
         else:
             sanitized_transform = identity
         assert isinstance(shape, Shape) or shape is None
+        self._unique_id = runtime.get_next_store_id()
         self._shape = shape
         self._ndim = ndim
         self._dtype = dtype
         self._storage = storage
         self._transform: TransformStackBase = sanitized_transform
-        self._key_partition: Union[None, PartitionBase] = None
         # This is a cache for the projection functor id
         # when no custom functor is given
         self._projection: Union[None, int] = None
@@ -1022,6 +1022,7 @@ def invert_partition(self, partition: PartitionBase) -> PartitionBase:
     def __str__(self) -> str:
         return (
             f"Store("
+            f"id: {self._unique_id}, "
             f"shape: {self._shape}, "
             f"ndim: {self._ndim}, "
             f"type: {self._dtype}, "
@@ -1220,23 +1221,24 @@ def get_key_partition(self) -> Optional[PartitionBase]:
         # registered correctly
         runtime.flush_scheduling_window()
 
-        restrictions = self.find_restrictions()
-
-        if (
-            self._key_partition is not None
-            and self._key_partition.satisfies_restriction(restrictions)
-        ):
-            return self._key_partition
-
-        return None
+        return partition_manager.find_store_key_partition(
+            self._unique_id, self.find_restrictions()
+        )
 
     def has_key_partition(self, restrictions: tuple[Restriction, ...]) -> bool:
+        key_partition = partition_manager.find_store_key_partition(
+            self._unique_id, restrictions
+        )
+        if key_partition is not None:
+            return True
         restrictions = self._transform.invert_restrictions(restrictions)
         part = self._storage.find_key_partition(restrictions)
         return (part is not None) and (part.even or self._transform.bottom)
 
     def set_key_partition(self, partition: PartitionBase) -> None:
-        self._key_partition = partition
+        partition_manager.record_store_key_partition(
+            self._unique_id, partition
+        )
         # We also update the storage's key partition for other stores
         # sharing the same storage
         self._storage.set_key_partition(
@@ -1244,16 +1246,16 @@ def set_key_partition(self, partition: PartitionBase) -> None:
         )
 
     def reset_key_partition(self) -> None:
-        self._storage.reset_key_partition()
+        partition_manager.reset_store_key_partition(self._unique_id)
 
     def compute_key_partition(
         self, restrictions: tuple[Restriction, ...]
     ) -> PartitionBase:
-        if (
-            self._key_partition is not None
-            and self._key_partition.satisfies_restriction(restrictions)
-        ):
-            return self._key_partition
+        key_partition = partition_manager.find_store_key_partition(
+            self._unique_id, restrictions
+        )
+        if key_partition is not None:
+            return key_partition
 
         # If this is effectively a scalar store, we don't need to partition it
         if self.kind is Future or self.ndim == 0:
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index dcc393023..2f5f5788d 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -235,12 +235,13 @@ void BaseMapper::slice_auto_task(const MapperContext ctx,
                                  const SliceTaskInput& input,
                                  SliceTaskOutput& output)
 {
-  LegateProjectionFunctor* key_functor = nullptr;
+  ProjectionID projection = 0;
   for (auto& req : task.regions)
     if (req.tag == LEGATE_CORE_KEY_STORE_TAG) {
-      key_functor = find_legate_projection_functor(req.projection);
+      projection = req.projection;
       break;
     }
+  auto key_functor = find_legate_projection_functor(projection);
 
   // For multi-node cases we should already have been sharded so we
   // should just have one or a few points here on this node, so iterate
@@ -253,23 +254,13 @@ void BaseMapper::slice_auto_task(const MapperContext ctx,
     sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
 
   auto round_robin = [&](auto& procs) {
-    if (nullptr != key_functor) {
-      auto lo = key_functor->project_point(sharding_domain.lo(), sharding_domain);
-      auto hi = key_functor->project_point(sharding_domain.hi(), sharding_domain);
-      for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-        auto p   = key_functor->project_point(itr.p, sharding_domain);
-        auto idx = linearize(lo, hi, p);
-        output.slices.push_back(TaskSlice(
-          Domain(itr.p, itr.p), procs[idx % procs.size()], false /*recurse*/, false /*stealable*/));
-      }
-    } else {
-      auto lo = sharding_domain.lo();
-      auto hi = sharding_domain.hi();
-      for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-        auto idx = linearize(lo, hi, itr.p);
-        output.slices.push_back(TaskSlice(
-          Domain(itr.p, itr.p), procs[idx % procs.size()], false /*recurse*/, false /*stealable*/));
-      }
+    auto lo = key_functor->project_point(sharding_domain.lo(), sharding_domain);
+    auto hi = key_functor->project_point(sharding_domain.hi(), sharding_domain);
+    for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
+      auto p   = key_functor->project_point(itr.p, sharding_domain);
+      auto idx = linearize(lo, hi, p);
+      output.slices.push_back(TaskSlice(
+        Domain(itr.p, itr.p), procs[idx % procs.size()], false /*recurse*/, false /*stealable*/));
     }
   };
 
@@ -350,33 +341,6 @@ void BaseMapper::slice_manual_task(const MapperContext ctx,
   dispatch(task.target_proc.kind(), distribute);
 }
 
-void BaseMapper::slice_round_robin_task(const MapperContext ctx,
-                                        const LegionTask& task,
-                                        const SliceTaskInput& input,
-                                        SliceTaskOutput& output)
-{
-  // If we're here, that means that the task has no region that we can key off
-  // to distribute them reasonably. In this case, we just do a round-robin
-  // assignment.
-
-  output.slices.reserve(input.domain.get_volume());
-
-  // Get the domain for the sharding space also
-  Domain sharding_domain = task.index_domain;
-  if (task.sharding_space.exists())
-    sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
-
-  auto distribute = [&](auto& procs) {
-    size_t idx = 0;
-    for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-      output.slices.push_back(TaskSlice(
-        Domain(itr.p, itr.p), procs[idx++ % procs.size()], false /*recurse*/, false /*stealable*/));
-    }
-  };
-
-  dispatch(task.target_proc.kind(), distribute);
-}
-
 void BaseMapper::slice_task(const MapperContext ctx,
                             const LegionTask& task,
                             const SliceTaskInput& input,
@@ -384,8 +348,6 @@ void BaseMapper::slice_task(const MapperContext ctx,
 {
   if (task.tag == LEGATE_CORE_MANUAL_PARALLEL_LAUNCH_TAG)
     slice_manual_task(ctx, task, input, output);
-  else if (task.regions.size() == 0)
-    slice_round_robin_task(ctx, task, input, output);
   else
     slice_auto_task(ctx, task, input, output);
 }
@@ -1073,15 +1035,10 @@ void BaseMapper::map_copy(const MapperContext ctx,
     // in which case we should find the key store and use its projection functor
     // for the linearization
     auto* key_functor = find_legate_projection_functor(0);
-
-    if (key_functor != nullptr) {
-      auto lo = key_functor->project_point(sharding_domain.lo(), sharding_domain);
-      auto hi = key_functor->project_point(sharding_domain.hi(), sharding_domain);
-      auto p  = key_functor->project_point(copy.index_point, sharding_domain);
-      proc_id = linearize(lo, hi, p);
-    } else {
-      proc_id = linearize(sharding_domain.lo(), sharding_domain.hi(), copy.index_point);
-    }
+    auto lo           = key_functor->project_point(sharding_domain.lo(), sharding_domain);
+    auto hi           = key_functor->project_point(sharding_domain.hi(), sharding_domain);
+    auto p            = key_functor->project_point(copy.index_point, sharding_domain);
+    proc_id           = linearize(lo, hi, p);
   }
   if (!local_gpus.empty())
     target_proc = local_gpus[proc_id % local_gpus.size()];
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index 850427b6d..17fdb2045 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -330,10 +330,6 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
                          const Legion::Task& task,
                          const SliceTaskInput& input,
                          SliceTaskOutput& output);
-  void slice_round_robin_task(const Legion::Mapping::MapperContext ctx,
-                              const Legion::Task& task,
-                              const SliceTaskInput& input,
-                              SliceTaskOutput& output);
 
  protected:
   Legion::ShardingID find_sharding_functor_by_key_store_projection(
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index da3f7414b..ccc738983 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -23,6 +23,7 @@
 #include "core/comm/comm_nccl.h"
 #endif
 #include "core/task/task.h"
+#include "core/utilities/linearize.h"
 
 namespace legate {
 
@@ -94,6 +95,20 @@ class CoreMapper : public Legion::Mapping::NullMapper {
                                     const SelectTunableInput& input,
                                     SelectTunableOutput& output);
 
+ protected:
+  template <typename Functor>
+  decltype(auto) dispatch(Legion::Processor::Kind kind, Functor functor)
+  {
+    switch (kind) {
+      case Legion::Processor::LOC_PROC: return functor(local_cpus);
+      case Legion::Processor::TOC_PROC: return functor(local_gpus);
+      case Legion::Processor::OMP_PROC: return functor(local_omps);
+      default: LEGATE_ABORT;
+    }
+    assert(false);
+    return functor(local_cpus);
+  }
+
  public:
   const AddressSpace local_node;
   const size_t total_nodes;
@@ -258,68 +273,22 @@ void CoreMapper::slice_task(const MapperContext ctx,
 {
   assert(context.valid_task_id(task.task_id));
   output.slices.reserve(input.domain.get_volume());
-  // Check to see if we're control replicated or not. If we are then
-  // we'll already have been sharded.
-  Machine::ProcessorQuery all_procs(machine);
-  all_procs.only_kind(task.target_proc.kind());
-  if (all_procs.count() == input.domain.get_volume()) {
-    Machine::ProcessorQuery::iterator pit = all_procs.begin();
-    for (Domain::DomainPointIterator itr(input.domain); itr; itr++, pit++)
-      output.slices.push_back(
-        TaskSlice(Domain(itr.p, itr.p), *pit, false /*recurse*/, false /*stealable*/));
-  } else {
-    // Control-replicated because we've already been sharded
-    Domain sharding_domain = task.index_domain;
-    if (task.sharding_space.exists())
-      sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
-    assert(sharding_domain.get_dim() == 1);
-    assert(input.domain.get_dim() == 1);
-    const Rect<1> space = sharding_domain;
-    const Rect<1> local = input.domain;
-    const size_t size   = (space.hi[0] - space.lo[0]) + 1;
-    // Assume that if we're control replicated there is one shard per space
-    const coord_t chunk = (size + total_nodes - 1) / total_nodes;
-    const coord_t start = local_node * chunk + space.lo[0];
-    switch (task.target_proc.kind()) {
-      case Processor::LOC_PROC: {
-        for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-          const Point<1> point = itr.p;
-          assert(point[0] >= start);
-          assert(point[0] < (start + chunk));
-          const unsigned local_index = point[0] - start;
-          assert(local_index < local_cpus.size());
-          output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), local_cpus[local_index], false /*recurse*/, false /*stealable*/));
-        }
-        break;
-      }
-      case Processor::TOC_PROC: {
-        for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-          const Point<1> point = itr.p;
-          assert(point[0] >= start);
-          assert(point[0] < (start + chunk));
-          const unsigned local_index = point[0] - start;
-          assert(local_index < local_gpus.size());
-          output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), local_gpus[local_index], false /*recurse*/, false /*stealable*/));
-        }
-        break;
-      }
-      case Processor::OMP_PROC: {
-        for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
-          const Point<1> point = itr.p;
-          assert(point[0] >= start);
-          assert(point[0] < (start + chunk));
-          const unsigned local_index = point[0] - start;
-          assert(local_index < local_omps.size());
-          output.slices.push_back(TaskSlice(
-            Domain(itr.p, itr.p), local_omps[local_index], false /*recurse*/, false /*stealable*/));
-        }
-        break;
-      }
-      default: LEGATE_ABORT;
+
+  Domain sharding_domain = task.index_domain;
+  if (task.sharding_space.exists())
+    sharding_domain = runtime->get_index_space_domain(ctx, task.sharding_space);
+
+  auto round_robin = [&](auto& procs) {
+    auto lo = sharding_domain.lo();
+    auto hi = sharding_domain.hi();
+    for (Domain::DomainPointIterator itr(input.domain); itr; itr++) {
+      auto idx = linearize(lo, hi, itr.p);
+      output.slices.push_back(TaskSlice(
+        Domain(itr.p, itr.p), procs[idx % procs.size()], false /*recurse*/, false /*stealable*/));
     }
-  }
+  };
+
+  dispatch(task.target_proc.kind(), round_robin);
 }
 
 void CoreMapper::map_task(const MapperContext ctx,
diff --git a/src/core/runtime/projection.cc b/src/core/runtime/projection.cc
index 1fde46ea0..5b5809866 100644
--- a/src/core/runtime/projection.cc
+++ b/src/core/runtime/projection.cc
@@ -74,14 +74,6 @@ LogicalRegion DelinearizationFunctor::project(LogicalPartition upper_bound,
     return LogicalRegion::NO_REGION;
 }
 
-void register_legate_core_projection_functors(Legion::Runtime* runtime,
-                                              const LibraryContext& context)
-{
-  auto proj_id = context.get_projection_id(LEGATE_CORE_DELINEARIZE_PROJ_ID);
-  auto functor = new DelinearizationFunctor(runtime);
-  runtime->register_projection_functor(proj_id, functor, true /*silence warnings*/);
-}
-
 LegateProjectionFunctor::LegateProjectionFunctor(Runtime* rt) : ProjectionFunctor(rt) {}
 
 LogicalRegion LegateProjectionFunctor::project(LogicalPartition upper_bound,
@@ -101,8 +93,7 @@ class AffineFunctor : public LegateProjectionFunctor {
   AffineFunctor(Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets);
 
  public:
-  virtual DomainPoint project_point(const DomainPoint& point,
-                                    const Domain& launch_domain) const override
+  DomainPoint project_point(const DomainPoint& point, const Domain& launch_domain) const override
   {
     return DomainPoint(transform_ * Point<SRC_DIM>(point) + offsets_);
   }
@@ -142,8 +133,17 @@ template <int32_t SRC_DIM, int32_t TGT_DIM>
   return transform;
 }
 
-static std::unordered_map<ProjectionID, LegateProjectionFunctor*> functor_table;
-static std::mutex functor_table_lock;
+struct IdentityFunctor : public LegateProjectionFunctor {
+  IdentityFunctor(Runtime* runtime) : LegateProjectionFunctor(runtime) {}
+  DomainPoint project_point(const DomainPoint& point, const Domain&) const override
+  {
+    return point;
+  }
+};
+
+static LegateProjectionFunctor* identity_functor{nullptr};
+static std::unordered_map<ProjectionID, LegateProjectionFunctor*> functor_table{};
+static std::mutex functor_table_lock{};
 
 struct create_affine_functor_fn {
   template <int32_t SRC_DIM, int32_t TGT_DIM>
@@ -158,9 +158,19 @@ struct create_affine_functor_fn {
   }
 };
 
+void register_legate_core_projection_functors(Legion::Runtime* runtime,
+                                              const LibraryContext& context)
+{
+  auto proj_id = context.get_projection_id(LEGATE_CORE_DELINEARIZE_PROJ_ID);
+  auto functor = new DelinearizationFunctor(runtime);
+  runtime->register_projection_functor(proj_id, functor, true /*silence warnings*/);
+
+  identity_functor = new IdentityFunctor(runtime);
+}
+
 LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id)
 {
-  if (0 == proj_id) return nullptr;
+  if (0 == proj_id) return identity_functor;
   const std::lock_guard<std::mutex> lock(functor_table_lock);
   return functor_table[proj_id];
 }

From c3cc9fb58fd3491bdf4fc9b1fede086f72cdfcd4 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 17 Nov 2022 17:41:43 -0800
Subject: [PATCH 063/121] Cycle detection check (#361)

* WIP on cycle detection check

* Don't consider cycles going through sys.modules

* Return True if cycles are found

* Remove some dead code

* Add -legate:cycle-check flag, to perform cycle check at exit

* Print out list indices on cycle listings

* Ignore getset_descriptor objects in cycle check

They seem to be triggering false positives

* Instead just skip the __globals__ pointer

* Skip based on ModuleType, vs inclusion sys.modules

* Print name for functions and class objects

* Ignore cycles going through types

Self-type references and class.__init__.__closure__ seem to produce
a lot of false positives.

* Look for ny Store, not just RegionField

* Filter for more specific Legion handle classes

* Add unit test for cycle checker

* Restrict to RegionField's, as some Futures naturally leak

* Update documentation

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/core/cycle_detector.py         | 153 ++++++++++++++++++++++++++
 legate/core/runtime.py                |  45 +++++++-
 tests/unit/legate/test_cycle_check.py |  47 ++++++++
 3 files changed, 239 insertions(+), 6 deletions(-)
 create mode 100644 legate/core/cycle_detector.py
 create mode 100644 tests/unit/legate/test_cycle_check.py

diff --git a/legate/core/cycle_detector.py b/legate/core/cycle_detector.py
new file mode 100644
index 000000000..a90a6531a
--- /dev/null
+++ b/legate/core/cycle_detector.py
@@ -0,0 +1,153 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gc
+import inspect
+from collections import deque
+from types import FunctionType, ModuleType
+from typing import Any, Set, Union
+
+
+def _skip(src: Any, dst: Any) -> bool:
+    return (
+        isinstance(src, type)
+        or isinstance(src, ModuleType)
+        or isinstance(src, FunctionType)
+        and hasattr(src, "__globals__")
+        and src.__globals__ is dst
+    )
+
+
+def _find_cycles(root: Any, all_ids: Set[int]) -> bool:
+    opened: dict[int, int] = {}
+    closed: Set[int] = set()
+    stack = [root]
+    while len(stack) > 0:
+        dst = stack[-1]
+        if id(dst) in opened:
+            if opened[id(dst)] == len(stack):
+                del opened[id(dst)]
+                closed.add(id(dst))
+            else:
+                print("found cycle!")
+                print("  tail:")
+                _bfs(dst, root, all_ids)
+                print("  cycle:")
+                _bfs(dst, dst, all_ids)
+                return True
+            stack.pop()
+        elif id(dst) in closed:
+            stack.pop()
+        else:
+            opened[id(dst)] = len(stack)
+            for src in gc.get_referrers(dst):
+                if id(src) in all_ids and not _skip(src, dst):
+                    stack.append(src)
+    return False
+
+
+def _find_field(src: Any, dst: Any) -> Union[str, None]:
+    if type(src) == dict:
+        for k, v in src.items():
+            if v is dst and isinstance(k, str):
+                return f'["{k}"]'
+    if type(src) == tuple:
+        for k, v in enumerate(src):
+            if v is dst:
+                return f"[{k}]"
+    if type(src) == list:
+        for i, v in enumerate(src):
+            if v is dst:
+                return f"[{i}]"
+    try:
+        for fld in dir(src):
+            try:
+                if hasattr(src, fld) and getattr(src, fld) is dst:
+                    return "." + fld
+            except Exception:
+                pass
+    except Exception:
+        pass
+    try:
+        for fld in vars(src):
+            try:
+                if hasattr(src, fld) and getattr(src, fld) is dst:
+                    return "." + fld
+            except Exception:
+                pass
+    except Exception:
+        pass
+    try:
+        for fld, val in inspect.getmembers(src):
+            if val is dst:
+                return "." + fld
+    except Exception:
+        pass
+    return None
+
+
+def _obj_str(obj: Any) -> str:
+    res = f"{hex(id(obj))}: {type(obj)}"
+    if hasattr(obj, "__name__"):
+        res += f" {obj.__name__}"
+    return res
+
+
+def _bfs(begin: Any, end: Any, all_ids: Set[int]) -> None:
+    parent = {}
+    q = deque([begin])
+    while len(q) > 0:
+        src = q.popleft()
+        for dst in gc.get_referents(src):
+            if id(dst) not in all_ids or id(dst) in parent or _skip(src, dst):
+                continue
+            parent[id(dst)] = src
+            if dst is end:
+                print(f"    {_obj_str(dst)}")
+                while True:
+                    src = parent[id(dst)]
+                    fld = _find_field(src, dst)
+                    if fld is None:
+                        print("     ^")
+                    else:
+                        print(f"     ^ {fld}")
+                    print(f"    {_obj_str(src)}")
+                    dst = src
+                    if dst is begin:
+                        break
+                return
+            q.append(dst)
+    print(f"    {_obj_str(end)}")
+    print("     ^")
+    print("    ???")
+    print("     ^")
+    print(f"    {_obj_str(begin)}")
+
+
+def find_cycles() -> bool:
+    from .store import RegionField
+
+    found_cycles = False
+    all_objs = gc.get_objects()
+    all_ids = set(id(obj) for obj in all_objs)
+    for obj in all_objs:
+        if isinstance(obj, RegionField):
+            print(
+                f"looking for cycles involving {hex(id(obj))}, "
+                f"of type {type(obj)}"
+            )
+            if _find_cycles(obj, all_ids):
+                found_cycles = True
+    return found_cycles
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 3b6c5ddec..534862b61 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -17,9 +17,11 @@
 import gc
 import math
 import struct
+import sys
 import weakref
 from collections import deque
 from dataclasses import dataclass
+from types import ModuleType
 from typing import TYPE_CHECKING, Any, Deque, List, Optional, TypeVar, Union
 
 from legion_top import add_cleanup_item, top_level
@@ -45,6 +47,7 @@
 from .allocation import Attachable
 from .communicator import CPUCommunicator, NCCLCommunicator
 from .corelib import core_library
+from .cycle_detector import find_cycles
 from .exception import PendingException
 from .projection import is_identity_projection, pack_symbolic_projection_repr
 from .restriction import Restriction
@@ -83,7 +86,24 @@
             action="store_true",
             default=False,
             dest="consensus",
-            help="Turn on consensus match on single node. (for testing)",
+            help="Turn on consensus match on single node (for testing).",
+        ),
+    ),
+    Argument(
+        "cycle-check",
+        ArgSpec(
+            action="store_true",
+            default=False,
+            dest="cycle_check",
+            help=(
+                "Check for reference cycles involving RegionField objects on "
+                "program exit (developer option). Such cycles have the effect "
+                "of stopping used RegionFields from being repurposed for "
+                "other Stores, thus increasing memory pressure. By default "
+                "this mode will miss any cycles already collected by the "
+                "garbage collector; run gc.disable() at the beginning of the "
+                "program to avoid this."
+            ),
         ),
     ),
 ]
@@ -927,11 +947,6 @@ def __init__(self, core_library: CoreLib) -> None:
 
         self._args = parse_library_command_args("legate", ARGS)
 
-        try:
-            self._legion_context = top_level.context[0]
-        except AttributeError:
-            pass
-
         # Record whether we need to run finalize tasks
         # Key off whether we are being loaded in a context or not
         try:
@@ -1665,6 +1680,24 @@ def _cleanup_legate_runtime() -> None:
 add_cleanup_item(_cleanup_legate_runtime)
 
 
+class _CycleCheckWrapper(ModuleType):
+    def __init__(self, wrapped_mod: ModuleType):
+        self._wrapped_mod = wrapped_mod
+
+    def __getattr__(self, attr: str) -> Any:
+        return getattr(self._wrapped_mod, attr)
+
+    def __del__(self) -> None:
+        find_cycles()
+
+
+if runtime._args.cycle_check:
+    # The first thing that legion_top does after executing the user script
+    # is to remove the newly created "__main__" module. We intercept this
+    # deletion operation to perform our check.
+    sys.modules["__main__"] = _CycleCheckWrapper(sys.modules["__main__"])
+
+
 def get_legion_runtime() -> legion.legion_runtime_t:
     return runtime.legion_runtime
 
diff --git a/tests/unit/legate/test_cycle_check.py b/tests/unit/legate/test_cycle_check.py
new file mode 100644
index 000000000..5839bd917
--- /dev/null
+++ b/tests/unit/legate/test_cycle_check.py
@@ -0,0 +1,47 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import subprocess
+from pathlib import Path
+
+import pytest
+
+PROG_TEXT = """
+import numpy as np
+from legate.core import get_legate_runtime, types as ty
+store = get_legate_runtime().core_context.create_store(
+    ty.int32, shape=(4,), optimize_scalar=False
+)
+# initialize the RegionField backing the store
+store.storage
+# create a cycle
+x = [store]
+x.append(x)
+"""
+
+
+def test_cycle_check(tmp_path: Path) -> None:
+    prog_file = tmp_path / "prog.py"
+    prog_file.write_text(PROG_TEXT)
+    output = subprocess.check_output(
+        ["legate", prog_file, "--cpus", "1", "-legate:cycle-check"]
+    )
+    assert "found cycle!" in output.decode("utf-8")
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))

From 278259aa091f820503d23794e630b2bb65f4f064 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 17 Nov 2022 20:01:56 -0800
Subject: [PATCH 064/121] Fixes for mypy>=0.990 (#482)

* Fixes for mypy>=0.990

* Bump up the mypy version

* Fix the mypy error by inheriting DataclassMixin explicitly
---
 .pre-commit-config.yaml              | 2 +-
 legate/core/_legion/util.py          | 2 ++
 legate/core/constraints.py           | 2 +-
 pyproject.toml                       | 1 -
 tests/unit/legate/util/test_args.py  | 6 +++---
 tests/unit/legate/util/test_types.py | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 402bffb64..a22015b82 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
           files: \.(cu|cuh|h|cc|inl)$
           types_or: []
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v0.982'
+      rev: 'v0.991'
       hooks:
         - id: mypy
           pass_filenames: false
diff --git a/legate/core/_legion/util.py b/legate/core/_legion/util.py
index af4d07cf2..c4b2e9705 100644
--- a/legate/core/_legion/util.py
+++ b/legate/core/_legion/util.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import struct
+from abc import abstractmethod
 from typing import TYPE_CHECKING, Any, Generic, List, Optional, TypeVar, Union
 
 import numpy as np
@@ -159,6 +160,7 @@ def launch(
 
 
 class Dispatchable(Generic[T]):
+    @abstractmethod
     def launch(
         self,
         runtime: legion.legion_runtime_t,
diff --git a/legate/core/constraints.py b/legate/core/constraints.py
index d18b5fab0..5b910c3e3 100644
--- a/legate/core/constraints.py
+++ b/legate/core/constraints.py
@@ -82,7 +82,7 @@ def reduce(self) -> Lit:
         return self
 
     def unknowns(self) -> Iterator[PartSym]:
-        pass
+        return iter([])
 
 
 class PartSym(Expr):
diff --git a/pyproject.toml b/pyproject.toml
index 18325b3c6..89a6da552 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,7 +79,6 @@ warn_no_return = true
 warn_return_any = false
 warn_unreachable = true
 
-show_none_errors = true
 ignore_errors = false
 
 allow_untyped_globals = false
diff --git a/tests/unit/legate/util/test_args.py b/tests/unit/legate/util/test_args.py
index f6c97f4ed..190ff3c6b 100644
--- a/tests/unit/legate/util/test_args.py
+++ b/tests/unit/legate/util/test_args.py
@@ -114,7 +114,7 @@ def test_default_help(
         with pytest.raises(SystemExit) as e:
             m.parse_library_command_args("foo", [])
         assert e.value.code is None
-        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        out, err = capsys.readouterr()
         assert out.startswith("usage: <foo program>")
 
     def test_default_help_precedence(
@@ -125,7 +125,7 @@ def test_default_help_precedence(
         with pytest.raises(SystemExit) as e:
             m.parse_library_command_args("foo", args)
         assert e.value.code is None
-        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        out, err = capsys.readouterr()
         assert out.startswith("usage: <foo program>")
 
     def test_default_help_patches_short_args(
@@ -136,7 +136,7 @@ def test_default_help_patches_short_args(
         with pytest.raises(SystemExit) as e:
             m.parse_library_command_args("foo", args)
         assert e.value.code is None
-        out, err = capsys.readouterr()  # type: ignore[unreachable]
+        out, err = capsys.readouterr()
         assert out.startswith("usage: <foo program>")
         assert "-foo:bar" in out
         assert "--foo:bar" not in out
diff --git a/tests/unit/legate/util/test_types.py b/tests/unit/legate/util/test_types.py
index 01835f882..070bc458f 100644
--- a/tests/unit/legate/util/test_types.py
+++ b/tests/unit/legate/util/test_types.py
@@ -41,7 +41,7 @@ class Source:
 
 
 @dataclass(frozen=True)
-class Target:
+class Target(m.DataclassMixin):
     foo: int
     bar: float
     baz: str

From 9d7b0cf0daf259fa22b6f18abecb1169107e6ae9 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 21 Nov 2022 11:36:57 -0800
Subject: [PATCH 065/121] Fix some typos (#485)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/core/launcher.py | 22 +++++++++++++++++++---
 legate/core/store.py    |  2 +-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 2f8ff886d..69d6833c7 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -277,7 +277,15 @@ def add(
             parent_partition = parent.parent
             parent = parent_partition.parent
         if req.permission != Permission.REDUCTION:
-            f(task, req.region, fields, 0, parent=parent, tag=req.tag)
+            f(
+                task,
+                req.region,
+                fields,
+                0,
+                parent=parent,
+                tag=req.tag,
+                flags=req.flags,
+            )
         else:
             f(
                 task,
@@ -287,6 +295,7 @@ def add(
                 0,
                 parent=parent,
                 tag=req.tag,
+                flags=req.flags,
             )
 
     def add_single(
@@ -355,9 +364,16 @@ def add_single(
     ) -> None:
         f = methods[req.permission]
         if req.permission != Permission.REDUCTION:
-            f(task, req.region, fields, tag=req.tag)
+            f(task, req.region, fields, tag=req.tag, flags=req.flags)
         else:
-            f(task, req.region, fields, self.redop, tag=req.tag)
+            f(
+                task,
+                req.region,
+                fields,
+                self.redop,
+                tag=req.tag,
+                flags=req.flags,
+            )
 
     def __hash__(self) -> int:
         return hash((self.part, self.proj, self.redop))
diff --git a/legate/core/store.py b/legate/core/store.py
index be7e44ea1..63040901b 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -374,7 +374,7 @@ def register_consumer(self, consumer: Any) -> None:
         # so that we don't create reference cycles.
 
         def callback() -> None:
-            self.decrement_inline_mapped_ref_count()
+            self.decrement_inline_mapped_ref_count(unordered=True)
 
         weakref.finalize(consumer, callback)
 

From 97e2edfabb01667d28dd003e7a3d5b6976a7647a Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Mon, 21 Nov 2022 11:39:13 -0800
Subject: [PATCH 066/121] fix several reference cycle / leak related bugs
 (#488)

* legate/core: don't hoist partition manager to top level

These declarations appear to cause the partition manager to not be
cleaned up, leading to leaks of futures and future maps in the
partitions that it points to.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

* legate/core/runtime: remove PartitionManager reference on shutdown

This enables the `PartitionManager` to get collected on runtime
shutdown, even if there are cycles that point to the core's runtime.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/partition.py | 19 ++++++++++++-------
 legate/core/runtime.py   |  3 +++
 legate/core/store.py     | 32 +++++++++++++++++---------------
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/legate/core/partition.py b/legate/core/partition.py
index be243911d..d5904c319 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -38,9 +38,6 @@
 RequirementType = Union[Type[Broadcast], Type[Partition]]
 
 
-part_mgr = runtime.partition_manager
-
-
 class PartitionBase(ABC):
     @abstractproperty
     def color_shape(self) -> Optional[Shape]:
@@ -298,7 +295,9 @@ def construct(
         self, region: Region, complete: bool = False
     ) -> Optional[LegionPartition]:
         index_space = region.index_space
-        index_partition = part_mgr.find_index_partition(index_space, self)
+        index_partition = runtime.partition_manager.find_index_partition(
+            index_space, self
+        )
         if index_partition is None:
             tile_shape = self._tile_shape
             transform = Transform(tile_shape.ndim, tile_shape.ndim)
@@ -325,7 +324,9 @@ def construct(
                 kind=kind,
                 keep=True,  # export this partition functor to other libraries
             )
-            part_mgr.record_index_partition(index_space, self, index_partition)
+            runtime.partition_manager.record_index_partition(
+                index_space, self, index_partition
+            )
         return region.get_child(index_partition)
 
 
@@ -409,7 +410,9 @@ def construct(
         assert complete
 
         index_space = region.index_space
-        index_partition = part_mgr.find_index_partition(index_space, self)
+        index_partition = runtime.partition_manager.find_index_partition(
+            index_space, self
+        )
         if index_partition is None:
             color_space = runtime.find_or_create_index_space(self._color_shape)
             functor = PartitionByWeights(self._weights)
@@ -423,5 +426,7 @@ def construct(
                 kind=kind,
                 keep=True,  # export this partition functor to other libraries
             )
-            part_mgr.record_index_partition(index_space, self, index_partition)
+            runtime.partition_manager.record_index_partition(
+                index_space, self, index_partition
+            )
         return region.get_child(index_partition)
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 534862b61..8306f8b8a 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -1206,6 +1206,9 @@ def destroy(self) -> None:
         self.region_managers_by_region = {}
         self.field_managers = {}
         self.index_spaces = {}
+        # Explicitly release the reference to the partition manager so that
+        # it may be collected, releasing references to Futures and FutureMaps.
+        self._partition_manager = None  # type: ignore
 
         if self._finalize_tasks:
             # Run a gc and then end the legate task
diff --git a/legate/core/store.py b/legate/core/store.py
index 63040901b..2f4e2f9a7 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -64,7 +64,6 @@
 from math import prod
 
 attachment_manager = runtime.attachment_manager
-partition_manager = runtime.partition_manager
 
 
 # A Field holds a reference to a field in a region tree
@@ -695,8 +694,11 @@ def slice(self, tile_shape: Shape, offsets: Shape) -> Storage:
             shape % tile_shape
         ).sum() == 0
 
-        if can_tile_completely and partition_manager.use_complete_tiling(
-            shape, tile_shape
+        if (
+            can_tile_completely
+            and runtime.partition_manager.use_complete_tiling(
+                shape, tile_shape
+            )
         ):
             color_shape = shape // tile_shape
             color = offsets // tile_shape
@@ -737,7 +739,7 @@ def get_inline_allocation(
     def find_key_partition(
         self, restrictions: tuple[Restriction, ...]
     ) -> Optional[PartitionBase]:
-        partition = partition_manager.find_storage_key_partition(
+        partition = runtime.partition_manager.find_storage_key_partition(
             self._unique_id, restrictions
         )
         if partition is None and self._parent is not None:
@@ -745,12 +747,12 @@ def find_key_partition(
         return partition
 
     def set_key_partition(self, partition: PartitionBase) -> None:
-        partition_manager.record_storage_key_partition(
+        runtime.partition_manager.record_storage_key_partition(
             self._unique_id, partition
         )
 
     def reset_key_partition(self) -> None:
-        partition_manager.reset_storage_key_partition(self._unique_id)
+        runtime.partition_manager.reset_storage_key_partition(self._unique_id)
 
     def find_or_create_legion_partition(
         self, functor: PartitionBase, complete: bool
@@ -760,12 +762,12 @@ def find_or_create_legion_partition(
 
         assert isinstance(self.data, RegionField)
 
-        part, found = partition_manager.find_legion_partition(
+        part, found = runtime.partition_manager.find_legion_partition(
             self._unique_id, functor
         )
         if not found:
             part = functor.construct(self.data.region, complete=complete)
-            partition_manager.record_legion_partition(
+            runtime.partition_manager.record_legion_partition(
                 self._unique_id, functor, part
             )
         return part
@@ -1221,12 +1223,12 @@ def get_key_partition(self) -> Optional[PartitionBase]:
         # registered correctly
         runtime.flush_scheduling_window()
 
-        return partition_manager.find_store_key_partition(
+        return runtime.partition_manager.find_store_key_partition(
             self._unique_id, self.find_restrictions()
         )
 
     def has_key_partition(self, restrictions: tuple[Restriction, ...]) -> bool:
-        key_partition = partition_manager.find_store_key_partition(
+        key_partition = runtime.partition_manager.find_store_key_partition(
             self._unique_id, restrictions
         )
         if key_partition is not None:
@@ -1236,7 +1238,7 @@ def has_key_partition(self, restrictions: tuple[Restriction, ...]) -> bool:
         return (part is not None) and (part.even or self._transform.bottom)
 
     def set_key_partition(self, partition: PartitionBase) -> None:
-        partition_manager.record_store_key_partition(
+        runtime.partition_manager.record_store_key_partition(
             self._unique_id, partition
         )
         # We also update the storage's key partition for other stores
@@ -1246,12 +1248,12 @@ def set_key_partition(self, partition: PartitionBase) -> None:
         )
 
     def reset_key_partition(self) -> None:
-        partition_manager.reset_store_key_partition(self._unique_id)
+        runtime.partition_manager.reset_store_key_partition(self._unique_id)
 
     def compute_key_partition(
         self, restrictions: tuple[Restriction, ...]
     ) -> PartitionBase:
-        key_partition = partition_manager.find_store_key_partition(
+        key_partition = runtime.partition_manager.find_store_key_partition(
             self._unique_id, restrictions
         )
         if key_partition is not None:
@@ -1274,14 +1276,14 @@ def compute_key_partition(
             partition = self._transform.convert_partition(partition)
             return partition
         else:
-            launch_shape = partition_manager.compute_launch_shape(
+            launch_shape = runtime.partition_manager.compute_launch_shape(
                 self,
                 restrictions,
             )
             if launch_shape is None:
                 partition = REPLICATE
             else:
-                tile_shape = partition_manager.compute_tile_shape(
+                tile_shape = runtime.partition_manager.compute_tile_shape(
                     self.shape, launch_shape
                 )
                 partition = Tiling(tile_shape, launch_shape)

From 2c64d1cc60d1dcc5a5c7f449f7011156958f687d Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 21 Nov 2022 12:01:09 -0800
Subject: [PATCH 067/121] Regenerate `install_info.py` on every build (#486)

* regenerate install_info.py on every build

* specify custom target dependencies correctly

* fix typo
---
 CMakeLists.txt                                | 13 ++++----
 cmake/generate_install_info_py.cmake          | 31 +++++++++++++++++++
 legate_core_python.cmake                      | 20 ++++--------
 scripts/build-install.sh                      |  2 +-
 scripts/build-no-install.sh                   |  2 +-
 scripts/build-separately-no-install.sh        |  2 +-
 scripts/build-with-legion-no-install.sh       |  2 +-
 ...build-with-legion-separately-no-install.sh |  2 +-
 8 files changed, 48 insertions(+), 26 deletions(-)
 create mode 100644 cmake/generate_install_info_py.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa2ce2cf4..e83b9a779 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,23 +97,22 @@ endif()
 if(CMAKE_GENERATOR STREQUAL "Ninja")
   function(add_touch_legate_core_ninja_build_target)
     set(_suf )
-    set(_depends )
     if(SKBUILD)
       set(_suf "_python")
     endif()
+    add_custom_target("touch_legate_core${_suf}_ninja_build" ALL
+      COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
+      COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
+      VERBATIM
+    )
     foreach(_dep IN ITEMS legion_core legion_core_python
                           Legion LegionRuntime
                           Realm RealmRuntime
                           Regent)
       if(TARGET ${_dep})
-        list(APPEND _depends ${_dep})
+        add_dependencies("touch_legate_core${_suf}_ninja_build" ${_dep})
       endif()
     endforeach()
-    add_custom_target("touch_legion_core${_suf}_ninja_build" ALL
-      COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
-      COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
-      VERBATIM DEPENDS ${_depends}
-    )
   endfunction()
   add_touch_legate_core_ninja_build_target()
 endif()
diff --git a/cmake/generate_install_info_py.cmake b/cmake/generate_install_info_py.cmake
new file mode 100644
index 000000000..408500ac9
--- /dev/null
+++ b/cmake/generate_install_info_py.cmake
@@ -0,0 +1,31 @@
+#=============================================================================
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+execute_process(
+  COMMAND ${CMAKE_C_COMPILER}
+    -E -DLEGATE_USE_PYTHON_CFFI
+    -I "${CMAKE_CURRENT_LIST_DIR}/../src/core"
+    -P "${CMAKE_CURRENT_LIST_DIR}/../src/core/legate_c.h"
+  ECHO_ERROR_VARIABLE
+  OUTPUT_VARIABLE header
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+set(libpath "")
+configure_file(
+  "${CMAKE_CURRENT_LIST_DIR}/../legate/install_info.py.in"
+  "${CMAKE_CURRENT_LIST_DIR}/../legate/install_info.py"
+@ONLY)
diff --git a/legate_core_python.cmake b/legate_core_python.cmake
index 85e57d8d2..05d92853e 100644
--- a/legate_core_python.cmake
+++ b/legate_core_python.cmake
@@ -43,22 +43,14 @@ if(NOT legate_core_FOUND)
   set(SKBUILD ON)
 endif()
 
-execute_process(
-  COMMAND ${CMAKE_C_COMPILER}
-    -E -DLEGATE_USE_PYTHON_CFFI
-    -I "${CMAKE_CURRENT_SOURCE_DIR}/core/src"
-    -P "${CMAKE_CURRENT_SOURCE_DIR}/src/core/legate_c.h"
-  ECHO_ERROR_VARIABLE
-  OUTPUT_VARIABLE header
-  COMMAND_ERROR_IS_FATAL ANY
+add_custom_target("generate_install_info_py" ALL
+  COMMAND ${CMAKE_COMMAND}
+          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+          -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate_install_info_py.cmake"
+  COMMENT "Generate install_info.py"
+  VERBATIM
 )
 
-set(libpath "")
-configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/legate/install_info.py.in"
-  "${CMAKE_CURRENT_SOURCE_DIR}/legate/install_info.py"
-@ONLY)
-
 add_library(legate_core_python INTERFACE)
 add_library(legate::core_python ALIAS legate_core_python)
 target_link_libraries(legate_core_python INTERFACE legate::core)
diff --git a/scripts/build-install.sh b/scripts/build-install.sh
index f7b5a3854..a4d671cc3 100755
--- a/scripts/build-install.sh
+++ b/scripts/build-install.sh
@@ -16,7 +16,7 @@ rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-no-install.sh b/scripts/build-no-install.sh
index 8cb6665e4..50827ce7a 100755
--- a/scripts/build-no-install.sh
+++ b/scripts/build-no-install.sh
@@ -14,7 +14,7 @@ rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-separately-no-install.sh b/scripts/build-separately-no-install.sh
index 1ffacde26..e8d1e64c5 100755
--- a/scripts/build-separately-no-install.sh
+++ b/scripts/build-separately-no-install.sh
@@ -14,7 +14,7 @@ rm -rf ./{build,_skbuild,dist,legate.core.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-with-legion-no-install.sh b/scripts/build-with-legion-no-install.sh
index 5cc03b624..2f3a1e397 100755
--- a/scripts/build-with-legion-no-install.sh
+++ b/scripts/build-with-legion-no-install.sh
@@ -29,7 +29,7 @@ fi
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-with-legion-separately-no-install.sh b/scripts/build-with-legion-separately-no-install.sh
index a497af581..d15180dc0 100755
--- a/scripts/build-with-legion-separately-no-install.sh
+++ b/scripts/build-with-legion-separately-no-install.sh
@@ -29,7 +29,7 @@ fi
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="

From 52047b62abc7f67da18ff6f81b43f3c75fc3d406 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 21 Nov 2022 12:08:21 -0800
Subject: [PATCH 068/121] Don't load a Realm network module if running on 1
 rank (#484)

* Don't load a Realm network module if running on 1 rank

* Add a unit test

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/driver/command.py                 |  8 ++++
 tests/unit/legate/driver/test_command.py | 47 ++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/legate/driver/command.py b/legate/driver/command.py
index 3ff4bbef7..7aeac6dd9 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -302,6 +302,13 @@ def cmd_regmem(
     return () if regmem == 0 else ("-ll:rsize", str(regmem))
 
 
+def cmd_network(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
+    # Don't initialize a Realm network module if running on a single rank
+    return () if config.multi_node.ranks > 1 else ("-ll:networks", "none")
+
+
 def cmd_log_levels(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
@@ -384,6 +391,7 @@ def cmd_user_opts(
     cmd_numamem,
     cmd_fbmem,
     cmd_regmem,
+    cmd_network,
     cmd_log_levels,
     cmd_log_file,
     cmd_eager_alloc,
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index e79310c93..1c2aa5355 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -58,6 +58,7 @@ def test_CMD_PARTS() -> None:
         m.cmd_numamem,
         m.cmd_fbmem,
         m.cmd_regmem,
+        m.cmd_network,
         m.cmd_log_levels,
         m.cmd_log_file,
         m.cmd_eager_alloc,
@@ -946,6 +947,52 @@ def test_nonzero(self, genobjs: GenObjs, value: str) -> None:
         assert result == ("-ll:rsize", value)
 
 
+class Test_cmd_network:
+    def test_no_launcher_single_rank(
+        self,
+        genobjs: GenObjs,
+    ) -> None:
+        config, system, launcher = genobjs()
+        result = m.cmd_network(config, system, launcher)
+        assert result == ("-ll:networks", "none")
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    def test_no_launcher_multi_rank(
+        self,
+        genobjs: GenObjs,
+        rank_var: dict[str, str],
+    ) -> None:
+        config, system, launcher = genobjs(
+            multi_rank=(2, 2),
+            rank_env={rank_var: "1"},
+        )
+        result = m.cmd_network(config, system, launcher)
+        assert result == ()
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    def test_launcher_single_rank(
+        self,
+        genobjs: GenObjs,
+        launch: LauncherType,
+    ) -> None:
+        config, system, launcher = genobjs(["--launcher", launch])
+        result = m.cmd_network(config, system, launcher)
+        assert result == ("-ll:networks", "none")
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    def test_launcher_multi_rank(
+        self,
+        genobjs: GenObjs,
+        launch: LauncherType,
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--launcher", launch],
+            multi_rank=(2, 2),
+        )
+        result = m.cmd_network(config, system, launcher)
+        assert result == ()
+
+
 class Test_cmd_log_levels:
     def test_default(self, genobjs: GenObjs) -> None:
         config, system, launcher = genobjs([])

From 056723f25b8e17fe4c6dd6e68134d98dd054157d Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 21 Nov 2022 17:27:26 -0800
Subject: [PATCH 069/121] Don't filter out cycles going through __globals__
 (#489)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/core/cycle_detector.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/legate/core/cycle_detector.py b/legate/core/cycle_detector.py
index a90a6531a..e5dc8d855 100644
--- a/legate/core/cycle_detector.py
+++ b/legate/core/cycle_detector.py
@@ -16,18 +16,12 @@
 import gc
 import inspect
 from collections import deque
-from types import FunctionType, ModuleType
+from types import ModuleType
 from typing import Any, Set, Union
 
 
 def _skip(src: Any, dst: Any) -> bool:
-    return (
-        isinstance(src, type)
-        or isinstance(src, ModuleType)
-        or isinstance(src, FunctionType)
-        and hasattr(src, "__globals__")
-        and src.__globals__ is dst
-    )
+    return isinstance(src, type) or isinstance(src, ModuleType)
 
 
 def _find_cycles(root: Any, all_ids: Set[int]) -> bool:

From 18721950a154821318fb4f20a4a1250447c3a81e Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 21 Nov 2022 17:33:53 -0800
Subject: [PATCH 070/121] Fix typo in bind.sh

---
 bind.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bind.sh b/bind.sh
index 86a9bf44b..8ccf56203 100755
--- a/bind.sh
+++ b/bind.sh
@@ -77,7 +77,7 @@ case "$launcher" in
     ;;
   jsrun )
     local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-unknown}"
-    gloabl_rank="${OMPI_COMM_WORLD_RANK:-unknown}"
+    global_rank="${OMPI_COMM_WORLD_RANK:-unknown}"
     ;;
   srun  )
     local_rank="${SLURM_LOCALID:-unknown}"

From 16624698a25e0d3f70c4d76e432987510b1d7027 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 21 Nov 2022 21:10:10 -0800
Subject: [PATCH 071/121] Update create_buffer to use socket memories whenever
 available (#487)

---
 src/core/data/buffer.h          |  8 +++-----
 src/core/legate_c.h             |  1 +
 src/core/mapping/core_mapper.cc |  4 ++++
 src/core/runtime/runtime.cc     |  6 ++++++
 src/core/runtime/runtime.h      |  1 +
 src/core/task/return.h          |  4 ++--
 src/core/utilities/machine.cc   | 20 +++++++++++++++++---
 src/core/utilities/machine.h    |  2 +-
 8 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/core/data/buffer.h b/src/core/data/buffer.h
index f2b5e81e9..91550f69d 100644
--- a/src/core/data/buffer.h
+++ b/src/core/data/buffer.h
@@ -18,6 +18,8 @@
 
 #include "legion.h"
 
+#include "core/utilities/machine.h"
+
 namespace legate {
 
 template <typename VAL, int32_t DIM = 1>
@@ -29,11 +31,7 @@ Buffer<VAL, DIM> create_buffer(const Legion::Point<DIM>& extents,
                                size_t alignment          = 16)
 {
   using namespace Legion;
-  if (Memory::Kind::NO_MEMKIND == kind) {
-    auto proc = Processor::get_executing_processor();
-    kind      = proc.kind() == Processor::Kind::TOC_PROC ? Memory::Kind::GPU_FB_MEM
-                                                         : Memory::Kind::SYSTEM_MEM;
-  }
+  if (Memory::Kind::NO_MEMKIND == kind) kind = find_memory_kind_for_executing_processor(false);
   auto hi = extents - Point<DIM>::ONES();
   // We just avoid creating empty buffers, as they cause all sorts of headaches.
   for (int32_t idx = 0; idx < DIM; ++idx) hi[idx] = std::max<int64_t>(hi[idx], 0);
diff --git a/src/core/legate_c.h b/src/core/legate_c.h
index 5b64b67fd..b29fbf056 100644
--- a/src/core/legate_c.h
+++ b/src/core/legate_c.h
@@ -55,6 +55,7 @@ typedef enum legate_core_tunable_t {
   LEGATE_CORE_TUNABLE_TOTAL_GPUS,
   LEGATE_CORE_TUNABLE_NUM_PIECES,
   LEGATE_CORE_TUNABLE_NUM_NODES,
+  LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM,
   LEGATE_CORE_TUNABLE_MIN_SHARD_VOLUME,
   LEGATE_CORE_TUNABLE_WINDOW_SIZE,
   LEGATE_CORE_TUNABLE_MAX_PENDING_EXCEPTIONS,
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index ccc738983..7f7fbbaf3 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -409,6 +409,10 @@ void CoreMapper::select_tunable_value(const MapperContext ctx,
         pack_tunable<int64_t>(min_cpu_chunk, output);
       return;
     }
+    case LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM: {
+      pack_tunable<bool>(has_socket_mem, output);
+      return;
+    }
     case LEGATE_CORE_TUNABLE_WINDOW_SIZE: {
       pack_tunable<uint32_t>(window_size, output);
       return;
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index fb1549cf7..27e14a7da 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -43,6 +43,8 @@ static const char* const core_library_name = "legate.core";
 
 /*static*/ bool Core::log_mapping_decisions = false;
 
+/*static*/ bool Core::has_socket_mem = false;
+
 /*static*/ void Core::parse_config(void)
 {
 #ifndef LEGATE_USE_CUDA
@@ -199,6 +201,10 @@ extern void register_exception_reduction_op(Runtime* runtime, const LibraryConte
   register_legate_core_projection_functors(runtime, context);
 
   register_legate_core_sharding_functors(runtime, context);
+
+  auto fut = runtime->select_tunable_value(
+    Runtime::get_context(), LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM, context.get_mapper_id(0));
+  Core::has_socket_mem = fut.get_result<bool>();
 }
 
 }  // namespace legate
diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h
index c92f18091..b7b86c836 100644
--- a/src/core/runtime/runtime.h
+++ b/src/core/runtime/runtime.h
@@ -43,6 +43,7 @@ class Core {
   static bool use_empty_task;
   static bool synchronize_stream_view;
   static bool log_mapping_decisions;
+  static bool has_socket_mem;
 };
 
 }  // namespace legate
diff --git a/src/core/task/return.h b/src/core/task/return.h
index 9fa558e64..031bb71f2 100644
--- a/src/core/task/return.h
+++ b/src/core/task/return.h
@@ -34,8 +34,8 @@ struct ReturnValue {
  public:
   void* ptr();
   const void* ptr() const;
-  const size_t size() const { return size_; }
-  const bool is_device_value() const { return is_device_value_; }
+  size_t size() const { return size_; }
+  bool is_device_value() const { return is_device_value_; }
 
  public:
   // Calls the Legion postamble with an instance
diff --git a/src/core/utilities/machine.cc b/src/core/utilities/machine.cc
index 843f6d7eb..9d7e31cdd 100644
--- a/src/core/utilities/machine.cc
+++ b/src/core/utilities/machine.cc
@@ -16,15 +16,29 @@
 
 #include "core/utilities/machine.h"
 
+#include "core/runtime/runtime.h"
+#include "legate_defines.h"
+
 using namespace Legion;
 
 namespace legate {
 
-Memory::Kind find_memory_kind_for_executing_processor()
+Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible)
 {
   auto proc = Processor::get_executing_processor();
-  return proc.kind() == Processor::Kind::TOC_PROC ? Memory::Kind::Z_COPY_MEM
-                                                  : Memory::Kind::SYSTEM_MEM;
+  switch (proc.kind()) {
+    case Processor::Kind::LOC_PROC: {
+      return Memory::Kind::SYSTEM_MEM;
+    }
+    case Processor::Kind::TOC_PROC: {
+      return host_accessible ? Memory::Kind::Z_COPY_MEM : Memory::Kind::GPU_FB_MEM;
+    }
+    case Processor::Kind::OMP_PROC: {
+      return Core::has_socket_mem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
+    }
+  }
+  LEGATE_ABORT;
+  return Memory::Kind::SYSTEM_MEM;
 }
 
 }  // namespace legate
diff --git a/src/core/utilities/machine.h b/src/core/utilities/machine.h
index a1862336c..b61824542 100644
--- a/src/core/utilities/machine.h
+++ b/src/core/utilities/machine.h
@@ -20,6 +20,6 @@
 
 namespace legate {
 
-Legion::Memory::Kind find_memory_kind_for_executing_processor();
+Legion::Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible = true);
 
 }  // namespace legate

From e52d20636593e47510a23ad6ba4a6e3fdc0e5fbf Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 22 Nov 2022 14:36:28 -0800
Subject: [PATCH 072/121] Don't use cmake 3.25.0 in build-isolation mode (#492)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 89a6da552..8f82a0d13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires = [
     "ninja",
     "setuptools",
     "scikit-build>=0.13.1",
-    "cmake>=3.22.1,!=3.23.0",
+    "cmake>=3.22.1,!=3.23.0,!=3.25.0",
 ]
 
 [tool.black]

From dc8228104ed82caf69812b3bbab8afcdcc6a50fd Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 23 Nov 2022 13:00:45 -0800
Subject: [PATCH 073/121] Throw an error if an allocation is not consumed
 (#490)

* Throw an error if an allocation is not consumed

* Adjust allocation count instead of erroring out if unconsumed

* Deletion in finalizer needs to be unordered

* Use the correct parameter name

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 legate/core/allocation.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/legate/core/allocation.py b/legate/core/allocation.py
index ebfcd97a9..4f7354e42 100644
--- a/legate/core/allocation.py
+++ b/legate/core/allocation.py
@@ -40,6 +40,12 @@ def __init__(
         self._strides = strides
         self._consumed = False
 
+    def __del__(self) -> None:
+        if not self._consumed:
+            self._region_field.decrement_inline_mapped_ref_count(
+                unordered=True
+            )
+
     def consume(
         self, ctor: Callable[[tuple[int, ...], int, tuple[int, ...]], Any]
     ) -> Any:

From c555632566eeee530408015df305a171d0bd7a27 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:28:07 -0800
Subject: [PATCH 074/121] [pre-commit.ci] pre-commit autoupdate (#480)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/pre-commit/mirrors-clang-format: v14.0.6 → v15.0.4](https://github.com/pre-commit/mirrors-clang-format/compare/v14.0.6...v15.0.4)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml          |  2 +-
 src/core/utilities/type_traits.h | 18 ++++++------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a22015b82..0b75f30a1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v14.0.6'
+      rev: 'v15.0.4'
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$
diff --git a/src/core/utilities/type_traits.h b/src/core/utilities/type_traits.h
index 4d8b324b4..84197f528 100644
--- a/src/core/utilities/type_traits.h
+++ b/src/core/utilities/type_traits.h
@@ -172,27 +172,21 @@ struct is_floating_point {
 };
 
 template <LegateTypeCode CODE>
-struct is_complex : std::false_type {
-};
+struct is_complex : std::false_type {};
 
 template <>
-struct is_complex<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct is_complex<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 
 template <>
-struct is_complex<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct is_complex<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <typename T>
-struct is_complex_type : std::false_type {
-};
+struct is_complex_type : std::false_type {};
 
 template <>
-struct is_complex_type<complex<float>> : std::true_type {
-};
+struct is_complex_type<complex<float>> : std::true_type {};
 
 template <>
-struct is_complex_type<complex<double>> : std::true_type {
-};
+struct is_complex_type<complex<double>> : std::true_type {};
 
 }  // namespace legate

From 91a2ff890c65be62b6625d5be0d82c592863862c Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Wed, 23 Nov 2022 15:27:50 -0800
Subject: [PATCH 075/121] legate/core: fix FutureMap leak in communicator
 shutdown (#495)

This commit fixes the following leak, leading to shutdown hangs.

```
found cycle!
  tail:
    0x200092ca1e10: <class 'legate.core._legion.future.FutureMap'>
     ^
    0x200058dd65c0: <class 'dict'>
     ^ ["_handles"]
    0x2000588dff80: <class 'dict'>
     ^ .__dict__
    0x2000889b11b0: <class 'legate.core.communicator.NCCLCommunicator'>
     ^ ["_nccl"]
    0x2000588dd540: <class 'dict'>
     ^ .__dict__
    0x2000887b8430: <class 'legate.core.runtime.CommunicatorManager'>
     ^ ["_comm_manager"]
    0x2000889f7d00: <class 'dict'>
     ^ .__dict__
    0x2000887b9f90: <class 'legate.core.runtime.Runtime'>
     ^ ["runtime"]
    0x200173d7b180: <class 'dict'>
  cycle:
    0x200173d7b180: <class 'dict'>
     ^ .__dict__
    0x200173d84370: <class 'legate.core.runtime.ConsensusMatchingFieldManager'>
     ^ ["manager"]
    0x200173e30cc0: <class 'dict'>
     ^ .__dict__
    0x200173db2e00: <class 'legate.core.runtime.FreeFieldInfo'>
     ^ [85]
    0x200173df0f80: <class 'list'>
     ^ ["_freed_fields"]
    0x2000588ddb40: <class 'dict'>
     ^ .__dict__
    0x2000889b0fa0: <class 'legate.core.runtime.FieldMatchManager'>
     ^ ["_field_match_manager"]
    0x200173d7b180: <class 'dict'>
```

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/communicator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/legate/core/communicator.py b/legate/core/communicator.py
index 019258a51..caa40243c 100644
--- a/legate/core/communicator.py
+++ b/legate/core/communicator.py
@@ -61,6 +61,10 @@ def initialize(self, volume: int) -> None:
     def destroy(self) -> None:
         for volume, handle in self._handles.items():
             self._finalize(volume, handle)
+        # Drop the references to the handles dict after
+        # all handles have been finalized to ensure that
+        # no references to FutureMaps are kept.
+        self._handles = {}
 
     @abstractproperty
     def needs_barrier(self) -> bool:

From fd57fdae73e5547a3394255a74342608df0d3686 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Mon, 28 Nov 2022 08:54:16 -0800
Subject: [PATCH 076/121] Add type for LegateDataInterface (#494)

* add LegateDataInterface protocol and remove object base classes

* specific ignore types

* tighten up LegateDataInterface definition

* remove LDI from Time class

* fix circular import

* restore type ignore
---
 legate/core/corelib.py            |  2 +-
 legate/core/legate.py             | 36 ++++++++++++++++++++-----------
 legate/core/runtime.py            |  2 +-
 legate/jupyter/_legion_kernel.py  |  6 +++---
 legate/jupyter/magic.py           |  2 +-
 legate/tester/args.py             |  2 +-
 legate/timing/timing.py           | 10 ++-------
 tests/unit/legate/util/test_ui.py |  2 +-
 8 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/legate/core/corelib.py b/legate/core/corelib.py
index 4b27bc512..6ccd7cff7 100644
--- a/legate/core/corelib.py
+++ b/legate/core/corelib.py
@@ -17,7 +17,7 @@
 import os
 from typing import Any, Union
 
-from ..install_info import header, libpath  # type: ignore
+from ..install_info import header, libpath  # type: ignore [import]
 from .legate import Library
 from .resource import ResourceConfig
 
diff --git a/legate/core/legate.py b/legate/core/legate.py
index 38760f41e..044a41c7e 100644
--- a/legate/core/legate.py
+++ b/legate/core/legate.py
@@ -15,9 +15,10 @@
 from __future__ import annotations
 
 import platform
-from typing import TYPE_CHECKING, Any, Iterator, Optional, Union
+from typing import TYPE_CHECKING, Any, Iterator, Optional, TypedDict, Union
 
 import pyarrow
+from typing_extensions import Protocol
 
 from .resource import ResourceConfig
 
@@ -25,6 +26,17 @@
     from .store import Store
 
 
+class LegateDataInterfaceItem(TypedDict):
+    version: int
+    data: dict[pyarrow.Field, Array]
+
+
+class LegateDataInterface(Protocol):
+    @property
+    def __legate_data_interface__(self) -> LegateDataInterfaceItem:
+        ...
+
+
 class Array:
     def __init__(
         self,
@@ -109,7 +121,7 @@ def __len__(self) -> int:
         raise NotImplementedError("Array.__len__")
 
 
-class Table:
+class Table(LegateDataInterface):
     def __init__(self, schema: pyarrow.Schema, columns: list[Array]) -> None:
         """
         A Table is a collection of top-level, equal-length Array
@@ -127,30 +139,30 @@ def __init__(self, schema: pyarrow.Schema, columns: list[Array]) -> None:
         self._columns = columns
 
     @property
-    def __legate_data_interface__(self) -> dict[str, Any]:
+    def __legate_data_interface__(self) -> LegateDataInterfaceItem:
         """
         The Legate data interface allows for different Legate libraries to get
         access to the base Legion primitives that back objects from different
         Legate libraries. It currently requires objects that implement it to
-        return a dictionary that contains two integer members:
+        return a dictionary that contains two members:
 
         Returns
         -------
         A dictionary with the following entries:
+
         'version' (required) : int
             An integer showing the version number of this implementation of
             the interface (i.e. 1 for this version)
-        'data' (required) : OrderedDict[Field,Array]
-            An ordered dictionary mapping 'Field' objects that represent the
-            names and types of the field data to 'Array' objects containing
+
+        'data' (required) : dict[Field, Array]
+            An dictionary mapping ``pyarrow.Field`` objects that represent the
+            names and types of the field data to ``Array`` objects containing
             Store objects
+
         """
-        result: dict[str, Any] = dict()
-        result["version"] = 1
-        data = {}
+        result: LegateDataInterfaceItem = {"version": 1, "data": {}}
         for index, column in enumerate(self._columns):
-            data[self._schema.field(index)] = column
-        result["data"] = data
+            result["data"][self._schema.field(index)] = column
         return result
 
     def add_column(
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 8306f8b8a..cddc49813 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -1208,7 +1208,7 @@ def destroy(self) -> None:
         self.index_spaces = {}
         # Explicitly release the reference to the partition manager so that
         # it may be collected, releasing references to Futures and FutureMaps.
-        self._partition_manager = None  # type: ignore
+        del self._partition_manager
 
         if self._finalize_tasks:
             # Run a gc and then end the legate task
diff --git a/legate/jupyter/_legion_kernel.py b/legate/jupyter/_legion_kernel.py
index b88d23f30..812f81cf6 100644
--- a/legate/jupyter/_legion_kernel.py
+++ b/legate/jupyter/_legion_kernel.py
@@ -20,7 +20,7 @@
 from contextlib import contextmanager
 from typing import Any, Iterator, TextIO
 
-from ipykernel.ipkernel import IPythonKernel  # type: ignore
+from ipykernel.ipkernel import IPythonKernel  # type: ignore [import]
 
 __version__ = "0.1"
 
@@ -33,7 +33,7 @@ def reset_stdout(stdout: TextIO) -> Iterator[None]:
     sys.stdout = _stdout
 
 
-class LegionKernel(IPythonKernel):  # type: ignore
+class LegionKernel(IPythonKernel):  # type: ignore [misc,no-any-unimported]
     implementation = "legion_kernel"
     implementation_version = __version__
     banner = "Legion IPython Kernel for SM"
@@ -55,6 +55,6 @@ def __init__(self, **kwargs: Any) -> None:
 
 
 if __name__ == "__main__":
-    from ipykernel.kernelapp import IPKernelApp  # type: ignore
+    from ipykernel.kernelapp import IPKernelApp  # type: ignore [import]
 
     IPKernelApp.launch_instance(kernel_class=LegionKernel)
diff --git a/legate/jupyter/magic.py b/legate/jupyter/magic.py
index b5b82784c..ba80f9c20 100644
--- a/legate/jupyter/magic.py
+++ b/legate/jupyter/magic.py
@@ -50,7 +50,7 @@
 }
 
 
-class LegateInfo(object):
+class LegateInfo:
     config: LegateMetadata
 
     def __init__(self) -> None:
diff --git a/legate/tester/args.py b/legate/tester/args.py
index 0645fea9e..4b24077a0 100644
--- a/legate/tester/args.py
+++ b/legate/tester/args.py
@@ -61,7 +61,7 @@
     dest="features",
     action=ExtendAction,
     choices=MultipleChoices(sorted(FEATURES)),
-    type=lambda s: s.split(","),  # type: ignore
+    type=lambda s: s.split(","),  # type: ignore [arg-type,return-value]
     help="Test Legate with features (also via USE_*)",
 )
 
diff --git a/legate/timing/timing.py b/legate/timing/timing.py
index 9d40409a1..9838f7cb4 100644
--- a/legate/timing/timing.py
+++ b/legate/timing/timing.py
@@ -25,7 +25,7 @@
     import pyarrow
 
 
-class TimingRuntime(object):
+class TimingRuntime:
     def __init__(self) -> None:
         self.runtime = get_legion_runtime()
         self.context = get_legion_context()
@@ -53,18 +53,12 @@ def measure_nanoseconds(self) -> Future:
         )
 
 
-class Time(object):
+class Time:
     def __init__(self, future: Future, dtype: Any) -> None:
         self.future = future
         self.dtype = dtype
         self.value: Union[int, float, None] = None
 
-    @property
-    def __legate_data_interface__(self) -> dict[str, Any]:
-        result: dict[str, Any] = {"version": 1, "data": dict()}
-        result["data"]["Legate Timestamp"] = self
-        return result
-
     @property
     def type(self) -> Any:
         return self.dtype
diff --git a/tests/unit/legate/util/test_ui.py b/tests/unit/legate/util/test_ui.py
index 4603c053c..56c53dfe7 100644
--- a/tests/unit/legate/util/test_ui.py
+++ b/tests/unit/legate/util/test_ui.py
@@ -24,7 +24,7 @@
 from legate.util import colors, ui as m
 
 try:
-    import colorama  # type: ignore
+    import colorama  # type: ignore [import]
 except ImportError:
     colorama = None
 

From 25c9771909fcd52a00584cc9dc4dc4b17d3db453 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 28 Nov 2022 19:09:24 -0800
Subject: [PATCH 077/121] Check for cycles involving Futures after runtime
 shutdown (#496)

* Check for cycles involving Futures after runtime shutdown

* Separate Future cycle check into separate option
---
 legate/core/cycle_detector.py | 18 +++++++++++++---
 legate/core/runtime.py        | 39 ++++++++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/legate/core/cycle_detector.py b/legate/core/cycle_detector.py
index e5dc8d855..14fb73e5e 100644
--- a/legate/core/cycle_detector.py
+++ b/legate/core/cycle_detector.py
@@ -130,14 +130,26 @@ def _bfs(begin: Any, end: Any, all_ids: Set[int]) -> None:
     print(f"    {_obj_str(begin)}")
 
 
-def find_cycles() -> bool:
-    from .store import RegionField
+def find_cycles(for_futures: bool) -> bool:
+    # Avoid importing RegionField when looking for cycles after Runtime
+    # deletion, because at that point it is impossible to import store.py.
+    if for_futures:
+        from ._legion import Future, FutureMap
+
+        def is_interesting(obj: Any) -> bool:
+            return isinstance(obj, (Future, FutureMap))
+
+    else:
+        from .store import RegionField
+
+        def is_interesting(obj: Any) -> bool:
+            return isinstance(obj, RegionField)
 
     found_cycles = False
     all_objs = gc.get_objects()
     all_ids = set(id(obj) for obj in all_objs)
     for obj in all_objs:
-        if isinstance(obj, RegionField):
+        if is_interesting(obj):
             print(
                 f"looking for cycles involving {hex(id(obj))}, "
                 f"of type {type(obj)}"
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index cddc49813..48f4391e9 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -97,12 +97,26 @@
             dest="cycle_check",
             help=(
                 "Check for reference cycles involving RegionField objects on "
-                "program exit (developer option). Such cycles have the effect "
-                "of stopping used RegionFields from being repurposed for "
-                "other Stores, thus increasing memory pressure. By default "
-                "this mode will miss any cycles already collected by the "
-                "garbage collector; run gc.disable() at the beginning of the "
-                "program to avoid this."
+                "script exit (developer option). When such cycles arise "
+                "during execution, they stop used RegionFields from getting "
+                "collected and reused for new Stores, thus increasing memory "
+                "pressure. By default this check will miss any RegionField "
+                "cycles the garbage collector collected during execution; "
+                "run gc.disable() at the beginning of the program to avoid "
+                "this."
+            ),
+        ),
+    ),
+    Argument(
+        "future-leak-check",
+        ArgSpec(
+            action="store_true",
+            default=False,
+            dest="future_leak_check",
+            help=(
+                "Check for reference cycles keeping Future/FutureMap objects "
+                "alive after Legate runtime exit (developer option). Such "
+                "leaks can result in Legion runtime shutdown hangs."
             ),
         ),
     ),
@@ -1675,9 +1689,16 @@ def raise_exceptions(self) -> None:
 
 def _cleanup_legate_runtime() -> None:
     global runtime
+    future_leak_check = runtime._args.future_leak_check
     runtime.destroy()
     del runtime
     gc.collect()
+    if future_leak_check:
+        print(
+            "Looking for cycles that are keeping Future/FutureMap objects "
+            "alive after Legate runtime exit."
+        )
+        find_cycles(True)
 
 
 add_cleanup_item(_cleanup_legate_runtime)
@@ -1691,7 +1712,11 @@ def __getattr__(self, attr: str) -> Any:
         return getattr(self._wrapped_mod, attr)
 
     def __del__(self) -> None:
-        find_cycles()
+        print(
+            "Looking for cycles that are stopping RegionFields from getting "
+            "collected and reused."
+        )
+        find_cycles(False)
 
 
 if runtime._args.cycle_check:

From 472abc966df5da0943c7260931695e32ad4494e8 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Tue, 29 Nov 2022 15:44:42 -0800
Subject: [PATCH 078/121] src/core/mapping: adjust indirect copy mapping for
 GPUs (#499)

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 src/core/mapping/base_mapper.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 2f5f5788d..33681abc0 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -1049,6 +1049,16 @@ void BaseMapper::map_copy(const MapperContext ctx,
 
   auto store_target = default_store_targets(target_proc.kind()).front();
 
+  // If we're mapping an indirect copy and have data resident in GPU memory,
+  // map everything to CPU memory, as indirect copies on GPUs are currently
+  // extremely slow.
+  auto indirect =
+    !copy.src_indirect_requirements.empty() || !copy.dst_indirect_requirements.empty();
+  if (indirect && target_proc.kind() == Processor::TOC_PROC) {
+    target_proc  = local_cpus.front();
+    store_target = StoreTarget::SYSMEM;
+  }
+
   Copy legate_copy(&copy, runtime, ctx);
 
   std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;

From 09b43106c87fcb26428155fbd51e9bf20a00c975 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Tue, 29 Nov 2022 15:44:51 -0800
Subject: [PATCH 079/121] legate/core: guard deletion in
 reset_{store,storage}_key_partition (#498)

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/runtime.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 48f4391e9..637b09f72 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -891,7 +891,8 @@ def record_store_key_partition(
         self._store_key_partitions[store_id] = key_partition
 
     def reset_store_key_partition(self, store_id: int) -> None:
-        del self._store_key_partitions[store_id]
+        if store_id in self._store_key_partitions:
+            del self._store_key_partitions[store_id]
 
     def find_storage_key_partition(
         self, storage_id: int, restrictions: tuple[Restriction, ...]
@@ -909,7 +910,8 @@ def record_storage_key_partition(
         self._storage_key_partitions[storage_id] = key_partition
 
     def reset_storage_key_partition(self, storage_id: int) -> None:
-        del self._storage_key_partitions[storage_id]
+        if storage_id in self._storage_key_partitions:
+            del self._storage_key_partitions[storage_id]
 
     def find_legion_partition(
         self, storage_id: int, functor: PartitionBase

From b85a1dff9f1d6a10cf73d2acccc71f3fe827c619 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Tue, 29 Nov 2022 15:44:57 -0800
Subject: [PATCH 080/121] src/core/mapping: temporary fix to base_mapper for
 collective branch (#493)

This commit adjusts an assertion in the base mapper for use with the
collective branch of legion.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 src/core/mapping/base_mapper.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 33681abc0..c48428de4 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -925,7 +925,10 @@ void BaseMapper::legate_select_sources(const MapperContext ctx,
     } else
       band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, finder->second));
   }
-  assert(!band_ranking.empty());
+  // If there aren't any sources (for example if there are some collective views
+  // to choose from, not yet in this branch), just return nothing and let the
+  // runtime pick something for us.
+  if (band_ranking.empty()) { return; }
   // Easy case of only one instance
   if (band_ranking.size() == 1) {
     ranking.push_back(band_ranking.begin()->first);

From f5fac94ab3b230ca05cb25944eaf91872ffaa6c7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 29 Nov 2022 17:50:54 -0800
Subject: [PATCH 081/121] [pre-commit.ci] pre-commit autoupdate (#502)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/flake8: 5.0.4 → 6.0.0](https://github.com/PyCQA/flake8/compare/5.0.4...6.0.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b75f30a1..4635440a3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8
-      rev: 5.0.4
+      rev: 6.0.0
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format

From 3cdd7252cf485f712e8801f08f3878b7437ffc3b Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 29 Nov 2022 23:36:38 -0800
Subject: [PATCH 082/121] Don't access stream pools unless we're on GPUs (#503)

* Don't access stream pools unless we're on GPUs

* Fix for the dangling else
---
 src/core/task/return.cc | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/core/task/return.cc b/src/core/task/return.cc
index c44365516..e11640616 100644
--- a/src/core/task/return.cc
+++ b/src/core/task/return.cc
@@ -208,17 +208,19 @@ void ReturnValues::legion_serialize(void* buffer) const
   //
   // the size of value i is computed by offsets[i] - (i == 0 ? 0 : offsets[i-1])
 
-#ifdef LEGATE_USE_CUDA
-  auto stream = cuda::StreamPool::get_stream_pool().get_stream();
-#endif
-
+  // Special case with a single scalar
   if (return_values_.size() == 1) {
     auto& ret = return_values_.front();
-#ifdef LEGATE_USE_CUDA
-    if (ret.is_device_value())
-      CHECK_CUDA(cudaMemcpyAsync(buffer, ret.ptr(), ret.size(), cudaMemcpyDeviceToHost, stream));
-    else
+    if (ret.is_device_value()) {
+#ifdef DEBUG_LEGATE
+      assert(Processor::get_executing_processor().kind() == Processor::Kind::TOC_PROC);
 #endif
+      CHECK_CUDA(cudaMemcpyAsync(buffer,
+                                 ret.ptr(),
+                                 ret.size(),
+                                 cudaMemcpyDeviceToHost,
+                                 cuda::StreamPool::get_stream_pool().get_stream()));
+    } else
       memcpy(buffer, ret.ptr(), ret.size());
     return;
   }
@@ -233,15 +235,25 @@ void ReturnValues::legion_serialize(void* buffer) const
     ptr                               = ptr + sizeof(uint32_t);
   }
 
-  for (auto ret : return_values_) {
-    uint32_t size = ret.size();
 #ifdef LEGATE_USE_CUDA
-    if (ret.is_device_value())
-      CHECK_CUDA(cudaMemcpyAsync(ptr, ret.ptr(), size, cudaMemcpyDeviceToHost, stream));
-    else
+  if (Processor::get_executing_processor().kind() == Processor::Kind::TOC_PROC) {
+    auto stream = cuda::StreamPool::get_stream_pool().get_stream();
+    for (auto ret : return_values_) {
+      uint32_t size = ret.size();
+      if (ret.is_device_value())
+        CHECK_CUDA(cudaMemcpyAsync(ptr, ret.ptr(), size, cudaMemcpyDeviceToHost, stream));
+      else
+        memcpy(ptr, ret.ptr(), size);
+      ptr += size;
+    }
+  } else
 #endif
+  {
+    for (auto ret : return_values_) {
+      uint32_t size = ret.size();
       memcpy(ptr, ret.ptr(), size);
-    ptr += size;
+      ptr += size;
+    }
   }
 }
 

From d7d2d8207178e3ec542326e64f4ee578ed12ab73 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 30 Nov 2022 14:49:53 -0800
Subject: [PATCH 083/121] Add typing-extensions run export to conda build
 (#504)

Add typing-extensions run export to conda build

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 32d765ead..51f56180d 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -119,6 +119,7 @@ requirements:
     - llvm-openmp
     - numpy {{ numpy_version }}
     - pyarrow {{ pyarrow_version }}
+    - typing_extensions
 {% if gpu_enabled_bool %}
     - cuda-cudart >={{ cuda_version }}
     - nccl

From 4ca97f7ecb620f198a6aac1a83daeee391756fc1 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 1 Dec 2022 10:10:34 -0800
Subject: [PATCH 084/121] Add CTK 11.8 to environment generation script (#505)

---
 scripts/generate-conda-envs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index 494b408ba..dc92d373d 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -221,6 +221,7 @@ def filename(self) -> str:
     "11.5",
     "11.6",
     "11.7",
+    "11.8",
 )
 
 OS_NAMES: Tuple[OSType, ...] = ("linux", "osx")

From 0a556f23f7ed53cd3d54efcc386ff0bb85776eda Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Thu, 1 Dec 2022 14:39:36 -0800
Subject: [PATCH 085/121] Add machinery for legate.core base tests (#500)

* Add machinery for legate.core base tests

* fixes to tests

* skip main plumbing tests for now
---
 .github/workflows/ci.yml                | 66 ++++++++++++++++++++++++-
 tests/unit/legate/driver/test_main.py   |  6 ++-
 tests/unit/legate/jupyter/test_main.py  |  8 ++-
 tests/unit/legate/tester/test_config.py |  3 +-
 tests/unit/legate/util/test_system.py   |  6 ---
 5 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 11b700eab..94373c39a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,5 +1,5 @@
 name: Build legate.core
-on: 
+on:
   push:
     branches-ignore:
       - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
@@ -63,4 +63,66 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: build-log
-          path: ./**/${{ env.COMMIT }}-build.log.gpg
\ No newline at end of file
+          path: ./**/${{ env.COMMIT }}-build.log.gpg
+  test:
+    if: ${{ github.repository == 'nv-legate/legate.core' }}
+    runs-on: self-hosted
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - {name: mypy, options: mypy, log: mypy}
+          - {name: pytest unit tests, options: unit, log: pytest}
+    name: ${{ matrix.name }}
+    steps:
+      - name: Dump GitHub context
+        env:
+          GITHUB_CONTEXT: ${{ toJSON(github) }}
+        run: echo "$GITHUB_CONTEXT"
+      - name: Dump job context
+        env:
+          JOB_CONTEXT: ${{ toJSON(job) }}
+        run: echo "$JOB_CONTEXT"
+      - name: Dump steps context
+        env:
+          STEPS_CONTEXT: ${{ toJSON(steps) }}
+        run: echo "$STEPS_CONTEXT"
+      - name: Dump runner context
+        env:
+          RUNNER_CONTEXT: ${{ toJSON(runner) }}
+        run: echo "$RUNNER_CONTEXT"
+      - name: Dump strategy context
+        env:
+          STRATEGY_CONTEXT: ${{ toJSON(strategy) }}
+        run: echo "$STRATEGY_CONTEXT"
+      - name: Dump matrix context
+        env:
+          MATRIX_CONTEXT: ${{ toJSON(matrix) }}
+        run: echo "$MATRIX_CONTEXT"
+      - name: Prepare
+        run: |
+          /data/github-runner/legate-bin/setup.sh
+          cd legate-ci/github-ci/legate.core
+          if [[ ! -d ngc-artifacts ]]
+          then
+            mkdir ngc-artifacts
+          else
+            rm -rf ngc-artifacts/*
+          fi
+      - name: Test
+        run: |
+          cd legate-ci/github-ci/legate.core
+          ./test.sh ${{ matrix.options }} > ${COMMIT}-test-${{ matrix.log }}.log 2>&1
+      - name: Process output
+        if: always()
+        run: |
+          cd legate-ci/github-ci/legate.core
+          /data/github-runner/legate-bin/encrypt.sh ${COMMIT}-test-${{ matrix.log }}.log
+          cat *artifacts/*/*
+      - name: Upload Log
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: test-${{ matrix.log }}-log
+          path: ./**/${{ env.COMMIT }}-test-${{ matrix.log }}.log.gpg
diff --git a/tests/unit/legate/driver/test_main.py b/tests/unit/legate/driver/test_main.py
index a5537afba..c0dfd07d5 100644
--- a/tests/unit/legate/driver/test_main.py
+++ b/tests/unit/legate/driver/test_main.py
@@ -16,6 +16,7 @@
 
 import sys
 
+import pytest
 from pytest_mock import MockerFixture
 
 import legate.driver as m
@@ -27,6 +28,9 @@
 # all the expected plumbing is hooked up as it is supposed to be
 
 
+# TODO: this test with the fake argv path does not work for the way
+# legate is installed in CI, so skip for now.
+@pytest.mark.skip
 def test_main(mocker: MockerFixture) -> None:
     import legate.driver.config
     import legate.driver.driver
@@ -36,7 +40,7 @@ def test_main(mocker: MockerFixture) -> None:
     system_spy = mocker.spy(legate.util.system.System, "__init__")
     driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
     mocker.patch("legate.driver.driver.Driver.run", return_value=123)
-    mocker.patch.object(sys, "argv", ["foo", "bar"])
+    mocker.patch.object(sys, "argv", ["/some/path/foo", "bar"])
 
     result = m.main()
 
diff --git a/tests/unit/legate/jupyter/test_main.py b/tests/unit/legate/jupyter/test_main.py
index 0e0159dc9..a92b071c7 100644
--- a/tests/unit/legate/jupyter/test_main.py
+++ b/tests/unit/legate/jupyter/test_main.py
@@ -16,6 +16,7 @@
 
 import sys
 
+import pytest
 from pytest_mock import MockerFixture
 
 import legate.jupyter as m
@@ -27,6 +28,9 @@
 # all the expected plumbing is hooked up as it is supposed to be
 
 
+# TODO: this test with the fake argv path does not work for the way
+# legate is installed in CI, so skip for now.
+@pytest.mark.skip
 def test_main(mocker: MockerFixture) -> None:
     import legate.driver.driver
     import legate.jupyter.config
@@ -37,7 +41,9 @@ def test_main(mocker: MockerFixture) -> None:
     driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
     generate_spy = mocker.spy(legate.jupyter.kernel, "generate_kernel_spec")
     install_mock = mocker.patch("legate.jupyter.kernel.install_kernel_spec")
-    mocker.patch.object(sys, "argv", ["legate-jupyter", "--name", "foo"])
+    mocker.patch.object(
+        sys, "argv", ["/some/path/legate-jupyter", "--name", "foo"]
+    )
 
     m.main()
 
diff --git a/tests/unit/legate/tester/test_config.py b/tests/unit/legate/tester/test_config.py
index f0e351caf..ac7b30f07 100644
--- a/tests/unit/legate/tester/test_config.py
+++ b/tests/unit/legate/tester/test_config.py
@@ -17,6 +17,7 @@
 """
 from __future__ import annotations
 
+import os
 from pathlib import Path, PurePath
 
 import pytest
@@ -64,7 +65,7 @@ def test_default_init(self) -> None:
         assert c.legate_dir is None
 
         assert c.extra_args == []
-        assert c.root_dir == PurePath(m.__file__).parents[2]
+        assert c.root_dir == PurePath(os.getcwd())
 
         # TODO (bv) restore when generalized
         # assert len(c.test_files) > 0
diff --git a/tests/unit/legate/util/test_system.py b/tests/unit/legate/util/test_system.py
index 38db9cc0b..0115e417f 100644
--- a/tests/unit/legate/util/test_system.py
+++ b/tests/unit/legate/util/test_system.py
@@ -98,12 +98,6 @@ def test_cpus(self) -> None:
         assert len(cpus) > 0
         assert all(len(cpu.ids) > 0 for cpu in cpus)
 
-    @pytest.mark.skipif(platform.system() != "Linux", reason="Linux test")
-    def test_gpus_linux(self) -> None:
-        s = m.System()
-        # can't really assume / test much here
-        s.gpus
-
     @pytest.mark.skipif(platform.system() != "Darwin", reason="OSX test")
     def test_gpus_osx(self) -> None:
         s = m.System()

From 0c68b4c9868cec53a3fb612269ea8282d7144b32 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 1 Dec 2022 15:28:02 -0800
Subject: [PATCH 086/121] Fix non-CUDA build (#506)

---
 src/core/task/return.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/task/return.cc b/src/core/task/return.cc
index e11640616..98c70419a 100644
--- a/src/core/task/return.cc
+++ b/src/core/task/return.cc
@@ -211,6 +211,7 @@ void ReturnValues::legion_serialize(void* buffer) const
   // Special case with a single scalar
   if (return_values_.size() == 1) {
     auto& ret = return_values_.front();
+#ifdef LEGATE_USE_CUDA
     if (ret.is_device_value()) {
 #ifdef DEBUG_LEGATE
       assert(Processor::get_executing_processor().kind() == Processor::Kind::TOC_PROC);
@@ -221,6 +222,7 @@ void ReturnValues::legion_serialize(void* buffer) const
                                  cudaMemcpyDeviceToHost,
                                  cuda::StreamPool::get_stream_pool().get_stream()));
     } else
+#endif
       memcpy(buffer, ret.ptr(), ret.size());
     return;
   }

From 8a9d35b4e58dbee74bfbc38028568baf30a4e4ba Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Thu, 1 Dec 2022 16:27:21 -0800
Subject: [PATCH 087/121] legate/core: reset storage key partition in
 Store::reset_key_partition (#507)

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/store.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/legate/core/store.py b/legate/core/store.py
index 2f4e2f9a7..0d55cbf62 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -1249,6 +1249,8 @@ def set_key_partition(self, partition: PartitionBase) -> None:
 
     def reset_key_partition(self) -> None:
         runtime.partition_manager.reset_store_key_partition(self._unique_id)
+        # Also reset the storage's key partition.
+        self._storage.reset_key_partition()
 
     def compute_key_partition(
         self, restrictions: tuple[Restriction, ...]

From ab2f34fc0d354dfa1f627fcd857bad49303241f8 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Fri, 2 Dec 2022 12:41:40 -0800
Subject: [PATCH 088/121] Don't nocr for single node (#508)

* Don't nocr for single node

* handle multi-rank more generally
---
 legate/driver/config.py                  |  6 +++-
 tests/unit/legate/driver/test_command.py | 37 +++++++++++++++++++++++-
 tests/unit/legate/driver/test_config.py  | 25 ++++++++++++++--
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/legate/driver/config.py b/legate/driver/config.py
index 6a526214f..0be36f959 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -206,7 +206,11 @@ def console(self) -> bool:
         return not any(opt.endswith(".py") for opt in self.user_opts)
 
     def _fixup_nocr(self, args: Namespace) -> None:
-        if self.console and not args.not_control_replicable:
+        # this is slightly duplicative of MultiNode.ranks property, but fixup
+        # checks happen before sub-configs are initialized from args
+        ranks = int(args.nodes) * int(args.ranks_per_node)
+
+        if self.console and not args.not_control_replicable and ranks > 1:
             print(warn("Disabling control replication for interactive run"))
             args.not_control_replicable = True
 
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index 1c2aa5355..38f247ea3 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -519,11 +519,46 @@ def test_default(self, genobjs: GenObjs) -> None:
 
         assert result == ()
 
-    def test_console(self, genobjs: GenObjs) -> None:
+    def test_console_single_node(self, genobjs: GenObjs) -> None:
         config, system, launcher = genobjs([], fake_module=None)
 
         result = m.cmd_nocr(config, system, launcher)
 
+        assert result == ()
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    def test_console_multi_node(
+        self, genobjs: GenObjs, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            # passing --nodes is not usually necessary for genobjs but we
+            # are probing a "fixup" check that inspect args directly
+            ["--nodes", "2"],
+            multi_rank=(2, 1),
+            rank_env={rank_var: rank},
+            fake_module=None,
+        )
+
+        result = m.cmd_nocr(config, system, launcher)
+
+        assert result == ("--nocr",)
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    def test_console_multi_rank(
+        self, genobjs: GenObjs, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            # passing --ranks-per-node is not usually necessary for genobjs
+            # but we are probing a "fixup" check that inspect args directly
+            ["--ranks-per-node", "2"],
+            multi_rank=(1, 2),
+            rank_env={rank_var: "0"},
+            fake_module=None,
+        )
+
+        result = m.cmd_nocr(config, system, launcher)
+
         assert result == ("--nocr",)
 
     def test_with_option(self, genobjs: GenObjs) -> None:
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 483719a6e..91b6f5056 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -284,7 +284,7 @@ def test_default_init(self) -> None:
         assert c.multi_node == m.MultiNode(
             nodes=defaults.LEGATE_NODES,
             ranks_per_node=defaults.LEGATE_RANKS_PER_NODE,
-            not_control_replicable=True,
+            not_control_replicable=False,
             launcher="none",
             launcher_extra=[],
         )
@@ -373,9 +373,30 @@ def test_arg_conversions(self, mocker: MockerFixture) -> None:
             ]
         )
 
-    def test_nocr_fixup(self, capsys: Capsys) -> None:
+    def test_nocr_fixup_default_single_node(self, capsys: Capsys) -> None:
         c = m.Config(["legate"])
 
+        assert c.console
+        assert not c.multi_node.not_control_replicable
+
+        out, _ = capsys.readouterr()
+        assert scrub(out).strip() == ""
+
+    def test_nocr_fixup_multi_node(self, capsys: Capsys) -> None:
+        c = m.Config(["legate", "--nodes", "2"])
+
+        assert c.console
+        assert c.multi_node.not_control_replicable
+
+        out, _ = capsys.readouterr()
+        assert (
+            scrub(out).strip()
+            == "WARNING: Disabling control replication for interactive run"
+        )
+
+    def test_nocr_fixup_multi_rank(self, capsys: Capsys) -> None:
+        c = m.Config(["legate", "--ranks-per-node", "2"])
+
         assert c.console
         assert c.multi_node.not_control_replicable
 

From 363c478abda48aee0bcddfcf0e24beb1f7361439 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 12 Dec 2022 11:43:19 -0800
Subject: [PATCH 089/121] Fix for 509 (#510)

* Retry constraint solving with fewer key partitions when the first round makes the operation sequential

* Add comments to the new code
---
 legate/core/solver.py | 158 +++++++++++++++++++++++++++++-------------
 1 file changed, 110 insertions(+), 48 deletions(-)

diff --git a/legate/core/solver.py b/legate/core/solver.py
index d34f0d62b..8efe88866 100644
--- a/legate/core/solver.py
+++ b/legate/core/solver.py
@@ -14,7 +14,7 @@
 #
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Generic, List, Optional, TypeVar
+from typing import TYPE_CHECKING, Generic, List, Optional, Tuple, TypeVar
 
 from . import FieldSpace, Future, Rect
 from .constraints import Alignment, Broadcast, Containment, PartSym
@@ -350,13 +350,16 @@ def compute_launch_shape(
         partitions: dict[PartSym, PartitionBase],
         all_outputs: set[Store],
         unbound_ndim: Optional[int],
-    ) -> Optional[Shape]:
+        # The Boolean return value denotes whether the computed launch shape
+        # is "final". If it's True, there's no room for the solver to improve
+        # the quality of parallelization.
+    ) -> Tuple[Optional[Shape], bool]:
         # We filter out the cases where any of the outputs is assigned
         # to replication, in which case the operation must be performed
         # sequentially
         for unknown, part in partitions.items():
             if unknown.store in all_outputs and part is REPLICATE:
-                return None
+                return None, True
 
         # If we're here, this means that replicated stores are safe to access
         # in parallel, so we filter those out to determine the launch domain
@@ -364,7 +367,7 @@ def compute_launch_shape(
 
         # If all stores are replicated, we can't parallelize the operation
         if len(parts) == 0:
-            return None
+            return None, True
 
         # Here we check if all partitions agree on the color shape
         must_be_1d_launch = False
@@ -382,7 +385,7 @@ def compute_launch_shape(
             # we can't use a 1-D launch domain, hence falling back to
             # a sequential launch
             if unbound_ndim is not None and unbound_ndim != 1:
-                return None
+                return None, True
 
             # If all color spaces don't have the same number of colors,
             # it means some inputs are much smaller than the others
@@ -393,15 +396,80 @@ def compute_launch_shape(
                 assert part.color_shape is not None
                 volumes.add(part.color_shape.volume())
             if len(volumes) > 1:
-                return None
+                return None, False
             else:
-                return Shape(volumes)
+                return Shape(volumes), True
         # If there is an unbound store, the store's dimensionality must be
         # the same as that of the launch domain
         elif unbound_ndim is None or unbound_ndim == launch_shape.ndim:
-            return launch_shape
+            return launch_shape, True
         else:
-            return None
+            return None, True
+
+    def _solve_store_constraints(
+        self,
+        partitions: dict[PartSym, PartitionBase],
+        unknowns: list[PartSym],
+        dependent: dict[PartSym, Expr],
+        all_restrictions: dict[PartSym, Restrictions],
+        constraints: EqClass[PartSym],
+        must_be_even: OrderedSet[PartSym],
+    ) -> Tuple[dict[PartSym, PartitionBase], set[PartSym]]:
+        result = partitions.copy()
+        key_parts: set[PartSym] = set()
+
+        for unknown in unknowns:
+            if unknown in result:
+                continue
+            elif unknown in dependent:
+                continue
+
+            store = unknown.store
+            restrictions = all_restrictions[unknown]
+            cls = constraints.find(unknown)
+
+            partition = store.compute_key_partition(restrictions)
+            if not partition.even and len(cls) > 1:
+                partition, unknown = self.maybe_find_alternative_key_partition(
+                    partition,
+                    unknown,
+                    cls,
+                    restrictions,
+                    must_be_even,
+                )
+            key_parts.add(unknown)
+
+            for to_align in cls:
+                if to_align in result:
+                    continue
+                result[to_align] = partition
+
+        for rhs, lhs in dependent.items():
+            expr = lhs.subst(result).reduce()
+            if TYPE_CHECKING:
+                assert isinstance(expr, Lit)
+            result[rhs] = expr._part
+
+        return result, key_parts
+
+    @staticmethod
+    def _reset_less_optimal_partitions(
+        partitions: dict[PartSym, PartitionBase],
+    ) -> bool:
+        valid_parts = [
+            (part.color_shape.volume(), unknown)
+            for unknown, part in partitions.items()
+            if part.color_shape is not None
+        ]
+        max_dop = max(dop for dop, _ in valid_parts)
+        reset_any = False
+        for dop, unknown in valid_parts:
+            if dop == max_dop:
+                continue
+            reset_any = True
+            unknown.store.reset_key_partition()
+
+        return reset_any
 
     def partition_stores(self) -> Strategy:
         unknowns: OrderedSet[PartSym] = OrderedSet()
@@ -476,45 +544,39 @@ def cost(unknown: PartSym) -> tuple[int, bool]:
                 not store.has_key_partition(all_restrictions[unknown]),
             )
 
-        sorted_unknowns = sorted(unknowns, key=cost)
-
-        key_parts = set()
-        for unknown in sorted_unknowns:
-            if unknown in partitions:
-                continue
-            elif unknown in dependent:
-                continue
-
-            store = unknown.store
-            restrictions = all_restrictions[unknown]
-            cls = constraints.find(unknown)
-
-            partition = store.compute_key_partition(restrictions)
-            if not partition.even and len(cls) > 1:
-                partition, unknown = self.maybe_find_alternative_key_partition(
-                    partition,
-                    unknown,
-                    cls,
-                    restrictions,
-                    must_be_even,
-                )
-            key_parts.add(unknown)
+        result: dict[PartSym, PartitionBase]
+        key_parts: set[PartSym]
+        launch_shape: Optional[Shape]
+
+        can_retry = True
+        while True:
+            result, key_parts = self._solve_store_constraints(
+                partitions,
+                sorted(unknowns, key=cost),
+                dependent,
+                all_restrictions,
+                constraints,
+                must_be_even,
+            )
 
-            for to_align in cls:
-                if to_align in partitions:
+            launch_shape, done = self.compute_launch_shape(
+                result, all_outputs, unbound_ndim
+            )
+            # When partitions have different numbers of chunks, the solver
+            # normally decides to serialize the operation, as there's no
+            # obvious mapping between the partitions. However, it is
+            # sometimes possible to recover parallelism by searching for
+            # an alternative solution to the given set of partitioning
+            # constraints, especially when some of the stores have cached key
+            # partitions that are not computed for themselves but copied from
+            # others due to alignments.
+            if can_retry and not done:
+                # We only retry once because resetting the cached key
+                # partitions followed recomputing key partitions is
+                # idempotent.
+                can_retry = False
+                if self._reset_less_optimal_partitions(result):
                     continue
-                partitions[to_align] = partition
-
-        for rhs, lhs in dependent.items():
-            expr = lhs.subst(partitions).reduce()
-            if TYPE_CHECKING:
-                assert isinstance(expr, Lit)
-            partitions[rhs] = expr._part
-
-        launch_shape = self.compute_launch_shape(
-            partitions, all_outputs, unbound_ndim
-        )
+            break
 
-        return Strategy(
-            launch_shape, partitions, fspaces, key_parts, constraints
-        )
+        return Strategy(launch_shape, result, fspaces, key_parts, constraints)

From bce684e6d7b651b3fbe6be89badeea8877c56067 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 12 Dec 2022 22:04:09 -0800
Subject: [PATCH 090/121] [pre-commit.ci] pre-commit autoupdate (#513)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/isort: 5.10.1 → 5.11.1](https://github.com/PyCQA/isort/compare/5.10.1...5.11.1)
- [github.com/psf/black: 22.10.0 → 22.12.0](https://github.com/psf/black/compare/22.10.0...22.12.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4635440a3..bb12ddb99 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,10 +1,10 @@
 repos:
     - repo: https://github.com/PyCQA/isort
-      rev: 5.10.1
+      rev: 5.11.1
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 22.10.0
+      rev: 22.12.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8

From 4d6fae5a008238324669487212c5b9a3583130d4 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Thu, 15 Dec 2022 10:46:47 -0800
Subject: [PATCH 091/121] fix uninstall instructions for jupyter extension
 (#514)

---
 legate/jupyter/kernel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/legate/jupyter/kernel.py b/legate/jupyter/kernel.py
index e71604ed8..daadae9ff 100644
--- a/legate/jupyter/kernel.py
+++ b/legate/jupyter/kernel.py
@@ -90,9 +90,11 @@ def install_kernel_spec(spec: KernelSpec, config: Config) -> None:
     except NoSuchKernel:
         pass
     else:
+        # inexplicably, install_kernel_spec calls lower on the supplied kernel
+        # name before using, so we need to call lower for this advice to work
         msg = error(
             f"kernel spec {spec_name!r} already exists. Remove it by "
-            f"running 'jupyter kernelspec uninstall {spec_name!r}, "
+            f"running: 'jupyter kernelspec uninstall {spec_name.lower()}', "
             "or choose a new kernel name."
         )
         print(msg)

From dd8b87f22d1773509f5895696f2460c7b54102ff Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 16 Dec 2022 14:58:10 -0800
Subject: [PATCH 092/121] Change the default legion branch to collective for
 now (#515)

---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index fdff8e692..e3dfdef1c 100755
--- a/install.py
+++ b/install.py
@@ -724,7 +724,7 @@ def driver():
         "--legion-branch",
         dest="legion_branch",
         required=False,
-        default="control_replication",
+        default="collective",
         help="Legion branch to build Legate with.",
     )
     args, unknown = parser.parse_known_args()

From 080aa750cb1d9b2d6388417b63026cca78195934 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Sat, 17 Dec 2022 02:25:54 +0200
Subject: [PATCH 093/121] Improve build documentation (#517)

* Install ninja on dev envs, for better build error messages

* Cosmetic changes

* Add documentation for some more dependencies

* Put the basic build workflow clearly on the top

* Typo
---
 BUILD.md                       | 201 +++++++++++++++++++--------------
 README.md                      |   4 -
 scripts/generate-conda-envs.py |   1 +
 3 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index f14dd8710..2191c2c7b 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -15,13 +15,12 @@ limitations under the License.
 
 -->
 
-# TL;DR
+# Basic build
 
-1) Check if there are specialized scripts available for your cluster at [nv-legate/quickstart](https://github.com/nv-legate/quickstart).
-2) [Install dependencies from conda](#getting-dependencies-through-conda)
-3) [Build using install.py](#using-installpy)
-
-# Getting dependencies
+If you are building on a cluster, first check if there are specialized scripts
+available for your cluster at
+[nv-legate/quickstart](https://github.com/nv-legate/quickstart). Even if your
+specific cluster is not covered, you may be able to adapt an existing workflow.
 
 ## Getting dependencies through conda
 
@@ -40,7 +39,7 @@ $ ./scripts/generate-conda-envs.py --python 3.10 --ctk 11.7 --os linux --compile
 
 Run this script with `-h` to see all available configuration options for the
 generated environment file (e.g. all the supported Python versions). See the
-[Notable Dependencies](#notable-dependencies) section for more details.
+[Dependencies](#dependency-listing) section for more details.
 
 Once you have this environment file, you can install the required packages by
 creating a new conda environment:
@@ -55,20 +54,77 @@ or by updating an existing environment:
 conda env update -f <env-file>.yaml
 ```
 
-## Notable dependencies
+## Building through install.py
+
+The Legate Core repository comes with a helper `install.py` script in the
+top-level directory, that will build the C++ parts of the library and install
+the C++ and Python components under the currently active Python environment.
+
+To add GPU support, use the `--cuda` flag:
+
+```shell
+./install.py --cuda
+```
+
+You can specify the CUDA toolkit directory and the CUDA architecture you want to
+target using the `--with-cuda` and `--arch` flags, e.g.:
+
+```shell
+./install.py --cuda --with-cuda /usr/local/cuda/ --arch ampere
+```
+
+By default the script relies on CMake's auto-detection for these settings.
+CMake will first search the currently active Python/conda environment
+for dependencies, then any common system-wide installation directories (e.g.
+`/usr/lib`). If a dependency cannot be found but is publicly available in source
+form (e.g. OpenBLAS), cmake will fetch and build it automatically. You can
+override this search by providing an install location for any dependency
+explicitly, using a `--with-<dep>` flag, e.g. `--with-nccl` and
+`--with-openblas`.
+
+For multi-node execution Legate uses [GASNet](https://gasnet.lbl.gov/) which can be
+requested using the `--network gasnet1` or `--network gasnetex` flag. By default
+GASNet will be automatically downloaded and built, but if you have an existing
+installation then you can inform the install script using the `--with-gasnet` flag.
+You also need to specify the interconnect network of the target machine using the
+`--conduit` flag.
+
+For example this would be an installation for a
+[DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
+
+```shell
+./install.py --network gasnet1 --conduit ibv --cuda --arch ampere
+```
+
+Alternatively, here is an install line for the
+[Piz-Daint](https://www.cscs.ch/computers/dismissed/piz-daint-piz-dora/) supercomputer:
 
-### OS (`--os` option)
+```shell
+./install.py --network gasnet1 --conduit aries --cuda --arch pascal
+```
+
+To see all available configuration options, run with the `--help` flag:
+
+```shell
+./install.py --help
+```
+
+# Advanced topics
+
+## Dependency listing
+
+### OS (`--os` flag)
 
 Legate has been tested on Linux and MacOS, although only a few flavors of Linux
 such as Ubuntu have been thoroughly tested. There is currently no support for
 Windows.
 
-### Python >= 3.8 (`--python` option)
+### Python >= 3.8 (`--python` flag)
 
 In terms of Python compatibility, Legate *roughly* follows the timeline outlined
 in [NEP 29](https://numpy.org/neps/nep-0029-deprecation_policy.html).
 
-### C++17 compatible compiler (`--compilers` option)
+### C++17 compatible compiler (`--compilers` flag)
 
 For example: g++, clang, or nvc++. When creating an environment using the
 `--compilers` flag, an appropriate compiler for the current system will be
@@ -91,7 +147,7 @@ stubs, are not distributed through conda. These must instead be installed using
 [system-level packages](https://developer.nvidia.com/cuda-downloads).
 
 Independent of the system-level CUDA installation, conda will need to install an
-environment-local copy of the CUDA toolkit (which is what the `--ctk` option
+environment-local copy of the CUDA toolkit (which is what the `--ctk` flag
 controls). To avoid versioning conflicts it is safest to match the version of
 CUDA installed system-wide on your machine
 
@@ -100,12 +156,51 @@ architectures. You can use Legate with Pascal GPUs as well, but there could
 be issues due to lack of independent thread scheduling. Please report any such
 issues on GitHub.
 
-### Fortran compiler (optional)
+### CUDA Libraries (optional)
+
+Only necessary if you wish to run with Nvidia GPUs.
+
+The following libraries are included automatically in CUDA-enabled environment
+files:
+
+- `cutensor`
+- `nccl`
+
+If you wish to provide alternative installations for these, then you can remove
+them from the environment file and pass the corresponding `--with-<dep>` flag
+to `install.py`.
+
+### Build tools
+
+The following tools are used for building Legate, and are automatically included
+in the environment file:
 
-Only necessary if you wish to build OpenBLAS from source.
+- `cmake`
+- `git`
+- `make`
+- `ninja` (this is optional, but produces more informative build output)
+- `scikit-build`
 
-Not included by default in the generated conda environment files; install
-`fortran-compiler` from `conda-forge` if you need it.
+### OpenBLAS
+
+This library is automatically pulled from conda. If you wish to provide an
+alternative installation, then you can manually remove `openblas` from the
+generated environment file and pass `--with-openblas` to `install.py`.
+
+Note that you will need to get a Fortran compiler before you can build OpenBLAS
+from source, e.g. by pulling `fortran-compiler` from `conda-forge`.
+
+If you wish to compile Legate with OpenMP support, then you need a build of
+OpenBLAS configured with the following options:
+
+- `USE_THREAD=1`
+- `USE_OPENMP=1`
+- `NUM_PARALLEL=32` (or at least as many as the NUMA domains on the target
+  machine) -- The `NUM_PARALLEL` flag defines how many instances of OpenBLAS's
+  calculation API can run in parallel. Legate will typically instantiate a
+  separate OpenMP group per NUMA domain, and each group can launch independent
+  BLAS work. If `NUM_PARALLEL` is not high enough, some of this parallel work
+  will be serialized.
 
 ### Numactl (optional)
 
@@ -114,7 +209,7 @@ Required to support CPU and memory binding in the Legate launcher.
 Not available on conda; typically available through the system-level package
 manager.
 
-### MPI (`--openmpi` option; optional)
+### MPI (`--openmpi` flag; optional)
 
 Only necessary if you wish to run on multiple nodes.
 
@@ -139,8 +234,9 @@ If using UCX, a build of UCX configured with `--enable-mt` is required.
 
 If you do not wish to use conda for some (or all) of the dependencies, you can
 remove the corresponding entries from the environment file before passing it to
-conda. See [the `install.py` section](#using-installpy) for instructions on how
-to provide alternative locations for these dependencies to the build process.
+conda. See [the `install.py` section](#building-through-installpy) for
+instructions on how to provide alternative locations for these dependencies to
+the build process.
 
 Note that this is likely to result in conflicts between conda-provided and
 system-provided libraries.
@@ -176,64 +272,7 @@ This way you can make sure that the (typically more recent) conda version of any
 common library will be preferred over the system-wide one, no matter which
 component requests it first.
 
-# Building for Users
-
-## Using install.py
-
-The Legate Core repository comes with a helper `install.py` script in the
-top-level directory, that will build the C++ parts of the library and install
-the C++ and Python components under the currently active Python environment.
-
-To add GPU support, use the `--cuda` flag:
-
-```shell
-./install.py --cuda
-```
-
-You can specify the CUDA toolkit directory and the CUDA architecture you want to
-target using the `--with-cuda` and `--arch` flags, e.g.:
-
-```shell
-./install.py --cuda --with-cuda /usr/local/cuda/ --arch ampere
-```
-
-By default the script relies on CMake's auto-detection for these settings.
-CMake will first search the currently active Python/conda environment
-for dependencies, then any common system-wide installation directories (e.g.
-`/usr/lib`). If a dependency cannot be found but is publicly available in source
-form (e.g. OpenBLAS), cmake will fetch and build it automatically. You can
-override this search by providing an install location for any dependency
-explicitly, using a `--with-dep` flag, e.g. `--with-nccl` and
-`--with-openblas`.
-
-For multi-node execution Legate uses [GASNet](https://gasnet.lbl.gov/) which can be
-requested using the `--network gasnet1` or `--network gasnetex` flag. By default
-GASNet will be automatically downloaded and built, but if you have an existing
-installation then you can inform the install script using the `--with-gasnet` flag.
-You also need to specify the interconnect network of the target machine using the
-`--conduit` flag.
-
-For example this would be an installation for a
-[DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
-
-```shell
-./install.py --network gasnet1 --conduit ibv --cuda --arch ampere
-```
-
-Alternatively, here is an install line for the
-[Piz-Daint](https://www.cscs.ch/computers/dismissed/piz-daint-piz-dora/) supercomputer:
-
-```shell
-./install.py --network gasnet1 --conduit aries --cuda --arch pascal
-```
-
-To see all available configuration options, run with the `--help` flag:
-
-```shell
-./install.py --help
-```
-
-## Using pip
+## Building through pip
 
 Legate Core is not yet registered in a standard pip repository. However, users
 can still use the pip installer to build and install Legate Core. The following
@@ -250,8 +289,6 @@ or
 $ python3 -m pip install .
 ```
 
-## Advanced Customization
-
 Legate relies on CMake to select its toolchain and build flags. Users can set
 the environment variables `CXX` or `CXXFLAGS` prior to building to override the
 CMake defaults.
@@ -271,9 +308,7 @@ An alternative syntax using `setup.py` with `scikit-build` is
 $ python setup.py install -- -DLegion_USE_CUDA:BOOL=ON
 ```
 
-# Building for Developers
-
-## Overview
+## Building through pip & cmake
 
 pip uses [scikit-build](https://scikit-build.readthedocs.io/en/latest/)
 in `setup.py` to drive the build and installation.  A `pip install` will trigger three general actions:
@@ -288,8 +323,6 @@ After building the C++ libraries, the `pip install` can be done in "editable" mo
 This configures the Python site packages to import the Python source tree directly.
 The Python source can then be edited and used directly for testing without requiring another `pip install`.
 
-## Example
-
 There are several examples in the `scripts` folder. We walk through the steps in
 `build-separately-no-install.sh` here.
 
diff --git a/README.md b/README.md
index f713191e9..05aa5b159 100644
--- a/README.md
+++ b/README.md
@@ -226,10 +226,6 @@ conda install -c nvidia -c conda-forge -c legate legate-core
 The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
 and Volta or later GPU architectures.
 
-Docker image build scripts, as well as specialized
-install scripts for supported clusters are available on the
-[quickstart](https://github.com/nv-legate/quickstart) repo.
-
 See [BUILD.md](BUILD.md) for instructions on building Legate Core from source.
 
 ## How Do I Use Legate?
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index dc92d373d..9798d7680 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -85,6 +85,7 @@ def conda(self) -> Reqs:
             "cmake>=3.24,!=3.25.0",
             "git",
             "make",
+            "ninja",
             "scikit-build>=0.13.1",
             "setuptools>=60",
             "zlib",

From b93b57a115286b5be96da33f85e4002aadcea811 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 19 Dec 2022 15:13:11 -0800
Subject: [PATCH 094/121] Map futures to zero-copy memory on GPUs (#518)

---
 src/core/mapping/base_mapper.cc |  6 ++++--
 src/core/mapping/core_mapper.cc | 11 +----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index c48428de4..bc80521bf 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -476,14 +476,16 @@ void BaseMapper::map_task(const MapperContext ctx,
   auto default_option            = options.front();
   auto generate_default_mappings = [&](auto& stores, bool exact) {
     for (auto& store : stores) {
-      auto mapping = StoreMapping::default_mapping(store, default_option, exact);
       if (store.is_future()) {
+        auto option  = default_option == StoreTarget::FBMEM ? StoreTarget::ZCMEM : default_option;
+        auto mapping = StoreMapping::default_mapping(store, option, exact);
         auto fut_idx = store.future_index();
         if (mapped_futures.find(fut_idx) != mapped_futures.end()) continue;
         mapped_futures.insert(fut_idx);
         for_futures.push_back(std::move(mapping));
       } else {
-        auto key = store.unique_region_field_id();
+        auto mapping = StoreMapping::default_mapping(store, default_option, exact);
+        auto key     = store.unique_region_field_id();
         if (mapped_regions.find(key) != mapped_regions.end()) continue;
         mapped_regions.insert(key);
         if (store.unbound())
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index 7f7fbbaf3..89a5b4147 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -350,19 +350,10 @@ void CoreMapper::map_future_map_reduction(const MapperContext ctx,
 {
   output.serdez_upper_bound = LEGATE_MAX_SIZE_SCALAR_RETURN;
 
-#ifdef LEGATE_MAP_FUTURE_MAP_REDUCTIONS_TO_GPU
-  // TODO: It's been reported that blindly mapping target instances of future map reductions
-  // to framebuffers hurts performance. Until we find a better mapping policy, we guard
-  // the current policy with a macro.
-
-  // If this was joining exceptions, we don't want to put instances anywhere
-  // other than the system memory because they need serdez
-  if (input.tag == LEGATE_CORE_JOIN_EXCEPTION_TAG) return;
   if (!local_gpus.empty())
-    for (auto& pair : local_frame_buffers) output.destination_memories.push_back(pair.second);
+    output.destination_memories.push_back(local_zerocopy_memory);
   else if (has_socket_mem)
     for (auto& pair : local_numa_domains) output.destination_memories.push_back(pair.second);
-#endif
 }
 
 void CoreMapper::select_tunable_value(const MapperContext ctx,

From 8381c518b30aec90af9b18564e535851c8f9370a Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 20 Dec 2022 12:07:30 -0800
Subject: [PATCH 095/121] Implementin logic for reuse of reduction instances 
 (#511)

Implementing logic for reuse of reduction instances
---
 src/core/mapping/base_mapper.cc      | 45 +++++++++++++---
 src/core/mapping/base_mapper.h       |  2 +
 src/core/mapping/instance_manager.cc | 71 ++++++++++++++++++++++++
 src/core/mapping/instance_manager.h  | 81 +++++++++++++++++++++++++---
 4 files changed, 185 insertions(+), 14 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index bc80521bf..aedaeceb3 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -84,7 +84,8 @@ BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
     total_nodes(get_total_nodes(m)),
     mapper_name(std::move(create_name(local_node))),
     logger(create_logger_name().c_str()),
-    local_instances(InstanceManager::get_instance_manager())
+    local_instances(InstanceManager::get_instance_manager()),
+    reduction_instances(ReductionInstanceManager::get_instance_manager())
 {
   // Query to find all our local processors
   Machine::ProcessorQuery local_procs(machine);
@@ -684,9 +685,35 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   // Generate layout constraints from the store mapping
   LayoutConstraintSet layout_constraints;
   mapping.populate_layout_constraints(layout_constraints);
+  auto& fields = layout_constraints.field_constraint.field_set;
 
-  // If we're making a reduction instance, we should just make it now
+  // If we're making a reduction instance:
   if (redop != 0) {
+    // We need to hold the instance manager lock as we're about to try
+    // to find an instance
+    AutoLock reduction_lock(ctx, reduction_instances->manager_lock());
+
+    // This whole process has to appear atomic
+    runtime->disable_reentrant(ctx);
+
+    // reuse reductions only for GPU tasks:
+    if (target_proc.kind() == Processor::TOC_PROC) {
+      // See if we already have it in our local instances
+      if (fields.size() == 1 && regions.size() == 1 &&
+          reduction_instances->find_instance(
+            redop, regions.front(), fields.front(), target_memory, result, policy)) {
+#ifdef DEBUG_LEGATE
+        logger.debug() << "Operation " << mappable.get_unique_id()
+                       << ": reused cached reduction instance " << result << " for "
+                       << regions.front();
+#endif
+        runtime->enable_reentrant(ctx);
+        // Needs acquire to keep the runtime happy
+        return true;
+      }
+    }
+
+    // if we didn't find it, create one
     layout_constraints.add_constraint(SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
     size_t footprint = 0;
     if (runtime->create_physical_instance(ctx,
@@ -705,6 +732,14 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
       for (LogicalRegion r : regions) msg << " " << r;
       msg << " (size: " << footprint << " bytes, memory: " << target_memory << ")";
 #endif
+      if (target_proc.kind() == Processor::TOC_PROC) {
+        // store reduction instance
+        if (fields.size() == 1 && regions.size() == 1) {
+          auto fid = fields.front();
+          reduction_instances->record_instance(redop, regions.front(), fid, result, policy);
+        }
+      }
+      runtime->enable_reentrant(ctx);
       // We already did the acquire
       return false;
     }
@@ -713,14 +748,8 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
     return true;
   }
 
-  auto& fields = layout_constraints.field_constraint.field_set;
-
-  // We need to hold the instance manager lock as we're about to try to find an instance
   AutoLock lock(ctx, local_instances->manager_lock());
-
-  // This whole process has to appear atomic
   runtime->disable_reentrant(ctx);
-
   // See if we already have it in our local instances
   if (fields.size() == 1 && regions.size() == 1 &&
       local_instances->find_instance(
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index 17fdb2045..5e5bf1f49 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -31,6 +31,7 @@ namespace legate {
 namespace mapping {
 
 class InstanceManager;
+class ReductionInstanceManager;
 
 enum class Strictness : bool {
   strict = true,
@@ -368,6 +369,7 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
 
  protected:
   InstanceManager* local_instances;
+  ReductionInstanceManager* reduction_instances;
 
  protected:
   // Used for n-D cyclic distribution
diff --git a/src/core/mapping/instance_manager.cc b/src/core/mapping/instance_manager.cc
index bc0adf0f6..511487892 100644
--- a/src/core/mapping/instance_manager.cc
+++ b/src/core/mapping/instance_manager.cc
@@ -302,6 +302,37 @@ void InstanceSet::dump_and_sanity_check() const
   for (auto& entry : instances_) assert(found_groups.count(entry.first) > 0);
 }
 
+bool ReductionInstanceSet::find_instance(ReductionOpID& redop,
+                                         Region& region,
+                                         Instance& result,
+                                         const InstanceMappingPolicy& policy) const
+{
+  auto finder = instances_.find(region);
+  if (finder == instances_.end()) return false;
+  auto& spec = finder->second;
+  if (spec.policy == policy && spec.redop == redop) {
+    result = spec.instance;
+    return true;
+  } else
+    return false;
+}
+
+void ReductionInstanceSet::record_instance(ReductionOpID& redop,
+                                           Region& region,
+                                           Instance& instance,
+                                           const InstanceMappingPolicy& policy)
+{
+  auto finder = instances_.find(region);
+  if (finder != instances_.end()) {
+    auto& spec = finder->second;
+    if (spec.policy != policy || spec.redop != redop) {
+      instances_.insert_or_assign(region, ReductionInstanceSpec(redop, instance, policy));
+    }
+  } else {
+    instances_[region] = ReductionInstanceSpec(redop, instance, policy);
+  }
+}
+
 bool InstanceManager::find_instance(Region region,
                                     FieldID field_id,
                                     Memory memory,
@@ -384,5 +415,45 @@ std::map<Legion::Memory, size_t> InstanceManager::aggregate_instance_sizes() con
   return manager;
 }
 
+bool ReductionInstanceManager::find_instance(ReductionOpID& redop,
+                                             Region region,
+                                             FieldID field_id,
+                                             Memory memory,
+                                             Instance& result,
+                                             const InstanceMappingPolicy& policy)
+{
+  auto finder = instance_sets_.find(FieldMemInfo(region.get_tree_id(), field_id, memory));
+  return policy.allocation != AllocPolicy::MUST_ALLOC && finder != instance_sets_.end() &&
+         finder->second.find_instance(redop, region, result, policy);
+}
+
+void ReductionInstanceManager::record_instance(ReductionOpID& redop,
+                                               Region region,
+                                               FieldID fid,
+                                               Instance instance,
+                                               const InstanceMappingPolicy& policy)
+{
+  const auto mem = instance.get_location();
+  const auto tid = instance.get_tree_id();
+
+  FieldMemInfo key(tid, fid, mem);
+  auto finder = instance_sets_.find(key);
+  if (finder != instance_sets_.end())
+    instance_sets_[key].record_instance(redop, region, instance, policy);
+  else {
+    ReductionInstanceSet set;
+    set.record_instance(redop, region, instance, policy);
+    instance_sets_[key] = set;
+  }
+}
+
+/*static*/ ReductionInstanceManager* ReductionInstanceManager::get_instance_manager()
+{
+  static ReductionInstanceManager* manager{nullptr};
+
+  if (nullptr == manager) manager = new ReductionInstanceManager();
+  return manager;
+}
+
 }  // namespace mapping
 }  // namespace legate
diff --git a/src/core/mapping/instance_manager.h b/src/core/mapping/instance_manager.h
index 2861532cb..0b6b2918a 100644
--- a/src/core/mapping/instance_manager.h
+++ b/src/core/mapping/instance_manager.h
@@ -95,7 +95,45 @@ struct InstanceSet {
   std::map<Legion::LogicalRegion, RegionGroupP> groups_;
 };
 
-class InstanceManager {
+class ReductionInstanceSet {
+ public:
+  using Region        = Legion::LogicalRegion;
+  using Instance      = Legion::Mapping::PhysicalInstance;
+  using Domain        = Legion::Domain;
+  using ReductionOpID = Legion::ReductionOpID;
+
+ public:
+  struct ReductionInstanceSpec {
+    ReductionInstanceSpec() {}
+    ReductionInstanceSpec(const ReductionOpID& op,
+                          const Instance& inst,
+                          const InstanceMappingPolicy& po)
+      : redop(op), instance(inst), policy(po)
+    {
+    }
+
+    ReductionOpID redop{0};
+    Instance instance{};
+    InstanceMappingPolicy policy{};
+  };
+
+ public:
+  bool find_instance(ReductionOpID& redop,
+                     Region& region,
+                     Instance& result,
+                     const InstanceMappingPolicy& policy) const;
+
+ public:
+  void record_instance(ReductionOpID& redop,
+                       Region& region,
+                       Instance& instance,
+                       const InstanceMappingPolicy& policy);
+
+ private:
+  std::map<Region, ReductionInstanceSpec> instances_;
+};
+
+class BaseInstanceManager {
  public:
   using Region       = Legion::LogicalRegion;
   using RegionTreeID = Legion::RegionTreeID;
@@ -103,7 +141,6 @@ class InstanceManager {
   using Domain       = Legion::Domain;
   using FieldID      = Legion::FieldID;
   using Memory       = Legion::Memory;
-  using RegionGroupP = std::shared_ptr<RegionGroup>;
 
  public:
   struct FieldMemInfo {
@@ -132,6 +169,17 @@ class InstanceManager {
     Memory memory;
   };
 
+ public:
+  Legion::Mapping::LocalLock& manager_lock() { return manager_lock_; }
+
+ private:
+  Legion::Mapping::LocalLock manager_lock_{};
+};
+
+class InstanceManager : public BaseInstanceManager {
+ public:
+  using RegionGroupP = std::shared_ptr<RegionGroup>;
+
  public:
   bool find_instance(Region region,
                      FieldID field_id,
@@ -151,9 +199,6 @@ class InstanceManager {
  public:
   void erase(Instance inst);
 
- public:
-  Legion::Mapping::LocalLock& manager_lock() { return manager_lock_; }
-
  public:
   static InstanceManager* get_instance_manager();
 
@@ -162,7 +207,31 @@ class InstanceManager {
 
  private:
   std::map<FieldMemInfo, InstanceSet> instance_sets_{};
-  Legion::Mapping::LocalLock manager_lock_{};
+};
+
+class ReductionInstanceManager : public BaseInstanceManager {
+ public:
+  using ReductionOpID = Legion::ReductionOpID;
+
+ public:
+  bool find_instance(ReductionOpID& redop,
+                     Region region,
+                     FieldID field_id,
+                     Memory memory,
+                     Instance& result,
+                     const InstanceMappingPolicy& policy = {});
+
+  void record_instance(ReductionOpID& redop,
+                       Region region,
+                       FieldID field_id,
+                       Instance instance,
+                       const InstanceMappingPolicy& policy = {});
+
+ public:
+  static ReductionInstanceManager* get_instance_manager();
+
+ private:
+  std::map<FieldMemInfo, ReductionInstanceSet> instance_sets_{};
 };
 
 }  // namespace mapping

From 81e85341ed9fd4dfc3f29a76a35dda8f0bba5234 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 21 Dec 2022 14:58:24 -0800
Subject: [PATCH 096/121] [pre-commit.ci] pre-commit autoupdate (#519)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/isort: 5.11.1 → v5.11.3](https://github.com/PyCQA/isort/compare/5.11.1...v5.11.3)
- [github.com/pre-commit/mirrors-clang-format: v15.0.4 → v15.0.6](https://github.com/pre-commit/mirrors-clang-format/compare/v15.0.4...v15.0.6)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bb12ddb99..48d53dba9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.1
+      rev: v5.11.3
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
@@ -12,7 +12,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v15.0.4'
+      rev: 'v15.0.6'
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From 6d400aed6b92e1ec511fe6028391bb7309b42a1b Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 21 Dec 2022 22:28:39 -0800
Subject: [PATCH 097/121] Sharding functor for delinearizing functor (#520)

* Two changes to the projection functor code:

* Delinearizing functor is now an instance of LegateProjectionFunctor
  so we can derive a sharding functor from it
* Projection functor creations and queries are now logged. Debugging
  code that renders projection functors is also improved.

* Remove dead code

* Use the stream API to print the log message
---
 legate/core/projection.py      |   3 +
 legate/core/transform.py       |   6 +-
 src/core/runtime/projection.cc | 166 +++++++++++++++++++++------------
 3 files changed, 115 insertions(+), 60 deletions(-)

diff --git a/legate/core/projection.py b/legate/core/projection.py
index 68989f13f..18d21a8cf 100644
--- a/legate/core/projection.py
+++ b/legate/core/projection.py
@@ -47,6 +47,7 @@ def __repr__(self) -> str:
         if self._repr is None:
             s = ""
             if self._weight != 0:
+                assert self._dim != -1
                 if self._weight != 1:
                     s += f"{self._weight} * "
                 s += f"COORD{self._dim}"
@@ -55,6 +56,8 @@ def __repr__(self) -> str:
                     s += f" + {self._offset}"
                 else:
                     s += f" - {abs(self._offset)}"
+            elif self._weight == 0:
+                s += "0"
             self._repr = s
         return self._repr
 
diff --git a/legate/core/transform.py b/legate/core/transform.py
index 817f08e25..9a7d5baa0 100644
--- a/legate/core/transform.py
+++ b/legate/core/transform.py
@@ -313,7 +313,11 @@ def invert_point(self, point: Shape) -> Shape:
         return point.insert(self._dim, self._index)
 
     def invert_symbolic_point(self, dims: SymbolicPoint) -> SymbolicPoint:
-        return dims[: self._dim] + (ProjExpr(-1),) + dims[self._dim :]
+        return (
+            dims[: self._dim]
+            + (ProjExpr(dim=-1, weight=0),)
+            + dims[self._dim :]
+        )
 
     def invert_restrictions(self, restrictions: Restrictions) -> Restrictions:
         left = restrictions[: self._dim]
diff --git a/src/core/runtime/projection.cc b/src/core/runtime/projection.cc
index 5b5809866..f13ff8021 100644
--- a/src/core/runtime/projection.cc
+++ b/src/core/runtime/projection.cc
@@ -22,27 +22,64 @@
 
 #include "core/runtime/projection.h"
 #include "core/utilities/dispatch.h"
+#include "legate_defines.h"
 
 using namespace Legion;
 
 namespace legate {
 
-class DelinearizationFunctor : public ProjectionFunctor {
+extern Logger log_legate;
+
+// This special functor overrides the default projection implementation because it needs
+// to know the the target color space for delinearization. Also note that this functor's
+// project_point passes through input points, as we already know they are always 1D points
+// and the output will be linearized back to integers.
+class DelinearizationFunctor : public LegateProjectionFunctor {
  public:
   DelinearizationFunctor(Runtime* runtime);
 
  public:
   virtual Legion::LogicalRegion project(Legion::LogicalPartition upper_bound,
                                         const Legion::DomainPoint& point,
-                                        const Legion::Domain& launch_domain);
+                                        const Legion::Domain& launch_domain) override;
 
  public:
-  virtual bool is_functional(void) const { return true; }
-  virtual bool is_exclusive(void) const { return true; }
-  virtual unsigned get_depth(void) const { return 0; }
+  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
+                                            const Legion::Domain& launch_domain) const override;
 };
 
-DelinearizationFunctor::DelinearizationFunctor(Runtime* runtime) : ProjectionFunctor(runtime) {}
+template <int32_t SRC_DIM, int32_t TGT_DIM>
+class AffineFunctor : public LegateProjectionFunctor {
+ public:
+  AffineFunctor(Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets);
+
+ public:
+  DomainPoint project_point(const DomainPoint& point, const Domain& launch_domain) const override;
+
+ public:
+  static Legion::Transform<TGT_DIM, SRC_DIM> create_transform(int32_t* dims, int32_t* weights);
+
+ private:
+  const Legion::Transform<TGT_DIM, SRC_DIM> transform_;
+  Point<TGT_DIM> offsets_;
+};
+
+LegateProjectionFunctor::LegateProjectionFunctor(Runtime* rt) : ProjectionFunctor(rt) {}
+
+LogicalRegion LegateProjectionFunctor::project(LogicalPartition upper_bound,
+                                               const DomainPoint& point,
+                                               const Domain& launch_domain)
+{
+  const DomainPoint dp = project_point(point, launch_domain);
+  if (runtime->has_logical_subregion_by_color(upper_bound, dp))
+    return runtime->get_logical_subregion_by_color(upper_bound, dp);
+  else
+    return LogicalRegion::NO_REGION;
+}
+
+DelinearizationFunctor::DelinearizationFunctor(Runtime* runtime) : LegateProjectionFunctor(runtime)
+{
+}
 
 LogicalRegion DelinearizationFunctor::project(LogicalPartition upper_bound,
                                               const DomainPoint& point,
@@ -74,38 +111,12 @@ LogicalRegion DelinearizationFunctor::project(LogicalPartition upper_bound,
     return LogicalRegion::NO_REGION;
 }
 
-LegateProjectionFunctor::LegateProjectionFunctor(Runtime* rt) : ProjectionFunctor(rt) {}
-
-LogicalRegion LegateProjectionFunctor::project(LogicalPartition upper_bound,
-                                               const DomainPoint& point,
-                                               const Domain& launch_domain)
+Legion::DomainPoint DelinearizationFunctor::project_point(const Legion::DomainPoint& point,
+                                                          const Legion::Domain& launch_domain) const
 {
-  const DomainPoint dp = project_point(point, launch_domain);
-  if (runtime->has_logical_subregion_by_color(upper_bound, dp))
-    return runtime->get_logical_subregion_by_color(upper_bound, dp);
-  else
-    return LogicalRegion::NO_REGION;
+  return point;
 }
 
-template <int32_t SRC_DIM, int32_t TGT_DIM>
-class AffineFunctor : public LegateProjectionFunctor {
- public:
-  AffineFunctor(Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets);
-
- public:
-  DomainPoint project_point(const DomainPoint& point, const Domain& launch_domain) const override
-  {
-    return DomainPoint(transform_ * Point<SRC_DIM>(point) + offsets_);
-  }
-
- public:
-  static Transform<TGT_DIM, SRC_DIM> create_transform(int32_t* dims, int32_t* weights);
-
- private:
-  const Transform<TGT_DIM, SRC_DIM> transform_;
-  Point<TGT_DIM> offsets_;
-};
-
 template <int32_t SRC_DIM, int32_t TGT_DIM>
 AffineFunctor<SRC_DIM, TGT_DIM>::AffineFunctor(Runtime* runtime,
                                                int32_t* dims,
@@ -117,10 +128,17 @@ AffineFunctor<SRC_DIM, TGT_DIM>::AffineFunctor(Runtime* runtime,
 }
 
 template <int32_t SRC_DIM, int32_t TGT_DIM>
-/*static*/ Transform<TGT_DIM, SRC_DIM> AffineFunctor<SRC_DIM, TGT_DIM>::create_transform(
+DomainPoint AffineFunctor<SRC_DIM, TGT_DIM>::project_point(const DomainPoint& point,
+                                                           const Domain& launch_domain) const
+{
+  return DomainPoint(transform_ * Point<SRC_DIM>(point) + offsets_);
+}
+
+template <int32_t SRC_DIM, int32_t TGT_DIM>
+/*static*/ Legion::Transform<TGT_DIM, SRC_DIM> AffineFunctor<SRC_DIM, TGT_DIM>::create_transform(
   int32_t* dims, int32_t* weights)
 {
-  Transform<TGT_DIM, SRC_DIM> transform;
+  Legion::Transform<TGT_DIM, SRC_DIM> transform;
 
   for (int32_t tgt_dim = 0; tgt_dim < TGT_DIM; ++tgt_dim)
     for (int32_t src_dim = 0; src_dim < SRC_DIM; ++src_dim) transform[tgt_dim][src_dim] = 0;
@@ -146,11 +164,54 @@ static std::unordered_map<ProjectionID, LegateProjectionFunctor*> functor_table{
 static std::mutex functor_table_lock{};
 
 struct create_affine_functor_fn {
+  static void spec_to_string(std::stringstream& ss,
+                             int32_t src_ndim,
+                             int32_t tgt_ndim,
+                             int32_t* dims,
+                             int32_t* weights,
+                             int32_t* offsets)
+  {
+    ss << "\\(";
+    for (int32_t idx = 0; idx < src_ndim; ++idx) {
+      if (idx != 0) ss << ",";
+      ss << "x" << idx;
+    }
+    ss << ")->(";
+    for (int32_t idx = 0; idx < tgt_ndim; ++idx) {
+      if (idx != 0) ss << ",";
+      auto dim    = dims[idx];
+      auto weight = weights[idx];
+      auto offset = offsets[idx];
+      if (dim != -1)
+        if (weight != 0) {
+          assert(dim != -1);
+          if (weight != 1) ss << weight << "*";
+          ss << "x" << dim;
+        }
+      if (offset != 0) {
+        if (offset > 0)
+          ss << "+" << offset;
+        else
+          ss << "-" << -offset;
+      } else if (weight == 0)
+        ss << "0";
+    }
+    ss << ")";
+  }
+
   template <int32_t SRC_DIM, int32_t TGT_DIM>
   void operator()(
     Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets, ProjectionID proj_id)
   {
     auto functor = new AffineFunctor<SRC_DIM, TGT_DIM>(runtime, dims, weights, offsets);
+#ifdef DEBUG_LEGATE
+    std::stringstream ss;
+    ss << "Register projection functor: functor: " << functor << ", id: " << proj_id << ", ";
+    spec_to_string(ss, SRC_DIM, TGT_DIM, dims, weights, offsets);
+    log_legate.debug() << ss.str();
+#else
+    log_legate.debug("Register projection functor: functor: %p, id: %d", functor, proj_id);
+#endif
     runtime->register_projection_functor(proj_id, functor, true /*silence warnings*/);
 
     const std::lock_guard<std::mutex> lock(functor_table_lock);
@@ -163,8 +224,12 @@ void register_legate_core_projection_functors(Legion::Runtime* runtime,
 {
   auto proj_id = context.get_projection_id(LEGATE_CORE_DELINEARIZE_PROJ_ID);
   auto functor = new DelinearizationFunctor(runtime);
+  log_legate.debug("Register delinearizing functor: functor: %p, id: %d", functor, proj_id);
   runtime->register_projection_functor(proj_id, functor, true /*silence warnings*/);
-
+  {
+    const std::lock_guard<std::mutex> lock(functor_table_lock);
+    functor_table[proj_id] = functor;
+  }
   identity_functor = new IdentityFunctor(runtime);
 }
 
@@ -172,28 +237,11 @@ LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id)
 {
   if (0 == proj_id) return identity_functor;
   const std::lock_guard<std::mutex> lock(functor_table_lock);
-  return functor_table[proj_id];
-}
-
-DomainPoint delinearize_future_map_domain(const DomainPoint& point,
-                                          const Domain& domain,
-                                          const Domain& range)
-{
-  int32_t ndim = range.dim;
-
-  DomainPoint result;
-  result.dim = ndim;
-
-  auto lo = range.lo();
-  auto hi = range.hi();
-
-  int64_t idx = point[0];
-  for (int32_t dim = ndim - 1; dim >= 0; --dim) {
-    int64_t extent = hi[dim] - lo[dim] + 1;
-    result[dim]    = idx % extent;
-    idx            = idx / extent;
+  auto result = functor_table[proj_id];
+  if (nullptr == result) {
+    log_legate.debug("Failed to find projection functor of id %d", proj_id);
+    LEGATE_ABORT;
   }
-
   return result;
 }
 

From dc882cbab1b97bb0e2ec658e97357c2a70e464c1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 27 Dec 2022 14:25:44 +0000
Subject: [PATCH 098/121] [pre-commit.ci] pre-commit autoupdate (#522)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/isort: v5.11.3 → 5.11.4](https://github.com/PyCQA/isort/compare/v5.11.3...5.11.4)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 48d53dba9..6ebe994d9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
     - repo: https://github.com/PyCQA/isort
-      rev: v5.11.3
+      rev: 5.11.4
       hooks:
             - id: isort
     - repo: https://github.com/psf/black

From 69e208912bbc030fd736570667ac87be43634fc8 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 28 Dec 2022 15:07:01 -0800
Subject: [PATCH 099/121] Fix a link in the README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 05aa5b159..7f13cc33e 100644
--- a/README.md
+++ b/README.md
@@ -310,7 +310,8 @@ line options, and their default values are as follows.
 
 ### Distributed Launch
 
-If Legate is compiled with networking support ([see the installation section](#Installation)),
+If Legate is compiled with networking support (see the
+[installation section](#how-do-i-install-legate)),
 it can be run in parallel by using the `--nodes` option followed by the number of nodes
 to be used.  Whenever the `--nodes` option is used, Legate will be launched
 using `mpirun`, even with `--nodes 1`.  Without the `--nodes` option, no launcher will

From 81d0e309338180f0dfb1ccd060b96e3a62f07f11 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 29 Dec 2022 14:42:00 -0800
Subject: [PATCH 100/121] New collective (#466)

* adding logic for collective instances

* making collective logic be optional

* making use of collective instances be ON by default

* making collective branch be default for Legion
---
 cmake/Modules/legate_core_options.cmake |  2 ++
 cmake/thirdparty/get_legion.cmake       |  2 +-
 legate_core_cpp.cmake                   |  4 ++++
 src/core/mapping/base_mapper.cc         | 10 ++++++++++
 src/core/runtime/projection.cc          | 18 ++++++++++++++++++
 src/core/runtime/projection.h           |  5 +++++
 6 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/cmake/Modules/legate_core_options.cmake b/cmake/Modules/legate_core_options.cmake
index 0bf870fe6..158eae120 100644
--- a/cmake/Modules/legate_core_options.cmake
+++ b/cmake/Modules/legate_core_options.cmake
@@ -78,6 +78,8 @@ endif()
 
 option(legate_core_STATIC_CUDA_RUNTIME "Statically link the cuda runtime library" OFF)
 option(legate_core_EXCLUDE_LEGION_FROM_ALL "Exclude Legion targets from legate.core's 'all' target" OFF)
+option(legate_core_COLLECTIVE "Use of collective instances" ON)
+
 
 set_or_default(NCCL_DIR NCCL_PATH)
 set_or_default(Thrust_DIR THRUST_PATH)
diff --git a/cmake/thirdparty/get_legion.cmake b/cmake/thirdparty/get_legion.cmake
index 5faf54023..92ec30247 100644
--- a/cmake/thirdparty/get_legion.cmake
+++ b/cmake/thirdparty/get_legion.cmake
@@ -175,7 +175,7 @@ function(find_or_configure_legion)
 endfunction()
 
 if(NOT DEFINED legate_core_LEGION_BRANCH)
-  set(legate_core_LEGION_BRANCH control_replication)
+  set(legate_core_LEGION_BRANCH collective)
 endif()
 
 if(NOT DEFINED legate_core_LEGION_REPOSITORY)
diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index 7502e501d..642c9ceb7 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -142,6 +142,10 @@ set(legate_core_CUDA_OPTIONS "")
 include(cmake/Modules/set_cpu_arch_flags.cmake)
 set_cpu_arch_flags(legate_core_CXX_OPTIONS)
 
+if (legate_core_COLLECTIVE)
+  list(APPEND legate_core_CXX_DEFS LEGATE_USE_COLLECTIVE)
+endif()
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
   list(APPEND legate_core_CXX_DEFS DEBUG_LEGATE)
   list(APPEND legate_core_CUDA_DEFS DEBUG_LEGATE)
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index aedaeceb3..670db0329 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -208,6 +208,16 @@ void BaseMapper::select_task_options(const MapperContext ctx,
                                      const LegionTask& task,
                                      TaskOptions& output)
 {
+#ifdef LEGATE_USE_COLLECTIVE
+  for (uint32_t idx = 0; idx < task.regions.size(); ++idx) {
+    auto& req = task.regions[idx];
+    if ((req.handle_type == LEGION_SINGULAR_PROJECTION) ||
+        (find_legate_projection_functor(req.projection)->is_collective())) {
+      output.check_collective_regions.insert(idx);
+    }
+  }
+#endif
+
   std::vector<TaskTarget> options;
   if (!local_gpus.empty() && has_variant(ctx, task, Processor::TOC_PROC))
     options.push_back(TaskTarget::GPU);
diff --git a/src/core/runtime/projection.cc b/src/core/runtime/projection.cc
index f13ff8021..9b6947277 100644
--- a/src/core/runtime/projection.cc
+++ b/src/core/runtime/projection.cc
@@ -125,6 +125,24 @@ AffineFunctor<SRC_DIM, TGT_DIM>::AffineFunctor(Runtime* runtime,
   : LegateProjectionFunctor(runtime), transform_(create_transform(dims, weights))
 {
   for (int32_t dim = 0; dim < TGT_DIM; ++dim) offsets_[dim] = offsets[dim];
+
+  // mapping to a different dimension
+  if (SRC_DIM > TGT_DIM) {
+    set_collective();
+    return;
+  }
+
+  // find if there is `-1` in the dimensions
+  std::set<int32_t> unique;
+  for (int32_t dim = 0; dim < SRC_DIM; ++dim) {
+    if (dims[dim] == -1) {
+      set_collective();
+      return;
+    }
+    unique.insert(dims[dim]);
+  }
+  // if there are repeated dimensions
+  if (unique.size() != SRC_DIM) set_collective();
 }
 
 template <int32_t SRC_DIM, int32_t TGT_DIM>
diff --git a/src/core/runtime/projection.h b/src/core/runtime/projection.h
index 740c4bef5..b363b52c2 100644
--- a/src/core/runtime/projection.h
+++ b/src/core/runtime/projection.h
@@ -38,10 +38,15 @@ class LegateProjectionFunctor : public Legion::ProjectionFunctor {
   virtual bool is_functional(void) const { return true; }
   virtual bool is_exclusive(void) const { return true; }
   virtual unsigned get_depth(void) const { return 0; }
+  bool is_collective() const { return is_collective_; }
+  void set_collective() { is_collective_ = true; }
 
  public:
   virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
                                             const Legion::Domain& launch_domain) const = 0;
+
+ private:
+  bool is_collective_ = false;
 };
 
 void register_legate_core_projection_functors(Legion::Runtime* runtime,

From c7039056732a05a91b2197be6e761c5d3b2fb4ac Mon Sep 17 00:00:00 2001
From: Mark Vaz <m3vaz@users.noreply.github.com>
Date: Wed, 4 Jan 2023 06:12:08 +1100
Subject: [PATCH 101/121] Update env gen script so OS type works for mac (#523)

* Update environment generation scripts so OS type works for osx
---
 scripts/generate-conda-envs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index 9798d7680..c4bc9059c 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -21,7 +21,7 @@
 
 Req = str
 Reqs = Tuple[Req, ...]
-OSType = Literal["linux", "darwin"]
+OSType = Literal["linux", "osx"]
 
 
 class SectionConfig(Protocol):
@@ -258,7 +258,7 @@ def filename(self) -> str:
     for compilers in (True, False)
     for openmpi in (True, False)
 ] + [
-    EnvConfig("test", python, "darwin", "none", compilers, openmpi)
+    EnvConfig("test", python, "osx", "none", compilers, openmpi)
     for python in PYTHON_VERSIONS
     for compilers in (True, False)
     for openmpi in (True, False)

From 2a3dc8336b285e450276e7c6376d44e3824a84f8 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Wed, 4 Jan 2023 13:19:58 -0800
Subject: [PATCH 102/121] Switch docs from recommonmark to myst-parser (#524)

---
 BUILD.md                          | 2 +-
 docs/legate/core/source/BUILD.md  | 1 +
 docs/legate/core/source/conf.py   | 5 +++--
 docs/legate/core/source/index.rst | 1 +
 scripts/generate-conda-envs.py    | 3 +--
 5 files changed, 7 insertions(+), 5 deletions(-)
 create mode 120000 docs/legate/core/source/BUILD.md

diff --git a/BUILD.md b/BUILD.md
index 2191c2c7b..8f749b0be 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -257,7 +257,7 @@ after to trip GLIBC's internal version checks, since the conda library expects
 to find symbols with more recent version numbers than what is available on the
 system-wide GLIBC:
 
-```shell
+```
 /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by /opt/conda/envs/legate/lib/libarrow.so)
 ```
 
diff --git a/docs/legate/core/source/BUILD.md b/docs/legate/core/source/BUILD.md
new file mode 120000
index 000000000..ad5438194
--- /dev/null
+++ b/docs/legate/core/source/BUILD.md
@@ -0,0 +1 @@
+../../../../BUILD.md
\ No newline at end of file
diff --git a/docs/legate/core/source/conf.py b/docs/legate/core/source/conf.py
index 8cfdf75d8..6ebb74e89 100644
--- a/docs/legate/core/source/conf.py
+++ b/docs/legate/core/source/conf.py
@@ -38,10 +38,11 @@
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx_copybutton",
-    "sphinx_markdown_tables",
-    "recommonmark",
+    "myst_parser",
 ]
 
+suppress_warnings = ["ref.myst"]
+
 # The master toctree document.
 master_doc = "index"
 
diff --git a/docs/legate/core/source/index.rst b/docs/legate/core/source/index.rst
index 4c6f1afd1..5475b6fa6 100644
--- a/docs/legate/core/source/index.rst
+++ b/docs/legate/core/source/index.rst
@@ -5,6 +5,7 @@ Welcome to Legate Core's documentation!
   :maxdepth: 1
 
   Overview <README.md>
+  Build instructions <BUILD.md>
   API Reference <api/index.rst>
   Contributing <CONTRIBUTING.md>
   Versions <versions.rst>
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index c4bc9059c..677b42f48 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -156,9 +156,8 @@ def pip(self) -> Reqs:
             "jinja2",
             "markdown<3.4.0",
             "pydata-sphinx-theme",
-            "recommonmark",
+            "myst-parser",
             "sphinx-copybutton",
-            "sphinx-markdown-tables",
             "sphinx>=4.4.0",
         )
 

From 026fb6e700ed4cad9b918bbd582b4316502e0beb Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bryan@bokeh.org>
Date: Thu, 5 Jan 2023 13:58:20 -0800
Subject: [PATCH 103/121] remove realm pylib check (#525)

---
 legate/driver/launcher.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/legate/driver/launcher.py b/legate/driver/launcher.py
index 1f67046b5..78f34f307 100644
--- a/legate/driver/launcher.py
+++ b/legate/driver/launcher.py
@@ -15,12 +15,10 @@
 from __future__ import annotations
 
 import os
-import sys
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 from ..util.fs import read_c_define
-from ..util.ui import warn
 
 if TYPE_CHECKING:
     from ..util.system import System
@@ -92,8 +90,6 @@ def __init__(self, config: ConfigProtocol, system: System) -> None:
                     self.detected_rank_id = system.env[var]
                     break
 
-        self._check_realm_python()
-
     def __eq__(self, other: object) -> bool:
         return (
             isinstance(other, type(self))
@@ -154,27 +150,6 @@ def is_launcher_var(name: str) -> bool:
             name.startswith(prefix) for prefix in LAUNCHER_VAR_PREFIXES
         )
 
-    def _check_realm_python(self) -> None:
-
-        # Make sure the version of Python used by Realm is the same as what the
-        # user is using currently.
-        realm_pylib = read_c_define(
-            self._system.legion_paths.realm_defines_h, "REALM_PYTHON_LIB"
-        )
-
-        if realm_pylib is None:
-            raise RuntimeError("Cannot determine Realm Python Lib")
-
-        realm_home = Path(realm_pylib[1:-1]).parents[1]
-        if (current_home := Path(sys.executable).parents[1]) != realm_home:
-            print(
-                warn(
-                    "Legate was compiled against the Python installation at "
-                    f"{realm_home}, but you are currently using the Python "
-                    f"installation at {current_home}"
-                )
-            )
-
     def _compute_env(self) -> tuple[EnvDict, set[str]]:
         config = self._config
         system = self._system

From 12d3693eefd925bb26a4522d34cbcb1e0eb13e7a Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 5 Jan 2023 15:05:27 -0800
Subject: [PATCH 104/121] Typo

---
 legate/driver/args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/legate/driver/args.py b/legate/driver/args.py
index 9d6758a07..92bce362b 100644
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -262,7 +262,7 @@
     default=False,
     required=False,
     help="enable GASNet tracing (assumes GASNet was configured with "
-    "--enable--trace)",
+    "--enable-trace)",
 )
 
 debugging.add_argument(

From d0b89184288c529d7b3c1d700a0604050ce142f8 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 9 Jan 2023 14:40:33 +0900
Subject: [PATCH 105/121] More changes to map futures to zero-copy memory
 (#521)

* More sensible placement of futures on zero-copy memory

* Remove dead code
---
 src/core/data/store.cc          |  8 ++++----
 src/core/mapping/base_mapper.cc |  6 ++----
 src/core/mapping/core_mapper.cc | 16 ++++++++++++++--
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index a4ca73f4b..c185ab602 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -136,13 +136,13 @@ FutureWrapper::FutureWrapper(
 #ifdef DEBUG_LEGATE
     assert(!initialize || future_.get_untyped_size() == field_size);
 #endif
-    auto proc = Processor::get_executing_processor();
+    auto mem_kind = find_memory_kind_for_executing_processor(
 #ifdef LEGATE_NO_FUTURES_ON_FB
-    auto mem_kind = find_memory_kind_for_executing_processor();
+      true
 #else
-    auto mem_kind = proc.kind() == Processor::Kind::TOC_PROC ? Memory::Kind::GPU_FB_MEM
-                                                             : Memory::Kind::SYSTEM_MEM;
+      false
 #endif
+    );
     if (initialize) {
       auto p_init_value = future_.get_buffer(mem_kind);
 #ifdef LEGATE_USE_CUDA
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 670db0329..bd0b72b6a 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -487,16 +487,14 @@ void BaseMapper::map_task(const MapperContext ctx,
   auto default_option            = options.front();
   auto generate_default_mappings = [&](auto& stores, bool exact) {
     for (auto& store : stores) {
+      auto mapping = StoreMapping::default_mapping(store, default_option, exact);
       if (store.is_future()) {
-        auto option  = default_option == StoreTarget::FBMEM ? StoreTarget::ZCMEM : default_option;
-        auto mapping = StoreMapping::default_mapping(store, option, exact);
         auto fut_idx = store.future_index();
         if (mapped_futures.find(fut_idx) != mapped_futures.end()) continue;
         mapped_futures.insert(fut_idx);
         for_futures.push_back(std::move(mapping));
       } else {
-        auto mapping = StoreMapping::default_mapping(store, default_option, exact);
-        auto key     = store.unique_region_field_id();
+        auto key = store.unique_region_field_id();
         if (mapped_regions.find(key) != mapped_regions.end()) continue;
         mapped_regions.insert(key);
         if (store.unbound())
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index 89a5b4147..bb9389108 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -350,9 +350,21 @@ void CoreMapper::map_future_map_reduction(const MapperContext ctx,
 {
   output.serdez_upper_bound = LEGATE_MAX_SIZE_SCALAR_RETURN;
 
-  if (!local_gpus.empty())
+  if (!local_gpus.empty()) {
+    // TODO: It's been reported that blindly mapping target instances of future map reductions
+    // to framebuffers hurts performance. Until we find a better mapping policy, we guard
+    // the current policy with a macro.
+#ifdef LEGATE_MAP_FUTURE_MAP_REDUCTIONS_TO_GPU
+    // If this was joining exceptions, we should put instances on a host-visible memory
+    // because they need serdez
+    if (input.tag == LEGATE_CORE_JOIN_EXCEPTION_TAG)
+      output.destination_memories.push_back(local_zerocopy_memory);
+    else
+      for (auto& pair : local_frame_buffers) output.destination_memories.push_back(pair.second);
+#else
     output.destination_memories.push_back(local_zerocopy_memory);
-  else if (has_socket_mem)
+#endif
+  } else if (has_socket_mem)
     for (auto& pair : local_numa_domains) output.destination_memories.push_back(pair.second);
 }
 

From 340e15684052d4cb6d37362b0182180a8f893efc Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 10 Jan 2023 14:30:58 -0800
Subject: [PATCH 106/121] Don't check for collective behavior when we have
 WRITE reqs (#526)

---
 src/core/mapping/base_mapper.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index bd0b72b6a..1fff1f157 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -211,6 +211,7 @@ void BaseMapper::select_task_options(const MapperContext ctx,
 #ifdef LEGATE_USE_COLLECTIVE
   for (uint32_t idx = 0; idx < task.regions.size(); ++idx) {
     auto& req = task.regions[idx];
+    if (req.privilege & LEGION_WRITE_PRIV) continue;
     if ((req.handle_type == LEGION_SINGULAR_PROJECTION) ||
         (find_legate_projection_functor(req.projection)->is_collective())) {
       output.check_collective_regions.insert(idx);

From fdaac615f60147b4c3138f16597e8a5bd7244304 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 12 Jan 2023 13:27:54 -0800
Subject: [PATCH 107/121] All NCCL ranks on the same node must get the same
 NCCL_IB_HCA (#528)

* Fix info message; --foo=ARG isn't actually accepted

* Fix double printing in debug message

* All NCCL ranks on the same node must get the same NCCL_IB_HCA
---
 bind.sh | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/bind.sh b/bind.sh
index 8ccf56203..a371be953 100755
--- a/bind.sh
+++ b/bind.sh
@@ -22,14 +22,14 @@ help() {
 Usage: bind.sh [OPTIONS]... -- APP...
 
 Options:
-  --launcher={mpirun|srun|jrun|auto|local}
+  --launcher {mpirun|srun|jrun|auto|local}
                     Launcher type, used to set LEGATE_RANK
                     If 'auto', attempt to find the launcher rank automatically
                     If 'local', rank is set to "0".
-  --cpus=SPEC       CPU binding specification, passed to numactl
-  --gpus=SPEC       GPU binding specification, used to set CUDA_VISIBLE_DEVICES
-  --mems=SPEC       Memory binding specification, passed to numactl
-  --nics=SPEC       Network interface binding specification, used to set
+  --cpus SPEC       CPU binding specification, passed to numactl
+  --gpus SPEC       GPU binding specification, used to set CUDA_VISIBLE_DEVICES
+  --mems SPEC       Memory binding specification, passed to numactl
+  --nics SPEC       Network interface binding specification, used to set
                     all of: UCX_NET_DEVICES, NCCL_IB_HCA, GASNET_NUM_QPS,
                     and GASNET_IBV_PORTS
   --debug           print out the final computed invocation before exectuting
@@ -147,9 +147,15 @@ if [ -n "${nics+x}" ]; then
   nic="${nics[$local_rank]}"
   nic_array=(${nic//,/ })
   export UCX_NET_DEVICES="${nic//,/:1,}":1
-  export NCCL_IB_HCA="$nic"
   export GASNET_NUM_QPS="${#nic_array[@]}"
   export GASNET_IBV_PORTS="${nic//,/+}"
+
+  # NCCL is supposed to detect the topology and use the right NIC automatically.
+  # NCCL env vars must be set the same way for all ranks on the same node, so
+  # the best we can do here is to constrain NCCL to the full set of NICs that
+  # the user specified.
+  # Note the added "=", to do exact instead of prefix match.
+  export NCCL_IB_HCA="=$(IFS=, ; echo "${nics[*]}")"
 fi
 
 # numactl is only needed if cpu or memory pinning was requested
@@ -177,7 +183,7 @@ done
 set -- "${updated[@]}"
 
 if [ "$debug" == "1" ]; then
-  echo -n "bind.sh: $@"
+  echo -n "bind.sh:"
   for TOK in "$@"; do printf " %q" "$TOK"; done
   echo
 fi

From 02315a0c46dee81d15cc584d44deff5099a921b2 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Fri, 13 Jan 2023 09:31:10 -0800
Subject: [PATCH 108/121] legate/core/_legion: add default new argument to dep
 part functions (#527)

Recent Legion changes added an argument to these functions, we so
we need to pass something through.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 legate/core/_legion/partition_functor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/legate/core/_legion/partition_functor.py b/legate/core/_legion/partition_functor.py
index 469a12c2c..47eec302d 100644
--- a/legate/core/_legion/partition_functor.py
+++ b/legate/core/_legion/partition_functor.py
@@ -140,6 +140,7 @@ def partition(
             part_id,
             self.mapper,
             self.tag,
+            (ffi.NULL, 0),
         )
 
 
@@ -185,6 +186,7 @@ def partition(
             part_id,
             self.mapper,
             self.tag,
+            (ffi.NULL, 0),
         )
 
 
@@ -231,6 +233,7 @@ def partition(
             part_id,
             self.mapper,
             self.tag,
+            (ffi.NULL, 0),
         )
 
 
@@ -277,6 +280,7 @@ def partition(
             part_id,
             self.mapper,
             self.tag,
+            (ffi.NULL, 0),
         )
 
 

From 367e26f39202b1faef992fe3298d00e19f12d4ee Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 18 Jan 2023 15:48:55 -0800
Subject: [PATCH 109/121] Don't turn on Legate debug checks on debug-rel builds
 (#533)

---
 legate_core_cpp.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index 642c9ceb7..ef714345f 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -146,7 +146,7 @@ if (legate_core_COLLECTIVE)
   list(APPEND legate_core_CXX_DEFS LEGATE_USE_COLLECTIVE)
 endif()
 
-if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   list(APPEND legate_core_CXX_DEFS DEBUG_LEGATE)
   list(APPEND legate_core_CUDA_DEFS DEBUG_LEGATE)
 endif()

From 0e3d9cdc984557617d9391f1abcc542e6df7a5b3 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Wed, 18 Jan 2023 22:59:27 -0800
Subject: [PATCH 110/121] src/core: gaurd against missing projection functors
 in collective check (#534)

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 src/core/mapping/base_mapper.cc | 7 ++++++-
 src/core/runtime/projection.cc  | 5 +++--
 src/core/runtime/projection.h   | 3 ++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 1fff1f157..eaf37c83e 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -212,8 +212,13 @@ void BaseMapper::select_task_options(const MapperContext ctx,
   for (uint32_t idx = 0; idx < task.regions.size(); ++idx) {
     auto& req = task.regions[idx];
     if (req.privilege & LEGION_WRITE_PRIV) continue;
+    // Look up the projection for the input region. There are cases where
+    // Legate libraries register their own projection functors that are
+    // not recorded by Legate Core. So, handle the case when these functors
+    // are not present and allow for them to be missing.
+    auto projection = find_legate_projection_functor(req.projection, true /* allow_mising */);
     if ((req.handle_type == LEGION_SINGULAR_PROJECTION) ||
-        (find_legate_projection_functor(req.projection)->is_collective())) {
+        (projection != nullptr && projection->is_collective())) {
       output.check_collective_regions.insert(idx);
     }
   }
diff --git a/src/core/runtime/projection.cc b/src/core/runtime/projection.cc
index 9b6947277..9392cd051 100644
--- a/src/core/runtime/projection.cc
+++ b/src/core/runtime/projection.cc
@@ -251,12 +251,13 @@ void register_legate_core_projection_functors(Legion::Runtime* runtime,
   identity_functor = new IdentityFunctor(runtime);
 }
 
-LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id)
+LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id, bool allow_missing)
 {
   if (0 == proj_id) return identity_functor;
   const std::lock_guard<std::mutex> lock(functor_table_lock);
   auto result = functor_table[proj_id];
-  if (nullptr == result) {
+  // If we're not OK with a missing projection functor, then throw an error.
+  if (nullptr == result && !allow_missing) {
     log_legate.debug("Failed to find projection functor of id %d", proj_id);
     LEGATE_ABORT;
   }
diff --git a/src/core/runtime/projection.h b/src/core/runtime/projection.h
index b363b52c2..cf74d1689 100644
--- a/src/core/runtime/projection.h
+++ b/src/core/runtime/projection.h
@@ -52,6 +52,7 @@ class LegateProjectionFunctor : public Legion::ProjectionFunctor {
 void register_legate_core_projection_functors(Legion::Runtime* runtime,
                                               const LibraryContext& context);
 
-LegateProjectionFunctor* find_legate_projection_functor(Legion::ProjectionID proj_id);
+LegateProjectionFunctor* find_legate_projection_functor(Legion::ProjectionID proj_id,
+                                                        bool allow_missing = false);
 
 }  // namespace legate

From 03372857480ad8567957d3157e35c007de12e6c3 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 Jan 2023 15:21:39 -0800
Subject: [PATCH 111/121] Erase cached reduction instances that cannot be
 acquired (#536)

---
 src/core/mapping/base_mapper.cc      | 28 ++++++++++++++------------
 src/core/mapping/instance_manager.cc | 30 ++++++++++++++++++++++++++++
 src/core/mapping/instance_manager.h  |  6 ++++++
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index eaf37c83e..5c6fc5dd1 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -609,8 +609,13 @@ void BaseMapper::map_legate_stores(const MapperContext ctx,
         logger.debug() << log_mappable(mappable) << ": failed to acquire instance " << result
                        << " for reqs:" << reqs_ss.str();
 #endif
-        AutoLock lock(ctx, local_instances->manager_lock());
-        local_instances->erase(result);
+        if ((*reqs.begin())->redop != 0) {
+          AutoLock lock(ctx, reduction_instances->manager_lock());
+          reduction_instances->erase(result);
+        } else {
+          AutoLock lock(ctx, local_instances->manager_lock());
+          local_instances->erase(result);
+        }
         result = NO_INST;
       }
       instances.push_back(result);
@@ -681,20 +686,17 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   for (auto* req : reqs) regions.push_back(req->region);
   auto target_memory = get_target_memory(target_proc, policy.target);
 
-  ReductionOpID redop = 0;
-  bool first          = true;
+  ReductionOpID redop = (*reqs.begin())->redop;
+#ifdef DEBUG_LEGATE
   for (auto* req : reqs) {
-    if (first)
-      redop = req->redop;
-    else {
-      if (redop != req->redop) {
-        logger.error(
-          "Colocated stores should be either non-reduction arguments "
-          "or reductions with the same reduction operator.");
-        LEGATE_ABORT;
-      }
+    if (redop != req->redop) {
+      logger.error(
+        "Colocated stores should be either non-reduction arguments "
+        "or reductions with the same reduction operator.");
+      LEGATE_ABORT;
     }
   }
+#endif
 
   // Generate layout constraints from the store mapping
   LayoutConstraintSet layout_constraints;
diff --git a/src/core/mapping/instance_manager.cc b/src/core/mapping/instance_manager.cc
index 511487892..f732b0b79 100644
--- a/src/core/mapping/instance_manager.cc
+++ b/src/core/mapping/instance_manager.cc
@@ -333,6 +333,18 @@ void ReductionInstanceSet::record_instance(ReductionOpID& redop,
   }
 }
 
+bool ReductionInstanceSet::erase(PhysicalInstance inst)
+{
+  for (auto it = instances_.begin(); it != instances_.end(); /*nothing*/) {
+    if (it->second.instance == inst) {
+      auto to_erase = it++;
+      instances_.erase(to_erase);
+    } else
+      it++;
+  }
+  return instances_.empty();
+}
+
 bool InstanceManager::find_instance(Region region,
                                     FieldID field_id,
                                     Memory memory,
@@ -447,6 +459,24 @@ void ReductionInstanceManager::record_instance(ReductionOpID& redop,
   }
 }
 
+void ReductionInstanceManager::erase(PhysicalInstance inst)
+{
+  const auto mem = inst.get_location();
+  const auto tid = inst.get_tree_id();
+
+  for (auto fit = instance_sets_.begin(); fit != instance_sets_.end(); /*nothing*/) {
+    if ((fit->first.memory != mem) || (fit->first.tid != tid)) {
+      fit++;
+      continue;
+    }
+    if (fit->second.erase(inst)) {
+      auto to_erase = fit++;
+      instance_sets_.erase(to_erase);
+    } else
+      fit++;
+  }
+}
+
 /*static*/ ReductionInstanceManager* ReductionInstanceManager::get_instance_manager()
 {
   static ReductionInstanceManager* manager{nullptr};
diff --git a/src/core/mapping/instance_manager.h b/src/core/mapping/instance_manager.h
index 0b6b2918a..c42df3119 100644
--- a/src/core/mapping/instance_manager.h
+++ b/src/core/mapping/instance_manager.h
@@ -129,6 +129,9 @@ class ReductionInstanceSet {
                        Instance& instance,
                        const InstanceMappingPolicy& policy);
 
+ public:
+  bool erase(Instance inst);
+
  private:
   std::map<Region, ReductionInstanceSpec> instances_;
 };
@@ -227,6 +230,9 @@ class ReductionInstanceManager : public BaseInstanceManager {
                        Instance instance,
                        const InstanceMappingPolicy& policy = {});
 
+ public:
+  void erase(Instance inst);
+
  public:
   static ReductionInstanceManager* get_instance_manager();
 

From 025a66ba7a2134c5f1652b99ef28969d6b7107aa Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Fri, 20 Jan 2023 09:47:58 -0800
Subject: [PATCH 112/121] Pass `CMAKE_GENERATOR` to scikit-build (#529)

* pass cmake_generator to skbuild as envvar so it overrides skbuild's generator detection

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change variable name

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 install.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/install.py b/install.py
index e3dfdef1c..0e9816916 100755
--- a/install.py
+++ b/install.py
@@ -408,12 +408,6 @@ def validate_path(path):
     # Also use preexisting CMAKE_ARGS from conda if set
     cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
 
-    if cmake_generator:
-        if " " not in cmake_generator:
-            cmake_flags += [f"-G{cmake_generator}"]
-        else:
-            cmake_flags += [f"-G'{cmake_generator}'"]
-
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
 
@@ -464,10 +458,18 @@ def validate_path(path):
         cmake_flags += ["-Dlegate_core_LEGION_BRANCH=%s" % legion_branch]
 
     cmake_flags += extra_flags
+    build_flags = [f"-j{str(thread_count)}"]
+    if verbose:
+        if cmake_generator == "Unix Makefiles":
+            build_flags += ["VERBOSE=1"]
+        else:
+            build_flags += ["--verbose"]
+
     cmd_env.update(
         {
-            "SKBUILD_BUILD_OPTIONS": f"-j{str(thread_count)}",
             "CMAKE_ARGS": " ".join(cmake_flags),
+            "CMAKE_GENERATOR": cmake_generator,
+            "SKBUILD_BUILD_OPTIONS": " ".join(build_flags),
         }
     )
 
@@ -634,7 +636,10 @@ def driver():
         "--cmake-generator",
         dest="cmake_generator",
         required=False,
-        default=(None if shutil.which("ninja") is None else "Ninja"),
+        default=os.environ.get(
+            "CMAKE_GENERATOR",
+            "Unix Makefiles" if shutil.which("ninja") is None else "Ninja",
+        ),
         choices=["Ninja", "Unix Makefiles", None],
         help="The CMake makefiles generator",
     )

From e537b10ab1f112d1949b457b3967842bbd3049ad Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 23 Jan 2023 14:11:29 -0800
Subject: [PATCH 113/121] Change the default CPU architecture to haswell.
 (#538)

When on an x86 platform, change the CPU architecture default to haswell
---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index 0e9816916..3f599b301 100755
--- a/install.py
+++ b/install.py
@@ -576,7 +576,7 @@ def driver():
         "--march",
         dest="march",
         required=False,
-        default="native",
+        default=("haswell" if platform.machine() == "x86_64" else "native"),
         help="Specify the target CPU architecture.",
     )
     parser.add_argument(

From 50084ce29ea33ad46e74749e7aa1f6598a34aedf Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 23 Jan 2023 14:25:44 -0800
Subject: [PATCH 114/121] Build rust `legion_prof` (#535)

* build rust legion prof

* add rust to generate-conda-envs.py and meta.yaml
---
 cmake/thirdparty/get_legion.cmake | 1 +
 conda/conda-build/meta.yaml       | 1 +
 scripts/generate-conda-envs.py    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/cmake/thirdparty/get_legion.cmake b/cmake/thirdparty/get_legion.cmake
index 92ec30247..e158391cc 100644
--- a/cmake/thirdparty/get_legion.cmake
+++ b/cmake/thirdparty/get_legion.cmake
@@ -159,6 +159,7 @@ function(find_or_configure_legion)
                                  "Legion_REDOP_HALF ON"
                                  "Legion_REDOP_COMPLEX ON"
                                  "Legion_GPU_REDUCTIONS OFF"
+                                 "Legion_BUILD_RUST_PROFILER ON"
     )
   endif()
 
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 51f56180d..77722cab1 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -94,6 +94,7 @@ build:
 requirements:
   build:
     - make
+    - rust
     - ninja
     - cmake {{ cmake_version }}
     - {{ compiler('c') }} =11.2
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index 677b42f48..17c5fb2a7 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -85,6 +85,7 @@ def conda(self) -> Reqs:
             "cmake>=3.24,!=3.25.0",
             "git",
             "make",
+            "rust",
             "ninja",
             "scikit-build>=0.13.1",
             "setuptools>=60",

From 3545b3ab38c6b6f1af5e59e53e9484c24f1e81b5 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 24 Jan 2023 08:37:02 -0800
Subject: [PATCH 115/121] adding logic for collective instances to the
 legate_select_sources (#532)

* adding logic for collective instances to the legate_select_sources

* fixing logic for legate_select_sources

* removing all_local logic
---
 src/core/mapping/base_mapper.cc | 91 +++++++++++++++++++--------------
 src/core/mapping/base_mapper.h  |  1 +
 2 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 5c6fc5dd1..0fe6075e9 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -928,54 +928,66 @@ void BaseMapper::select_task_sources(const MapperContext ctx,
                                      const SelectTaskSrcInput& input,
                                      SelectTaskSrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
+}
+
+void add_instance_to_band_ranking(const PhysicalInstance& instance,
+                                  const Legion::AddressSpace& local_node,
+                                  std::map<Memory, uint32_t /*bandwidth*/>& source_memories,
+                                  std::vector<std::pair<PhysicalInstance, uint32_t>>& band_ranking,
+                                  const Memory& destination_memory,
+                                  const Legion::Machine& machine)
+{
+  Memory location = instance.get_location();
+  auto finder     = source_memories.find(location);
+  if (finder == source_memories.end()) {
+    std::vector<MemoryMemoryAffinity> affinity;
+    machine.get_mem_mem_affinity(
+      affinity, location, destination_memory, false /*not just local affinities*/);
+    uint32_t memory_bandwidth = 0;
+    if (!affinity.empty()) {
+#ifdef DEBUG_LEGATE
+      assert(affinity.size() == 1);
+#endif
+      memory_bandwidth = affinity[0].bandwidth;
+    }
+    source_memories[location] = memory_bandwidth;
+    band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, memory_bandwidth));
+  } else
+    band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, finder->second));
 }
 
 void BaseMapper::legate_select_sources(const MapperContext ctx,
                                        const PhysicalInstance& target,
                                        const std::vector<PhysicalInstance>& sources,
+                                       const std::vector<CollectiveView>& collective_sources,
                                        std::deque<PhysicalInstance>& ranking)
 {
   std::map<Memory, uint32_t /*bandwidth*/> source_memories;
   // For right now we'll rank instances by the bandwidth of the memory
-  // they are in to the destination, we'll only rank sources from the
-  // local node if there are any
-  bool all_local = false;
+  // they are in to the destination.
   // TODO: consider layouts when ranking source to help out the DMA system
   Memory destination_memory = target.get_location();
-  std::vector<MemoryMemoryAffinity> affinity(1);
   // fill in a vector of the sources with their bandwidths and sort them
   std::vector<std::pair<PhysicalInstance, uint32_t /*bandwidth*/>> band_ranking;
   for (uint32_t idx = 0; idx < sources.size(); idx++) {
     const PhysicalInstance& instance = sources[idx];
-    Memory location                  = instance.get_location();
-    if (location.address_space() == local_node) {
-      if (!all_local) {
-        source_memories.clear();
-        band_ranking.clear();
-        all_local = true;
-      }
-    } else if (all_local)  // Skip any remote instances once we're local
-      continue;
-    auto finder = source_memories.find(location);
-    if (finder == source_memories.end()) {
-      affinity.clear();
-      machine.get_mem_mem_affinity(
-        affinity, location, destination_memory, false /*not just local affinities*/);
-      uint32_t memory_bandwidth = 0;
-      if (!affinity.empty()) {
-        assert(affinity.size() == 1);
-        memory_bandwidth = affinity[0].bandwidth;
-      }
-      source_memories[location] = memory_bandwidth;
-      band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, memory_bandwidth));
-    } else
-      band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, finder->second));
+    add_instance_to_band_ranking(
+      instance, local_node, source_memories, band_ranking, destination_memory, machine);
   }
-  // If there aren't any sources (for example if there are some collective views
-  // to choose from, not yet in this branch), just return nothing and let the
-  // runtime pick something for us.
-  if (band_ranking.empty()) { return; }
+
+  for (uint32_t idx = 0; idx < collective_sources.size(); idx++) {
+    std::vector<PhysicalInstance> col_instances;
+    collective_sources[idx].find_instances_nearest_memory(destination_memory, col_instances);
+    // we need only first instance if there are several
+    const PhysicalInstance& instance = col_instances[0];
+    add_instance_to_band_ranking(
+      instance, local_node, source_memories, band_ranking, destination_memory, machine);
+  }
+#ifdef DEBUG_LEGATE
+  assert(!band_ranking.empty());
+#endif
   // Easy case of only one instance
   if (band_ranking.size() == 1) {
     ranking.push_back(band_ranking.begin()->first);
@@ -1057,7 +1069,8 @@ void BaseMapper::select_inline_sources(const MapperContext ctx,
                                        const SelectInlineSrcInput& input,
                                        SelectInlineSrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
 void BaseMapper::report_profiling(const MapperContext ctx,
@@ -1153,7 +1166,8 @@ void BaseMapper::select_copy_sources(const MapperContext ctx,
                                      const SelectCopySrcInput& input,
                                      SelectCopySrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
 void BaseMapper::speculate(const MapperContext ctx,
@@ -1185,7 +1199,8 @@ void BaseMapper::select_close_sources(const MapperContext ctx,
                                       const SelectCloseSrcInput& input,
                                       SelectCloseSrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
 void BaseMapper::report_profiling(const MapperContext ctx,
@@ -1248,7 +1263,8 @@ void BaseMapper::select_release_sources(const MapperContext ctx,
                                         const SelectReleaseSrcInput& input,
                                         SelectReleaseSrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
 void BaseMapper::speculate(const MapperContext ctx,
@@ -1318,7 +1334,8 @@ void BaseMapper::select_partition_sources(const MapperContext ctx,
                                           const SelectPartitionSrcInput& input,
                                           SelectPartitionSrcOutput& output)
 {
-  legate_select_sources(ctx, input.target, input.source_instances, output.chosen_ranking);
+  legate_select_sources(
+    ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
 void BaseMapper::report_profiling(const MapperContext ctx,
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index 5e5bf1f49..86e558e0b 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -281,6 +281,7 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
   void legate_select_sources(const Legion::Mapping::MapperContext ctx,
                              const Legion::Mapping::PhysicalInstance& target,
                              const std::vector<Legion::Mapping::PhysicalInstance>& sources,
+                             const std::vector<Legion::Mapping::CollectiveView>& collective_sources,
                              std::deque<Legion::Mapping::PhysicalInstance>& ranking);
 
  protected:

From 6e04a53e3e7b810371ebb74a7975c2fad9f116d9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Jan 2023 09:43:56 -0800
Subject: [PATCH 116/121] [pre-commit.ci] pre-commit autoupdate (#542)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/mirrors-clang-format: v15.0.6 → v15.0.7](https://github.com/pre-commit/mirrors-clang-format/compare/v15.0.6...v15.0.7)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6ebe994d9..c6003f5fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v15.0.6'
+      rev: 'v15.0.7'
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From f5152c1d4478b6c2dc1e66a3f1ff8a1485a8971c Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 24 Jan 2023 10:39:35 -0800
Subject: [PATCH 117/121] Temporarily disable collectives, to work around CI
 failures (#544)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 cmake/Modules/legate_core_options.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/legate_core_options.cmake b/cmake/Modules/legate_core_options.cmake
index 158eae120..7e4b80261 100644
--- a/cmake/Modules/legate_core_options.cmake
+++ b/cmake/Modules/legate_core_options.cmake
@@ -78,7 +78,7 @@ endif()
 
 option(legate_core_STATIC_CUDA_RUNTIME "Statically link the cuda runtime library" OFF)
 option(legate_core_EXCLUDE_LEGION_FROM_ALL "Exclude Legion targets from legate.core's 'all' target" OFF)
-option(legate_core_COLLECTIVE "Use of collective instances" ON)
+option(legate_core_COLLECTIVE "Use of collective instances" OFF)
 
 
 set_or_default(NCCL_DIR NCCL_PATH)

From 1a29d33660c8c0aeb972e7a5cdea3cedb8544bb4 Mon Sep 17 00:00:00 2001
From: Seyed Mirsadeghi <hessam.mirsadeghi@gmail.com>
Date: Wed, 25 Jan 2023 16:21:16 -0500
Subject: [PATCH 118/121] Add support for building Legion with the UCX backend
 (#516)

* Add support for building Legion with the UCX backend

* Add UCX-specific command-line options and env vars

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Set -ucx:tls_host to ^dc,ud

The latest UCX now fixes a bug which allows us to list disble dc and ud
in the correct way, that is, by ^dc,ud instead of explicily naming all
other transports.

* Fix typo

* Enable passing of UCX_ROOT

* Include UCX in our environment generation script

* More documentation on UCX and CUDA prereqs

* Typo

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 BUILD.md                                 | 57 +++++++++++++++++-------
 CMakeLists.txt                           |  7 +++
 README.md                                |  2 +-
 install.py                               | 16 ++++++-
 legate/driver/command.py                 |  7 +++
 legate/driver/launcher.py                | 11 +++++
 scripts/generate-conda-envs.py           | 21 +++++++--
 src/legate_defines.h                     |  3 +-
 tests/unit/legate/driver/test_command.py |  1 +
 9 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 8f749b0be..406953fa7 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -82,12 +82,18 @@ override this search by providing an install location for any dependency
 explicitly, using a `--with-<dep>` flag, e.g. `--with-nccl` and
 `--with-openblas`.
 
-For multi-node execution Legate uses [GASNet](https://gasnet.lbl.gov/) which can be
-requested using the `--network gasnet1` or `--network gasnetex` flag. By default
-GASNet will be automatically downloaded and built, but if you have an existing
-installation then you can inform the install script using the `--with-gasnet` flag.
-You also need to specify the interconnect network of the target machine using the
-`--conduit` flag.
+For multi-node execution Legate can use [GASNet](https://gasnet.lbl.gov/) (use
+`--network gasnet1` or `--network gasnetex`) or [UCX](https://openucx.org) (use
+`--network ucx`).
+With gasnet1 or gasnetex, GASNet will be automatically downloaded and built,
+but if you have an existing installation then you can inform the install script
+using the `--with-gasnet` flag. You also need to specify the interconnect network
+of the target machine using the `--conduit` flag.
+With UCX, the library must be already installed and `--with-ucx` can be used
+to point to the installation path if UCX is not installed under common system paths.
+At least version 1.14 is required, configured with `--enable-mt`.
+
+Compiling with networking support requires MPI.
 
 For example this would be an installation for a
 [DGX SuperPOD](https://www.nvidia.com/en-us/data-center/dgx-superpod/):
@@ -113,7 +119,7 @@ To see all available configuration options, run with the `--help` flag:
 
 ## Dependency listing
 
-### OS (`--os` flag)
+### OS (`--os` flag on `generate-conda-envs.py`)
 
 Legate has been tested on Linux and MacOS, although only a few flavors of Linux
 such as Ubuntu have been thoroughly tested. There is currently no support for
@@ -144,7 +150,9 @@ Only necessary if you wish to run with Nvidia GPUs.
 
 Some CUDA components necessary for building, e.g. the `nvcc` compiler and driver
 stubs, are not distributed through conda. These must instead be installed using
-[system-level packages](https://developer.nvidia.com/cuda-downloads).
+[system-level packages](https://developer.nvidia.com/cuda-downloads). If these
+are not installed under a standard system location, you will need to inform
+`install.py` of their location using `--with-cuda`.
 
 Independent of the system-level CUDA installation, conda will need to install an
 environment-local copy of the CUDA toolkit (which is what the `--ctk` flag
@@ -160,15 +168,17 @@ issues on GitHub.
 
 Only necessary if you wish to run with Nvidia GPUs.
 
-The following libraries are included automatically in CUDA-enabled environment
-files:
+The following additional CUDA libraries are required:
 
-- `cutensor`
-- `nccl`
+- `curand` (only necessary to provide this if building without CUDA support;
+  CUDA-enabled installations will use the version bundled with CUDA)
+- `cutensor` >= 1.3.3 (included in conda environment file)
+- `nccl` (included in conda environment file)
+- `thrust` >= 1.15 (pulled from github)
 
 If you wish to provide alternative installations for these, then you can remove
-them from the environment file and pass the corresponding `--with-<dep>` flag
-to `install.py`.
+them from the environment file (if necessary) and pass the corresponding
+`--with-<dep>` flag to `install.py`.
 
 ### Build tools
 
@@ -221,14 +231,27 @@ file generated with `--no-openmpi`.
 
 Legate requires a build of MPI that supports `MPI_THREAD_MULTIPLE`.
 
-### Networking libraries (e.g. Infiniband, RoCE, UCX; optional)
+### Infiniband/RoCE networking libraries (optional)
 
-Only necessary if you wish to run on multiple nodes.
+Only necessary if you wish to run on multiple nodes, using the corresponding
+networking hardware.
 
 Not available on conda; typically available through MOFED or the system-level
 package manager.
 
-If using UCX, a build of UCX configured with `--enable-mt` is required.
+### UCX >= 1.14 (`--ucx` flag; optional)
+
+Only necessary if you wish to run on multiple nodes, using the UCX Realm
+networking backend.
+
+A build of UCX configured with `--enable-mt` is required.
+
+The build of UCX available on conda might not include support for the particular
+networking hardware on your machine (or may not be optimally tuned for such). In
+that case you may want to use an environment file generated with `--no-ucx`,
+get UCX from another source (e.g. MOFED, the system-level package manager, or
+compiled manually from source), and pass the location of your installation to
+`install.py` (if necessary) using `--with-ucx`.
 
 ## Alternative sources for dependencies
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e83b9a779..c83e6b7c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,13 @@
 
 cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
 
+if(POLICY CMP0074)
+  # find_package() uses <PackageName>_ROOT variables
+  # https://cmake.org/cmake/help/latest/policy/CMP0074.html#policy:CMP0074
+  cmake_policy(SET CMP0074 NEW)
+  set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)
+endif()
+
 if(POLICY CMP0060)
   # Link libraries by full path even in implicit directories
   # https://cmake.org/cmake/help/latest/policy/CMP0060.html#policy:CMP0060
diff --git a/README.md b/README.md
index 7f13cc33e..040142e6f 100644
--- a/README.md
+++ b/README.md
@@ -417,7 +417,7 @@ Memory:
 
 * *Does Legate only work on NVIDIA hardware?*
   No, Legate will run on any processor supported by Legion (e.g. x86, ARM, and
-  PowerPC CPUs), and any network supported by GASNet (e.g. Infiniband,
+  PowerPC CPUs), and any network supported by GASNet or UCX (e.g. Infiniband,
   Cray, Omnipath, and (ROC-)Ethernet based interconnects).
 
 * *What languages does the Legate Core API have bindings for?*
diff --git a/install.py b/install.py
index 3f599b301..e2506a68d 100755
--- a/install.py
+++ b/install.py
@@ -246,6 +246,7 @@ def install(
     cmake_exe,
     cmake_generator,
     gasnet_dir,
+    ucx_dir,
     cuda_dir,
     maxdim,
     maxfields,
@@ -292,6 +293,7 @@ def install(
         print("cmake_exe:", cmake_exe)
         print("cmake_generator:", cmake_generator)
         print("gasnet_dir:", gasnet_dir)
+        print("ucx_dir:", ucx_dir)
         print("cuda_dir:", cuda_dir)
         print("maxdim:", maxdim)
         print("maxfields:", maxfields)
@@ -334,6 +336,7 @@ def validate_path(path):
     legion_dir = validate_path(legion_dir)
     legion_src_dir = validate_path(legion_src_dir)
     gasnet_dir = validate_path(gasnet_dir)
+    ucx_dir = validate_path(ucx_dir)
     thrust_dir = validate_path(thrust_dir)
 
     if verbose:
@@ -343,6 +346,7 @@ def validate_path(path):
         print("legion_dir: ", legion_dir)
         print("legion_src_dir: ", legion_src_dir)
         print("gasnet_dir: ", gasnet_dir)
+        print("ucx_dir: ", ucx_dir)
         print("thrust_dir: ", thrust_dir)
 
     if thread_count is None:
@@ -440,6 +444,8 @@ def validate_path(path):
         cmake_flags += ["-DNCCL_DIR=%s" % nccl_dir]
     if gasnet_dir:
         cmake_flags += ["-DGASNet_ROOT_DIR=%s" % gasnet_dir]
+    if ucx_dir:
+        cmake_flags += ["-DUCX_ROOT=%s" % ucx_dir]
     if conduit:
         cmake_flags += ["-DGASNet_CONDUIT=%s" % conduit]
     if cuda_dir:
@@ -532,7 +538,7 @@ def driver():
         dest="networks",
         action="append",
         required=False,
-        choices=["gasnet1", "gasnetex", "mpi"],
+        choices=["gasnet1", "gasnetex", "ucx", "mpi"],
         default=[],
         help="Realm networking backend to use for multi-node execution.",
     )
@@ -544,6 +550,14 @@ def driver():
         default=os.environ.get("GASNET"),
         help="Path to GASNet installation directory.",
     )
+    parser.add_argument(
+        "--with-ucx",
+        dest="ucx_dir",
+        metavar="DIR",
+        required=False,
+        default=os.environ.get("UCX_ROOT"),
+        help="Path to UCX installation directory.",
+    )
     parser.add_argument(
         "--cuda",
         action=BooleanFlag,
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 7aeac6dd9..2fc8cc1ed 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -360,6 +360,12 @@ def cmd_eager_alloc(
     return ("-lg:eager_alloc_percentage", str(eager_alloc))
 
 
+def cmd_ucx(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
+    return ("-ucx:tls_host", "^dc,ud")
+
+
 def cmd_user_opts(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
@@ -395,6 +401,7 @@ def cmd_user_opts(
     cmd_log_levels,
     cmd_log_file,
     cmd_eager_alloc,
+    cmd_ucx,
     # Append user flags so they can override whatever we provided
     cmd_user_opts,
 )
diff --git a/legate/driver/launcher.py b/legate/driver/launcher.py
index 78f34f307..c5e55cfc2 100644
--- a/legate/driver/launcher.py
+++ b/legate/driver/launcher.py
@@ -186,6 +186,17 @@ def _compute_env(self) -> tuple[EnvDict, set[str]]:
         # threading support
         env["GASNET_MPI_THREAD"] = "MPI_THREAD_MULTIPLE"
 
+        # UCX-related environment variables
+        env["UCX_CUDA_COPY_MAX_REG_RATIO"] = "1.0"
+        env["UCX_MULTI_LANE_MAX_RATIO"] = "1.0"
+        env["UCX_IB_RCACHE_PURGE_ON_FORK"] = "n"
+        env["UCX_RC_TX_POLL_ALWAYS"] = "y"
+
+        # Link to the UCX bootstrap plugin, in case Realm is using UCX
+        env["REALM_UCP_BOOTSTRAP_PLUGIN"] = str(
+            system.legion_paths.legion_lib_path / "realm_ucp_bootstrap_mpi.so"
+        )
+
         # Set some environment variables depending on our configuration that
         # we will check in the Legate binary to ensure that it is properly.
         # configured. Always make sure we include the Legion library
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index 17c5fb2a7..361f35149 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -75,6 +75,7 @@ def __str__(self) -> str:
 class BuildConfig(SectionConfig):
     compilers: bool = True
     openmpi: bool = True
+    ucx: bool = True
 
     header = "build"
 
@@ -95,11 +96,14 @@ def conda(self) -> Reqs:
             pkgs += ("c-compiler", "cxx-compiler")
         if self.openmpi:
             pkgs += ("openmpi",)
+        if self.ucx:
+            pkgs += ("ucx>=1.14",)
         return sorted(pkgs)
 
     def __str__(self) -> str:
         val = "-compilers" if self.compilers else ""
         val += "-openmpi" if self.openmpi else ""
+        val += "-ucx" if self.ucx else ""
         return val
 
 
@@ -171,6 +175,7 @@ class EnvConfig:
     ctk: str
     compilers: bool
     openmpi: bool
+    ucx: bool
 
     @property
     def sections(self) -> Tuple[SectionConfig, ...]:
@@ -188,7 +193,7 @@ def cuda(self) -> CUDAConfig:
 
     @property
     def build(self) -> BuildConfig:
-        return BuildConfig(self.compilers, self.openmpi)
+        return BuildConfig(self.compilers, self.openmpi, self.ucx)
 
     @property
     def runtime(self) -> RuntimeConfig:
@@ -252,13 +257,14 @@ def filename(self) -> str:
 """
 
 ALL_CONFIGS = [
-    EnvConfig("test", python, "linux", ctk, compilers, openmpi)
+    EnvConfig("test", python, "linux", ctk, compilers, openmpi, ucx)
     for python in PYTHON_VERSIONS
     for ctk in CTK_VERSIONS
     for compilers in (True, False)
     for openmpi in (True, False)
+    for ucx in (True, False)
 ] + [
-    EnvConfig("test", python, "osx", "none", compilers, openmpi)
+    EnvConfig("test", python, "osx", "none", compilers, openmpi, False)
     for python in PYTHON_VERSIONS
     for compilers in (True, False)
     for openmpi in (True, False)
@@ -345,6 +351,13 @@ def __call__(self, parser, namespace, values, option_string):
         default=None,
         help="Whether to include openmpi or not (default: both)",
     )
+    parser.add_argument(
+        "--ucx",
+        action=BooleanFlag,
+        dest="ucx",
+        default=None,
+        help="Whether to include UCX or not (default: both)",
+    )
 
     args = parser.parse_args(sys.argv[1:])
 
@@ -362,6 +375,8 @@ def __call__(self, parser, namespace, values, option_string):
         configs = (x for x in configs if x.os == args.os)
     if args.openmpi is not None:
         configs = (x for x in configs if x.build.openmpi == args.openmpi)
+    if args.ucx is not None:
+        configs = (x for x in configs if x.build.ucx == args.ucx)
 
     for config in configs:
         conda_sections = indent(
diff --git a/src/legate_defines.h b/src/legate_defines.h
index fa215e8e7..de272dde0 100644
--- a/src/legate_defines.h
+++ b/src/legate_defines.h
@@ -46,7 +46,8 @@
 #endif
 
 #ifndef LEGATE_USE_NETWORK
-#if defined(REALM_USE_GASNET1) || defined(REALM_USE_GASNETEX) || defined(REALM_USE_MPI)
+#if defined(REALM_USE_GASNET1) || defined(REALM_USE_GASNETEX) || defined(REALM_USE_MPI) || \
+  defined(REALM_USE_UCX)
 #define LEGATE_USE_NETWORK
 #endif
 #endif
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index 38f247ea3..436d53bbf 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -62,6 +62,7 @@ def test_CMD_PARTS() -> None:
         m.cmd_log_levels,
         m.cmd_log_file,
         m.cmd_eager_alloc,
+        m.cmd_ucx,
         m.cmd_user_opts,
     )
 

From 6c718de3c7a986a8a5003d0964dfcc810756bc6b Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Fri, 27 Jan 2023 10:31:48 -0800
Subject: [PATCH 119/121] Update the architectures built in conda package
 (#545) (#546)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index 27b5aead1..317947dc4 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -16,7 +16,7 @@ CMAKE_ARGS+="
 if [ -z "$CPU_ONLY" ]; then
   CMAKE_ARGS+="
 -DLegion_USE_CUDA=ON
--DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;86
+-DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;90
 "
 fi
 

From 90153d289519437f22393af03ed077b6fad3e0a7 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 30 Jan 2023 13:55:09 -0800
Subject: [PATCH 120/121] Revert "Update the architectures built in conda
 package (#545) (#546)" (#550)

This reverts commit 6c718de3c7a986a8a5003d0964dfcc810756bc6b.

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index 317947dc4..27b5aead1 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -16,7 +16,7 @@ CMAKE_ARGS+="
 if [ -z "$CPU_ONLY" ]; then
   CMAKE_ARGS+="
 -DLegion_USE_CUDA=ON
--DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;90
+-DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;86
 "
 fi
 

From 3b5f69a372e23d26e574475663212ea8977bd17d Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 30 Jan 2023 14:14:04 -0800
Subject: [PATCH 121/121] Fix the default Legion version (#547)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index e2506a68d..56c508ad9 100755
--- a/install.py
+++ b/install.py
@@ -743,7 +743,7 @@ def driver():
         "--legion-branch",
         dest="legion_branch",
         required=False,
-        default="collective",
+        default="04cf06a2",
         help="Legion branch to build Legate with.",
     )
     args, unknown = parser.parse_known_args()