diff --git a/.github/workflows/require-labels.yml b/.github/workflows/require-labels.yml
index d6634be4f..6f1f751d2 100644
--- a/.github/workflows/require-labels.yml
+++ b/.github/workflows/require-labels.yml
@@ -6,13 +6,9 @@ jobs:
   label:
     runs-on: ubuntu-latest
     steps:
-      - name: Delay checking labels if PR is just created
-        if: ${{ github.event.action == 'opened' }}
-        run: sleep 300s
-        shell: bash
       - name: Check Labels
-        uses: mheap/github-action-required-labels@v2
+        uses: mheap/github-action-required-labels@v3
         with:
           mode: exactly
           count: 1
-          labels: "category:new-feature, category:improvement, category:bug-fix, category:task"
\ No newline at end of file
+          labels: "category:new-feature, category:improvement, category:bug-fix, category:task, category:documentation"
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c6003f5fc..a8ac54762 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,10 +1,10 @@
 repos:
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.4
+      rev: 5.12.0
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 22.12.0
+      rev: 23.1.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8
@@ -18,7 +18,7 @@ repos:
           files: \.(cu|cuh|h|cc|inl)$
           types_or: []
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v0.991'
+      rev: 'v1.0.1'
       hooks:
         - id: mypy
           pass_filenames: false
diff --git a/BUILD.md b/BUILD.md
index 406953fa7..43c054fe9 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -125,7 +125,7 @@ Legate has been tested on Linux and MacOS, although only a few flavors of Linux
 such as Ubuntu have been thoroughly tested. There is currently no support for
 Windows.
 
-### Python >= 3.8 (`--python` flag)
+### Python >= 3.9 (`--python` flag)
 
 In terms of Python compatibility, Legate *roughly* follows the timeline outlined
 in [NEP 29](https://numpy.org/neps/nep-0029-deprecation_policy.html).
@@ -189,6 +189,7 @@ in the environment file:
 - `git`
 - `make`
 - `ninja` (this is optional, but produces more informative build output)
+- `rust`
 - `scikit-build`
 
 ### OpenBLAS
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c83e6b7c1..62f4f3cba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,7 +62,7 @@ endif()
 # - Download and initialize RAPIDS CMake helpers -----------------------------
 
 if(NOT EXISTS ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 endif()
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
@@ -72,7 +72,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(legate_core_version 22.12.00)
+set(legate_core_version 23.03.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes
@@ -123,3 +123,8 @@ if(CMAKE_GENERATOR STREQUAL "Ninja")
   endfunction()
   add_touch_legate_core_ninja_build_target()
 endif()
+
+option(legate_core_EXAMPLE_BUILD_TESTS OFF)
+if (legate_core_EXAMPLE_BUILD_TESTS)
+  add_subdirectory(examples)
+endif()
diff --git a/README.md b/README.md
index 040142e6f..576222033 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,7 @@ by running the same code that runs on a desktop or a laptop at scale.
 Using this technology, computational and data scientists can develop and test
 programs on moderately sized data sets on local machines and then immediately
 scale up to larger data sets deployed on many nodes in the cloud or on a
-supercomputer without any code modifications. In visual terms:
-
-<img src="docs/figures/vision.png" alt="drawing" width="700"/>
+supercomputer without any code modifications.
 
 The Legate project is built upon two foundational principles:
 
@@ -223,8 +221,12 @@ Legate Core is available [on conda](https://anaconda.org/legate/legate-core):
 conda install -c nvidia -c conda-forge -c legate legate-core
 ```
 
-The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
-and Volta or later GPU architectures.
+Only linux-64 packages are available at the moment.
+
+The default package contains GPU support, and is compatible with CUDA >= 11.4
+(CUDA driver version >= r470), and Volta or later GPU architectures. There are
+also CPU-only packages available, and will be automatically selected by `conda`
+when installing on a machine without GPUs.
 
 See [BUILD.md](BUILD.md) for instructions on building Legate Core from source.
 
@@ -461,15 +463,14 @@ See the discussion of contributing in [CONTRIBUTING.md](CONTRIBUTING.md).
 
 ## Documentation
 
-A complete list of available features can is provided in the [API
-reference](https://nv-legate.github.io/legate.core/api.html).
+A complete list of available features can is found in the [Legate Core
+documentation](https://nv-legate.github.io/legate.core).
 
 ## Next Steps
 
 We recommend starting by experimenting with at least one Legate application
 library to test out performance and see how Legate works. If you are interested
 in building your own Legate application library, we recommend that you
-investigate our [Legate Hello World application
-library](https://github.com/nv-legate/legate.hello) that provides a small
-example of how to get started developing your own drop-in replacement library
-on top of Legion using the Legate Core library.
+investigate our [Legate Hello World application library](examples/hello) that
+provides a small example of how to get started developing your own drop-in
+replacement library on top of Legion using the Legate Core library.
diff --git a/cmake/Modules/cuda_arch_helpers.cmake b/cmake/Modules/cuda_arch_helpers.cmake
index c70235f68..9a2206f69 100644
--- a/cmake/Modules/cuda_arch_helpers.cmake
+++ b/cmake/Modules/cuda_arch_helpers.cmake
@@ -44,6 +44,9 @@ function(set_cuda_arch_from_names)
   if(CMAKE_CUDA_ARCHITECTURES MATCHES "ampere")
     list(APPEND cuda_archs 80)
   endif()
+  if(CMAKE_CUDA_ARCHITECTURES MATCHES "hopper")
+    list(APPEND cuda_archs 90)
+  endif()
 
   if(cuda_archs)
     list(LENGTH cuda_archs num_archs)
@@ -83,6 +86,7 @@ function(add_cuda_architecture_defines defs)
   add_def_if_arch_enabled("70" "VOLTA_ARCH")
   add_def_if_arch_enabled("75" "TURING_ARCH")
   add_def_if_arch_enabled("80" "AMPERE_ARCH")
+  add_def_if_arch_enabled("90" "HOPPER_ARCH")
 
   set(${defs} ${_defs} PARENT_SCOPE)
 endfunction()
diff --git a/cmake/Modules/legate_core_options.cmake b/cmake/Modules/legate_core_options.cmake
index 7e4b80261..62fd1e759 100644
--- a/cmake/Modules/legate_core_options.cmake
+++ b/cmake/Modules/legate_core_options.cmake
@@ -78,7 +78,8 @@ endif()
 
 option(legate_core_STATIC_CUDA_RUNTIME "Statically link the cuda runtime library" OFF)
 option(legate_core_EXCLUDE_LEGION_FROM_ALL "Exclude Legion targets from legate.core's 'all' target" OFF)
-option(legate_core_COLLECTIVE "Use of collective instances" OFF)
+option(legate_core_COLLECTIVE "Use of collective instances" ON)
+option(legate_core_BUILD_DOCS "Build doxygen docs" OFF)
 
 
 set_or_default(NCCL_DIR NCCL_PATH)
diff --git a/cmake/legate_helper_functions.cmake b/cmake/legate_helper_functions.cmake
new file mode 100644
index 000000000..db269fa78
--- /dev/null
+++ b/cmake/legate_helper_functions.cmake
@@ -0,0 +1,442 @@
+macro(legate_include_rapids)
+  if (NOT _LEGATE_HAS_RAPIDS)
+    if(NOT EXISTS ${CMAKE_BINARY_DIR}/LEGATE_RAPIDS.cmake)
+      file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
+           ${CMAKE_BINARY_DIR}/LEGATE_RAPIDS.cmake)
+    endif()
+    include(${CMAKE_BINARY_DIR}/LEGATE_RAPIDS.cmake)
+    include(rapids-cmake)
+    include(rapids-cpm)
+    include(rapids-cuda)
+    include(rapids-export)
+    include(rapids-find)
+    set(_LEGATE_HAS_RAPIDS ON)
+  endif()
+endmacro()
+
+function(legate_default_cpp_install target)
+  set(options)
+  set(one_value_args EXPORT)
+  set(multi_value_args)
+  cmake_parse_arguments(
+    LEGATE_OPT
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  if (NOT LEGATE_OPT_EXPORT)
+    message(FATAL_ERROR "Need EXPORT name for legate_default_install")
+  endif()
+
+  legate_include_rapids()
+
+  rapids_cmake_install_lib_dir(lib_dir)
+
+  install(TARGETS ${target}
+          DESTINATION ${lib_dir}
+	  EXPORT ${LEGATE_OPT_EXPORT})
+
+  set(final_code_block
+    "set(${target}_BUILD_LIBDIR ${CMAKE_BINARY_DIR}/legate_${target})"
+  )
+
+  rapids_export(
+    INSTALL ${target}
+    EXPORT_SET ${LEGATE_OPT_EXPORT}
+    GLOBAL_TARGETS ${target}
+    NAMESPACE legate::
+    LANGUAGES ${ENABLED_LANGUAGES}
+  )
+
+  # build export targets
+  rapids_export(
+    BUILD ${target}
+    EXPORT_SET ${LEGATE_OPT_EXPORT}
+    GLOBAL_TARGETS ${target}
+    NAMESPACE legate::
+    FINAL_CODE_BLOCK final_code_block
+    LANGUAGES ${ENABLED_LANGUAGES}
+  )
+endfunction()
+
+function(legate_add_cffi header)
+  if (NOT DEFINED CMAKE_C_COMPILER)
+    message(FATAL_ERROR "Must enable C language to build Legate projects")
+  endif()
+
+  set(options)
+  set(one_value_args TARGET)
+  set(multi_value_args)
+  cmake_parse_arguments(
+    LEGATE_OPT
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+  # abbreviate for the function below
+  set(target ${LEGATE_OPT_TARGET})
+  set(install_info_in
+[=[
+from pathlib import Path
+
+def get_libpath():
+    import os, sys, platform
+    join = os.path.join
+    exists = os.path.exists
+    dirname = os.path.dirname
+    cn_path = dirname(dirname(__file__))
+    so_ext = {
+        "": "",
+        "Java": ".jar",
+        "Linux": ".so",
+        "Darwin": ".dylib",
+        "Windows": ".dll"
+    }[platform.system()]
+
+    def find_lib(libdir):
+        target = f"lib@target@{so_ext}*"
+        search_path = Path(libdir)
+        matches = [m for m in search_path.rglob(target)]
+        if matches:
+          return matches[0].parent
+        return None
+
+    return (
+        find_lib("@libdir@") or
+        find_lib(join(dirname(dirname(dirname(cn_path))), "lib")) or
+        find_lib(join(dirname(dirname(sys.executable)), "lib")) or
+        ""
+    )
+
+libpath: str = get_libpath()
+
+header: str = """
+  @header@
+  void @target@_perform_registration();
+"""
+]=])
+  set(install_info_py_in ${CMAKE_BINARY_DIR}/legate_${target}/install_info.py.in)
+  set(install_info_py ${CMAKE_SOURCE_DIR}/${target}/install_info.py)
+  file(WRITE ${install_info_py_in} "${install_info_in}")
+
+  set(generate_script_content
+  [=[
+    execute_process(
+      COMMAND ${CMAKE_C_COMPILER}
+        -E
+        -P @header@
+      ECHO_ERROR_VARIABLE
+      OUTPUT_VARIABLE header
+      COMMAND_ERROR_IS_FATAL ANY
+    )
+    configure_file(
+        @install_info_py_in@
+        @install_info_py@
+        @ONLY)
+  ]=])
+
+  set(generate_script ${CMAKE_BINARY_DIR}/gen_install_info.cmake)
+  file(CONFIGURE
+       OUTPUT ${generate_script}
+       CONTENT "${generate_script_content}"
+       @ONLY
+  )
+
+  if (DEFINED ${target}_BUILD_LIBDIR)
+    # this must have been imported from an existing editable build
+    set(libdir ${${target}_BUILD_LIBDIR})
+  else()
+    # libraries are built in a common spot
+    set(libdir ${CMAKE_BINARY_DIR}/legate_${target})
+    message("libdir to binary dir")
+  endif()
+  add_custom_target("generate_install_info_py" ALL
+    COMMAND ${CMAKE_COMMAND}
+      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+      -Dtarget=${target}
+      -Dlibdir=${libdir}
+      -P ${generate_script}
+    OUTPUT ${install_info_py}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating install_info.py"
+    DEPENDS ${header}
+  )
+endfunction()
+
+function(legate_default_python_install target)
+  set(options)
+  set(one_value_args EXPORT)
+  set(multi_value_args)
+  cmake_parse_arguments(
+    LEGATE_OPT
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  if (NOT LEGATE_OPT_EXPORT)
+    message(FATAL_ERROR "Need EXPORT name for legate_default_python_install")
+  endif()
+
+  if (SKBUILD)
+    add_library(${target}_python INTERFACE)
+    add_library(legate::${target}_python ALIAS ${target}_python)
+    target_link_libraries(${target}_python INTERFACE legate::core legate::${target})
+
+    install(TARGETS ${target}_python
+            DESTINATION ${lib_dir}
+            EXPORT ${LEGATE_OPT_EXPORT})
+
+    legate_include_rapids()
+    rapids_export(
+      INSTALL ${target}_python
+      EXPORT_SET ${LEGATE_OPT_EXPORT}
+      GLOBAL_TARGETS ${target}_python
+      NAMESPACE legate::
+    )
+  endif()
+endfunction()
+
+function(legate_add_cpp_subdirectory dir)
+  set(options)
+  set(one_value_args EXPORT TARGET)
+  set(multi_value_args)
+  cmake_parse_arguments(
+    LEGATE_OPT
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
+
+  if (NOT LEGATE_OPT_EXPORT)
+    message(FATAL_ERROR "Need EXPORT name for legate_default_install")
+  endif()
+
+  if (NOT LEGATE_OPT_TARGET)
+    message(FATAL_ERROR "Need TARGET name for Legate package")
+  endif()
+  # abbreviate for the function
+  set(target ${LEGATE_OPT_TARGET})
+
+  legate_include_rapids()
+
+  rapids_find_package(legate_core CONFIG
+          GLOBAL_TARGETS legate::core
+          BUILD_EXPORT_SET ${LEGATE_OPT_EXPORT}
+          INSTALL_EXPORT_SET ${LEGATE_OPT_EXPORT})
+
+  if (SKBUILD)
+    if (NOT DEFINED ${target}_ROOT)
+      set(${target}_ROOT ${CMAKE_SOURCE_DIR}/build)
+    endif()
+    rapids_find_package(${target} CONFIG
+      GLOBAL_TARGETS legate::${target}
+      BUILD_EXPORT_SET ${LEGATE_OPT_EXPORT}
+      INSTALL_EXPORT_SET ${LEGATE_OPT_EXPORT})
+    if (NOT ${target}_FOUND)
+      add_subdirectory(${dir} ${CMAKE_BINARY_DIR}/legate_${target})
+      legate_default_cpp_install(${target} EXPORT ${LEGATE_OPT_EXPORT})
+    else()
+      # Make sure the libdir is visible to other functions
+      set(${target}_BUILD_LIBDIR "${${target}_BUILD_LIBDIR}" PARENT_SCOPE)
+    endif()
+  else()
+    add_subdirectory(${dir} ${CMAKE_BINARY_DIR}/legate_${target})
+    legate_default_cpp_install(${target} EXPORT ${LEGATE_OPT_EXPORT})
+  endif()
+
+endfunction()
+
+function(legate_cpp_library_template target output_sources_variable)
+  set(file_template
+[=[
+#pragma once
+
+#include "legate.h"
+
+namespace @target@ {
+
+struct Registry {
+ public:
+  template <typename... Args>
+  static void record_variant(Args&&... args)
+  {
+    get_registrar().record_variant(std::forward<Args>(args)...);
+  }
+  static legate::TaskRegistrar& get_registrar();
+};
+
+template <typename T, int ID>
+struct Task : public legate::LegateTask<T> {
+  using Registrar = Registry;
+  static constexpr int TASK_ID = ID;
+};
+
+}
+]=])
+  string(CONFIGURE "${file_template}" file_content @ONLY)
+  file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/legate_library.h "${file_content}")
+
+  set(file_template
+[=[
+#include "legate_library.h"
+#include "core/mapping/mapping.h"
+
+namespace @target@ {
+
+class Mapper : public legate::mapping::LegateMapper {
+ public:
+  Mapper(){}
+
+ private:
+  Mapper(const Mapper& rhs)            = delete;
+  Mapper& operator=(const Mapper& rhs) = delete;
+
+  // Legate mapping functions
+ public:
+  void set_machine(const legate::mapping::MachineQueryInterface* machine) override {
+    machine_ = machine;
+  }
+
+  legate::mapping::TaskTarget task_target(
+    const legate::mapping::Task& task,
+    const std::vector<legate::mapping::TaskTarget>& options) override {
+    return *options.begin();
+  }
+
+  std::vector<legate::mapping::StoreMapping> store_mappings(
+    const legate::mapping::Task& task,
+    const std::vector<legate::mapping::StoreTarget>& options) override {
+    using legate::mapping::StoreMapping;
+    std::vector<StoreMapping> mappings;
+    auto& inputs  = task.inputs();
+    auto& outputs = task.outputs();
+    for (auto& input : inputs) {
+      mappings.push_back(StoreMapping::default_mapping(input, options.front()));
+      mappings.back().policy.exact = true;
+    }
+    for (auto& output : outputs) {
+      mappings.push_back(StoreMapping::default_mapping(output, options.front()));
+      mappings.back().policy.exact = true;
+    }
+    return std::move(mappings);
+  }
+
+  legate::Scalar tunable_value(legate::TunableID tunable_id) override {
+    return 0;
+  }
+
+ private:
+  const legate::mapping::MachineQueryInterface* machine_;
+};
+
+static const char* const library_name = "@target@";
+
+Legion::Logger log_@target@(library_name);
+
+/*static*/ legate::TaskRegistrar& Registry::get_registrar()
+{
+  static legate::TaskRegistrar registrar;
+  return registrar;
+}
+
+void registration_callback()
+{
+  legate::ResourceConfig config;
+  config.max_mappers       = 1;
+  config.max_tasks         = 1024;
+  config.max_reduction_ops = 8;
+  legate::LibraryContext context(library_name, config);
+
+  Registry::get_registrar().register_all_tasks(context);
+
+  // Now we can register our mapper with the runtime
+  context.register_mapper(std::make_unique<Mapper>(), 0);
+}
+
+}  // namespace @target@
+
+extern "C" {
+
+void @target@_perform_registration(void)
+{
+  // Tell the runtime about our registration callback so we hook it
+  // in before the runtime starts and make it global so that we know
+  // that this call back is invoked everywhere across all nodes
+  legate::Core::perform_registration<@target@::registration_callback>();
+}
+
+}
+]=])
+  string(CONFIGURE "${file_template}" file_content @ONLY)
+  file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/legate_library.cc "${file_content}")
+
+  set(${output_sources_variable}
+    legate_library.h
+    legate_library.cc
+    PARENT_SCOPE
+  )
+endfunction()
+
+function(legate_python_library_template target)
+set(file_template
+[=[
+from legate.core import (
+    Library,
+    ResourceConfig,
+    get_legate_runtime,
+)
+import os
+from typing import Any
+
+class UserLibrary(Library):
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.shared_object: Any = None
+
+    @property
+    def cffi(self) -> Any:
+        return self.shared_object
+
+    def get_name(self) -> str:
+        return self.name
+
+    def get_shared_library(self) -> str:
+        from @target@.install_info import libpath
+        return os.path.join(libpath, f"lib@target@{self.get_library_extension()}")
+
+    def get_c_header(self) -> str:
+        from @target@.install_info import header
+
+        return header
+
+    def get_registration_callback(self) -> str:
+        return "@target@_perform_registration"
+
+    def get_resource_configuration(self) -> ResourceConfig:
+        assert self.shared_object is not None
+        config = ResourceConfig()
+        config.max_tasks = 1024
+        config.max_mappers = 1
+        config.max_reduction_ops = 8
+        config.max_projections = 0
+        config.max_shardings = 0
+        return config
+
+    def initialize(self, shared_object: Any) -> None:
+        self.shared_object = shared_object
+
+    def destroy(self) -> None:
+        pass
+
+user_lib = UserLibrary("@target@")
+user_context = get_legate_runtime().register_library(user_lib)
+]=])
+  string(CONFIGURE "${file_template}" file_content @ONLY)
+  file(WRITE ${CMAKE_SOURCE_DIR}/${target}/library.py "${file_content}")
+endfunction()
diff --git a/cmake/thirdparty/get_legion.cmake b/cmake/thirdparty/get_legion.cmake
index e158391cc..71ce33fb1 100644
--- a/cmake/thirdparty/get_legion.cmake
+++ b/cmake/thirdparty/get_legion.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright 2022 NVIDIA Corporation
+# Copyright 2022-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
 # limitations under the License.
 #=============================================================================
 
+include_guard(GLOBAL)
+
 function(find_or_configure_legion)
   set(oneValueArgs VERSION REPOSITORY BRANCH EXCLUDE_FROM_ALL)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -21,6 +23,22 @@ function(find_or_configure_legion)
   include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
   rapids_export_parse_version(${PKG_VERSION} Legion PKG_VERSION)
 
+  string(REGEX REPLACE "^0([0-9]+)?$" "\\1" Legion_major_version "${Legion_major_version}")
+  string(REGEX REPLACE "^0([0-9]+)?$" "\\1" Legion_minor_version "${Legion_minor_version}")
+  string(REGEX REPLACE "^0([0-9]+)?$" "\\1" Legion_patch_version "${Legion_patch_version}")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(Legion version git_repo git_branch shallow exclude_from_all)
+
+  set(version "${Legion_major_version}.${Legion_minor_version}.${Legion_patch_version}")
+  set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
+  if(PKG_BRANCH)
+    set(git_branch "${PKG_BRANCH}")
+  endif()
+  if(PKG_REPOSITORY)
+    set(git_repo "${PKG_REPOSITORY}")
+  endif()
+
   set(Legion_CUDA_ARCH "")
   if(Legion_USE_CUDA)
     set(Legion_CUDA_ARCH ${CMAKE_CUDA_ARCHITECTURES})
@@ -47,14 +65,15 @@ function(find_or_configure_legion)
     if(Legion_DIR OR Legion_ROOT)
       set(_find_mode REQUIRED)
     endif()
-    rapids_find_package(Legion ${PKG_VERSION} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
+    rapids_find_package(Legion ${version} EXACT CONFIG ${_find_mode} ${FIND_PKG_ARGS})
   endif()
 
   if(Legion_FOUND)
-    message(STATUS "CPM: using local package Legion@${PKG_VERSION}")
+    message(STATUS "CPM: using local package Legion@${version}")
   else()
+
     include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/cpm_helpers.cmake)
-    get_cpm_git_args(legion_cpm_git_args REPOSITORY ${PKG_REPOSITORY} BRANCH ${PKG_BRANCH})
+    get_cpm_git_args(legion_cpm_git_args REPOSITORY ${git_repo} BRANCH ${git_branch})
     if(NOT DEFINED Legion_PYTHON_EXTRA_INSTALL_ARGS)
       set(Legion_PYTHON_EXTRA_INSTALL_ARGS "--single-version-externally-managed --root=/")
     endif()
@@ -86,38 +105,41 @@ function(find_or_configure_legion)
       endif()
 
       # Get the `stubs/libcuda.so` path so we can set CMAKE_LIBRARY_PATH for FindCUDA.cmake
+      set(_libdir "lib64")
+      if(CMAKE_SIZEOF_VOID_P LESS 8)
+        set(_libdir "lib")
+      endif()
 
-      # Prefer users' CUDA_PATH envvar (if set)
-      set(_cuda_stubs "$ENV{CUDA_PATH}")
-      if(NOT _cuda_stubs)
-        if(DEFINED ENV{CUDA_LIB_PATH})
-          # Prefer users' CUDA_LIB_PATH envvar (if set)
-          list(APPEND _cuda_stubs "$ENV{CUDA_LIB_PATH}")
-          message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
-        elseif(EXISTS "${CUDAToolkit_LIBRARY_DIR}/stubs/libcuda.so")
-          # This might be the path to the `$CONDA_PREFIX/lib`
-          # If it is (and it has the libcuda.so driver stub),
-          # then we know we're using the cuda-toolkit package
-          # and should link to that driver stub instead of the
-          # one potentially in `/usr/local/cuda/lib[64]/stubs`
-          list(APPEND _cuda_stubs "${CUDAToolkit_LIBRARY_DIR}/stubs")
-          message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
-        elseif(DEFINED ENV{LIBRARY_PATH})
-          # LIBRARY_PATH is set automatically in the `nvidia/cuda` containers.
-          # Only use it if the conda env doesn't have the `stubs/libcuda.so` lib.
-          list(APPEND _cuda_stubs "$ENV{LIBRARY_PATH}")
-          message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
-        elseif(CMAKE_SIZEOF_VOID_P LESS 8)
-          # Otherwise assume stubs are relative to the CUDA toolkit root dir
-          list(APPEND _cuda_stubs "${CUDAToolkit_LIBRARY_ROOT}/lib/stubs")
-          message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
-        else()
-          # Otherwise assume stubs are relative to the CUDA toolkit root dir
-          list(APPEND _cuda_stubs "${CUDAToolkit_LIBRARY_ROOT}/lib64/stubs")
-          message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
-        endif()
+      if(EXISTS "${CUDAToolkit_LIBRARY_DIR}/stubs/libcuda.so")
+        # This might be the path to the `$CONDA_PREFIX/lib`
+        # If it is (and it has the libcuda.so driver stub),
+        # then we know we're using the cuda-toolkit package
+        # and should link to that driver stub instead of the
+        # one potentially in `/usr/local/cuda/lib[64]/stubs`
+        list(APPEND _cuda_stubs "${CUDAToolkit_LIBRARY_DIR}/stubs")
+      elseif(EXISTS "${CUDAToolkit_TARGET_DIR}/${_libdir}/stubs/libcuda.so")
+        # Otherwise assume stubs are relative to the CUDA toolkit root dir
+        list(APPEND _cuda_stubs "${CUDAToolkit_TARGET_DIR}/${_libdir}/stubs")
+      elseif(EXISTS "${CUDAToolkit_LIBRARY_ROOT}/${_libdir}/stubs/libcuda.so")
+        list(APPEND _cuda_stubs "${CUDAToolkit_LIBRARY_ROOT}/${_libdir}/stubs")
+      elseif(DEFINED ENV{CUDA_PATH} AND EXISTS "$ENV{CUDA_PATH}/${_libdir}/stubs/libcuda.so")
+        # Use CUDA_PATH envvar (if set)
+        list(APPEND _cuda_stubs "$ENV{CUDA_PATH}/${_libdir}/stubs/libcuda.so")
+      elseif(DEFINED ENV{CUDA_LIB_PATH} AND EXISTS "$ENV{CUDA_LIB_PATH}/stubs/libcuda.so")
+        # Use CUDA_LIB_PATH envvar (if set)
+        list(APPEND _cuda_stubs "$ENV{CUDA_LIB_PATH}/stubs/libcuda.so")
+      elseif(DEFINED ENV{LIBRARY_PATH} AND
+            ("$ENV{LIBRARY_PATH}" STREQUAL "/usr/local/cuda/${_libdir}/stubs"))
+        # LIBRARY_PATH is set in the `nvidia/cuda` containers to /usr/local/cuda/lib64/stubs
+        list(APPEND _cuda_stubs "$ENV{LIBRARY_PATH}")
+      else()
+        message(FATAL_ERROR "Could not find the libcuda.so driver stub. "
+                            "Please reconfigure with -DCUDAToolkit_ROOT= "
+                            "set to a valid CUDA Toolkit installation.")
       endif()
 
+      message(VERBOSE "legate.core: Path(s) to CUDA stubs: ${_cuda_stubs}")
+
       list(APPEND _legion_cuda_options "CUDA_NVCC_FLAGS ${_nvcc_flags}")
       list(APPEND _legion_cuda_options "CMAKE_CUDA_STANDARD ${_cuda_std}")
       # Set this so Legion correctly finds the CUDA toolkit.
@@ -131,6 +153,8 @@ function(find_or_configure_legion)
       list(APPEND CMAKE_C_IMPLICIT_LINK_DIRECTORIES "${_cuda_stubs}")
       list(APPEND CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "${_cuda_stubs}")
       list(APPEND CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${_cuda_stubs}")
+      set(legate_core_cuda_stubs_path "${_cuda_stubs}" PARENT_SCOPE)
+      set(legate_core_cuda_stubs_path "${_cuda_stubs}" CACHE STRING "" FORCE)
     endif()
 
     # Because legion sets these as cache variables, we need to force set this as a cache variable here
@@ -144,14 +168,19 @@ function(find_or_configure_legion)
     set(Legion_CUDA_ARCH ${Legion_CUDA_ARCH} CACHE STRING
       "Comma-separated list of CUDA architectures to build for (e.g. 60,70)" FORCE)
 
-    rapids_cpm_find(Legion ${PKG_VERSION} ${FIND_PKG_ARGS}
+    message(VERBOSE "legate.core: Legion version: ${version}")
+    message(VERBOSE "legate.core: Legion git_repo: ${git_repo}")
+    message(VERBOSE "legate.core: Legion git_branch: ${git_branch}")
+    message(VERBOSE "legate.core: Legion exclude_from_all: ${exclude_from_all}")
+
+    rapids_cpm_find(Legion ${version} ${FIND_PKG_ARGS}
         CPM_ARGS
           ${legion_cpm_git_args}
           FIND_PACKAGE_ARGUMENTS EXACT
-          EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+          EXCLUDE_FROM_ALL       ${exclude_from_all}
           OPTIONS                ${_legion_cuda_options}
                                  "CMAKE_CXX_STANDARD ${_cxx_std}"
-                                 "Legion_VERSION ${PKG_VERSION}"
+                                 "Legion_VERSION ${version}"
                                  "Legion_BUILD_BINDINGS ON"
                                  "Legion_BUILD_APPS OFF"
                                  "Legion_BUILD_TESTS OFF"
@@ -160,6 +189,14 @@ function(find_or_configure_legion)
                                  "Legion_REDOP_COMPLEX ON"
                                  "Legion_GPU_REDUCTIONS OFF"
                                  "Legion_BUILD_RUST_PROFILER ON"
+                                 "Legion_SPY ${Legion_SPY}"
+                                 "Legion_USE_LLVM ${Legion_USE_LLVM}"
+                                 "Legion_USE_HDF5 ${Legion_USE_HDF5}"
+                                 "Legion_USE_CUDA ${Legion_USE_CUDA}"
+                                 "Legion_NETWORKS ${Legion_NETWORKS}"
+                                 "Legion_USE_OpenMP ${Legion_USE_OpenMP}"
+                                 "Legion_USE_Python ${Legion_USE_Python}"
+                                 "Legion_BOUNDS_CHECKS ${Legion_BOUNDS_CHECKS}"
     )
   endif()
 
@@ -175,16 +212,23 @@ function(find_or_configure_legion)
 
 endfunction()
 
-if(NOT DEFINED legate_core_LEGION_BRANCH)
-  set(legate_core_LEGION_BRANCH collective)
-endif()
-
-if(NOT DEFINED legate_core_LEGION_REPOSITORY)
-  set(legate_core_LEGION_REPOSITORY https://gitlab.com/StanfordLegion/legion.git)
-endif()
+foreach(_var IN ITEMS "legate_core_LEGION_VERSION"
+                      "legate_core_LEGION_BRANCH"
+                      "legate_core_LEGION_REPOSITORY"
+                      "legate_core_EXCLUDE_LEGION_FROM_ALL")
+  if(DEFINED ${_var})
+    # Create a legate_core_LEGION_BRANCH variable in the current scope either from the existing
+    # current-scope variable, or the cache variable.
+    set(${_var} "${${_var}}")
+    # Remove legate_core_LEGION_BRANCH from the CMakeCache.txt. This ensures reconfiguring the same
+    # build dir without passing `-Dlegate_core_LEGION_BRANCH=` reverts to the value in versions.json
+    # instead of reusing the previous `-Dlegate_core_LEGION_BRANCH=` value.
+    unset(${_var} CACHE)
+  endif()
+endforeach()
 
 if(NOT DEFINED legate_core_LEGION_VERSION)
-  set(legate_core_LEGION_VERSION "${legate_core_VERSION_MAJOR}.${legate_core_VERSION_MINOR}.0")
+  set(legate_core_LEGION_VERSION "${legate_core_VERSION}")
 endif()
 
 find_or_configure_legion(VERSION          ${legate_core_LEGION_VERSION}
diff --git a/cmake/versions.json b/cmake/versions.json
index 02d16cf16..c5bf2d31a 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,9 +1,13 @@
 {
   "packages" : {
-      "Thrust" : {
-        "version" : "1.15.0.0",
-        "git_url" : "https://github.com/NVIDIA/thrust.git",
-        "git_tag" : "1.15.0"
+    "Thrust" : {
+      "version" : "1.17.0.0",
+      "git_url" : "https://github.com/NVIDIA/thrust.git",
+      "git_tag" : "1.17.0"
+    },
+    "Legion": {
+      "git_url" : "https://gitlab.com/StanfordLegion/legion.git",
+      "git_tag" : "e1f1ef61e29c3160419d0cd528950b2d565c2a0d"
     }
   }
 }
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index 27b5aead1..317947dc4 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -16,7 +16,7 @@ CMAKE_ARGS+="
 if [ -z "$CPU_ONLY" ]; then
   CMAKE_ARGS+="
 -DLegion_USE_CUDA=ON
--DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;86
+-DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;90
 "
 fi
 
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index e970f469e..81326c39a 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -3,9 +3,9 @@ gpu_enabled:
   - false
 
 python:
-  - 3.8
-  - 3.9
+  - "3.9,!=3.9.7"
   - 3.10
+  - 3.11
 
 numpy_version:
   - ">=1.22"
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 77722cab1..ace526e84 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -112,6 +112,7 @@ requirements:
     - cuda-nvtx ={{ cuda_version }}
     - cuda-cccl ={{ cuda_version }}
     - cuda-cudart ={{ cuda_version }}
+    - cuda-nvml-dev ={{ cuda_version }}
     - cuda-driver-dev ={{ cuda_version }}
     - cuda-cudart-dev ={{ cuda_version }}
 {% endif %}
diff --git a/docs/figures/vision.png b/docs/figures/vision.png
deleted file mode 100644
index 6cd072bac..000000000
Binary files a/docs/figures/vision.png and /dev/null differ
diff --git a/docs/legate/core/Makefile b/docs/legate/core/Makefile
index bff64057f..72ea3d7ea 100644
--- a/docs/legate/core/Makefile
+++ b/docs/legate/core/Makefile
@@ -19,6 +19,7 @@
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?= -v -W
+PARALLEL_BUILD ?= 1
 SPHINXBUILD   ?= legate $(shell which sphinx-build)
 SOURCEDIR     = source
 BUILDDIR      = build
@@ -32,7 +33,7 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -j $(PARALLEL_BUILD)
 	rm -rf build/html/docs
 	mkdir -p build/html/docs
 	cp -r ../../figures build/html/docs/
diff --git a/docs/legate/core/source/api/allocation.rst b/docs/legate/core/source/api/allocation.rst
new file mode 100644
index 000000000..5f71df89c
--- /dev/null
+++ b/docs/legate/core/source/api/allocation.rst
@@ -0,0 +1,20 @@
+.. _label_allocation:
+
+.. currentmodule:: legate.core.allocation
+
+Inline mapping
+==============
+
+When a client requests an immediate allocation of a store with
+:meth:`legate.core.store.Store.get_inline_allocation`, the runtime gives you
+back an ``InlineMappedAllocation`` object, which is a thin wrapper around the
+allocation. Since the runtime needs to keep track of lifetimes of Python
+objects using the allocation, the wrapper reveals the allocation to a callback
+and not directly. Doing it this way allows the runtime to capture the object
+constructed from the allocation and tie their lifetimes.
+
+
+.. autosummary::
+   :toctree: generated/
+
+   InlineMappedAllocation.consume
diff --git a/docs/legate/core/source/api/classes.rst b/docs/legate/core/source/api/classes.rst
new file mode 100644
index 000000000..f46811a82
--- /dev/null
+++ b/docs/legate/core/source/api/classes.rst
@@ -0,0 +1,14 @@
+-------
+Classes
+-------
+
+.. toctree::
+   :maxdepth: 2
+
+   runtime
+   operation
+   store
+   allocation
+   shape
+.. partition
+.. legion
diff --git a/docs/legate/core/source/api/index.rst b/docs/legate/core/source/api/index.rst
index d5516f6dc..abe6a5ab9 100644
--- a/docs/legate/core/source/api/index.rst
+++ b/docs/legate/core/source/api/index.rst
@@ -5,43 +5,7 @@ API Reference
 .. currentmodule:: legate.core
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
 
-.. autosummary::
-   :toctree: generated/
-
-    Point
-    Rect
-    Domain
-    Transform
-    AffineTransform
-    IndexSpace
-    PartitionFunctor
-    PartitionByRestriction
-    PartitionByImage
-    PartitionByImageRange
-    EqualPartition
-    PartitionByWeights
-    IndexPartition
-    FieldSpace
-    FieldID
-    Region
-    Partition
-    Fill
-    IndexFill
-    Copy
-    IndexCopy
-    Attach
-    Detach
-    Acquire
-    Release
-    Future
-    OutputRegion
-    PhysicalRegion
-    InlineMapping
-    Task
-    FutureMap
-    IndexTask
-    Fence
-    ArgumentMap
-    BufferBuilder
+   routines
+   classes
diff --git a/docs/legate/core/source/api/operation.rst b/docs/legate/core/source/api/operation.rst
new file mode 100644
index 000000000..f84a1bced
--- /dev/null
+++ b/docs/legate/core/source/api/operation.rst
@@ -0,0 +1,120 @@
+.. _label_operation:
+
+.. currentmodule:: legate.core.operation
+
+Operations
+==========
+
+Operations in Legate are by default automatically parallelized. Legate extracts
+parallelism from an operation by partitioning its store arguments. Operations
+usually require the partitions to be aligned in some way; e.g., partitioning
+vectors across multiple addition tasks requires the vectors to be partitioned
+in the same way. Legate provides APIs for developers to control how stores are
+partitioned via `partitioning constraints`.
+
+When an operation needs a store to be partitioned more than one way, the
+operation can create `partition symbols` and use them in partitioning
+constraints. In that case, a partition symbol must be passed along with the
+store when the store is added. Stores can be partitioned in multiple ways when
+they are used only for read accesses or reductions.
+
+AutoTask
+--------
+
+``AutoTask`` is a type of tasks that are automatically parallelized. Each
+Legate task is associated with a task id that uniquely names a task to invoke.
+The actual task implementation resides on the C++ side.
+
+.. autosummary::
+   :toctree: generated/
+
+   AutoTask.add_input
+   AutoTask.add_output
+   AutoTask.add_reduction
+   AutoTask.add_scalar_arg
+   AutoTask.declare_partition
+   AutoTask.add_constraint
+   AutoTask.add_alignment
+   AutoTask.add_broadcast
+   AutoTask.throws_exception
+   AutoTask.can_raise_exception
+   AutoTask.add_nccl_communicator
+   AutoTask.add_cpu_communicator
+   AutoTask.side_effect
+   AutoTask.set_concurrent
+   AutoTask.set_side_effect
+   AutoTask.execute
+
+
+Copy
+----
+
+``Copy`` is a special kind of operation for copying data from one store to
+another. Unlike tasks that are mapped to and run on application processors,
+copies are performed by the DMA engine in the runtime. Also, unlike tasks that
+are user-defined, copies have well-defined semantics and come with predefined
+partitioning assumptions on stores. Hence, copies need not take partitioning
+constraints from developers.
+
+A copy can optionally take a store for indices that need to be used in
+accessing the source or target. With an `indirection` store on the source, the
+copy performs a gather operation, and with an indirection on the target, the
+copy does a scatter; when indirections exist for both the source and target,
+the copy turns into a full gather-scatter copy. Out-of-bounds indices are not
+checked and can produce undefined behavior. The caller therefore is responsible
+for making sure the indices are within bounds.
+
+.. autosummary::
+   :toctree: generated/
+
+   Copy.add_input
+   Copy.add_output
+   Copy.add_reduction
+   Copy.add_source_indirect
+   Copy.add_target_indirect
+   Copy.execute
+
+Fill
+----
+
+``Fill`` is a special kind of operation for filling a store with constant values.
+Like coipes, fills are performed by the DMA engine and their partitioning
+constraints are predefined.
+
+.. autosummary::
+   :toctree: generated/
+
+   Fill.execute
+
+
+Manually Parallelized Tasks
+---------------------------
+
+In some occassions, tasks are unnatural or even impossible to write in the
+auto-parallelized style. For those occassions, Legate provides explicit control
+on how tasks are parallelized via ``ManualTask``. Each manual task requires the
+caller to provide a `launch domain` that determines the degree of parallelism
+and also names task instances initiaed by the task. Direct store arguments to a
+manual task are assumed to be replicated across task instances, and it's the
+developer's responsibility to partition stores. Mapping between points in the
+launch domain and colors in the color space of a store partition is assumed to
+be an identity mapping by default, but it can be configured with a `projection
+function`, a Python function on tuples of coordinates. (See
+:ref:`StorePartition <label_store_partition>` for definitions of color,
+color space, and store partition.)
+
+.. autosummary::
+   :toctree: generated/
+
+   ManualTask.side_effect
+   ManualTask.set_concurrent
+   ManualTask.set_side_effect
+   ManualTask.add_input
+   ManualTask.add_output
+   ManualTask.add_reduction
+   ManualTask.add_scalar_arg
+   ManualTask.throws_exception
+   ManualTask.can_raise_exception
+   ManualTask.add_nccl_communicator
+   ManualTask.add_cpu_communicator
+   ManualTask.execute
diff --git a/docs/legate/core/source/api/routines.rst b/docs/legate/core/source/api/routines.rst
new file mode 100644
index 000000000..78dd1609a
--- /dev/null
+++ b/docs/legate/core/source/api/routines.rst
@@ -0,0 +1,11 @@
+.. currentmodule:: legate.core
+
+--------
+Routines
+--------
+
+.. autosummary::
+   :toctree: generated/
+
+   get_legate_runtime
+   track_provenance
diff --git a/docs/legate/core/source/api/runtime.rst b/docs/legate/core/source/api/runtime.rst
new file mode 100644
index 000000000..8a4033e43
--- /dev/null
+++ b/docs/legate/core/source/api/runtime.rst
@@ -0,0 +1,101 @@
+.. _label_runtime:
+
+.. currentmodule:: legate.core
+
+Runtime and Library Contexts
+============================
+
+Library
+-------
+
+A ``Library`` class is an interface that every library descriptor needs to
+implement. Each library should tell the Legate runtime how to initialize and
+configure the library, and this class provides a common way to reveal that
+information to the runtime. Each library should register to the runtime a
+library descriptor object that implements ``Library`` directly or via duck
+typing. (See :meth:`legate.core.runtime.Runtime.register_library`.)
+
+.. autosummary::
+   :toctree: generated/
+
+   Library.get_name
+   Library.get_shared_library
+   Library.get_c_header
+   Library.get_registration_callback
+   Library.get_resource_configuration
+
+
+Resource configuration
+----------------------
+
+A ``ResourceConfig`` object describes the maximum number of handles that a
+library uses.
+
+.. autosummary::
+   :toctree: generated/
+
+   ResourceConfig.max_tasks
+   ResourceConfig.max_reduction_ops
+   ResourceConfig.max_mappers
+
+
+Context
+-------
+
+A ``Context`` object provides APIs for creating stores and issuing tasks and
+other kinds of operations. When a library registers itself to the Legate
+runtime, the runtime gives back a context object unique to the library.
+
+.. autosummary::
+   :toctree: generated/
+
+   context.Context.create_store
+   context.Context.create_task
+   context.Context.create_manual_task
+   context.Context.create_auto_task
+   context.Context.create_copy
+   context.Context.create_fill
+   context.Context.issue_execution_fence
+   context.Context.tree_reduce
+   context.Context.get_tunable
+   context.Context.provenance
+   context.Context.annotation
+   context.Context.set_provenance
+   context.Context.reset_provenance
+   context.Context.push_provenance
+   context.Context.pop_provenance
+   context.Context.track_provenance
+
+
+Legate Runtime
+--------------
+
+.. autosummary::
+   :toctree: generated/
+
+   runtime.Runtime.num_cpus
+   runtime.Runtime.num_omps
+   runtime.Runtime.num_gpus
+   runtime.Runtime.register_library
+   runtime.Runtime.create_future
+
+
+Annotation
+----------
+
+An ``Annotation`` is a context manager to set library specific annotations that
+are to be attached to operations issued within a scope. A typical usage of
+``Annotation`` would look like this:
+
+::
+
+  with Annotation(lib_context, { "key1" : "value1", "key2" : "value2", ... }:
+    ...
+
+Then each operation in the scope is annotated with the key-value pairs,
+which are later rendered in execution profiles.
+
+.. autosummary::
+   :toctree: generated/
+
+   context.Annotation.__init__
diff --git a/docs/legate/core/source/api/shape.rst b/docs/legate/core/source/api/shape.rst
new file mode 100644
index 000000000..46b4aeba1
--- /dev/null
+++ b/docs/legate/core/source/api/shape.rst
@@ -0,0 +1,62 @@
+.. currentmodule:: legate.core.shape
+
+Shape
+=====
+
+A ``Shape`` is used in expressing the shape of a certain entity in Legate. The
+reason Legate introduces this indirection to the shape metadata is that stores
+in Legate can have unknown shapes at creation time; the shape of an unbound
+store is determined only when the producer task finishes. The shape object can
+help the runtime query the store's metadata or construct another store
+isomorphic to the store without getting blocked.
+
+Shape objects should behave just like an array of integers, but operations that
+introspect the values implicitly block on completion of the producer task.
+
+
+.. autosummary::
+   :toctree: generated/
+
+   Shape.__init__
+
+
+Properties
+----------
+.. autosummary::
+   :toctree: generated/
+
+   Shape.extents
+   Shape.fixed
+   Shape.ndim
+   Shape.volume
+   Shape.sum
+   Shape.strides
+
+
+Manipulation Methods
+--------------------
+.. autosummary::
+   :toctree: generated/
+
+   Shape.drop
+   Shape.update
+   Shape.replace
+   Shape.insert
+   Shape.map
+
+
+Arithmetic and comparison
+-------------------------
+.. autosummary::
+   :toctree: generated/
+
+   Shape.__eq__
+   Shape.__le__
+   Shape.__lt__
+   Shape.__ge__
+   Shape.__gt__
+   Shape.__add__
+   Shape.__sub__
+   Shape.__mul__
+   Shape.__mod__
+   Shape.__floordiv__
diff --git a/docs/legate/core/source/api/store.rst b/docs/legate/core/source/api/store.rst
new file mode 100644
index 000000000..0b95ca443
--- /dev/null
+++ b/docs/legate/core/source/api/store.rst
@@ -0,0 +1,125 @@
+.. currentmodule:: legate.core.store
+
+Store
+=====
+
+`Store` is a multi-dimensional data container for fixed-size elements. Stores
+are internally partitioned and distributed across the system. By default,
+Legate clients need not create nor maintain the partitions explicitly, and the
+Legate runtime is responsible for managing them. Legate clients can control how
+stores should be partitioned for a given task by attaching partitioning
+constraints to the task (see section :ref:`label_operation` for partitioning
+constraint APIs).
+
+Each Store object is a logical handle to the data and is not immediately
+associated with a physical allocation. To access the data, a client must
+`map` the store to a physical instance. A client can map a store by passing
+it to a task, in which case the task body can see the allocation, or calling
+``get_inline_allocation``, which gives the client a linear handle to the
+physical allocation (see section :ref:`label_allocation` for details about
+inline allocations).
+
+Normally, a store gets a fixed shape upon creation. However, there is a special
+type of stores called `unbound` stores whose shapes are unknown at creation
+time. (see section :ref:`label_runtime` for the store creation API.) The shape
+of an unbound store is determined by a task that first updates the store; upon
+the submission of the task, the store becomes a normal store. Passing an
+unbound store as a read-only argument or requesting an inline allocation of an
+unbound store are invalid.
+
+One consequence due to the nature of unbound stores is that querying the shape
+of a previously unbound store can block the client's control flow for an
+obvious reason; to know the shape of the store whose shape was unknown at
+creation time, the client must wait until the updater task to finish. However,
+passing a previously unbound store to a downstream operation can be
+non-blocking, as long as the operation requires no changes in the partitioning
+and mapping for the store.
+
+
+Basic Properties
+----------------
+
+.. autosummary::
+   :toctree: generated/
+
+   Store.shape
+   Store.ndim
+   Store.size
+   Store.type
+   Store.kind
+   Store.unbound
+   Store.scalar
+.. Store.extents
+
+
+Transformation
+--------------
+
+Legate provides several API calls to transform stores. A store after a
+transformation is a view to the original store; i.e., any changes made to the
+transformed store are visible via the original one and vice versa.
+
+.. autosummary::
+   :toctree: generated/
+
+   Store.transform
+   Store.transformed
+   Store.promote
+   Store.project
+   Store.slice
+   Store.transpose
+   Store.delinearize
+
+
+Storage management
+------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   Store.get_inline_allocation
+.. Store.storage
+.. Store.has_storage
+
+
+Partition management
+--------------------
+
+In most cases, Legate clients need not create nor manage partitions manually by
+themselves. However, there are occasions where the clients need to parallelize
+tasks manually, for which stores need to be partitioned manually as well. For
+those occasions, clients may want to query and update the `key` partition of
+each store, i.e., the partition used for updating the store for the last time.
+The following are the API calls for manual partition management.
+
+.. autosummary::
+   :toctree: generated/
+
+   Store.get_key_partition
+   Store.set_key_partition
+   Store.reset_key_partition
+   Store.partition_by_tiling
+
+
+.. _label_store_partition:
+
+StorePartition
+==============
+
+A ``StorePartition`` is an object that represents a partitioned state of a
+store. A store partition is a name of a collection of `sub-stores`, each of
+which contains to a subset of elements in the store. Sub-stores in a store
+partition are uniquely identified by their `colors`, and a set of all colors
+of a given store partition is called a `color space`.
+
+It is recommended that store partitions and their sub-stores be used as
+arguments to ``ManualTask`` (see section :ref:`label_operation` for APIs for
+manual parallelization).
+
+
+.. autosummary::
+   :toctree: generated/
+
+   StorePartition.store
+   StorePartition.partition
+   StorePartition.get_child_store
diff --git a/docs/legate/core/source/index.rst b/docs/legate/core/source/index.rst
index 5475b6fa6..b968d3738 100644
--- a/docs/legate/core/source/index.rst
+++ b/docs/legate/core/source/index.rst
@@ -6,7 +6,8 @@ Welcome to Legate Core's documentation!
 
   Overview <README.md>
   Build instructions <BUILD.md>
-  API Reference <api/index.rst>
+  Python API Reference <api/index.rst>
+  C++ API Reference <https://nv-legate.github.io/legate.core/cpp-interface>
   Contributing <CONTRIBUTING.md>
   Versions <versions.rst>
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 000000000..fdee93f24
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,5 @@
+# We abuse find package for testing purposes here to
+# 'find' the current build tree to test package builds
+set(legate_core_ROOT ${CMAKE_BINARY_DIR})
+
+add_subdirectory(hello)
diff --git a/examples/hello/CMakeLists.txt b/examples/hello/CMakeLists.txt
new file mode 100644
index 000000000..f163cfc7c
--- /dev/null
+++ b/examples/hello/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
+
+project(hello VERSION 1.0 LANGUAGES C CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(BUILD_SHARED_LIBS ON)
+
+find_package(legate_core REQUIRED)
+
+legate_add_cpp_subdirectory(src TARGET hello EXPORT hello-export)
+
+legate_add_cffi(${CMAKE_CURRENT_SOURCE_DIR}/src/hello_cffi.h TARGET hello)
+legate_python_library_template(hello)
+legate_default_python_install(hello EXPORT hello-export)
diff --git a/examples/hello/README.md b/examples/hello/README.md
new file mode 100644
index 000000000..d2a26a40e
--- /dev/null
+++ b/examples/hello/README.md
@@ -0,0 +1,218 @@
+# Legate Hello World Application
+
+Here we illustrate a minimal example to get a Legate library up and running.
+The example here shows how to get started with the minimum amount of boilerplate.
+For advanced use cases, the boilerplate generated can be customized as needed.
+In general, a Legate application will need to implement three pieces.
+
+1. Build system
+1. C++ tasks
+1. Python library
+
+Please refer to the README in the [Legate repo](https://github.com/nv-legate/legate.core/blob/HEAD/README.md)
+for first installing `legate.core`.  We strongly recommend creating a Conda environment for development and testing.
+
+# Build System
+
+## Build Steps
+
+To build the project, the user can do the following:
+
+```
+$ cmake -S . -B build
+$ cmake --build build
+$ python -m pip install -e .
+```
+
+This performs an editable install of the project, which we recommend for development.
+If `cmake` fails to find Legate, the path to the installed Legate can be manually
+specific as `-Dlegate_core_ROOT=<...>` to the `cmake` configuration.
+Alternatively, the user can just do a regular pip installation:
+
+```
+$ python -m pip install .
+```
+
+These approaches are illustrated in the `editable-install.sh` and `install.sh` scripts.
+In particular, `editable-install.sh` shows how to use Legate install info to
+point CMake to the correct installation root.
+
+## CMake
+CMake is the officially supported mechanism for building Legate libraries.
+Legate exports a CMake target and helper functions for building libraries and provides by-far the easiest onboarding.
+There are only a few main steps in setting up a build system.
+First, the user should initialize a CMake project.
+
+```cmake
+cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
+
+project(hello VERSION 1.0 LANGUAGES C CXX)
+```
+
+Next the user needs to find an existing Legate core:
+
+```cmake
+find_package(legate_core REQUIRED)
+```
+
+Once the `legate_core` package is located, a number of helper functions will be available.
+In a source folder, the user can define a library that will implement the C++ tasks:
+
+```cmake
+legate_cpp_library_template(hello TEMPLATE_SOURCES)
+
+add_library(
+  hello
+  hello_world.cc
+  hello_world.h
+  ${TEMPLATE_SOURCES}
+)
+target_link_libraries(hello PRIVATE legate::core)
+```
+
+First, a helper function is invoked to generate the Legate C++ boilerplate files.
+The list of generated files is returned in the `TEMPLATE_SOURCES` variable.
+Second, the CMake library is linked against the imported `legate::core` target.
+
+Two helper functions are provided to generate the Python boilerplate.
+In the top-level CMakeLists.txt, the Python-C bindings can be generated using CFFI:
+
+```cmake
+legate_add_cffi(${CMAKE_SOURCE_DIR}/src/hello_world.h TARGET hello)
+```
+
+The header file is implemented by the user and contains all the enums required
+to implement a Legate library. The necessary Python file is generated in the `hello`
+subdirectory. Additionally, the user may want to generate a standard `library.py`
+in the Python `hello` folder:
+
+```cmake
+legate_python_library_template(hello)
+```
+
+Finally, default pip installation hooks (via scikit-build) can be added:
+
+```cmake
+legate_default_python_install(hello EXPORT hello-export)
+```
+
+## Editable Builds
+
+Although the final user Python library will likely be installed with `pip`,
+the user will usually need to iterate on the C++ implementation of tasks
+for debugging and optmization.  The user will therefore want to be able
+to first build the C++ pieces of the project and then install the Python.
+To support this workflow, legate provides a helper function:
+
+```cmake
+legate_add_cpp_subdirectory(src hello EXPORT hello-export)
+```
+This encapsulates the build target `hello` so that the C++ library can
+be first built with CMake and then pip-installed in a separate step.
+This is optional, though, and the entire build can always be executed by
+doing a regular pip install:
+
+```
+$ python -m pip install .
+```
+
+# C++ tasks
+
+First, a `hello_world.h` header is needed to define all enums. In this case,
+we have enums identifying the different task types:
+
+```cpp
+enum HelloOpCode {
+  _OP_CODE_BASE = 0,
+  HELLO_WORLD_TASK = 1,
+};
+```
+
+We implement this CPU-only task in a `hello_world.cc`.
+
+```cpp
+#include "legate_library.h"
+#include "hello_world.h"
+
+namespace hello {
+```
+
+The source file should include the library header and the generated file `legate_library.h`.
+Because the target was named `hello` in the build files, all generated files create types
+in the `hello` namespace.
+
+The task implementation is simple:
+
+```cpp
+class HelloWorldTask : public Task<HelloWorldTask, HELLO_WORLD_TASK> {
+ public:
+  static void cpu_variant(legate::TaskContext& context){
+    std::string message = context.scalars()[0].value<std::string>();
+    std::cout << message << std::endl;
+  }
+};
+```
+Here we define a CPU variant. The task is given the unique enum ID from `hello_world.h`.
+The task unpacks a string from the input context and prints it.
+Task types needed to be statically registered, which requires a bit of extra boilerplate:
+
+```cpp
+namespace
+{
+
+static void __attribute__((constructor)) register_tasks(void)
+{
+  hello::HelloWorldTask::register_variants();
+}
+
+}
+```
+
+Any tasks instantiated in the Python library will ultimately invoke this C++ task.
+
+# Python library
+
+The example uses two generated files `library.py` and `install_info.py`.
+The implementation of tasks is provided in the `hello.py` file.
+First, we have to import a few types and a context object for creating tasks.
+The context object is automatically created in the generated boilerplate.
+
+```python
+from .library import user_context, user_lib
+from enum import IntEnum
+from legate.core import Rect
+import legate.core.types as types
+```
+
+The C++ enums can be mapped into Python:
+
+```python
+class HelloOpCode(IntEnum):
+    HELLO_WORLD = user_lib.cffi.HELLO_WORLD_TASK
+```
+
+The example here provides two library functions. The first prints a single message.
+The second prints a fixed number of of messages. For `print_hello`,
+a new task is created in `user_context`. The message string is added as a scalar argument.
+In the second example, a launch domain for a fixed `n` is provided.
+
+These library functions can now be imported and used in python.
+This is shown in `examples/hello.py`:
+
+```
+from hello import print_hello
+
+print_hello("Hello, world")
+```
+
+
+# Examples
+
+The tutorial contains a few examples that illustate key Legate concepts:
+
+1. [Hello World](examples/hello-world.md): Shows the basics of creating tasks and adding task arguments.
+1. [Variance](examples/variance.md): Shows how to create input arrays and tasks operating on partitioned data.
+Also shows how to perform reduction tasks like summation.
+
+
+
diff --git a/examples/hello/editable-install.sh b/examples/hello/editable-install.sh
new file mode 100755
index 000000000..5c5774ece
--- /dev/null
+++ b/examples/hello/editable-install.sh
@@ -0,0 +1,5 @@
+legate_root=`python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'`
+echo "Using Legate at $legate_root"
+cmake -S . -B build -D legate_core_ROOT=$legate_root
+cmake --build build
+python -m pip install -e .
diff --git a/examples/hello/examples/cunumeric-variance.py b/examples/hello/examples/cunumeric-variance.py
new file mode 100644
index 000000000..cfaed064d
--- /dev/null
+++ b/examples/hello/examples/cunumeric-variance.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+import cunumeric
+import numpy as np
+from hello import square, sum, to_scalar
+
+from legate.core import Store
+
+
+def mean_and_variance(a: Any, n: int) -> float:
+    a_sq: Store = square(a)  # A 1-D array of shape (4,)
+    sum_sq: Store = sum(a_sq)  # A scalar sum
+    sum_a: Store = sum(a)  # A scalar sum
+
+    # Extract scalar values from the Legate stores
+    mean_a: float = to_scalar(sum_a) / n
+    mean_sum_sq: float = to_scalar(sum_sq) / n
+    variance = mean_sum_sq - mean_a * mean_a
+    return mean_a, variance
+
+
+# Example: Use a random array from cunumeric
+n = 4
+a = cunumeric.random.randn(n).astype(np.float32)
+print(a)
+print(mean_and_variance(a, n))
diff --git a/examples/hello/examples/hello-world.md b/examples/hello/examples/hello-world.md
new file mode 100644
index 000000000..5df75a854
--- /dev/null
+++ b/examples/hello/examples/hello-world.md
@@ -0,0 +1,37 @@
+# Basic Hello, World Application
+
+The code for this example can be found in the [library file](../hello/hello.py) and [example](hello-world.py).
+
+## Single, auto task
+
+Generally auto tasks should be preferred that automatically
+partition and parallelize task launches.
+In the hello world example, only a single scalar argument
+is added and the task is enqueued with `execute`:
+
+```
+task = user_context.create_auto_task(HelloOpCode.HELLO_WORLD)
+task.add_scalar_arg(message, types.string)
+task.execute()
+```
+
+In this case, the cost heuristic in the runtime will notice
+that the task is inexpensive and launch a single instance.
+
+## Manual task with explicit launch domain
+
+It is possibly to manually specify the launch domain for a task,
+overriding the internal heuristics.
+
+```
+launch_domain = Rect(lo=[0], hi=[n], exclusive=True)
+task = user_context.create_manual_task(
+    HelloOpCode.HELLO_WORLD, launch_domain=launch_domain
+)
+task.add_scalar_arg(message, types.string)
+task.execute()
+```
+
+Now `n` replica tasks will be launched. In this case,
+the `Rect` launch domain is linear, but multi-dimensional domains
+are also possible.
diff --git a/examples/hello/examples/hello-world.py b/examples/hello/examples/hello-world.py
new file mode 100644
index 000000000..15728fb48
--- /dev/null
+++ b/examples/hello/examples/hello-world.py
@@ -0,0 +1,5 @@
+from hello import print_hello, print_hellos
+
+print_hello("Hello, world")
+
+print_hellos(message="Romanes eunt domus", n=2)
diff --git a/examples/hello/examples/variance.md b/examples/hello/examples/variance.md
new file mode 100644
index 000000000..c11042823
--- /dev/null
+++ b/examples/hello/examples/variance.md
@@ -0,0 +1,67 @@
+# Variance Example
+
+The code for this example can be found in the [library file](../hello/hello.py) and [example](variance.py).
+
+## Creating a store
+
+As seen in the `iota` task, a store can be created from a context as, e.g.
+
+```
+output = user_context.create_store(
+    types.float32,
+    shape=(size,),
+    optimize_scalar=True,
+)
+```
+
+At this point, the store may not be allocated or contain data,
+but can still be passed to tasks as a valid output handle.
+
+## Elementwise task with aligned partitions
+
+Tasks are also created on a context:
+
+```
+task = user_context.create_auto_task(HelloOpCode.SQUARE)
+
+task.add_input(input)
+task.add_output(output)
+task.add_alignment(input, output)
+task.execute()
+```
+
+An auto task indicates Legate should auto-partition based
+on cost heuristics and partitioning constraints.
+An input and output array are added.
+The most critical step here, though, is the alignment of
+the input and output. Since we want to do elementwise operations,
+we need the input and output partitions to be aligned.
+This expresses an auto-partitioning constraint.
+Finally, the task is enqueued by calling its `execute` method.
+
+## Reduction (Summation)
+
+We similarly set up a task, but now add the output
+as a reduction.
+
+```
+task = user_context.create_auto_task(HelloOpCode.SUM)
+
+task.add_input(input)
+task.add_reduction(output, types.ReductionOp.ADD)
+task.execute()
+```
+
+The output is a scalar, which means there is no partitioning
+alignment constraint with input and output.
+
+## Using data from other Legate libraries
+
+Data structures from other libraries (e.g. cunumeric)
+can be passed into functions from other Legate libraries,
+even if the libraries are unaware of each other.
+Legate provides a common interface for data structures
+to provide a schema and access to its underlying stores.
+This is shown in the `_get_legate_store` function via
+the `__legate_data_interface__`.
+
diff --git a/examples/hello/examples/variance.py b/examples/hello/examples/variance.py
new file mode 100644
index 000000000..a8272eec9
--- /dev/null
+++ b/examples/hello/examples/variance.py
@@ -0,0 +1,23 @@
+from typing import Any
+
+from hello import iota, square, sum, to_scalar
+
+from legate.core import Store
+
+
+def mean_and_variance(a: Any, n: int) -> float:
+    a_sq: Store = square(a)  # A 1-D array of shape (4,)
+    sum_sq: Store = sum(a_sq)  # A scalar sum
+    sum_a: Store = sum(a)  # A scalar sum
+
+    # Extract scalar values from the Legate stores
+    mean_a: float = to_scalar(sum_a) / n
+    mean_sum_sq: float = to_scalar(sum_sq) / n
+    variance = mean_sum_sq - mean_a * mean_a
+    return mean_a, variance
+
+
+# Example: Use a basic 1,2,3,4 array
+n = 4
+a = iota(n)
+print(mean_and_variance(a, n))
diff --git a/examples/hello/hello/__init__.py b/examples/hello/hello/__init__.py
new file mode 100644
index 000000000..6e38a3cde
--- /dev/null
+++ b/examples/hello/hello/__init__.py
@@ -0,0 +1,2 @@
+from .library import user_lib
+from .hello import iota, print_hello, print_hellos, square, sum, to_scalar
diff --git a/examples/hello/hello/hello.py b/examples/hello/hello/hello.py
new file mode 100644
index 000000000..6548806b2
--- /dev/null
+++ b/examples/hello/hello/hello.py
@@ -0,0 +1,169 @@
+import struct
+from enum import IntEnum
+from typing import Any
+
+import numpy as np
+
+import legate.core.types as types
+from legate.core import Rect, Store, get_legate_runtime
+
+from .library import user_context, user_lib
+
+
+class HelloOpCode(IntEnum):
+    HELLO_WORLD = user_lib.cffi.HELLO_WORLD
+    SUM = user_lib.cffi.SUM
+    SQUARE = user_lib.cffi.SQUARE
+    IOTA = user_lib.cffi.IOTA
+
+
+def print_hello(message: str) -> None:
+    """Create a Legate task launch to print a message
+
+    Args:
+        message (str): The message to print
+    """
+    task = user_context.create_auto_task(HelloOpCode.HELLO_WORLD)
+    task.add_scalar_arg(message, types.string)
+    task.execute()
+
+
+def print_hellos(message: str, n: int) -> None:
+    """Create a Legate task launch to print a message n times,
+       using n replicas of the task
+
+    Args:
+        message (str): The message to print
+        n (int): The number of times to print
+    """
+    launch_domain = Rect(lo=[0], hi=[n])
+    task = user_context.create_manual_task(
+        HelloOpCode.HELLO_WORLD, launch_domain=launch_domain
+    )
+    task.add_scalar_arg(message, types.string)
+    task.execute()
+
+
+def _get_legate_store(input: Any) -> Store:
+    """Extracts a Legate store from any object
+       implementing the legete data interface
+
+    Args:
+        input (Any): The input object
+
+    Returns:
+        Store: The extracted Legate store
+    """
+    if isinstance(input, Store):
+        return input
+    data = input.__legate_data_interface__["data"]
+    field = next(iter(data))
+    array = data[field]
+    _, store = array.stores()
+    return store
+
+
+def to_scalar(input: Store) -> float:
+    """Extracts a Python scalar value from a Legate store
+       encapsulating a single scalar
+
+    Args:
+        input (Store): The Legate store encapsulating a scalar
+
+    Returns:
+        float: A Python scalar
+    """
+    # This operation blocks until the data in the Store
+    # is available and correct
+    buf = input.storage.get_buffer(np.float32().itemsize)
+    result = np.frombuffer(buf, dtype=np.float32, count=1)
+    return float(result[0])
+
+
+def zero() -> Store:
+    """Creates a Legate store representing a single zero scalar
+
+    Returns:
+        Store: A Legate store representing a scalar zero
+    """
+    data = bytearray(4)
+    buf = struct.pack(f"{len(data)}s", data)
+    future = get_legate_runtime().create_future(buf, len(buf))
+    return user_context.create_store(
+        types.float32,
+        shape=(1,),
+        storage=future,
+        optimize_scalar=True,
+    )
+
+
+def iota(size: int) -> Store:
+    """Enqueues a task that will generate a 1-D array
+       1,2,...size.
+
+    Args:
+        size (int): The number of elements to generate
+
+    Returns:
+        Store: The Legate store that will hold the iota values
+    """
+    output = user_context.create_store(
+        types.float32,
+        shape=(size,),
+        optimize_scalar=True,
+    )
+    task = user_context.create_auto_task(
+        HelloOpCode.IOTA,
+    )
+    task.add_output(output)
+    task.execute()
+    return output
+
+
+def sum(input: Any) -> Store:
+    """Sums a 1-D array into a single scalar
+
+    Args:
+        input (Any): A Legate store or any object implementing
+                     the Legate data interface.
+
+    Returns:
+        Store: A Legate store encapsulating the array sum
+    """
+    input_store = _get_legate_store(input)
+
+    task = user_context.create_auto_task(HelloOpCode.SUM)
+
+    # zero-initialize the output for the summation
+    output = zero()
+
+    task.add_input(input_store)
+    task.add_reduction(output, types.ReductionOp.ADD)
+    task.execute()
+    return output
+
+
+def square(input: Any) -> Store:
+    """Computes the elementwise square of a 1-D array
+
+    Args:
+        input (Any): A Legate store or any object implementing
+                     the Legate data interface.
+
+    Returns:
+        Store: A Legate store encapsulating a 1-D array
+               holding the elementwise square values
+    """
+    input_store = _get_legate_store(input)
+
+    output = user_context.create_store(
+        types.float32, shape=input_store.shape, optimize_scalar=True
+    )
+    task = user_context.create_auto_task(HelloOpCode.SQUARE)
+
+    task.add_input(input_store)
+    task.add_output(output)
+    task.add_alignment(input_store, output)
+    task.execute()
+
+    return output
diff --git a/examples/hello/install.sh b/examples/hello/install.sh
new file mode 100755
index 000000000..b4d1f47d4
--- /dev/null
+++ b/examples/hello/install.sh
@@ -0,0 +1 @@
+python -m pip install .
diff --git a/examples/hello/setup.py b/examples/hello/setup.py
new file mode 100644
index 000000000..f919989b5
--- /dev/null
+++ b/examples/hello/setup.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+from pathlib import Path
+
+from setuptools import find_packages
+from skbuild import setup
+
+import legate.install_info as lg_install_info
+
+legate_dir = Path(lg_install_info.libpath).parent.as_posix()
+
+cmake_flags = [
+    f"-Dlegate_core_ROOT:STRING={legate_dir}",
+]
+
+env_cmake_args = os.environ.get("CMAKE_ARGS")
+if env_cmake_args is not None:
+    cmake_flags.append(env_cmake_args)
+os.environ["CMAKE_ARGS"] = " ".join(cmake_flags)
+
+
+setup(
+    name="Legate Hello",
+    version="0.1",
+    description="A Hello World for Legate",
+    author="NVIDIA Corporation",
+    license="Apache 2.0",
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Topic :: Database",
+        "Topic :: Scientific/Engineering",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+    ],
+    packages=find_packages(
+        where=".",
+        include=["hello", "hello.*"],
+    ),
+    include_package_data=True,
+    zip_safe=False,
+)
diff --git a/examples/hello/src/CMakeLists.txt b/examples/hello/src/CMakeLists.txt
new file mode 100644
index 000000000..dfe343722
--- /dev/null
+++ b/examples/hello/src/CMakeLists.txt
@@ -0,0 +1,22 @@
+
+legate_cpp_library_template(hello TEMPLATE_SOURCES)
+
+add_library(
+  hello
+  hello_world.cc
+  hello_world.h
+  iota.cc
+  sum.cc
+  square.cc
+  ${TEMPLATE_SOURCES}
+)
+
+target_include_directories(hello
+  PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  INTERFACE
+    $<INSTALL_INTERFACE:include>
+)
+
+target_link_libraries(hello PRIVATE legate::core)
+
diff --git a/examples/hello/src/hello_cffi.h b/examples/hello/src/hello_cffi.h
new file mode 100644
index 000000000..c9b8a2c33
--- /dev/null
+++ b/examples/hello/src/hello_cffi.h
@@ -0,0 +1,7 @@
+enum HelloOpCode {
+  _OP_CODE_BASE = 0,
+  HELLO_WORLD = 1,
+  SUM = 2,
+  SQUARE = 3,
+  IOTA = 4,
+};
diff --git a/examples/hello/src/hello_world.cc b/examples/hello/src/hello_world.cc
new file mode 100644
index 000000000..9cea64c42
--- /dev/null
+++ b/examples/hello/src/hello_world.cc
@@ -0,0 +1,25 @@
+#include "hello_world.h"
+#include "legate_library.h"
+
+namespace hello {
+
+Legion::Logger logger("legate.hello");
+
+class HelloWorldTask : public Task<HelloWorldTask, HELLO_WORLD> {
+public:
+  static void cpu_variant(legate::TaskContext &context) {
+    std::string message = context.scalars()[0].value<std::string>();
+    std::cout << message << std::endl;
+  }
+};
+
+} // namespace hello
+
+namespace // unnamed
+{
+
+static void __attribute__((constructor)) register_tasks(void) {
+  hello::HelloWorldTask::register_variants();
+}
+
+} // namespace
diff --git a/examples/hello/src/hello_world.h b/examples/hello/src/hello_world.h
new file mode 100644
index 000000000..8b1c7fb8e
--- /dev/null
+++ b/examples/hello/src/hello_world.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "hello_cffi.h"
+#include "legate_library.h"
+
+namespace hello {
+
+extern Legion::Logger logger;
+
+}
\ No newline at end of file
diff --git a/examples/hello/src/iota.cc b/examples/hello/src/iota.cc
new file mode 100644
index 000000000..4f408008b
--- /dev/null
+++ b/examples/hello/src/iota.cc
@@ -0,0 +1,33 @@
+#include "hello_world.h"
+#include "legate_library.h"
+
+namespace hello {
+
+class IotaTask : public Task<IotaTask, IOTA> {
+public:
+  static void cpu_variant(legate::TaskContext &context) {
+
+    legate::Store &output = context.outputs()[0];
+    legate::Rect<1> output_shape = output.shape<1>();
+    auto out = output.write_accessor<float, 1>();
+
+    logger.info() << "Iota task [" << output_shape.lo << "," << output_shape.hi
+                  << "]";
+
+    // i is a global index for the complete array
+    for (size_t i = output_shape.lo; i <= output_shape.hi; ++i) {
+      out[i] = i + 1;
+    }
+  }
+};
+
+} // namespace hello
+
+namespace // unnamed
+{
+
+static void __attribute__((constructor)) register_tasks(void) {
+  hello::IotaTask::register_variants();
+}
+
+} // namespace
diff --git a/examples/hello/src/square.cc b/examples/hello/src/square.cc
new file mode 100644
index 000000000..eb44365dc
--- /dev/null
+++ b/examples/hello/src/square.cc
@@ -0,0 +1,44 @@
+#include "hello_world.h"
+#include "legate_library.h"
+
+namespace hello {
+
+class SquareTask : public Task<SquareTask, SQUARE> {
+public:
+  static void cpu_variant(legate::TaskContext &context) {
+    legate::Store &output = context.outputs()[0];
+    // Best-practice to validate the store types
+    assert(output.code() == FLOAT_LT);
+    assert(output.dim() == 1);
+    legate::Rect<1> output_shape = output.shape<1>();
+    auto out = output.write_accessor<float, 1>();
+
+    legate::Store &input = context.inputs()[0];
+    // Best-practice to validate the store types
+    assert(input.code() == FLOAT_LT);
+    assert(input.dim() == 1);
+    legate::Rect<1> input_shape = input.shape<1>(); // should be a 1-Dim array
+    auto in = input.read_accessor<float, 1>();
+
+    assert(input_shape == output_shape);
+
+    logger.info() << "Elementwise square [" << output_shape.lo << ","
+                  << output_shape.hi << "]";
+
+    // i is a global index for the complete array
+    for (size_t i = input_shape.lo; i <= input_shape.hi; ++i) {
+      out[i] = in[i] * in[i];
+    }
+  }
+};
+
+} // namespace hello
+
+namespace // unnamed
+{
+
+static void __attribute__((constructor)) register_tasks(void) {
+  hello::SquareTask::register_variants();
+}
+
+} // namespace
diff --git a/examples/hello/src/sum.cc b/examples/hello/src/sum.cc
new file mode 100644
index 000000000..e736600f2
--- /dev/null
+++ b/examples/hello/src/sum.cc
@@ -0,0 +1,49 @@
+#include "hello_world.h"
+#include "legate_library.h"
+
+namespace hello {
+
+class SumTask : public Task<SumTask, SUM> {
+public:
+  static void cpu_variant(legate::TaskContext &context) {
+    legate::Store &input = context.inputs()[0];
+    legate::Rect<1> input_shape = input.shape<1>(); // should be a 1-Dim array
+    auto in = input.read_accessor<float, 1>();
+
+    logger.info() << "Sum [" << input_shape.lo << "," << input_shape.hi << "]";
+
+    float total = 0;
+    // i is a global index for the complete array
+    for (size_t i = input_shape.lo; i <= input_shape.hi; ++i) {
+      total += in[i];
+    }
+
+    /**
+      The task launch as a whole will return a single value (Store of size 1)
+      to the caller. However, each point task gets a separate Store of the
+      same size as the result, to reduce into. This "local accumulator" will
+      be initialized by the runtime, and all we need to do is call .reduce()
+      to add our local contribution. After all point tasks return, the runtime
+      will make sure to combine all their buffers into the single final result.
+    */
+    using Reduce = Legion::SumReduction<float>;
+    legate::Store &output = context.reductions()[0];
+    auto sum = output.reduce_accessor<Reduce, true, 1>();
+    // Best-practice is to validate types
+    assert(output.code() == FLOAT_LT);
+    assert(output.dim() == 1);
+    assert(output.shape<1>() == legate::Rect<1>(0, 0));
+    sum.reduce(0, total);
+  }
+};
+
+} // namespace hello
+
+namespace // unnamed
+{
+
+static void __attribute__((constructor)) register_tasks(void) {
+  hello::SumTask::register_variants();
+}
+
+} // namespace
diff --git a/install.py b/install.py
index 56c508ad9..4c94b3b32 100755
--- a/install.py
+++ b/install.py
@@ -241,6 +241,7 @@ def install(
     hdf,
     llvm,
     spy,
+    build_docs,
     conduit,
     nccl_dir,
     cmake_exe,
@@ -288,6 +289,7 @@ def install(
         print("hdf:", hdf)
         print("llvm:", llvm)
         print("spy:", spy)
+        print("build_docs:", build_docs)
         print("conduit:", conduit)
         print("nccl_dir:", nccl_dir)
         print("cmake_exe:", cmake_exe)
@@ -324,12 +326,14 @@ def install(
     print("Using python lib and version: {}, {}".format(pylib_name, pyversion))
 
     def validate_path(path):
-        if path is not None and (path := str(path)) != "":
-            if not os.path.isabs(path):
-                path = join(legate_core_dir, path)
-            if exists(path := realpath(path)):
-                return path
-        return None
+        if path is None or (path := str(path)) == "":
+            return None
+        if not os.path.isabs(path):
+            path = join(legate_core_dir, path)
+        if not exists(path := realpath(path)):
+            print(f"Error: path does not exist: {path}")
+            sys.exit(1)
+        return path
 
     cuda_dir = validate_path(cuda_dir)
     nccl_dir = validate_path(nccl_dir)
@@ -413,7 +417,7 @@ def validate_path(path):
     cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
 
     if debug or verbose:
-        cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
+        cmake_flags += [f"--log-level={'DEBUG' if debug else 'VERBOSE'}"]
 
     cmake_flags += f"""\
 -DCMAKE_BUILD_TYPE={(
@@ -441,27 +445,29 @@ def validate_path(path):
 """.splitlines()
 
     if nccl_dir:
-        cmake_flags += ["-DNCCL_DIR=%s" % nccl_dir]
+        cmake_flags += [f"-DNCCL_DIR={nccl_dir}"]
     if gasnet_dir:
-        cmake_flags += ["-DGASNet_ROOT_DIR=%s" % gasnet_dir]
+        cmake_flags += [f"-DGASNet_ROOT_DIR={gasnet_dir}"]
     if ucx_dir:
-        cmake_flags += ["-DUCX_ROOT=%s" % ucx_dir]
+        cmake_flags += [f"-DUCX_ROOT={ucx_dir}"]
     if conduit:
-        cmake_flags += ["-DGASNet_CONDUIT=%s" % conduit]
+        cmake_flags += [f"-DGASNet_CONDUIT={conduit}"]
     if cuda_dir:
-        cmake_flags += ["-DCUDA_TOOLKIT_ROOT_DIR=%s" % cuda_dir]
+        cmake_flags += [f"-DCUDAToolkit_ROOT={cuda_dir}"]
     if thrust_dir:
-        cmake_flags += ["-DThrust_ROOT=%s" % thrust_dir]
+        cmake_flags += [f"-DThrust_ROOT={thrust_dir}"]
     if legion_dir:
-        cmake_flags += ["-DLegion_ROOT=%s" % legion_dir]
+        cmake_flags += [f"-DLegion_ROOT={legion_dir}"]
     elif legion_src_dir:
-        cmake_flags += ["-DCPM_Legion_SOURCE=%s" % legion_src_dir]
+        cmake_flags += [f"-DCPM_Legion_SOURCE={legion_src_dir}"]
     else:
         cmake_flags += ["-DCPM_DOWNLOAD_Legion=ON"]
     if legion_url:
-        cmake_flags += ["-Dlegate_core_LEGION_REPOSITORY=%s" % legion_url]
+        cmake_flags += [f"-Dlegate_core_LEGION_REPOSITORY={legion_url}"]
     if legion_branch:
-        cmake_flags += ["-Dlegate_core_LEGION_BRANCH=%s" % legion_branch]
+        cmake_flags += [f"-Dlegate_core_LEGION_BRANCH={legion_branch}"]
+    if build_docs:
+        cmake_flags += ["-Dlegate_core_BUILD_DOCS=ON"]
 
     cmake_flags += extra_flags
     build_flags = [f"-j{str(thread_count)}"]
@@ -618,6 +624,14 @@ def driver():
         default=os.environ.get("USE_SPY", "0") == "1",
         help="Build Legate with detailed Legion Spy enabled.",
     )
+    parser.add_argument(
+        "--docs",
+        dest="build_docs",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Build Doxygen docs.",
+    )
     parser.add_argument(
         "--conduit",
         dest="conduit",
@@ -736,14 +750,14 @@ def driver():
         "--legion-url",
         dest="legion_url",
         required=False,
-        default="https://gitlab.com/StanfordLegion/legion.git",
+        default=None,
         help="Legion git URL to build Legate with.",
     )
     parser.add_argument(
         "--legion-branch",
         dest="legion_branch",
         required=False,
-        default="04cf06a2",
+        default=None,
         help="Legion branch to build Legate with.",
     )
     args, unknown = parser.parse_known_args()
@@ -761,7 +775,7 @@ def driver():
         )
         print("to specify the CMake executable if it is not on PATH.")
         print()
-        print("Attempted to execute: %s" % args.cmake_exe)
+        print(f"Attempted to execute: {args.cmake_exe}")
         sys.exit(1)
 
     install(unknown=unknown, **vars(args))
diff --git a/legate/_sphinxext/__init__.py b/legate/_sphinxext/__init__.py
new file mode 100644
index 000000000..dca0f1315
--- /dev/null
+++ b/legate/_sphinxext/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/legate/_sphinxext/settings.py b/legate/_sphinxext/settings.py
new file mode 100644
index 000000000..0cba5d1b1
--- /dev/null
+++ b/legate/_sphinxext/settings.py
@@ -0,0 +1,108 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import importlib
+import textwrap
+
+from docutils import nodes
+from docutils.parsers.rst.directives import unchanged
+from docutils.statemachine import ViewList
+from jinja2 import Template
+from sphinx.errors import SphinxError
+from sphinx.util.docutils import SphinxDirective
+from sphinx.util.nodes import nested_parse_with_titles
+
+from legate.util.settings import PrioritizedSetting, _Unset
+
+SETTINGS_DETAIL = Template(
+    """
+{% for setting in settings %}
+
+``{{ setting['name'] }}``
+{{ "''''" +  "'" * setting['name']|length }}
+
+:**Type**: {{ setting['type'] }}
+:**Env var**: ``{{ setting['env_var'] }}``
+:**Default**: {{ setting['default'] }}
+
+{{ setting['help'] }}
+
+{% endfor %}
+"""
+)
+
+
+class SettingsDirective(SphinxDirective):
+    has_content = True
+    required_arguments = 1
+    optional_arguments = 1
+    option_spec = {"module": unchanged}
+
+    def run(self):
+        obj_name = " ".join(self.arguments)
+        module_name = self.options["module"]
+
+        try:
+            module = importlib.import_module(module_name)
+        except ImportError:
+            raise SphinxError(
+                f"Unable to generate reference docs for {obj_name}: "
+                f"couldn't import module {module_name}"
+            )
+
+        obj = getattr(module, obj_name, None)
+        if obj is None:
+            raise SphinxError(
+                f"Unable to generate reference docs for {obj_name}: "
+                f"no model {obj_name} in module {module_name}"
+            )
+
+        settings = []
+        for x in obj.__class__.__dict__.values():
+            if not isinstance(x, PrioritizedSetting):
+                continue
+            # help = [line.strip() for line in x.help.strip().split("\n")]
+            setting = {
+                "name": x.name,
+                "env_var": x.env_var,
+                "type": x.convert_type,
+                "help": textwrap.dedent(x.help),
+                "default": "(Unset)"
+                if x.default is _Unset
+                else repr(x.default),
+            }
+            settings.append(setting)
+
+        rst_text = SETTINGS_DETAIL.render(
+            name=obj_name, module_name=module_name, settings=settings
+        )
+        return self.parse(rst_text, "<settings>")
+
+    def parse(self, rst_text, annotation):
+        result = ViewList()
+        for line in rst_text.split("\n"):
+            result.append(line, annotation)
+        node = nodes.paragraph()
+        node.document = self.state.document
+        nested_parse_with_titles(self.state, result, node)
+        return node.children
+
+
+def setup(app):
+    """Required Sphinx extension setup function."""
+    app.add_directive_to_domain("py", "settings", SettingsDirective)
+
+    return dict(parallel_read_safe=True, parallel_write_safe=True)
diff --git a/legate/core/__init__.py b/legate/core/__init__.py
index 0c5eda106..8a278ed92 100644
--- a/legate/core/__init__.py
+++ b/legate/core/__init__.py
@@ -14,12 +14,27 @@
 #
 from __future__ import annotations
 
-from ..rc import check_legion
-from ..util.args import parse_library_command_args
+from legion_cffi import is_legion_python, ffi, lib as legion
 
-check_legion()
+if is_legion_python == False:
+    from legion_top import (
+        legion_canonical_python_main,
+        legion_canonical_python_cleanup,
+    )
+    from ..driver.main import prepare_driver, CanonicalDriver
+    import atexit, os, shlex, sys
 
-from legion_cffi import ffi, lib as legion
+    argv = ["legate"] + shlex.split(os.environ.get("LEGATE_CONFIG", ""))
+
+    driver = prepare_driver(argv, CanonicalDriver)
+
+    if driver.dry_run:
+        sys.exit(0)
+
+    os.environ.update(driver.env)
+
+    legion_canonical_python_main(driver.cmd)
+    atexit.register(legion_canonical_python_cleanup)
 
 from ._legion import (
     LEGATE_MAX_DIM,
diff --git a/legate/core/_legion/future.py b/legate/core/_legion/future.py
index f4d98c882..470af4dd0 100644
--- a/legate/core/_legion/future.py
+++ b/legate/core/_legion/future.py
@@ -386,7 +386,7 @@ def from_dict(
         num_futures = len(futures)
         points = ffi.new("legion_domain_point_t[%d]" % num_futures)
         futures_ = ffi.new("legion_future_t[%d]" % num_futures)
-        for (i, (point, future)) in enumerate(futures.items()):
+        for i, (point, future) in enumerate(futures.items()):
             points[i] = point.raw()
             futures_[i] = future.handle
         handle = legion.legion_future_map_construct_from_futures(
diff --git a/legate/core/_legion/operation.py b/legate/core/_legion/operation.py
index cf13a7bff..95cf96fbb 100644
--- a/legate/core/_legion/operation.py
+++ b/legate/core/_legion/operation.py
@@ -1309,7 +1309,7 @@ def __init__(
             mem = legion.legion_memory_query_next(query, mem)
         legion.legion_memory_query_destroy(query)
         legion.legion_machine_destroy(machine)
-        for (sub_region, buf) in shard_local_data.items():
+        for sub_region, buf in shard_local_data.items():
             if sub_region.parent is not None:
                 assert sub_region.parent.parent is parent
             legion.legion_index_attach_launcher_attach_array_soa(
diff --git a/legate/core/_legion/partition_functor.py b/legate/core/_legion/partition_functor.py
index 47eec302d..2f9822441 100644
--- a/legate/core/_legion/partition_functor.py
+++ b/legate/core/_legion/partition_functor.py
@@ -379,7 +379,7 @@ def partition(
             assert num_domains <= color_space.get_volume()
             colors = ffi.new("legion_domain_point_t[%d]" % num_domains)
             domains = ffi.new("legion_domain_t[%d]" % num_domains)
-            for (i, (point, rect)) in enumerate(self.domains.items()):
+            for i, (point, rect) in enumerate(self.domains.items()):
                 colors[i] = point.raw()
                 domains[i] = rect.raw()
             return legion.legion_index_partition_create_by_domain(
diff --git a/legate/core/_legion/region.py b/legate/core/_legion/region.py
index a1eaf3cfb..60577ad1d 100644
--- a/legate/core/_legion/region.py
+++ b/legate/core/_legion/region.py
@@ -28,7 +28,6 @@
 
 
 class Region:
-
     handle: Any
 
     def __init__(
diff --git a/legate/core/_legion/task.py b/legate/core/_legion/task.py
index 670b2796c..b086523fd 100644
--- a/legate/core/_legion/task.py
+++ b/legate/core/_legion/task.py
@@ -471,7 +471,6 @@ def launch(
 
 
 class IndexTask(Dispatchable[Union[Future, FutureMap]]):
-
     point_args: Union[list[Any], None]
 
     def __init__(
diff --git a/legate/core/allocation.py b/legate/core/allocation.py
index 4f7354e42..b2b86f6c2 100644
--- a/legate/core/allocation.py
+++ b/legate/core/allocation.py
@@ -49,6 +49,23 @@ def __del__(self) -> None:
     def consume(
         self, ctor: Callable[[tuple[int, ...], int, tuple[int, ...]], Any]
     ) -> Any:
+        """
+        Consumes the allocation. Each allocation can be consumed only once.
+
+        Parameters
+        ----------
+        ctor : Callback
+            Callback that constructs a Python object from the allocation.
+            Each callback gets the shape, the physical address, and the strides
+            of the allocation, and is supposed to return a Python object
+            using the allocation. Leaking the three arguments in some other way
+            will lead to an undefined behavior.
+
+        Returns
+        -------
+        Any
+            Python object the callback constructs from the allocation
+        """
         if self._consumed:
             raise RuntimeError("Each inline mapping can be consumed only once")
         self._consumed = True
diff --git a/legate/core/context.py b/legate/core/context.py
index e1aac4536..e044271ae 100644
--- a/legate/core/context.py
+++ b/legate/core/context.py
@@ -54,7 +54,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
 
 
 def find_last_user_frame(libname: str) -> str:
-    for (frame, _) in traceback.walk_stack(None):
+    for frame, _ in traceback.walk_stack(None):
         if "__name__" not in frame.f_globals:
             continue
         if not any(
@@ -160,6 +160,14 @@ def destroy(self) -> None:
 
     @property
     def runtime(self) -> Runtime:
+        """
+        Returns the runtime
+
+        Returns
+        -------
+        Runtime
+            The runtime object
+        """
         return self._runtime
 
     @property
@@ -192,6 +200,15 @@ def type_system(self) -> TypeSystem:
 
     @property
     def annotation(self) -> LibraryAnnotations:
+        """
+        Returns the current set of annotations. Provenance string is one
+        entry in the set.
+
+        Returns
+        -------
+        LibraryAnnotations
+            Library annotations
+        """
         return self._annotations[-1]
 
     def get_all_annotations(self) -> str:
@@ -199,6 +216,15 @@ def get_all_annotations(self) -> str:
 
     @property
     def provenance(self) -> Optional[str]:
+        """
+        Returns the current provenance string. Attached to every operation
+        issued with the context.
+
+        Returns
+        -------
+        str or None
+            Provenance string
+        """
         return self.annotation.provenance
 
     def get_task_id(self, task_id: int) -> int:
@@ -226,6 +252,25 @@ def get_sharding_id(self, shard_id: int) -> int:
     def get_tunable(
         self, tunable_id: int, dtype: DataType, mapper_id: int = 0
     ) -> npt.NDArray[Any]:
+        """
+        Queries a tunable parameter to the mapper.
+
+        Parameters
+        ----------
+        tunable_id : int
+            Tunable id. Local to each mapper.
+
+        dtype : DataType
+            Value type
+
+        mapper_id : int
+            Id of the mapper that should handle the tunable query
+
+        Returns
+        -------
+        np.ndarray
+            A NumPy array holding the value of the tunable parameter
+        """
         dt = np.dtype(dtype.to_pandas_dtype())
         mapper_id = self.get_mapper_id(mapper_id)
         fut = Future(
@@ -244,16 +289,38 @@ def get_unique_op_id(self) -> int:
         return self._runtime.get_unique_op_id()
 
     def set_provenance(self, provenance: str) -> None:
+        """
+        Sets a new provenance string
+
+        Parameters
+        ----------
+        provenance : str
+            Provenance string
+        """
         self._annotations[-1].set_provenance(provenance)
 
     def reset_provenance(self) -> None:
+        """
+        Clears the provenance string that is currently set
+        """
         self._annotations[-1].reset_provenance()
 
     def push_provenance(self, provenance: str) -> None:
+        """
+        Pushes a provenance string to the stack
+
+        Parameters
+        ----------
+        provenance : str
+            Provenance string
+        """
         self._annotations.append(LibraryAnnotations())
         self.set_provenance(provenance)
 
     def pop_provenance(self) -> None:
+        """
+        Pops the provenance string on top the stack
+        """
         if len(self._annotations) == 1:
             raise ValueError("Provenance stack underflow")
         self._annotations.pop(-1)
@@ -261,6 +328,25 @@ def pop_provenance(self) -> None:
     def track_provenance(
         self, func: AnyCallable, nested: bool = False
     ) -> AnyCallable:
+        """
+        Wraps a function with provenance tracking. Provenance of each operation
+        issued within the wrapped function will be tracked automatically.
+
+        Parameters
+        ----------
+        func : AnyCallable
+            Function to wrap
+
+        nested : bool
+            If ``True``, each invocation to a wrapped function within another
+            wrapped function updates the provenance string. Otherwise, the
+            provenance is tracked only for the outermost wrapped function.
+
+        Returns
+        -------
+        AnyCallable
+            Wrapped function
+        """
         if nested:
 
             def wrapper(*args: Any, **kwargs: Any) -> Any:
@@ -286,9 +372,37 @@ def create_task(
         self,
         task_id: int,
         mapper_id: int = 0,
-        manual: Optional[bool] = False,
+        manual: bool = False,
         launch_domain: Optional[Rect] = None,
     ) -> Union[AutoTask, ManualTask]:
+        """
+        Creates a task. The type of the returned task is determined by the
+        value of ``manual``.
+
+        Parameters
+        ----------
+        task_id : int
+            Task id. Scoped locally within the context; i.e., different
+            libraries can use the same task id. There must be a task
+            implementation corresponding to the task id.
+
+        mapper_id : int, optional
+            Id of the mapper that should determine mapping policies for the
+            task. Used only when the library has more than one mapper.
+
+        manual : bool
+            Indicates whether the task should be manually parallelized;
+            if ``True``, the task is parallelized manually by the caller.
+
+        launch_domain : Rect, optional
+            Launch domain of the task. Ignored if the task is automatically
+            parallelized, mandatory otherwise.
+
+        Returns
+        -------
+        AutoTask or ManualTask
+            A new task
+        """
         from .operation import AutoTask, ManualTask
 
         unique_op_id = self.get_unique_op_id()
@@ -314,6 +428,30 @@ def create_manual_task(
         mapper_id: int = 0,
         launch_domain: Optional[Rect] = None,
     ) -> ManualTask:
+        """
+        Type safe version of ``Context.create_task``. Always returns a
+        `ManualTask`.
+
+        Parameters
+        ----------
+        task_id : int
+            Task id
+
+        mapper_id : int, optional
+            Mapper id
+
+        launch_domain : Rect, optional
+            Launch domain of the task.
+
+        Returns
+        -------
+        AutoTask
+            A new auto-parallelized task
+
+        See Also
+        --------
+        Context.create_task
+        """
         from .operation import ManualTask
 
         return cast(
@@ -330,8 +468,32 @@ def create_auto_task(
         self,
         task_id: int,
         mapper_id: int = 0,
-        launch_domain: Optional[Rect] = None,
     ) -> AutoTask:
+        """
+        Type safe version of ``Context.create_task``. Always returns an
+        `AutoTask`.
+
+        Parameters
+        ----------
+        task_id : int
+            Task id
+
+        mapper_id : int, optional
+            Mapper id
+
+        launch_domain : Rect, optional
+            Launch domain of the task.
+
+        Returns
+        -------
+        AutoTask
+            A new manually parallelized task
+
+        See Also
+        --------
+        Context.create_task
+        """
+
         from .operation import AutoTask
 
         return cast(
@@ -340,11 +502,25 @@ def create_auto_task(
                 task_id=task_id,
                 mapper_id=mapper_id,
                 manual=False,
-                launch_domain=launch_domain,
             ),
         )
 
     def create_copy(self, mapper_id: int = 0) -> Copy:
+        """
+        Creates a copy operation.
+
+        Parameters
+        ----------
+        mapper_id : int, optional
+            Id of the mapper that should determine mapping policies for the
+            copy. Used only when the library has more than one mapper.
+
+        Returns
+        -------
+        Copy
+            A new copy operation
+        """
+
         from .operation import Copy
 
         return Copy(self, mapper_id, self.get_unique_op_id())
@@ -352,6 +528,32 @@ def create_copy(self, mapper_id: int = 0) -> Copy:
     def create_fill(
         self, lhs: Store, value: Store, mapper_id: int = 0
     ) -> Fill:
+        """
+        Creates a fill operation.
+
+        Parameters
+        ----------
+        lhs : Store
+            Store to fill
+
+        value : Store
+            Store holding the constant value to fill the ``lhs`` with
+
+        mapper_id : int, optional
+            Id of the mapper that should determine mapping policies for the
+            fill. Used only when the library has more than one mapper.
+
+        Returns
+        -------
+        Copy
+            A new fill operation
+
+        Raises
+        ------
+        ValueError
+            If the ``value`` is not scalar or the ``lhs`` is either unbound or
+            scalar
+        """
         from .operation import Fill
 
         return Fill(self, lhs, value, mapper_id, self.get_unique_op_id())
@@ -370,6 +572,34 @@ def create_store(
         optimize_scalar: bool = False,
         ndim: Optional[int] = None,
     ) -> Store:
+        """
+        Creates a fresh store.
+
+        Parameters
+        ----------
+        ty : Dtype
+            Type of the elements
+
+        shape : Shape or tuple[int], optional
+            Shape of the store. The store becomes unbound if no shape is
+            given.
+
+        storage : RegionField or Future, optional
+            Optional storage to initialize the store with. Used only when the
+            store is constructed from a future holding a scalar value.
+
+        optimize_scalar : bool
+            If ``True``, the runtime will use a ``Future`` when the store's
+            size is 1
+
+        ndim : int, optional
+            Dimension of the store. Must be passed if the store is unbound.
+
+        Returns
+        -------
+        Store
+            A new store
+        """
         dtype = self.type_system[ty]
         return self._runtime.create_store(
             dtype,
@@ -386,11 +616,50 @@ def get_cpu_communicator(self) -> Communicator:
         return self._runtime.get_cpu_communicator()
 
     def issue_execution_fence(self, block: bool = False) -> None:
+        """
+        Issues an execution fence. A fence is a special operation that
+        guarantees that all upstream operations finish before any of the
+        downstream operations start. The caller can optionally block on
+        completion of all upstream operations.
+
+        Parameters
+        ----------
+        block : bool
+            If ``True``, the call blocks until all upstream operations finish.
+        """
         self._runtime.issue_execution_fence(block=block)
 
     def tree_reduce(
         self, task_id: int, store: Store, mapper_id: int = 0, radix: int = 4
     ) -> Store:
+        """
+        Performs a user-defined reduction by building a tree of reduction
+        tasks. At each step, the reducer task gets up to ``radix`` input stores
+        and is supposed to produce outputs in a single unbound store.
+
+        Parameters
+        ----------
+        task_id : int
+            Id of the reducer task
+
+        store : Store
+            Store to perform reductions on
+
+        mapper_id : int
+            Id of the mapper that should decide mapping policies for reducer
+            tasks
+
+        radix : int
+            Fan-in of each reducer task. If the store is partitioned into
+            :math:`N` sub-stores by the runtime, then the first level of
+            reduction tree has :math:`\\ceil{N / \\mathtt{radix}}` reducer
+            tasks.
+
+        Returns
+        -------
+        Store
+            Store that contains reduction results
+        """
         from .operation import Reduce
 
         result = self.create_store(store.type)
@@ -412,6 +681,31 @@ def track_provenance(
     context: Context,
     nested: bool = False,
 ) -> Callable[[AnyCallable], AnyCallable]:
+    """
+    Decorator that adds provenance tracking to functions. Provenance of each
+    operation issued within the wrapped function will be tracked automatically.
+
+    Parameters
+    ----------
+    context : Context
+        Context that the function uses to issue operations
+
+    nested : bool
+        If ``True``, each invocation to a wrapped function within another
+        wrapped function updates the provenance string. Otherwise, the
+        provenance is tracked only for the outermost wrapped function.
+
+    Returns
+    -------
+    Decorator
+        Function that takes a function and returns a one with provenance
+        tracking
+
+    See Also
+    --------
+    legate.core.context.Context.track_provenance
+    """
+
     def decorator(func: AnyCallable) -> AnyCallable:
         return context.track_provenance(func, nested=nested)
 
@@ -420,6 +714,16 @@ def decorator(func: AnyCallable) -> AnyCallable:
 
 class Annotation:
     def __init__(self, context: Context, pairs: dict[str, str]) -> None:
+        """
+        Constructs a new annotation object
+
+        Parameters
+        ----------
+        context : Context
+            Context to which the annotations should be added
+        pairs : dict[str, str]
+            Annotations as key-value pairs
+        """
         self._annotation = context.annotation
         self._pairs = pairs
 
diff --git a/legate/core/launcher.py b/legate/core/launcher.py
index 69d6833c7..5dc8bb34b 100644
--- a/legate/core/launcher.py
+++ b/legate/core/launcher.py
@@ -928,7 +928,7 @@ def build_task(
         if self._sharding_space is not None:
             task.set_sharding_space(self._sharding_space)
 
-        for (req, fields) in self._req_analyzer.requirements:
+        for req, fields in self._req_analyzer.requirements:
             req.proj.add(task, req, fields, _index_task_calls)
         for future in self._future_args:
             task.add_future(future)
@@ -937,7 +937,7 @@ def build_task(
             arrival, wait = runtime.get_barriers(volume)
             task.add_future(arrival)
             task.add_future(wait)
-        for (out_req, fields) in self._out_analyzer.requirements:
+        for out_req, fields in self._out_analyzer.requirements:
             out_req.add(task, fields)
         for comm in self._comms:
             task.add_point_future(ArgumentMap(future_map=comm))
@@ -966,11 +966,11 @@ def build_single_task(self, argbuf: BufferBuilder) -> SingleTask:
             tag=self._tag,
             provenance=self._provenance,
         )
-        for (req, fields) in self._req_analyzer.requirements:
+        for req, fields in self._req_analyzer.requirements:
             req.proj.add_single(task, req, fields, _single_task_calls)
         for future in self._future_args:
             task.add_future(future)
-        for (out_req, fields) in self._out_analyzer.requirements:
+        for out_req, fields in self._out_analyzer.requirements:
             out_req.add_single(task, fields)
         if (
             not self._has_side_effect
@@ -1178,7 +1178,7 @@ def build_copy(self, launch_domain: Rect) -> IndexCopy:
         def add_requirements(
             requirements: list[tuple[RegionReq, int]]
         ) -> None:
-            for (req, field) in requirements:
+            for req, field in requirements:
                 req.proj.add(copy, req, field, _index_copy_calls)
 
         add_requirements(self._input_reqs.requirements)
@@ -1209,7 +1209,7 @@ def build_single_copy(self) -> SingleCopy:
         def add_requirements(
             requirements: list[tuple[RegionReq, int]]
         ) -> None:
-            for (req, field) in requirements:
+            for req, field in requirements:
                 req.proj.add_single(copy, req, field, _single_copy_calls)
 
         add_requirements(self._input_reqs.requirements)
diff --git a/legate/core/legate.py b/legate/core/legate.py
index 044a41c7e..48b5c3de7 100644
--- a/legate/core/legate.py
+++ b/legate/core/legate.py
@@ -520,31 +520,56 @@ def __init__(self) -> None:
 
     def get_name(self) -> str:
         """
-        Return a string name describing this library
+        Returns a name of the library
+
+        Returns
+        -------
+        str
+            Library name
         """
         raise NotImplementedError("Implement in derived classes")
 
-    def get_shared_library(self) -> Any:
+    def get_shared_library(self) -> Optional[str]:
         """
-        Return the name of the shared library
+        Returns the path to the shared library
+
+        Returns
+        -------
+        str or ``None``
+            Path to the shared library
         """
         raise NotImplementedError("Implement in derived classes")
 
     def get_c_header(self) -> str:
         """
-        Return a compiled C string header for this library
+        Returns a compiled C header string for the library
+
+        Returns
+        -------
+        str
+            C header string
         """
         raise NotImplementedError("Implement in derived classes")
 
     def get_registration_callback(self) -> str:
         """
-        Return the name of a C registration callback for this library
+        Returns the name of a C registration callback for the library
+
+        Returns
+        -------
+        str
+            The name of the C registration callback
         """
         raise NotImplementedError("Implement in derived classes")
 
     def get_resource_configuration(self) -> ResourceConfig:
         """
-        Return a ResourceConfig object that configures the library
+        Returns a ResourceConfig object that configures the library
+
+        Returns
+        -------
+        ResourceConfig
+            A ``ResourceConfig`` object
         """
         # Return the default configuration
         return ResourceConfig()
diff --git a/legate/core/operation.py b/legate/core/operation.py
index 07738aa51..bdc282809 100644
--- a/legate/core/operation.py
+++ b/legate/core/operation.py
@@ -158,6 +158,26 @@ def get_all_stores(self) -> OrderedSet[Store]:
         return result
 
     def add_alignment(self, store1: Store, store2: Store) -> None:
+        """
+        Sets an alignment between stores. Equivalent to the following code:
+
+        ::
+
+            symb1 = op.declare_partition(store1)
+            symb2 = op.declare_partition(store2)
+            op.add_constraint(symb1 == symb2)
+
+        Parameters
+        ----------
+        store1, store2 : Store
+            Stores to align
+
+        Raises
+        ------
+        ValueError
+            If the stores don't have the same shape or only one of them is
+            unbound
+        """
         self._check_store(store1, allow_unbound=True)
         self._check_store(store2, allow_unbound=True)
         if not (
@@ -175,14 +195,44 @@ def add_alignment(self, store1: Store, store2: Store) -> None:
     def add_broadcast(
         self, store: Store, axes: Optional[Union[int, Iterable[int]]] = None
     ) -> None:
+        """
+        Sets a broadcasting constraint on the store. Equivalent to the
+        following code:
+
+        ::
+
+            symb = op.declare_partition(store)
+            op.add_constraint(symb.broadcast(axes))
+
+        Parameters
+        ----------
+        store : Store
+            Store to set a broadcasting constraint on
+        axes : int or Iterable[int], optional
+            Axes to broadcast. The entire store is replicated if no axes are
+            given.
+        """
         self._check_store(store)
         part = self._get_unique_partition(store)
         self.add_constraint(part.broadcast(axes=axes))
 
     def add_constraint(self, constraint: Constraint) -> None:
+        """
+        Adds a partitioning constraint to the operation
+
+        Parameters
+        ----------
+        constraint : Constraint
+            Partitioning constraint
+        """
         self._constraints.append(constraint)
 
     def execute(self) -> None:
+        """
+        Submits the operation to the runtime. There is no guarantee that the
+        operation will start the execution right upon the return of this
+        method.
+        """
         self._context.runtime.submit(self)
 
     @staticmethod
@@ -220,6 +270,23 @@ def get_name(self) -> str:
     def declare_partition(
         self, store: Store, disjoint: bool = True, complete: bool = True
     ) -> PartSym:
+        """
+        Creates a partition symbol for the store
+
+        Parameters
+        ----------
+        store : Store
+            Store to associate the partition symbol with
+        disjoint : bool, optional
+            ``True`` (by default) means the partition must be disjoint
+        complete : bool, optional
+            ``True`` (by default) means the partition must be complete
+
+        Returns
+        -------
+        PartSym
+            A partition symbol
+        """
         sym = PartSym(
             self._op_id,
             self.get_name(),
@@ -253,16 +320,62 @@ def __init__(
 
     @property
     def side_effect(self) -> bool:
+        """
+        Indicates whether the task has side effects
+
+        Returns
+        -------
+        bool
+            ``True`` if the task has side efects
+        """
         return self._side_effect
 
     def set_side_effect(self, side_effect: bool) -> None:
+        """
+        Sets whether the task has side effects or not. A task is assumed to be
+        free of side effects by default if the task only has scalar arguments.
+
+        Parameters
+        ----------
+        side_effect : bool
+            A new boolean value indicating whether the task has side effects
+        """
         self._side_effect = side_effect
 
     @property
     def concurrent(self) -> bool:
+        """
+        Indicates whether the task needs a concurrent task launch.
+
+        A concurrent task launch guarantees that all tasks will be active at
+        the same time and make progress concurrently. This means that the tasks
+        will and should be mapped to distinct processors and that no other
+        tasks will be interleaved at any given point in time during execution
+        of the concurrent tasks. This operational guarantee is useful
+        when the tasks need to perform collective operations or explicit
+        communication outside Legate, but comes with performance overhead
+        due to distributed rendezvous used in the launch.
+
+        Returns
+        -------
+        bool
+            ``True`` if the task needs a concurrent task launch
+        """
         return self._concurrent
 
     def set_concurrent(self, concurrent: bool) -> None:
+        """
+        Sets whether the task needs a concurrent task launch. Any task with at
+        least one communicator will implicitly use concurrent task launch, so
+        this method is to be used when the task needs a concurrent task launch
+        for a reason unknown to Legate.
+
+        Parameters
+        ----------
+        concurrent : bool
+            A new boolean value indicating whether the task needs a concurrent
+            task launch
+        """
         self._concurrent = concurrent
 
     def get_name(self) -> str:
@@ -272,6 +385,18 @@ def get_name(self) -> str:
     def add_scalar_arg(
         self, value: Any, dtype: Union[DTType, tuple[DTType]]
     ) -> None:
+        """
+        Adds a by-value argument to the task
+
+        Parameters
+        ----------
+        value : Any
+            Scalar value or a tuple of scalars (but no nested tuples)
+        dtype : DType
+            Data type descriptor for the scalar value. A descriptor ``(T,)``
+            means that the value is a tuple of elements of type ``T``.
+        """
+
         self._scalar_args.append((value, dtype))
 
     def add_dtype_arg(self, dtype: DTType) -> None:
@@ -279,17 +404,36 @@ def add_dtype_arg(self, dtype: DTType) -> None:
         self._scalar_args.append((code, ty.int32))
 
     def throws_exception(self, exn_type: type) -> None:
+        """
+        Declares that the task can raise an exception. If more than one
+        exception is added to the task, they are numbered by the order in which
+        they are added, and those numbers are used to refer to them in the C++
+        task.
+
+        Parameters
+        ----------
+        exn_type : Type
+            Type of exception
+        """
         self._exn_types.append(exn_type)
 
     @property
     def can_raise_exception(self) -> bool:
+        """
+        Indicates whether the task can raise an exception
+
+        Returns
+        -------
+        bool
+            ``True`` if the task can raise an exception
+        """
         return len(self._exn_types) > 0
 
     def capture_traceback(self) -> None:
         self._tb_repr = capture_traceback_repr()
 
     def _add_scalar_args_to_launcher(self, launcher: TaskLauncher) -> None:
-        for (arg, dtype) in self._scalar_args:
+        for arg, dtype in self._scalar_args:
             launcher.add_scalar_arg(arg, dtype)
 
     def _demux_scalar_stores_future(self, result: Future) -> None:
@@ -427,10 +571,16 @@ def _demux_scalar_stores(
                 self._demux_scalar_stores_future_map(result, launch_domain)
 
     def add_nccl_communicator(self) -> None:
+        """
+        Adds a NCCL communicator to the task
+        """
         comm = self._context.get_nccl_communicator()
         self._comm_args.append(comm)
 
     def add_cpu_communicator(self) -> None:
+        """
+        Adds a CPU communicator to the task
+        """
         comm = self._context.get_cpu_communicator()
         self._comm_args.append(comm)
 
@@ -470,9 +620,42 @@ def get_requirement(
         tag = self.get_tag(strategy, part_symb)
         return req, tag, store_part
 
+
+class AutoTask(AutoOperation, Task):
+    """
+    A type of tasks that are automatically parallelized
+    """
+
+    def __init__(
+        self,
+        context: Context,
+        task_id: int,
+        mapper_id: int,
+        op_id: int,
+    ) -> None:
+        super().__init__(
+            context=context,
+            task_id=task_id,
+            mapper_id=mapper_id,
+            op_id=op_id,
+        )
+        self._reusable_stores: list[Tuple[Store, PartSym]] = []
+        self._reuse_map: dict[int, Store] = {}
+
     def add_input(
         self, store: Store, partition: Optional[PartSym] = None
     ) -> None:
+        """
+        Adds a store as input to the task
+
+        Parameters
+        ----------
+        store : Store
+            Store to pass as input
+        partition : PartSym, optional
+            Partition to associate with the store. The default partition is
+            picked if none is given.
+        """
         self._check_store(store)
         if partition is None:
             partition = self._get_unique_partition(store)
@@ -482,6 +665,17 @@ def add_input(
     def add_output(
         self, store: Store, partition: Optional[PartSym] = None
     ) -> None:
+        """
+        Adds a store as output to the task
+
+        Parameters
+        ----------
+        store : Store
+            Store to pass as output
+        partition : PartSym, optional
+            Partition to associate with the store. The default partition is
+            picked if none is given.
+        """
         self._check_store(store, allow_unbound=True)
         if store.kind is Future:
             self._scalar_outputs.append(len(self._outputs))
@@ -495,6 +689,19 @@ def add_output(
     def add_reduction(
         self, store: Store, redop: int, partition: Optional[PartSym] = None
     ) -> None:
+        """
+        Adds a store to the task for reduction
+
+        Parameters
+        ----------
+        store : Store
+            Store to pass for reduction
+        redop : int
+            Reduction operator ID
+        partition : PartSym, optional
+            Partition to associate with the store. The default partition is
+            picked if none is given.
+        """
         self._check_store(store)
         if store.kind is Future:
             self._scalar_reductions.append(len(self._reductions))
@@ -503,24 +710,6 @@ def add_reduction(
         self._reductions.append((store, redop))
         self._reduction_parts.append(partition)
 
-
-class AutoTask(AutoOperation, Task):
-    def __init__(
-        self,
-        context: Context,
-        task_id: int,
-        mapper_id: int,
-        op_id: int,
-    ) -> None:
-        super().__init__(
-            context=context,
-            task_id=task_id,
-            mapper_id=mapper_id,
-            op_id=op_id,
-        )
-        self._reusable_stores: list[Tuple[Store, PartSym]] = []
-        self._reuse_map: dict[int, Store] = {}
-
     def record_reuse(
         self,
         strategy: Strategy,
@@ -604,7 +793,7 @@ def launch(self, strategy: Strategy) -> None:
             # We update the key partition of a store only when it gets updated
             store.set_key_partition(store_part.partition)
 
-        for ((store, redop), part_symb) in zip(
+        for (store, redop), part_symb in zip(
             self._reductions, self._reduction_parts
         ):
             req, tag, store_part = self.get_requirement(
@@ -618,7 +807,7 @@ def launch(self, strategy: Strategy) -> None:
                 store, req, tag=tag, read_write=can_read_write
             )
 
-        for (store, part_symb) in zip(self._outputs, self._output_parts):
+        for store, part_symb in zip(self._outputs, self._output_parts):
             if not store.unbound:
                 continue
             fspace = strategy.get_field_space(part_symb)
@@ -643,6 +832,10 @@ def launch(self, strategy: Strategy) -> None:
 
 
 class ManualTask(Operation, Task):
+    """
+    A type of tasks that need explicit parallelization
+    """
+
     def __init__(
         self,
         context: Context,
@@ -685,6 +878,16 @@ def add_input(
         arg: Union[Store, StorePartition],
         proj: Optional[ProjFn] = None,
     ) -> None:
+        """
+        Adds a store as input to the task
+
+        Parameters
+        ----------
+        arg : Store or StorePartition
+            Store or store partition to pass as input
+        proj : ProjFn, optional
+            Projection function
+        """
         self._check_arg(arg)
         if isinstance(arg, Store):
             self._input_parts.append(arg.partition(REPLICATE))
@@ -697,10 +900,25 @@ def add_output(
         arg: Union[Store, StorePartition],
         proj: Optional[ProjFn] = None,
     ) -> None:
+        """
+        Adds a store as output to the task
+
+        Parameters
+        ----------
+        arg : Store or StorePartition
+            Store or store partition to pass as output
+        proj : ProjFn, optional
+            Projection function
+
+        Raises
+        ------
+        NotImplementedError
+            If the store is unbound
+        """
         self._check_arg(arg)
         if isinstance(arg, Store):
             if arg.unbound:
-                raise ValueError(
+                raise NotImplementedError(
                     "Unbound store cannot be used with "
                     "manually parallelized task"
                 )
@@ -718,6 +936,16 @@ def add_reduction(
         redop: int,
         proj: Optional[ProjFn] = None,
     ) -> None:
+        """
+        Adds a store to the task for reduction
+
+        Parameters
+        ----------
+        arg : Store or StorePartition
+            Store or store partition to pass for reduction
+        proj : ProjFn, optional
+            Projection function
+        """
         self._check_arg(arg)
         if isinstance(arg, Store):
             if arg.kind is Future:
@@ -791,6 +1019,10 @@ def launch(self, strategy: Strategy) -> None:
 
 
 class Copy(AutoOperation):
+    """
+    A special kind of operation for copying data from one store to another.
+    """
+
     def __init__(
         self,
         context: Context,
@@ -813,49 +1045,141 @@ def get_name(self) -> str:
     def inputs(self) -> list[Store]:
         return super().inputs + self._source_indirects + self._target_indirects
 
-    def add_output(
-        self, store: Store, partition: Optional[PartSym] = None
-    ) -> None:
+    def add_input(self, store: Store) -> None:
+        """
+        Adds a store as a source of the copy
+
+        Parameters
+        ----------
+        store : Store
+            Source store
+
+        Raises
+        ------
+        ValueError
+            If the store is scalar or unbound
+        """
+        if store.kind is Future or store.unbound:
+            raise ValueError(
+                "Copy input must be a normal, region-backed store"
+            )
+        self._check_store(store)
+        partition = self._get_unique_partition(store)
+        self._inputs.append(store)
+        self._input_parts.append(partition)
+
+    def add_output(self, store: Store) -> None:
+        """
+        Adds a store as a target of the copy. To avoid ambiguity in matching
+        sources and targets, one copy cannot have both normal targets and
+        reduction targets.
+
+        Parameters
+        ----------
+        store : Store
+            Target store
+
+        Raises
+        ------
+        RuntimeError
+            If the copy already has a reduction target
+        ValueError
+            If the store is scalar or unbound
+        """
         if len(self._reductions) > 0:
             raise RuntimeError(
                 "Copy targets must be either all normal outputs or reductions"
             )
-        super().add_output(store, partition)
+        if store.kind is Future or store.unbound:
+            raise ValueError(
+                "Copy target must be a normal, region-backed store"
+            )
 
-    def add_reduction(
-        self, store: Store, redop: int, partition: Optional[PartSym] = None
-    ) -> None:
+        self._check_store(store)
+        partition = self._get_unique_partition(store)
+        self._outputs.append(store)
+        self._output_parts.append(partition)
+
+    def add_reduction(self, store: Store, redop: int) -> None:
+        """
+        Adds a store as a reduction target of the copy. To avoid ambiguity in
+        matching sources and targets, one copy cannot have both normal targets
+        and reduction targets.
+
+        Parameters
+        ----------
+        store : Store
+            Reduction target store
+        redop : int
+            Reduction operator ID
+
+        Raises
+        ------
+        RuntimeError
+            If the copy already has a normal target
+        ValueError
+            If the store is scalar or unbound
+        """
         if len(self._outputs) > 0:
             raise RuntimeError(
                 "Copy targets must be either all normal outputs or reductions"
             )
-        super().add_reduction(store, redop, partition)
+        if store.kind is Future or store.unbound:
+            raise ValueError(
+                "Copy target must be a normal, region-backed store"
+            )
+        self._check_store(store)
+        partition = self._get_unique_partition(store)
+        self._reductions.append((store, redop))
+        self._reduction_parts.append(partition)
 
-    def add_source_indirect(
-        self, store: Store, partition: Optional[PartSym] = None
-    ) -> None:
+    def add_source_indirect(self, store: Store) -> None:
+        """
+        Adds an indirection for sources. A copy can have only up to one source
+        indirection.
+
+        Parameters
+        ----------
+        store : Store
+            Source indirection store
+
+        Raises
+        ------
+        RuntimeError
+            If the copy already has a source indirection
+        """
         if len(self._source_indirects) != 0:
             raise RuntimeError(
                 "There can be only up to one source indirection store for "
                 "a Copy operation"
             )
         self._check_store(store)
-        if partition is None:
-            partition = self._get_unique_partition(store)
+        partition = self._get_unique_partition(store)
         self._source_indirects.append(store)
         self._source_indirect_parts.append(partition)
 
-    def add_target_indirect(
-        self, store: Store, partition: Optional[PartSym] = None
-    ) -> None:
+    def add_target_indirect(self, store: Store) -> None:
+        """
+        Adds an indirection for targets. A copy can have only up to one target
+        indirection.
+
+        Parameters
+        ----------
+        store : Store
+            Target indirection store
+
+        Raises
+        ------
+        RuntimeError
+            If the copy already has a target indirection
+        """
         if len(self._target_indirects) != 0:
             raise RuntimeError(
                 "There can be only up to one target indirection store for "
                 "a Copy operation"
             )
         self._check_store(store)
-        if partition is None:
-            partition = self._get_unique_partition(store)
+        partition = self._get_unique_partition(store)
         self._target_indirects.append(store)
         self._target_indirect_parts.append(partition)
 
@@ -964,7 +1288,7 @@ def launch(self, strategy: Strategy) -> None:
             else:
                 launcher.add_output(store, req, tag=tag)
 
-        for ((store, redop), part_symb) in zip(
+        for (store, redop), part_symb in zip(
             self._reductions, self._reduction_parts
         ):
             req, tag, store_part = self.get_requirement(
@@ -994,6 +1318,10 @@ def launch(self, strategy: Strategy) -> None:
 
 
 class Fill(AutoOperation):
+    """
+    A special kind of operation for filling a store with constant values
+    """
+
     def __init__(
         self,
         context: Context,
@@ -1009,8 +1337,18 @@ def __init__(
             raise ValueError("Fill lhs must be a bound Store")
         if lhs.kind is Future:
             raise ValueError("Fill lhs must be a RegionField-backed Store")
-        super().add_input(value)
-        super().add_output(lhs)
+        self._add_value(value)
+        self._add_lhs(lhs)
+
+    def _add_value(self, value: Store) -> None:
+        partition = self._get_unique_partition(value)
+        self._inputs.append(value)
+        self._input_parts.append(partition)
+
+    def _add_lhs(self, lhs: Store) -> None:
+        partition = self._get_unique_partition(lhs)
+        self._outputs.append(lhs)
+        self._output_parts.append(partition)
 
     def get_name(self) -> str:
         libname = self.context.library.get_name()
@@ -1033,21 +1371,6 @@ def add_constraint(self, constraint: Constraint) -> None:
             "User partitioning constraints are not allowed for fills"
         )
 
-    def add_input(
-        self, store: Store, partition: Optional[PartSym] = None
-    ) -> None:
-        raise TypeError("No further inputs can be added to fills")
-
-    def add_output(
-        self, store: Store, partition: Optional[PartSym] = None
-    ) -> None:
-        raise TypeError("No further outputs can be added to fills")
-
-    def add_reduction(
-        self, store: Store, redop: int, partition: Optional[PartSym] = None
-    ) -> None:
-        raise TypeError("No reductions can be added to fills")
-
     def launch(self, strategy: Strategy) -> None:
         lhs = self._outputs[0]
         lhs_part_sym = self._output_parts[0]
@@ -1096,6 +1419,19 @@ def __init__(
         self._radix = radix
         self._task_id = task_id
 
+    def add_input(self, store: Store) -> None:
+        self._check_store(store)
+        partition = self._get_unique_partition(store)
+        self._inputs.append(store)
+        self._input_parts.append(partition)
+
+    def add_output(self, store: Store) -> None:
+        assert store.unbound
+        partition = self._get_unique_partition(store)
+        self._unbound_outputs.append(len(self._outputs))
+        self._outputs.append(store)
+        self._output_parts.append(partition)
+
     def launch(self, strategy: Strategy) -> None:
         assert len(self._inputs) == 1 and len(self._outputs) == 1
 
diff --git a/legate/core/partition.py b/legate/core/partition.py
index d5904c319..c0eb24ad6 100644
--- a/legate/core/partition.py
+++ b/legate/core/partition.py
@@ -227,7 +227,7 @@ def is_complete_for(self, extents: Shape, offsets: Shape) -> bool:
         my_lo = self._offset
         my_hi = self._offset + self.tile_shape * self._color_shape
 
-        return my_lo <= offsets and offsets + extents <= my_hi
+        return all(my_lo <= offsets) and all(offsets + extents <= my_hi)
 
     def is_disjoint_for(self, launch_domain: Optional[Rect]) -> bool:
         return (
@@ -236,7 +236,7 @@ def is_disjoint_for(self, launch_domain: Optional[Rect]) -> bool:
         )
 
     def has_color(self, color: Shape) -> bool:
-        return color >= 0 and color < self._color_shape
+        return all(color >= 0) and all(color < self._color_shape)
 
     @lru_cache
     def get_subregion_size(self, extents: Shape, color: Shape) -> Shape:
@@ -396,7 +396,7 @@ def is_disjoint_for(self, launch_domain: Optional[Rect]) -> bool:
         return True
 
     def has_color(self, color: Shape) -> bool:
-        return color >= 0 and color < self._color_shape
+        return all(color >= 0) and all(color < self._color_shape)
 
     def translate(self, offset: Shape) -> None:
         raise NotImplementedError("This method shouldn't be invoked")
diff --git a/legate/core/resource.py b/legate/core/resource.py
index 240c7b6b8..e3d85e43e 100644
--- a/legate/core/resource.py
+++ b/legate/core/resource.py
@@ -22,20 +22,11 @@
 
 
 class ResourceConfig:
-    __slots__ = (
-        "max_tasks",
-        "max_mappers",
-        "max_reduction_ops",
-        "max_projections",
-        "max_shardings",
-    )
-
-    def __init__(self) -> None:
-        self.max_tasks = 1_000_000
-        self.max_mappers = 1
-        self.max_reduction_ops = 0
-        self.max_projections = 0
-        self.max_shardings = 0
+    max_tasks = 1_000_000
+    max_reduction_ops = 0
+    max_mappers = 1
+    max_projections = 0
+    max_shardings = 0
 
 
 class ResourceScope:
diff --git a/legate/core/runtime.py b/legate/core/runtime.py
index 637b09f72..00b48510f 100644
--- a/legate/core/runtime.py
+++ b/legate/core/runtime.py
@@ -26,7 +26,7 @@
 
 from legion_top import add_cleanup_item, top_level
 
-from ..util.args import ArgSpec, Argument, parse_library_command_args
+from ..settings import settings
 from . import ffi  # Make sure we only have one ffi instance
 from . import (
     Fence,
@@ -79,49 +79,6 @@
 
 _LEGATE_FIELD_ID_BASE = 1000
 
-ARGS = [
-    Argument(
-        "consensus",
-        ArgSpec(
-            action="store_true",
-            default=False,
-            dest="consensus",
-            help="Turn on consensus match on single node (for testing).",
-        ),
-    ),
-    Argument(
-        "cycle-check",
-        ArgSpec(
-            action="store_true",
-            default=False,
-            dest="cycle_check",
-            help=(
-                "Check for reference cycles involving RegionField objects on "
-                "script exit (developer option). When such cycles arise "
-                "during execution, they stop used RegionFields from getting "
-                "collected and reused for new Stores, thus increasing memory "
-                "pressure. By default this check will miss any RegionField "
-                "cycles the garbage collector collected during execution; "
-                "run gc.disable() at the beginning of the program to avoid "
-                "this."
-            ),
-        ),
-    ),
-    Argument(
-        "future-leak-check",
-        ArgSpec(
-            action="store_true",
-            default=False,
-            dest="future_leak_check",
-            help=(
-                "Check for reference cycles keeping Future/FutureMap objects "
-                "alive after Legate runtime exit (developer option). Such "
-                "leaks can result in Legion runtime shutdown hangs."
-            ),
-        ),
-    ),
-]
-
 
 # A helper class for doing field management with control replication
 @dataclass(frozen=True)
@@ -961,8 +918,6 @@ def __init__(self, core_library: CoreLib) -> None:
         focus on implementing their domain logic.
         """
 
-        self._args = parse_library_command_args("legate", ARGS)
-
         # Record whether we need to run finalize tasks
         # Key off whether we are being loaded in a context or not
         try:
@@ -1047,7 +1002,7 @@ def __init__(self, core_library: CoreLib) -> None:
         )
         self._field_manager_class = (
             ConsensusMatchingFieldManager
-            if self._num_nodes > 1 or self._args.consensus
+            if self._num_nodes > 1 or settings.consensus()
             else FieldManager
         )
         self._max_lru_length = int(
@@ -1132,14 +1087,38 @@ def empty_argmap(self) -> ArgumentMap:
 
     @property
     def num_cpus(self) -> int:
+        """
+        Returns the total number of CPUs in the system
+
+        Returns
+        -------
+        int
+            Number of CPUs
+        """
         return self._num_cpus
 
     @property
     def num_omps(self) -> int:
+        """
+        Returns the total number of OpenMP processors in the system
+
+        Returns
+        -------
+        int
+            Number of OpenMP processors
+        """
         return self._num_omps
 
     @property
     def num_gpus(self) -> int:
+        """
+        Returns the total number of GPUs in the system
+
+        Returns
+        -------
+        int
+            Number of GPUs
+        """
         return self._num_gpus
 
     @property
@@ -1164,6 +1143,19 @@ def field_match_manager(self) -> FieldMatchManager:
         return self._field_match_manager
 
     def register_library(self, library: Library) -> Context:
+        """
+        Registers a library to the runtime.
+
+        Parameters
+        ----------
+        library : Library
+            Library object
+
+        Returns
+        -------
+        Context
+            A new context for the library
+        """
         from .context import Context
 
         libname = library.get_name()
@@ -1356,6 +1348,23 @@ def get_transform_code(self, name: str) -> int:
         )
 
     def create_future(self, data: Any, size: int) -> Future:
+        """
+        Creates a future from a buffer holding a scalar value. The value is
+        copied to the future.
+
+        Parameters
+        ----------
+        data : buffer
+            Buffer that holds a scalar value
+
+        size : int
+            Size of the value
+
+        Returns
+        -------
+        Future
+            A new future
+        """
         future = Future()
         future.set_value(self.legion_runtime, data, size)
         return future
@@ -1691,7 +1700,7 @@ def raise_exceptions(self) -> None:
 
 def _cleanup_legate_runtime() -> None:
     global runtime
-    future_leak_check = runtime._args.future_leak_check
+    future_leak_check = settings.future_leak_check()
     runtime.destroy()
     del runtime
     gc.collect()
@@ -1721,7 +1730,7 @@ def __del__(self) -> None:
         find_cycles(False)
 
 
-if runtime._args.cycle_check:
+if settings.cycle_check():
     # The first thing that legion_top does after executing the user script
     # is to remove the newly created "__main__" module. We intercept this
     # deletion operation to perform our check.
@@ -1741,4 +1750,11 @@ def legate_add_library(library: Library) -> None:
 
 
 def get_legate_runtime() -> Runtime:
+    """
+    Returns the Legate runtime
+
+    Returns
+    -------
+        Legate runtime object
+    """
     return runtime
diff --git a/legate/core/shape.py b/legate/core/shape.py
index 98207191f..af84a6132 100644
--- a/legate/core/shape.py
+++ b/legate/core/shape.py
@@ -32,6 +32,11 @@ def _cast_tuple(value: int | Iterable[int], ndim: int) -> tuple[int, ...]:
     return tuple(value)
 
 
+class _ShapeComparisonResult(tuple[bool, ...]):
+    def __bool__(self) -> bool:
+        assert False, "use any() or all()"
+
+
 class Shape:
     _extents: Union[tuple[int, ...], None]
     _ispace: Union[IndexSpace, None]
@@ -41,6 +46,19 @@ def __init__(
         extents: Optional[ExtentLike] = None,
         ispace: Optional[IndexSpace] = None,
     ) -> None:
+        """
+        Constructs a new shape object
+
+        Parameters
+        ----------
+        extents: int, Iterable[int], or Shape
+           Extents to construct the shape object with. Must be passed unless an
+           ``ispace`` is given.
+        ispace : IndexSpace, optional
+            A Legion index space handle to construct the shape object with.
+            Must not be used by clients explicitly, as they don't have access
+            to index spaces.
+        """
         if isinstance(extents, int):
             self._extents = (extents,)
             self._ispace = None
@@ -54,6 +72,18 @@ def __init__(
 
     @property
     def extents(self) -> tuple[int, ...]:
+        """
+        Returns the extents of the shape in a tuple
+
+        Returns
+        -------
+        tuple[int]
+            Extents of the shape
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         if self._extents is None:
             assert self._ispace is not None
             bounds = self._ispace.get_bounds()
@@ -95,6 +125,14 @@ def __contains__(self, value: object) -> bool:
 
     @property
     def fixed(self) -> bool:
+        """
+        Indicates whether the shape's extents are already computed
+
+        Returns
+        ------
+        bool
+            If ``True``, the shape has fixed extents
+        """
         return self._extents is not None
 
     @property
@@ -103,6 +141,15 @@ def ispace(self) -> Union[IndexSpace, None]:
 
     @property
     def ndim(self) -> int:
+        """
+        Dimension of the shape. Unlike the ``extents`` property, this is
+        non-blocking.
+
+        Returns
+        ------
+        int
+            Dimension of the shape
+        """
         if self._extents is None:
             assert self._ispace is not None
             return self._ispace.get_dim()
@@ -122,9 +169,33 @@ def get_index_space(self, runtime: Runtime) -> IndexSpace:
             return self._ispace
 
     def volume(self) -> int:
+        """
+        Returns the shape's volume
+
+        Returns
+        ------
+        int
+            Volume of the shape
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         return reduce(lambda x, y: x * y, self.extents, 1)
 
     def sum(self) -> int:
+        """
+        Returns a sum of the extents
+
+        Returns
+        ------
+        int
+            Sum of the extents
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         return reduce(lambda x, y: x + y, self.extents, 0)
 
     def __hash__(self) -> int:
@@ -134,6 +205,23 @@ def __hash__(self) -> int:
             return hash((self.__class__, True, self._extents))
 
     def __eq__(self, other: object) -> bool:
+        """
+        Checks whether the shape is identical to a given shape
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        bool
+            ``True`` if the shapes are identical
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         if isinstance(other, Shape):
             if (
                 self._ispace is not None
@@ -154,43 +242,132 @@ def __eq__(self, other: object) -> bool:
         else:
             return False
 
-    def __le__(self, other: ExtentLike) -> bool:
+    def __le__(self, other: ExtentLike) -> _ShapeComparisonResult:
+        """
+        Returns the result of element-wise ``<=``.
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        tuple[bool]
+            Result of element-wise ``<=``.
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
             if isinstance(other, Shape)
             else _cast_tuple(other, self.ndim)
         )
-        return len(lh) == len(rh) and lh <= rh
-
-    def __lt__(self, other: ExtentLike) -> bool:
+        assert len(lh) == len(rh)
+        return _ShapeComparisonResult(l <= r for (l, r) in zip(lh, rh))
+
+    def __lt__(self, other: ExtentLike) -> _ShapeComparisonResult:
+        """
+        Returns the result of element-wise ``<``.
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        tuple[bool]
+            Result of element-wise ``<``.
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
             if isinstance(other, Shape)
             else _cast_tuple(other, self.ndim)
         )
-        return len(lh) == len(rh) and lh < rh
-
-    def __ge__(self, other: ExtentLike) -> bool:
+        assert len(lh) == len(rh)
+        return _ShapeComparisonResult(l < r for (l, r) in zip(lh, rh))
+
+    def __ge__(self, other: ExtentLike) -> _ShapeComparisonResult:
+        """
+        Returns the result of element-wise ``<=``.
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        tuple[bool]
+            Result of element-wise ``<=``.
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
             if isinstance(other, Shape)
             else _cast_tuple(other, self.ndim)
         )
-        return len(lh) == len(rh) and lh >= rh
-
-    def __gt__(self, other: ExtentLike) -> bool:
+        assert len(lh) == len(rh)
+        return _ShapeComparisonResult(l >= r for (l, r) in zip(lh, rh))
+
+    def __gt__(self, other: ExtentLike) -> _ShapeComparisonResult:
+        """
+        Returns the result of element-wise ``<=``.
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        tuple[bool]
+            Result of element-wise ``<=``.
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
             if isinstance(other, Shape)
             else _cast_tuple(other, self.ndim)
         )
-        return len(lh) == len(rh) and lh > rh
+        assert len(lh) == len(rh)
+        return _ShapeComparisonResult(l > r for (l, r) in zip(lh, rh))
 
     def __add__(self, other: ExtentLike) -> Shape:
+        """
+        Returns an element-wise addition of the shapes
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            A shape to add to this shape
+
+        Returns
+        ------
+        bool
+            Result of element-wise addition
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
@@ -200,6 +377,23 @@ def __add__(self, other: ExtentLike) -> Shape:
         return Shape(tuple(a + b for (a, b) in zip(lh, rh)))
 
     def __sub__(self, other: ExtentLike) -> Shape:
+        """
+        Returns an element-wise subtraction between the shapes
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            A shape to subtract from this shape
+
+        Returns
+        ------
+        bool
+            Result of element-wise subtraction
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
@@ -209,6 +403,23 @@ def __sub__(self, other: ExtentLike) -> Shape:
         return Shape(tuple(a - b for (a, b) in zip(lh, rh)))
 
     def __mul__(self, other: ExtentLike) -> Shape:
+        """
+        Returns an element-wise multiplication of the shapes
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            A shape to multiply with this shape
+
+        Returns
+        ------
+        bool
+            Result of element-wise multiplication
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
@@ -218,6 +429,23 @@ def __mul__(self, other: ExtentLike) -> Shape:
         return Shape(tuple(a * b for (a, b) in zip(lh, rh)))
 
     def __mod__(self, other: ExtentLike) -> Shape:
+        """
+        Returns the result of element-wise modulo operation
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            Shape to compare with
+
+        Returns
+        ------
+        bool
+            Result of element-wise modulo operation
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
@@ -227,6 +455,23 @@ def __mod__(self, other: ExtentLike) -> Shape:
         return Shape(tuple(a % b for (a, b) in zip(lh, rh)))
 
     def __floordiv__(self, other: ExtentLike) -> Shape:
+        """
+        Returns the result of element-wise integer division
+
+        Parameters
+        ----------
+        other : Shape or Iterable[int]
+            A shape to divide this shape by
+
+        Returns
+        ------
+        bool
+            Result of element-wise integer division
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         lh = self.extents
         rh = (
             other.extents
@@ -236,26 +481,138 @@ def __floordiv__(self, other: ExtentLike) -> Shape:
         return Shape(tuple(a // b for (a, b) in zip(lh, rh)))
 
     def drop(self, dim: int) -> Shape:
+        """
+        Removes a dimension from the shape
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to remove
+
+        Returns
+        ------
+        Shape
+            Shape with one less dimension
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         extents = self.extents
         return Shape(extents[:dim] + extents[dim + 1 :])
 
     def update(self, dim: int, new_value: int) -> Shape:
+        """
+        Replaces the extent of a dimension with a new extent
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to replace
+
+        new_value : int
+            New extent
+
+        Returns
+        ------
+        Shape
+            Shape with the chosen dimension updated
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         return self.replace(dim, (new_value,))
 
     def replace(self, dim: int, new_values: Iterable[int]) -> Shape:
+        """
+        Replaces a dimension with multiple dimensions
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to replace
+
+        new_values : Iterable[int]
+            Extents of the new dimensions
+
+        Returns
+        ------
+        Shape
+            Shape with the chosen dimension replaced
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         if not isinstance(new_values, tuple):
             new_values = tuple(new_values)
         extents = self.extents
         return Shape(extents[:dim] + new_values + extents[dim + 1 :])
 
     def insert(self, dim: int, new_value: int) -> Shape:
+        """
+        Inserts a new dimension
+
+        Parameters
+        ----------
+        dim : int
+            Location to insert the new dimension
+
+        new_value : int
+            Extent of the new dimension
+
+        Returns
+        ------
+        Shape
+            Shape with one more dimension
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         extents = self.extents
         return Shape(extents[:dim] + (new_value,) + extents[dim:])
 
     def map(self, mapping: tuple[int, ...]) -> Shape:
+        """
+        Applies a mapping to each extent in the shape
+
+        Parameters
+        ----------
+        maping : tuple[int]
+            New values for dimensions
+
+        Returns
+        ------
+        Shape
+            Shape with the extents replaced
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         return Shape(tuple(self[mapping[dim]] for dim in range(self.ndim)))
 
     def strides(self) -> Shape:
+        """
+        Computes strides of the shape. The last dimension is considered the
+        most rapidly changing one. For example, if the shape is ``(3, 4, 5)``,
+        the strides are
+
+        ::
+
+            (20, 5, 1)
+
+        Returns
+        ------
+        Shape
+            Strides of the shape
+
+        Notes
+        -----
+        Can block on the producer task
+        """
         strides: tuple[int, ...] = ()
         stride = 1
         for size in reversed(self.extents):
diff --git a/legate/core/store.py b/legate/core/store.py
index 0d55cbf62..309585999 100644
--- a/legate/core/store.py
+++ b/legate/core/store.py
@@ -178,10 +178,9 @@ def record_detach(detach: Union[Detach, IndexDetach]) -> None:
                 mapper=context.mapper_id,
                 provenance=context.provenance,
             )
-            # If we're not sharing then there is no need to map or restrict the
-            # attachment
+            attach.set_restricted(False)
+            # If we're not sharing then there is no need to map the attachment
             if not share:
-                attach.set_restricted(False)
                 attach.set_mapped(False)
             else:
                 self.physical_region_mapped = True
@@ -209,7 +208,7 @@ def record_detach(detach: Union[Detach, IndexDetach]) -> None:
                 else field_type
             )
             shard_local_data = {}
-            for (c, buf) in alloc.shard_local_buffers.items():
+            for c, buf in alloc.shard_local_buffers.items():
                 subregion = alloc.partition.get_child(c)
                 bounds = subregion.index_space.get_bounds()
                 if buf.shape != tuple(
@@ -231,9 +230,7 @@ def record_detach(detach: Union[Detach, IndexDetach]) -> None:
                 provenance=context.provenance,
             )
             index_attach.set_deduplicate_across_shards(True)
-            # If we're not sharing there is no need to restrict the attachment
-            if not share:
-                index_attach.set_restricted(False)
+            index_attach.set_restricted(False)
             external_resources = runtime.dispatch(index_attach)
             # We don't need to flush the contents back to the attached memory
             # if this is an internal temporary allocation.
@@ -786,10 +783,26 @@ def __init__(
 
     @property
     def store(self) -> Store:
+        """
+        Returns the store of the store partition
+
+        Returns
+        -------
+        Store
+            A ``Store`` object wrapped in the store partition
+        """
         return self._store
 
     @property
     def partition(self) -> PartitionBase:
+        """
+        Returns the partition descriptor of the store partition
+
+        Returns
+        -------
+        PartitionBase
+            A ``PartitionBase`` object wrapped in the store partition
+        """
         return self._partition
 
     @property
@@ -797,6 +810,19 @@ def transform(self) -> TransformStackBase:
         return self._store.transform
 
     def get_child_store(self, *indices: int) -> Store:
+        """
+        Returns the sub-store of a given color
+
+        Parameters
+        ----------
+        indices : tuple[int]
+            Color of the sub-store
+
+        Returns
+        -------
+        Store
+            The sub-store of the chosen color
+        """
         color = self.transform.invert_color(Shape(indices))
         child_storage = self._storage_partition.get_child(color)
         child_transform = self.transform
@@ -895,6 +921,15 @@ def move_data(self, other: Store) -> None:
 
     @property
     def shape(self) -> Shape:
+        """
+        Returns the shape of the store. Flushes the scheduling window if the
+        store is unbound and has no shape assigned.
+
+        Returns
+        -------
+        Shape
+          The store's shape
+        """
         if self._shape is None:
             # If someone wants to access the shape of an unbound
             # store before it is set, that means the producer task is
@@ -915,6 +950,14 @@ def shape(self) -> Shape:
 
     @property
     def ndim(self) -> int:
+        """
+        Returns the number of dimensions of the store.
+
+        Returns
+        -------
+        int
+          The number of dimensions
+        """
         if self._shape is None:
             assert self._ndim is not None
             return self._ndim
@@ -923,12 +966,25 @@ def ndim(self) -> int:
 
     @property
     def size(self) -> int:
+        """
+        Returns the number of elements in the store.
+
+        Returns
+        -------
+        int
+          The store's size
+        """
         return prod(self.shape) if self.ndim > 0 else 1
 
     @property
     def type(self) -> _Dtype:
         """
-        Return the type of the data in this storage primitive
+        Returns the element type of the store.
+
+        Returns
+        -------
+        _Dtype
+          Type of elements in the store
         """
         return self._dtype
 
@@ -938,24 +994,42 @@ def get_dtype(self) -> _Dtype:
     @property
     def kind(self) -> Union[Type[RegionField], Type[Future]]:
         """
-        Return the type of the Legion storage object backing the data in this
-        storage object: either Future, or RegionField.
+        Returns the kind of backing storage
+
+        Returns
+        -------
+        Type
+          `RegionField` or `Future`
         """
         return self._storage.kind
 
     @property
     def unbound(self) -> bool:
+        """
+        Indicates whether the store is unbound
+
+        Returns
+        -------
+        bool
+          ``True`` if the store is unbound
+        """
         return self._shape is None
 
     @property
     def scalar(self) -> bool:
+        """
+        Indicates whether the store is scalar (i.e., backed by a `Future` and
+        of size 1)
+
+        Returns
+        -------
+        bool
+          ``True`` if the store is scalar
+        """
         return self.kind is Future and self.shape.volume() == 1
 
     @property
     def storage(self) -> Union[RegionField, Future]:
-        """
-        Return the Legion container backing this Store.
-        """
         if self.unbound:
             raise RuntimeError(
                 "Storage of a variable size store cannot be retrieved "
@@ -976,10 +1050,26 @@ def extents(self) -> Shape:
 
     @property
     def transform(self) -> TransformStackBase:
+        """
+        Returns a transformation attached to the store
+
+        Returns
+        -------
+        TransformStackBase
+            Transformation attached to the store
+        """
         return self._transform
 
     @property
     def transformed(self) -> bool:
+        """
+        Indicates whether the store is transformed
+
+        Returns
+        -------
+        bool
+            If ``True``, the store is transformed
+        """
         return not self._transform.bottom
 
     def attach_external_allocation(
@@ -1038,6 +1128,46 @@ def __repr__(self) -> str:
     # Convert a store in N-D space to that in (N+1)-D space.
     # The extra_dim specifies the added dimension
     def promote(self, extra_dim: int, dim_size: int = 1) -> Store:
+        """
+        Adds an extra dimension to the store. Value of ``extra_dim`` decides
+        where a new dimension should be added, and each dimension `i`, where
+        `i` >= ``extra_dim``, is mapped to dimension `i+1` in a returned store.
+        A returned store provides a view to the input store where the values
+        are broadcasted along the new dimension.
+
+        For example, for a 1D store ``A`` contains ``[1, 2, 3]``,
+        ``A.promote(0, 2)`` yields a store equivalent to:
+
+        ::
+
+            [[1, 2, 3],
+             [1, 2, 3]]
+
+        whereas ``A.promote(1, 2)`` yields:
+
+        ::
+
+            [[1, 1],
+             [2, 2],
+             [3, 3]]
+
+        Parameters
+        ----------
+        extra_dim : int
+            Position for a new dimension
+        dim_size : int, optional
+            Extent of the new dimension
+
+        Returns
+        -------
+        Store
+            A new store with an extra dimension
+
+        Raises
+        ------
+        ValueError
+            If ``extra_dim`` is not a valid dimension name
+        """
         extra_dim = extra_dim + self.ndim if extra_dim < 0 else extra_dim
         if extra_dim < 0 or extra_dim > self.ndim:
             raise ValueError(
@@ -1060,6 +1190,34 @@ def promote(self, extra_dim: int, dim_size: int = 1) -> Store:
     # Take a hyperplane of an N-D store for a given index
     # to create an (N-1)-D store
     def project(self, dim: int, index: int) -> Store:
+        """
+        Projects out a dimension of the store. Each dimension `i`, where
+        `i` > ``dim``, is mapped to dimension `i-1` in a returned store.
+        A returned store provides a view to the input store where the values
+        are on hyperplane :math:`x_\\mathtt{dim} = \\mathtt{index}`.
+
+        For example, if a 2D store ``A`` contains ``[[1, 2], [3, 4]]``,
+        ``A.project(0, 1)`` yields a store equivalent to ``[3, 4]``, whereas
+        ``A.project(1, 0)`` yields ``[1, 3]``.
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to project out
+        index : int
+            Index on the chosen dimension
+
+        Returns
+        -------
+        Store
+            A new store with one fewer dimension
+
+        Raises
+        ------
+        ValueError
+            If ``dim`` is not a valid dimension name or ``index`` is
+            out of bounds
+        """
         dim = dim + self.ndim if dim < 0 else dim
         if dim < 0 or dim >= self.ndim:
             raise ValueError(
@@ -1097,6 +1255,68 @@ def project(self, dim: int, index: int) -> Store:
         )
 
     def slice(self, dim: int, sl: slice) -> Store:
+        """
+        Slices a contiguous sub-section of the store.
+
+        For example, consider a 2D store ``A``
+
+        ::
+
+            [[1, 2, 3],
+             [4, 5, 6],
+             [7, 8, 9]]
+
+        A slicing ``A.slice(0, slice(1, None))`` yields:
+
+        ::
+
+            [[4, 5, 6],
+             [7, 8, 9]]
+
+        The result store will look like this on a different slicing call
+        ``A.slice(1, slice(None, 2))``:
+
+        ::
+
+            [[1, 2],
+             [4, 5],
+             [7, 8]]
+
+        Finally, chained slicing calls
+
+        ::
+
+            A.slice(0, slice(1, None)).slice(1, slice(None, 2))
+
+        results in:
+
+        ::
+
+            [[4, 5],
+             [7, 8]]
+
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to slice
+        sl : slice
+            Slice that expresses a sub-section
+
+        Returns
+        -------
+        Store
+            A new store that correponds to the sliced section
+
+        Notes
+        -----
+        Slicing with a non-unit step is currently not supported.
+
+        Raises
+        ------
+        ValueError
+            If ``sl.step`` is not a unit or ``sl`` is out of bounds
+        """
         dim = dim + self.ndim if dim < 0 else dim
         if dim < 0 or dim >= self.ndim:
             raise ValueError(
@@ -1146,6 +1366,60 @@ def slice(self, dim: int, sl: slice) -> Store:
         )
 
     def transpose(self, axes: tuple[int, ...]) -> Store:
+        """
+        Reorders dimensions of the store. Dimension ``i`` of the resulting
+        store is mapped to dimension ``axes[i]`` of the input store.
+
+        For example, for a 3D store ``A``
+
+        ::
+
+            [[[1, 2],
+              [3, 4]],
+
+             [[5, 6],
+              [7, 8]]]
+
+        transpose calls ``A.transpose([1, 2, 0])`` and ``A.transpose([2, 1,
+        0])`` yield the following stores, respectively:
+
+        ::
+
+            [[[1, 5],
+              [2, 6]],
+
+             [[3, 7],
+              [4, 8]]]
+
+
+        ::
+
+            [[[1, 5],
+              [3, 7]],
+
+             [[2, 6],
+              [4, 8]]]
+
+
+        Parameters
+        ----------
+        axes : tuple[int]
+            Mapping from dimensions of the resulting store to those of the
+            input
+
+        Returns
+        -------
+        Store
+            A new store with the dimensions transposed
+
+        Raises
+        ------
+        ValueError
+            If any of the following happens: 1) The length of ``axes`` doesn't
+            match the store's dimension; 2) ``axes`` has duplicates; 3) Any
+            value in ``axes`` is negative, or greater than or equal to the
+            store's dimension
+        """
         if len(axes) != self.ndim:
             raise ValueError(
                 f"dimension mismatch: expected {self.ndim} axes, "
@@ -1173,6 +1447,54 @@ def transpose(self, axes: tuple[int, ...]) -> Store:
         )
 
     def delinearize(self, dim: int, shape: tuple[int, ...]) -> Store:
+        """
+        Delinearizes a dimension into multiple dimensions. Each dimension
+        `i` of the store, where `i` > ``dim``, will be mapped to dimension
+        `i+N` of the resulting store, where `N` is the length of ``shape``.
+        A delinearization that does not preserve the size of the store is
+        invalid.
+
+        For example, consider a 2D store ``A``
+
+        ::
+
+            [[1, 2, 3, 4],
+             [5, 6, 7, 8]]
+
+        A delinearizing call `A.delinearize(1, [2, 2]))` yields:
+
+        ::
+
+            [[[1, 2],
+              [3, 4]],
+
+             [[5, 6],
+              [7, 8]]]
+
+        Parameters
+        ----------
+        dim : int
+            Dimension to delinearize
+        shape : tuple[int]
+            New shape for the chosen dimension
+
+        Returns
+        -------
+        Store
+            A new store with the chosen dimension delinearized
+
+        Notes
+        -----
+        Unlike other transformations, delinearization is not an affine
+        transformation. Due to this nature, delinearized stores can raise
+        `NonInvertibleError` in places where they cannot be used.
+
+        Raises
+        ------
+        ValueError
+            If ``dim`` is invalid for the store or ``shape`` does not preserve
+            the size of the chosen dimenison
+        """
         dim = dim + self.ndim if dim < 0 else dim
         if dim < 0 or dim >= self.ndim:
             raise ValueError(
@@ -1201,6 +1523,24 @@ def delinearize(self, dim: int, shape: tuple[int, ...]) -> Store:
     def get_inline_allocation(
         self, context: Optional[Context] = None
     ) -> InlineMappedAllocation:
+        """
+        Creates an inline allocation for the store.
+
+        Parameters
+        ----------
+        context : Context, optional
+            Library context within which the allocation is created
+
+        Notes
+        -------
+        This call blocks the client's control flow. And it fetches the data for
+        the whole store on a single node.
+
+        Returns
+        -------
+        InlineMappedAllocation
+            A helper object wrapping the allocation
+        """
         assert self.kind is RegionField
         return self._storage.get_inline_allocation(
             self.shape,
@@ -1219,6 +1559,14 @@ def serialize(self, buf: BufferBuilder) -> None:
         self._transform.serialize(buf)
 
     def get_key_partition(self) -> Optional[PartitionBase]:
+        """
+        Returns the current key partition of the store
+
+        Returns
+        -------
+        PartitionBase
+            The store's key partition
+        """
         # Flush outstanding operations to have the key partition of this store
         # registered correctly
         runtime.flush_scheduling_window()
@@ -1238,6 +1586,14 @@ def has_key_partition(self, restrictions: tuple[Restriction, ...]) -> bool:
         return (part is not None) and (part.even or self._transform.bottom)
 
     def set_key_partition(self, partition: PartitionBase) -> None:
+        """
+        Sets a new key partition for the store
+
+        Parameters
+        ----------
+        partition : PartitionBase
+            A new key partition
+        """
         runtime.partition_manager.record_store_key_partition(
             self._unique_id, partition
         )
@@ -1248,6 +1604,9 @@ def set_key_partition(self, partition: PartitionBase) -> None:
         )
 
     def reset_key_partition(self) -> None:
+        """
+        Clears the store's key partition
+        """
         runtime.partition_manager.reset_store_key_partition(self._unique_id)
         # Also reset the storage's key partition.
         self._storage.reset_key_partition()
@@ -1346,6 +1705,19 @@ def partition(self, partition: PartitionBase) -> StorePartition:
     def partition_by_tiling(
         self, tile_shape: Union[Shape, Sequence[int]]
     ) -> StorePartition:
+        """
+        Creates a tiled partition of the store
+
+        Parameters
+        ----------
+        tile_shape : Shape or Sequence[int]
+            Shape of tiles
+
+        Returns
+        -------
+        StorePartition
+            A ``StorePartition`` object
+        """
         if self.unbound:
             raise TypeError("Unbound store cannot be manually partitioned")
         if not isinstance(tile_shape, Shape):
diff --git a/legate/driver/__init__.py b/legate/driver/__init__.py
index 67ce493b8..786b0f069 100644
--- a/legate/driver/__init__.py
+++ b/legate/driver/__init__.py
@@ -15,13 +15,23 @@
 from __future__ import annotations
 
 from .config import Config
-from .driver import Driver
+from .driver import LegateDriver, CanonicalDriver
 from .launcher import Launcher
 
 
 def main() -> int:
-    import sys
+    import os, shlex, sys
 
-    from .main import main as _main
+    from .main import legate_main as _main
 
-    return _main(sys.argv)
+    # A little explanation. We want to encourage configuration options be
+    # passed via LEGATE_CONFIG, in order to be considerate to user scripts.
+    # But we still need to accept actual command line args for comaptibility,
+    # and those should also take precedences. Here we splice the options from
+    # LEGATE_CONFIG in before sys.argv, and take advantage of the fact that if
+    # there are any options repeated in both places, argparse will use the
+    # latter (i.e. the actual command line provided ones).
+    env_args = shlex.split(os.environ.get("LEGATE_CONFIG", ""))
+    argv = sys.argv[:1] + env_args + sys.argv[1:]
+
+    return _main(argv)
diff --git a/legate/driver/args.py b/legate/driver/args.py
index 92bce362b..daee26ab0 100644
--- a/legate/driver/args.py
+++ b/legate/driver/args.py
@@ -16,8 +16,9 @@
 #
 from __future__ import annotations
 
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+from argparse import REMAINDER, ArgumentDefaultsHelpFormatter, ArgumentParser
 
+from .. import __version__
 from ..util.shared_args import (
     CPUS,
     FBMEM,
@@ -39,12 +40,20 @@
 
 __all__ = ("parser",)
 
+
 parser = ArgumentParser(
     description="Legate Driver",
     allow_abbrev=False,
     formatter_class=ArgumentDefaultsHelpFormatter,
 )
 
+parser.add_argument(
+    "command",
+    nargs=REMAINDER,
+    help="A python script to run, plus any arguments for the script. "
+    "Any arguments after the script will be passed to the script, i.e. "
+    "NOT used as arguments to legate itself.",
+)
 
 multi_node = parser.add_argument_group("Multi-node configuration")
 multi_node.add_argument(NODES.name, **NODES.kwargs)
@@ -62,7 +71,8 @@
     help="CPU cores to bind each rank to. Comma-separated core IDs as "
     "well as ranges are accepted, as reported by `numactl`. Binding "
     "instructions for all ranks should be listed in one string, separated "
-    "by `/`.",
+    "by `/`. "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -70,7 +80,8 @@
     "--mem-bind",
     help="NUMA memories to bind each rank to. Use comma-separated integer "
     "IDs as reported by `numactl`. Binding instructions for all ranks "
-    "should be listed in one string, separated by `/`.",
+    "should be listed in one string, separated by `/`. "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -78,7 +89,8 @@
     "--gpu-bind",
     help="GPUs to bind each rank to. Use comma-separated integer IDs as "
     "reported by `nvidia-smi`. Binding instructions for all ranks "
-    "should be listed in one string, separated by `/`.",
+    "should be listed in one string, separated by `/`. "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -86,11 +98,12 @@
     "--nic-bind",
     help="NICs to bind each rank to. Use comma-separated device names as "
     "appropriate for the network in use. Binding instructions for all ranks "
-    "should be listed in one string, separated by `/`.",
+    "should be listed in one string, separated by `/`. "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
-core = parser.add_argument_group("Core alloction")
+core = parser.add_argument_group("Core allocation")
 core.add_argument(CPUS.name, **CPUS.kwargs)
 core.add_argument(GPUS.name, **GPUS.kwargs)
 core.add_argument(OMPS.name, **OMPS.kwargs)
@@ -98,7 +111,7 @@
 core.add_argument(UTILITY.name, **UTILITY.kwargs)
 
 
-memory = parser.add_argument_group("Memory alloction")
+memory = parser.add_argument_group("Memory allocation")
 memory.add_argument(SYSMEM.name, **SYSMEM.kwargs)
 memory.add_argument(NUMAMEM.name, **NUMAMEM.kwargs)
 memory.add_argument(FBMEM.name, **FBMEM.kwargs)
@@ -134,7 +147,8 @@
     dest="cprofile",
     action="store_true",
     required=False,
-    help="profile Python execution with the cprofile module",
+    help="profile Python execution with the cprofile module, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -143,7 +157,8 @@
     dest="nvprof",
     action="store_true",
     required=False,
-    help="run Legate with nvprof",
+    help="run Legate with nvprof, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -152,7 +167,8 @@
     dest="nsys",
     action="store_true",
     required=False,
-    help="run Legate with Nsight Systems",
+    help="run Legate with Nsight Systems, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -161,7 +177,8 @@
     dest="nsys_targets",
     default="cublas,cuda,cudnn,nvtx,ucx",
     required=False,
-    help="Specify profiling targets for Nsight Systems",
+    help="Specify profiling targets for Nsight Systems, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -173,7 +190,8 @@
     required=False,
     help="Specify extra flags for Nsight Systems (can appear more than once). "
     "Multiple arguments may be provided together in a quoted string "
-    "(arguments with spaces inside must be additionally quoted)",
+    "(arguments with spaces inside must be additionally quoted), "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 logging = parser.add_argument_group("Logging")
@@ -223,7 +241,8 @@
     dest="gdb",
     action="store_true",
     required=False,
-    help="run Legate inside gdb",
+    help="run Legate inside gdb, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -232,7 +251,8 @@
     dest="cuda_gdb",
     action="store_true",
     required=False,
-    help="run Legate inside cuda-gdb",
+    help="run Legate inside cuda-gdb, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -241,7 +261,8 @@
     dest="memcheck",
     action="store_true",
     required=False,
-    help="run Legate with cuda-memcheck",
+    help="run Legate with cuda-memcheck, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -318,7 +339,8 @@
     dest="bind_detail",
     action="store_true",
     required=False,
-    help="print out the final invocation run by bind.sh",
+    help="print out the final invocation run by bind.sh, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -330,7 +352,8 @@
     dest="module",
     default=None,
     required=False,
-    help="Specify a Python module to load before running",
+    help="Specify a Python module to load before running, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 
@@ -348,7 +371,8 @@
     dest="rlwrap",
     action="store_true",
     required=False,
-    help="Whether to run with rlwrap to improve readline ability",
+    help="Whether to run with rlwrap to improve readline ability, "
+    "[legate-only, not supported with standard Python invocation]",
 )
 
 other.add_argument(
@@ -358,3 +382,9 @@
     required=False,
     help="Whether to use color terminal output (if colorama is installed)",
 )
+
+other.add_argument(
+    "--version",
+    action="version",
+    version=__version__,
+)
diff --git a/legate/driver/command.py b/legate/driver/command.py
index 2fc8cc1ed..b4ab348fe 100644
--- a/legate/driver/command.py
+++ b/legate/driver/command.py
@@ -16,6 +16,7 @@
 
 from typing import TYPE_CHECKING
 
+from .. import install_info
 from ..util.ui import warn
 
 if TYPE_CHECKING:
@@ -24,7 +25,7 @@
     from .config import ConfigProtocol
     from .launcher import Launcher
 
-__all__ = ("CMD_PARTS",)
+__all__ = ("CMD_PARTS_LEGION", "CMD_PARTS_CANONICAL")
 
 
 # this will be replaced by bind.sh with the actual computed rank at runtime
@@ -182,11 +183,18 @@ def cmd_legion(
     return (str(system.legion_paths.legion_python),)
 
 
-def cmd_processor(
+def cmd_python_processor(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
-    # We always need one python processor per rank and no local fields
-    return ("-ll:py", "1", "-lg:local", "0")
+    # We always need one python processor per rank
+    return ("-ll:py", "1")
+
+
+def cmd_local_field(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
+    # We always need no local fields
+    return ("-lg:local", "0")
 
 
 def cmd_kthreads(
@@ -251,26 +259,37 @@ def cmd_openmp(
     )
 
 
-def cmd_utility(
+def cmd_bgwork(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
-    utility = config.core.utility
     ranks = config.multi_node.ranks
+    utility = config.core.utility
 
-    if utility == 1:
-        return ()
-
-    opts: CommandPart = ("-ll:util", str(utility))
+    opts: CommandPart = ()
 
     # If we are running multi-rank then make the number of active
     # message handler threads equal to our number of utility
     # processors in order to prevent head-of-line blocking
     if ranks > 1:
-        opts += ("-ll:bgwork", str(utility))
+        opts += ("-ll:bgwork", str(max(utility, 2)))
+
+    if ranks > 1 and "ucx" in install_info.networks:
+        opts += ("-ll:bgworkpin", "1")
 
     return opts
 
 
+def cmd_utility(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
+    utility = config.core.utility
+
+    if utility == 1:
+        return ()
+
+    return ("-ll:util", str(utility))
+
+
 def cmd_mem(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
@@ -347,7 +366,7 @@ def cmd_log_file(
     log_to_file = config.logging.log_to_file
 
     if log_to_file:
-        return ("-logfile", str(log_dir / "legate_%.log"))
+        return ("-logfile", str(log_dir / "legate_%.log"), "-errlevel", "4")
 
     return ()
 
@@ -363,7 +382,13 @@ def cmd_eager_alloc(
 def cmd_ucx(
     config: ConfigProtocol, system: System, launcher: Launcher
 ) -> CommandPart:
-    return ("-ucx:tls_host", "^dc,ud")
+    return ("-ucx:tls_host", "rc,tcp,cuda_copy,cuda_ipc,sm,self")
+
+
+def cmd_user_script(
+    config: ConfigProtocol, system: System, launcher: Launcher
+) -> CommandPart:
+    return () if config.user_script is None else (config.user_script,)
 
 
 def cmd_user_opts(
@@ -372,27 +397,17 @@ def cmd_user_opts(
     return config.user_opts
 
 
-CMD_PARTS = (
-    cmd_bind,
-    cmd_rlwrap,
-    cmd_gdb,
-    cmd_cuda_gdb,
-    cmd_nvprof,
-    cmd_nsys,
-    # Add memcheck right before the binary
-    cmd_memcheck,
-    # Now we're ready to build the actual command to run
-    cmd_legion,
+_CMD_PARTS_SHARED = (
     # This has to go before script name
     cmd_nocr,
-    cmd_module,
-    cmd_processor,
+    cmd_local_field,
     cmd_kthreads,
     # Translate the requests to Realm command line parameters
     cmd_cpus,
     cmd_gpus,
     cmd_openmp,
     cmd_utility,
+    cmd_bgwork,
     cmd_mem,
     cmd_numamem,
     cmd_fbmem,
@@ -402,6 +417,41 @@ def cmd_user_opts(
     cmd_log_file,
     cmd_eager_alloc,
     cmd_ucx,
-    # Append user flags so they can override whatever we provided
-    cmd_user_opts,
+)
+
+CMD_PARTS_LEGION = (
+    (
+        cmd_bind,
+        cmd_rlwrap,
+        cmd_gdb,
+        cmd_cuda_gdb,
+        cmd_nvprof,
+        cmd_nsys,
+        # Add memcheck right before the binary
+        cmd_memcheck,
+        # Now we're ready to build the actual command to run
+        cmd_legion,
+        # This has to go before script name
+        cmd_python_processor,
+        cmd_module,
+    )
+    + _CMD_PARTS_SHARED
+    + (
+        # User script
+        cmd_user_script,
+        # Append user flags so they can override whatever we provided
+        cmd_user_opts,
+    )
+)
+
+CMD_PARTS_CANONICAL = (
+    (
+        # User script
+        cmd_user_script,
+    )
+    + _CMD_PARTS_SHARED
+    + (
+        # Append user flags so they can override whatever we provided
+        cmd_user_opts,
+    )
 )
diff --git a/legate/driver/config.py b/legate/driver/config.py
index 0be36f959..de0394a7d 100644
--- a/legate/driver/config.py
+++ b/legate/driver/config.py
@@ -22,7 +22,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Protocol
+from typing import Any, Optional, Protocol
 
 from ..util import colors
 from ..util.types import (
@@ -146,11 +146,11 @@ class Other(DataclassMixin):
 
 
 class ConfigProtocol(Protocol):
-
     _args: Namespace
 
     argv: ArgList
 
+    user_script: Optional[str]
     user_opts: tuple[str, ...]
     multi_node: MultiNode
     binding: Binding
@@ -177,14 +177,15 @@ class Config:
     def __init__(self, argv: ArgList) -> None:
         self.argv = argv
 
-        args, extra = parser.parse_known_args(self.argv[1:])
+        args = parser.parse_args(self.argv[1:])
 
         colors.ENABLED = args.color
 
         # only saving this for help with testing
         self._args = args
 
-        self.user_opts = tuple(extra)
+        self.user_script = args.command[0] if args.command else None
+        self.user_opts = tuple(args.command[1:]) if self.user_script else ()
 
         # these may modify the args, so apply before dataclass conversions
         self._fixup_nocr(args)
@@ -203,7 +204,7 @@ def __init__(self, argv: ArgList) -> None:
     @cached_property
     def console(self) -> bool:
         """Whether we are starting Legate as an interactive console."""
-        return not any(opt.endswith(".py") for opt in self.user_opts)
+        return self.user_script is None
 
     def _fixup_nocr(self, args: Namespace) -> None:
         # this is slightly duplicative of MultiNode.ranks property, but fixup
diff --git a/legate/driver/driver.py b/legate/driver/driver.py
index 7f3e17d33..a534bf09f 100644
--- a/legate/driver/driver.py
+++ b/legate/driver/driver.py
@@ -14,22 +14,24 @@
 #
 from __future__ import annotations
 
+from dataclasses import dataclass
 from shlex import quote
 from subprocess import run
 from textwrap import indent
 from typing import TYPE_CHECKING
 
 from ..util.system import System
+from ..util.types import DataclassMixin
 from ..util.ui import kvtable, rule, section, value, warn
-from .command import CMD_PARTS
+from .command import CMD_PARTS_CANONICAL, CMD_PARTS_LEGION
 from .config import ConfigProtocol
-from .launcher import Launcher
+from .launcher import Launcher, SimpleLauncher
 from .logs import process_logs
 
 if TYPE_CHECKING:
     from ..util.types import Command, EnvDict
 
-__all__ = ("Driver", "print_verbose")
+__all__ = ("LegateDriver", "CanonicalDriver", "print_verbose")
 
 _DARWIN_GDB_WARN = """\
 You must start the debugging session with the following command,
@@ -41,7 +43,14 @@
 """
 
 
-class Driver:
+@dataclass(frozen=True)
+class LegateVersions(DataclassMixin):
+    """Collect package versions relevant to Legate."""
+
+    legate_version: str
+
+
+class LegateDriver:
     """Coordinate the system, user-configuration, and launcher to appropriately
     execute the Legate process.
 
@@ -65,7 +74,7 @@ def cmd(self) -> Command:
         launcher = self.launcher
         system = self.system
 
-        parts = (part(config, system, launcher) for part in CMD_PARTS)
+        parts = (part(config, system, launcher) for part in CMD_PARTS_LEGION)
         return launcher.cmd + sum(parts, ())
 
     @property
@@ -83,12 +92,13 @@ def custom_env_vars(self) -> set[str]:
         # in case we want to augment the launcher env we could do it here
         return self.launcher.custom_env_vars
 
-    def run(self) -> int:
-        """Run the Legate process.
+    @property
+    def dry_run(self) -> bool:
+        """Check verbose and dry run.
 
         Returns
         -------
-            int : process return code
+            bool : whether dry run is enabled
 
         """
         if self.config.info.verbose:
@@ -101,7 +111,17 @@ def run(self) -> int:
 
         self._darwin_gdb_warn()
 
-        if self.config.other.dry_run:
+        return self.config.other.dry_run
+
+    def run(self) -> int:
+        """Run the Legate process.
+
+        Returns
+        -------
+            int : process return code
+
+        """
+        if self.dry_run:
             return 0
 
         with process_logs(self.config, self.system, self.launcher):
@@ -122,9 +142,55 @@ def _darwin_gdb_warn(self) -> None:
             )
 
 
+class CanonicalDriver(LegateDriver):
+    """Coordinate the system, user-configuration, and launcher to appropriately
+    execute the Legate process.
+
+    Parameters
+    ----------
+        config : Config
+
+        system : System
+
+    """
+
+    def __init__(self, config: ConfigProtocol, system: System) -> None:
+        self.config = config
+        self.system = system
+        self.launcher = SimpleLauncher(config, system)
+
+    @property
+    def cmd(self) -> Command:
+        """The full command invocation that should be used to start Legate."""
+        config = self.config
+        launcher = self.launcher
+        system = self.system
+
+        parts = (
+            part(config, system, launcher) for part in CMD_PARTS_CANONICAL
+        )
+        return sum(parts, ())
+
+    def run(self) -> int:
+        """Run the Legate process.
+
+        Returns
+        -------
+            int : process return code
+
+        """
+        assert False, "This function should not be invoked."
+
+
+def get_versions() -> LegateVersions:
+    from legate import __version__ as lg_version
+
+    return LegateVersions(legate_version=lg_version)
+
+
 def print_verbose(
     system: System,
-    driver: Driver | None = None,
+    driver: LegateDriver | None = None,
 ) -> None:
     """Print system and driver configuration values.
 
@@ -151,6 +217,9 @@ def print_verbose(
     print(section("\nLegion paths:"))
     print(indent(str(system.legion_paths), prefix="  "))
 
+    print(section("\nVersions:"))
+    print(indent(str(get_versions()), prefix="  "))
+
     if driver:
         print(section("\nCommand:"))
         cmd = " ".join(quote(t) for t in driver.cmd)
diff --git a/legate/driver/main.py b/legate/driver/main.py
index 2ca3f04be..bb02c7cbb 100644
--- a/legate/driver/main.py
+++ b/legate/driver/main.py
@@ -17,32 +17,26 @@
 """
 from __future__ import annotations
 
-__all__ = ("main",)
+from typing import Type, Union
 
+from . import CanonicalDriver, LegateDriver
 
-def main(argv: list[str]) -> int:
-    """A main function for the Legate driver that can be used programmatically
-    or by entry-points.
+__all__ = ("legate_main",)
 
-    Parameters
-    ----------
-        argv : list[str]
-            Command-line arguments to start the Legate driver with
 
-    Returns
-    -------
-        int, a process return code
-
-    """
+def prepare_driver(
+    argv: list[str],
+    driver_type: Union[Type[CanonicalDriver], Type[LegateDriver]],
+) -> Union[CanonicalDriver, LegateDriver]:
     from ..util.system import System
     from ..util.ui import error
-    from . import Config, Driver
+    from . import Config
     from .driver import print_verbose
 
     try:
         config = Config(argv)
     except Exception as e:
-        print(error("Could not configure Legate driver:\n"))
+        print(error("Could not configure driver:\n"))
         raise e
 
     try:
@@ -52,11 +46,29 @@ def main(argv: list[str]) -> int:
         raise e
 
     try:
-        driver = Driver(config, system)
+        driver = driver_type(config, system)
     except Exception as e:
-        msg = "Could not initialize Legate driver, path config and exception follow:"  # noqa
+        msg = "Could not initialize driver, path config and exception follow:"  # noqa
         print(error(msg))
         print_verbose(system)
         raise e
 
+    return driver
+
+
+def legate_main(argv: list[str]) -> int:
+    """A main function for the Legate driver that can be used programmatically
+    or by entry-points.
+
+    Parameters
+    ----------
+        argv : list[str]
+            Command-line arguments to start the Legate driver with
+
+    Returns
+    -------
+        int, a process return code
+
+    """
+    driver = prepare_driver(argv, LegateDriver)
     return driver.run()
diff --git a/legate/install_info.py.in b/legate/install_info.py.in
index 4f0416979..bc38c4083 100644
--- a/legate/install_info.py.in
+++ b/legate/install_info.py.in
@@ -42,3 +42,5 @@ def get_libpath():
 
 libpath: str = get_libpath()
 header: str = """@header@"""
+
+networks: list[str] = "@Legion_NETWORKS@".split()
diff --git a/legate/jupyter/config.py b/legate/jupyter/config.py
index 2acbc6dcb..ebec279be 100644
--- a/legate/jupyter/config.py
+++ b/legate/jupyter/config.py
@@ -19,6 +19,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional
 
 import legate.util.colors as colors
 from legate.driver.config import (
@@ -79,6 +80,7 @@ def __init__(self, argv: ArgList) -> None:
         self.memory = object_to_dataclass(args, Memory)
 
         # turn everything else off
+        self.user_script: Optional[str] = None
         self.user_opts: tuple[str, ...] = ()
         self.binding = Binding(None, None, None, None)
         self.profiling = Profiling(False, False, False, False, "", [])
diff --git a/legate/jupyter/kernel.py b/legate/jupyter/kernel.py
index daadae9ff..ae371e28f 100644
--- a/legate/jupyter/kernel.py
+++ b/legate/jupyter/kernel.py
@@ -31,7 +31,7 @@
     NoSuchKernel,
 )
 
-from legate.driver import Driver
+from legate.driver import LegateDriver
 from legate.jupyter.config import Config
 from legate.util.types import ArgList
 from legate.util.ui import error
@@ -48,7 +48,7 @@ class LegateMetadata(TypedDict):
 LEGATE_JUPYTER_METADATA_KEY: Literal["legate"] = "legate"
 
 
-def generate_kernel_spec(driver: Driver, config: Config) -> KernelSpec:
+def generate_kernel_spec(driver: LegateDriver, config: Config) -> KernelSpec:
     legion_kernel = Path(__file__).parent / "_legion_kernel.py"
     argv = list(driver.cmd) + [str(legion_kernel), "-f", "{connection_file}"]
 
diff --git a/legate/jupyter/main.py b/legate/jupyter/main.py
index 494fdf421..d287022d3 100644
--- a/legate/jupyter/main.py
+++ b/legate/jupyter/main.py
@@ -16,7 +16,7 @@
 #
 from __future__ import annotations
 
-from legate.driver import Driver
+from legate.driver import LegateDriver
 from legate.jupyter.config import Config
 from legate.jupyter.kernel import generate_kernel_spec, install_kernel_spec
 from legate.util.system import System
@@ -28,7 +28,7 @@ def main(argv: list[str]) -> int:
     config = Config(argv)
     system = System()
 
-    driver = Driver(config, system)
+    driver = LegateDriver(config, system)
 
     spec = generate_kernel_spec(driver, config)
 
diff --git a/legate/rc.py b/legate/rc.py
deleted file mode 100644
index bd4abca51..000000000
--- a/legate/rc.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-LEGION_WARNING = """
-
-All Legate programs must be run with a legion_python interperter. We
-recommend that you use the Legate driver script "bin/legate" found
-in the installation directory to launch Legate programs as it
-provides easy-to-use flags for invoking legion_python. You can see
-options for using the driver script with "bin/legate --help". You
-can also invoke legion_python directly.
-
-Use "bin/legate --verbose ..." to see some examples of how to call
-legion_python directly.
-"""
-
-# TODO (bv) temp transitive imports until cunumeric is updated
-from .util.args import (  # noqa
-    ArgSpec,
-    Argument,
-    parse_library_command_args as parse_command_args,
-)
-
-
-def has_legion_context() -> bool:
-    """Determine whether we are running in legion_python.
-
-    Returns
-        bool : True if running in legion_python, otherwise False
-
-    """
-    try:
-        from legion_cffi import lib
-
-        return lib.legion_runtime_has_context()
-    except (ModuleNotFoundError, AttributeError):
-        return False
-
-
-def check_legion(msg: str = LEGION_WARNING) -> None:
-    """Raise an error if we are not running in legion_python."""
-    if not has_legion_context():
-        raise RuntimeError(msg)
diff --git a/legate/settings.py b/legate/settings.py
new file mode 100644
index 000000000..c3b89ece0
--- /dev/null
+++ b/legate/settings.py
@@ -0,0 +1,62 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+from .util.settings import PrioritizedSetting, Settings, convert_bool
+
+__all__ = ("settings",)
+
+
+class LegateRuntimeSettings(Settings):
+    consensus: PrioritizedSetting[bool] = PrioritizedSetting(
+        "consensus",
+        "LEGATE_CONSENSUS",
+        default=False,
+        convert=convert_bool,
+        help="""
+        Whether to enable consensus match on single node (for testing).
+        """,
+    )
+
+    cycle_check: PrioritizedSetting[bool] = PrioritizedSetting(
+        "cycle_check",
+        "LEGATE_CYCLE_CHECK",
+        default=False,
+        convert=convert_bool,
+        help="""
+        Whether to check for reference cycles involving RegionField objects on
+        exit (developer option). When such cycles arise during execution they
+        stop used RegionFields from being collected and reused for new Stores,
+        thus increasing memory pressure. By default this check will miss any
+        RegionField cycles the garbage collector collected during execution.
+
+        Run gc.disable() at the beginning of the program to avoid this.
+        """,
+    )
+
+    future_leak_check: PrioritizedSetting[bool] = PrioritizedSetting(
+        "future_leak_check",
+        "LEGATE_FUTURE_LEAK_CHECK",
+        default=False,
+        convert=convert_bool,
+        help="""
+        Whether to check for reference cycles keeping Future/FutureMap objects
+        alive after Legate runtime exit (developer option). Such leaks can
+        result in Legion runtime shutdown hangs.
+        """,
+    )
+
+
+settings = LegateRuntimeSettings()
diff --git a/legate/tester/__init__.py b/legate/tester/__init__.py
index 045eca19d..29f8d0b20 100644
--- a/legate/tester/__init__.py
+++ b/legate/tester/__init__.py
@@ -29,7 +29,7 @@
 ]
 
 #: Value to use if --cpus is not specified.
-DEFAULT_CPUS_PER_NODE = 4
+DEFAULT_CPUS_PER_NODE = 2
 
 #: Value to use if --gpus is not specified.
 DEFAULT_GPUS_PER_NODE = 1
@@ -46,6 +46,9 @@
 #: Value to use if --ompthreads is not specified.
 DEFAULT_OMPTHREADS = 4
 
+#: Value to use if --numamem is not specified.
+DEFAULT_NUMAMEM = 0
+
 #: Default values to apply to normalize the testing environment.
 DEFAULT_PROCESS_ENV = {
     "LEGATE_TEST": "1",
diff --git a/legate/tester/args.py b/legate/tester/args.py
index 4b24077a0..5119a1994 100644
--- a/legate/tester/args.py
+++ b/legate/tester/args.py
@@ -28,6 +28,7 @@
     DEFAULT_GPU_DELAY,
     DEFAULT_GPU_MEMORY_BUDGET,
     DEFAULT_GPUS_PER_NODE,
+    DEFAULT_NUMAMEM,
     DEFAULT_OMPS_PER_NODE,
     DEFAULT_OMPTHREADS,
     FEATURES,
@@ -161,6 +162,15 @@
 )
 
 
+feature_opts.add_argument(
+    "--numamem",
+    dest="numamem",
+    type=int,
+    default=DEFAULT_NUMAMEM,
+    help="NUMA memory for OpenMP processors (MB)",
+)
+
+
 test_opts = parser.add_argument_group("Test run configuration options")
 
 
diff --git a/legate/tester/config.py b/legate/tester/config.py
index 39441e433..d943b4b14 100644
--- a/legate/tester/config.py
+++ b/legate/tester/config.py
@@ -63,6 +63,7 @@ def __init__(self, argv: ArgList) -> None:
         self.fbmem = args.fbmem
         self.gpu_delay = args.gpu_delay
         self.ompthreads = args.ompthreads
+        self.numamem = args.numamem
 
         # test run configuration
         self.debug = args.debug
diff --git a/legate/tester/stages/_linux/cpu.py b/legate/tester/stages/_linux/cpu.py
index deb5610a6..8c17343c7 100644
--- a/legate/tester/stages/_linux/cpu.py
+++ b/legate/tester/stages/_linux/cpu.py
@@ -19,7 +19,7 @@
 
 from ..test_stage import TestStage
 from ..util import (
-    CUNUMERIC_TEST_ARG,
+    CUNUMERIC_TEST_ENV,
     UNPIN_ENV,
     Shard,
     StageSpec,
@@ -48,13 +48,15 @@ class CPU(TestStage):
 
     kind: FeatureType = "cpus"
 
-    args = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+        env = {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+        env.update(CUNUMERIC_TEST_ENV)
+        return env
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
         args = [
diff --git a/legate/tester/stages/_linux/eager.py b/legate/tester/stages/_linux/eager.py
index cc9a08d5a..12a19c24b 100644
--- a/legate/tester/stages/_linux/eager.py
+++ b/legate/tester/stages/_linux/eager.py
@@ -17,7 +17,7 @@
 from typing import TYPE_CHECKING
 
 from ..test_stage import TestStage
-from ..util import Shard, StageSpec, adjust_workers
+from ..util import EAGER_ENV, Shard, StageSpec, adjust_workers
 
 if TYPE_CHECKING:
     from ....util.types import ArgList, EnvDict
@@ -47,12 +47,7 @@ def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        # Raise min chunk sizes for deferred codepaths to force eager execution
-        env = {
-            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
-        }
+        env = dict(EAGER_ENV)
         return env
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
diff --git a/legate/tester/stages/_linux/gpu.py b/legate/tester/stages/_linux/gpu.py
index 64f625c00..751118dcf 100644
--- a/legate/tester/stages/_linux/gpu.py
+++ b/legate/tester/stages/_linux/gpu.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 from ..test_stage import TestStage
-from ..util import CUNUMERIC_TEST_ARG, Shard, StageSpec, adjust_workers
+from ..util import CUNUMERIC_TEST_ENV, Shard, StageSpec, adjust_workers
 
 if TYPE_CHECKING:
     from ....util.types import ArgList, EnvDict
@@ -44,13 +44,13 @@ class GPU(TestStage):
 
     kind: FeatureType = "cuda"
 
-    args = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return {}
+        return dict(CUNUMERIC_TEST_ENV)
 
     def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
         time.sleep(config.gpu_delay / 1000)
diff --git a/legate/tester/stages/_linux/omp.py b/legate/tester/stages/_linux/omp.py
index f7af3e9d0..09101bafd 100644
--- a/legate/tester/stages/_linux/omp.py
+++ b/legate/tester/stages/_linux/omp.py
@@ -19,7 +19,7 @@
 
 from ..test_stage import TestStage
 from ..util import (
-    CUNUMERIC_TEST_ARG,
+    CUNUMERIC_TEST_ENV,
     UNPIN_ENV,
     Shard,
     StageSpec,
@@ -48,13 +48,15 @@ class OMP(TestStage):
 
     kind: FeatureType = "openmp"
 
-    args = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+        env = {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
+        env.update(CUNUMERIC_TEST_ENV)
+        return env
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
         args = [
@@ -62,6 +64,8 @@ def shard_args(self, shard: Shard, config: Config) -> ArgList:
             str(config.omps),
             "--ompthreads",
             str(config.ompthreads),
+            "--numamem",
+            str(config.numamem),
         ]
         if config.cpu_pin != "none":
             args += [
diff --git a/legate/tester/stages/_osx/cpu.py b/legate/tester/stages/_osx/cpu.py
index 182a6d76b..e911892de 100644
--- a/legate/tester/stages/_osx/cpu.py
+++ b/legate/tester/stages/_osx/cpu.py
@@ -18,7 +18,7 @@
 
 from ..test_stage import TestStage
 from ..util import (
-    CUNUMERIC_TEST_ARG,
+    CUNUMERIC_TEST_ENV,
     UNPIN_ENV,
     Shard,
     StageSpec,
@@ -47,13 +47,15 @@ class CPU(TestStage):
 
     kind: FeatureType = "cpus"
 
-    args = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return UNPIN_ENV
+        env = dict(UNPIN_ENV)
+        env.update(CUNUMERIC_TEST_ENV)
+        return env
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
         return ["--cpus", str(config.cpus)]
diff --git a/legate/tester/stages/_osx/eager.py b/legate/tester/stages/_osx/eager.py
index b32feb17d..4cb0be16b 100644
--- a/legate/tester/stages/_osx/eager.py
+++ b/legate/tester/stages/_osx/eager.py
@@ -17,7 +17,7 @@
 from typing import TYPE_CHECKING
 
 from ..test_stage import TestStage
-from ..util import UNPIN_ENV, Shard, StageSpec, adjust_workers
+from ..util import EAGER_ENV, UNPIN_ENV, Shard, StageSpec, adjust_workers
 
 if TYPE_CHECKING:
     from ....util.types import ArgList, EnvDict
@@ -47,12 +47,7 @@ def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        # Raise min chunk sizes for deferred codepaths to force eager execution
-        env = {
-            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
-        }
+        env = dict(EAGER_ENV)
         env.update(UNPIN_ENV)
         return env
 
diff --git a/legate/tester/stages/_osx/gpu.py b/legate/tester/stages/_osx/gpu.py
index 2a1597494..1e54ba737 100644
--- a/legate/tester/stages/_osx/gpu.py
+++ b/legate/tester/stages/_osx/gpu.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 from ..test_stage import TestStage
-from ..util import CUNUMERIC_TEST_ARG, UNPIN_ENV, Shard
+from ..util import CUNUMERIC_TEST_ENV, UNPIN_ENV, Shard
 
 if TYPE_CHECKING:
     from ....util.types import ArgList, EnvDict
@@ -42,13 +42,15 @@ class GPU(TestStage):
 
     kind: FeatureType = "cuda"
 
-    args: ArgList = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         raise RuntimeError("GPU test are not supported on OSX")
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return UNPIN_ENV
+        env = dict(UNPIN_ENV)
+        env.update(CUNUMERIC_TEST_ENV)
+        return env
 
     def delay(self, shard: Shard, config: Config, system: TestSystem) -> None:
         time.sleep(config.gpu_delay / 1000)
diff --git a/legate/tester/stages/_osx/omp.py b/legate/tester/stages/_osx/omp.py
index eb279791a..1d1a8d24b 100644
--- a/legate/tester/stages/_osx/omp.py
+++ b/legate/tester/stages/_osx/omp.py
@@ -18,7 +18,7 @@
 
 from ..test_stage import TestStage
 from ..util import (
-    CUNUMERIC_TEST_ARG,
+    CUNUMERIC_TEST_ENV,
     UNPIN_ENV,
     Shard,
     StageSpec,
@@ -47,13 +47,15 @@ class OMP(TestStage):
 
     kind: FeatureType = "openmp"
 
-    args = [CUNUMERIC_TEST_ARG]
+    args: ArgList = []
 
     def __init__(self, config: Config, system: TestSystem) -> None:
         self._init(config, system)
 
     def env(self, config: Config, system: TestSystem) -> EnvDict:
-        return UNPIN_ENV
+        env = dict(UNPIN_ENV)
+        env.update(CUNUMERIC_TEST_ENV)
+        return env
 
     def shard_args(self, shard: Shard, config: Config) -> ArgList:
         return [
diff --git a/legate/tester/stages/test_stage.py b/legate/tester/stages/test_stage.py
index ed24ae461..e9080394c 100644
--- a/legate/tester/stages/test_stage.py
+++ b/legate/tester/stages/test_stage.py
@@ -252,12 +252,17 @@ def run(
 
         cov_args = self.cov_args(config)
 
-        cmd = [str(config.legate_path)] + cov_args + [str(test_path)]
-
         stage_args = self.args + self.shard_args(shard, config)
         file_args = self.file_args(test_file, config)
 
-        cmd += stage_args + file_args + config.extra_args
+        cmd = (
+            [str(config.legate_path)]
+            + stage_args
+            + cov_args
+            + [str(test_path)]
+            + file_args
+            + config.extra_args
+        )
 
         if custom_args:
             cmd += custom_args
@@ -285,7 +290,6 @@ def _init(self, config: Config, system: TestSystem) -> None:
     def _launch(
         self, config: Config, system: TestSystem
     ) -> list[ProcessResult]:
-
         pool = multiprocessing.pool.ThreadPool(self.spec.workers)
 
         jobs = [
diff --git a/legate/tester/stages/util.py b/legate/tester/stages/util.py
index 27d53bbd1..fb38d77c1 100644
--- a/legate/tester/stages/util.py
+++ b/legate/tester/stages/util.py
@@ -25,10 +25,19 @@
 from ..logger import LOG
 from ..test_system import ProcessResult
 
-CUNUMERIC_TEST_ARG = "-cunumeric:test"
-
 UNPIN_ENV = {"REALM_SYNTHETIC_CORE_MAP": ""}
 
+CUNUMERIC_TEST_ENV = {"CUNUMERIC_TEST": "1"}
+
+# Raise min chunk sizes for deferred codepaths to force eager execution
+EAGER_ENV = {
+    "CUNUMERIC_TEST": "0",
+    "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
+    "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
+    "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
+}
+
+
 Shard: TypeAlias = Tuple[int, ...]
 
 
diff --git a/legate/tester/test_system.py b/legate/tester/test_system.py
index 2c4e9949f..9f416782b 100644
--- a/legate/tester/test_system.py
+++ b/legate/tester/test_system.py
@@ -33,7 +33,6 @@
 
 @dataclass
 class ProcessResult:
-
     #: The command invovation, including relevant environment vars
     invocation: str
 
diff --git a/legate/util/args.py b/legate/util/args.py
index 4485d6db2..d23dda932 100644
--- a/legate/util/args.py
+++ b/legate/util/args.py
@@ -14,9 +14,6 @@
 #
 from __future__ import annotations
 
-import re
-import sys
-import warnings
 from argparse import Action, ArgumentParser, Namespace
 from dataclasses import dataclass, fields
 from typing import (
@@ -144,67 +141,3 @@ def __call__(
             items.append(values)
         # removing any duplicates before storing
         setattr(namespace, self.dest, list(set(items)))
-
-
-def parse_library_command_args(
-    libname: str, args: Iterable[Argument]
-) -> Namespace:
-    """ """
-    if not libname.isidentifier():
-        raise ValueError(
-            f"Invalid library {libname!r} for command line arguments"
-        )
-
-    parser = ArgumentParser(
-        prog=f"<{libname} program>", add_help=False, allow_abbrev=False
-    )
-
-    # Some explanation is in order. Argparse treats arguments with a single
-    # dash differently, e.g. "-xyz" is interpreted as "-x -y -z". This can
-    # cause confusion and clashes when there are multiple single-dash args
-    # with identical prefixes. TLDR; we want "-legate:foo" to behave just
-    # as if it was "--legate:foo". In order to do this, we configure a parser
-    # for "long argumens" and then munge the values in sys.argv to update
-    # any "short prefix" arguments to be "long prefix" arguments first, before
-    # parsing. We also take care to update any output. The alternative here
-    # would be to abandon argparse entirely, and parse sys.argv manually.
-    #
-    # ref: https://github.com/nv-legate/legate.core/issues/415
-
-    short_prefix = f"-{libname}:"
-    long_prefix = f"-{short_prefix}"
-
-    argnames = [arg.name for arg in args]
-
-    for arg in args:
-        argname = f"{long_prefix}{arg.name}"
-        parser.add_argument(argname, **arg.kwargs)
-
-    has_custom_help = "help" in argnames
-
-    if f"{short_prefix}help" in sys.argv and not has_custom_help:
-        help_string = parser.format_help()
-
-        # this is a little sloppy but should suffice in practice
-        print(help_string.replace(long_prefix, short_prefix))
-
-        sys.exit()
-
-    # convert any short-prefix args to be long-prefix
-    sys.argv = [re.sub(f"^{short_prefix}", long_prefix, x) for x in sys.argv]
-
-    args, extra = parser.parse_known_args()
-
-    # put any unconsumed args back they way they were
-    extra = [re.sub(f"^{long_prefix}", short_prefix, x) for x in extra]
-
-    for item in extra:
-        if item.startswith(short_prefix):
-            warnings.warn(
-                f"Unrecognized argument {item!r} for {libname} (passed on as-is)"  # noqa: E501
-            )
-            break
-
-    sys.argv = sys.argv[:1] + extra
-
-    return args
diff --git a/legate/util/colors.py b/legate/util/colors.py
index 6c417c221..547f0e015 100644
--- a/legate/util/colors.py
+++ b/legate/util/colors.py
@@ -94,7 +94,6 @@ def yellow(text: str) -> str:
         colorama.init()
 
 except ImportError:
-
     bright = dim = white = cyan = red = magenta = green = yellow = _text
 
 # ref: https://stackoverflow.com/a/14693789
diff --git a/legate/util/fs.py b/legate/util/fs.py
index 4b7465799..ffc5f5ab8 100644
--- a/legate/util/fs.py
+++ b/legate/util/fs.py
@@ -114,7 +114,6 @@ def get_legate_build_dir(legate_dir: Path) -> Path | None:
         return None
 
     for f in skbuild_dir.iterdir():
-
         # If using a local scikit-build dir at _skbuild/<arch>/cmake-build,
         # read Legion_BINARY_DIR and Legion_SOURCE_DIR from CMakeCache.txt
 
diff --git a/legate/util/settings.py b/legate/util/settings.py
new file mode 100644
index 000000000..c4ef60899
--- /dev/null
+++ b/legate/util/settings.py
@@ -0,0 +1,310 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+""" Control global configuration options with environment variables.
+
+Precedence
+~~~~~~~~~~
+
+Setting values are always looked up in the following prescribed order:
+
+immediately supplied values
+    These are values that are passed to the setting:
+
+    .. code-block:: python
+
+        settings.consensus(value)
+
+    If ``value`` is not None, then it will be returned, as-is. Otherwise, if
+    None is passed, then the setting will continue to look down the search
+    order for a value. This is useful for passing optional function paramters
+    that are None by default. If the parameter is passed to the function, then
+    it will take precedence.
+
+previously user-set values
+    If the value is set explicity in code:
+
+    .. code-block:: python
+
+        settings.minified = False
+
+    Then this value will take precedence over other sources. Applications
+    may use this ability to set values supplied on the command line, so that
+    they take precedence over environment variables.
+
+environment variable
+    Values found in the associated environment variables:
+
+    .. code-block:: sh
+
+        LEGATE_CONSENSUS=yes legate script.py
+
+local defaults
+    These are default values defined when accessing the setting:
+
+    .. code-block:: python
+
+        settings.concensus(default=True)
+
+    Local defaults have lower precendence than every other setting mechanism
+    except global defaults.
+
+global defaults
+    These are default values defined by the setting declarations. They have
+    lower precedence than every other setting mechanism.
+
+If no value is obtained after searching all of these locations, then a
+RuntimeError will be raised.
+
+"""
+from __future__ import annotations
+
+import os
+from typing import Any, Generic, Type, TypeVar, Union
+
+from typing_extensions import TypeAlias
+
+__all__ = (
+    "convert_str",
+    "convert_bool",
+    "convert_str_seq",
+    "PrioritizedSetting",
+    "Settings",
+)
+
+
+class _Unset:
+    pass
+
+
+T = TypeVar("T")
+
+
+Unset: TypeAlias = Union[T, Type[_Unset]]
+
+
+def convert_str(value: str) -> str:
+    """Return a string as-is."""
+    return value
+
+
+def convert_bool(value: bool | str) -> bool:
+    """Convert a string to True or False.
+
+    If a boolean is passed in, it is returned as-is. Otherwise the function
+    maps the following strings, ignoring case:
+
+    * "yes", "1", "on", "true" -> True
+    * "no", "0", "off", "false" -> False
+
+    Args:
+        value (str):
+            A string value to convert to bool
+
+    Returns:
+        bool
+
+    Raises:
+        ValueError
+
+    """
+    if isinstance(value, bool):
+        return value
+
+    val = value.lower()
+    if val in ("yes", "1", "on", "true"):
+        return True
+    if val in ("no", "0", "off", "false"):
+        return False
+
+    raise ValueError(f"Cannot convert {value} to boolean value")
+
+
+def convert_str_seq(
+    value: list[str] | tuple[str, ...] | str
+) -> tuple[str, ...]:
+    """Convert a string to a list of strings.
+
+    If a list or tuple is passed in, it is returned as-is.
+
+    Args:
+        value (seq[str] or str) :
+            A string to convert to a list of strings
+
+    Returns
+        list[str]
+
+    Raises:
+        ValueError
+
+    """
+    if isinstance(value, (list, tuple)):
+        return tuple(value)
+
+    try:
+        return tuple(value.split(","))
+    except Exception:
+        raise ValueError(f"Cannot convert {value} to list value")
+
+
+class PrioritizedSetting(Generic[T]):
+    """Return a value for a global setting according to configuration
+    precedence.
+
+    The following methods are searched in order for the setting:
+
+    4. immediately supplied values
+    3. previously user-set values (e.g. set from command line)
+    2. environment variable
+    1. local defaults
+    0. global defaults
+
+    If a value cannot be determined, a RuntimeError is raised.
+
+    The ``env_var`` argument specifies the name of an environment to check for
+    setting values, e.g. ``"LEGATE_CHECK_CYCLE"``.
+
+    The optional ``default`` argument specified an implicit default value for
+    the setting that is returned if no other methods provide a value.
+
+    A ``convert`` agument may be provided to convert values before they are
+    returned.
+    """
+
+    _parent: Settings | None
+    _user_value: Unset[str | T]
+
+    def __init__(
+        self,
+        name: str,
+        env_var: str | None = None,
+        default: Unset[T] = _Unset,
+        convert: Any | None = None,
+        help: str = "",
+    ) -> None:
+        self._convert = convert if convert else convert_str
+        self._default = default
+        self._env_var = env_var
+        self._help = help
+        self._name = name
+        self._parent = None
+        self._user_value = _Unset
+
+    def __call__(
+        self, value: T | str | None = None, default: Unset[T] = _Unset
+    ) -> T:
+        """Return the setting value according to the standard precedence.
+
+        Args:
+            value (any, optional):
+                An optional immediate value. If not None, the value will
+                be converted, then returned.
+
+            default (any, optional):
+                An optional default value that only takes precendence over
+                implicit default values specified on the property itself.
+
+        Returns:
+            str or int or float
+
+        Raises:
+            RuntimeError
+        """
+
+        # 4. immediate values
+        if value is not None:
+            return self._convert(value)
+
+        # 3. previously user-set value
+        if self._user_value is not _Unset:
+            return self._convert(self._user_value)
+
+        # 2. environment variable
+        if self._env_var and self._env_var in os.environ:
+            return self._convert(os.environ[self._env_var])
+
+        # 1. local defaults
+        if default is not _Unset:
+            return self._convert(default)
+
+        # 0. global defaults
+        if self._default is not _Unset:
+            return self._convert(self._default)
+
+        raise RuntimeError(
+            f"No configured value found for setting {self._name!r}"
+        )
+
+    def __get__(
+        self, instance: Any, owner: type[Any]
+    ) -> PrioritizedSetting[T]:
+        return self
+
+    def __set__(self, instance: Any, value: str | T) -> None:
+        self.set_value(value)
+
+    def set_value(self, value: str | T) -> None:
+        """Specify a value for this setting programmatically.
+
+        A value set this way takes precedence over all other methods except
+        immediate values.
+
+        Args:
+            value (str or int or float):
+                A user-set value for this setting
+
+        Returns:
+            None
+        """
+        # It is usually not advised to store any data directly on descriptors,
+        # since they are shared by all instances. But in our case we only ever
+        # have a single instance of a given settings object.
+        self._user_value = value
+
+    def unset_value(self) -> None:
+        """Unset the previous user value such that the priority is reset."""
+        self._user_value = _Unset
+
+    @property
+    def env_var(self) -> str | None:
+        return self._env_var
+
+    @property
+    def default(self) -> Unset[T]:
+        return self._default
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def help(self) -> str:
+        return self._help
+
+    @property
+    def convert_type(self) -> str:
+        if self._convert is convert_str:
+            return "str"
+        if self._convert is convert_bool:
+            return "bool"
+        if self._convert is convert_str_seq:
+            return "tuple[str, ...]"
+        raise RuntimeError("unreachable")
+
+
+class Settings:
+    def __init__(self) -> None:
+        for x in self.__class__.__dict__.values():
+            if isinstance(x, PrioritizedSetting):
+                x._parent = self
diff --git a/legate/util/shared_args.py b/legate/util/shared_args.py
index 688c0bfa3..ac3a561f6 100644
--- a/legate/util/shared_args.py
+++ b/legate/util/shared_args.py
@@ -83,7 +83,8 @@
         choices=LAUNCHERS,
         default="none",
         help='launcher program to use (set to "none" for local runs, or if '
-        "the launch has already happened by the time legate is invoked)",
+        "the launch has already happened by the time legate is invoked), "
+        "[legate-only, not supported with standard Python invocation]",
     ),
 )
 
@@ -97,7 +98,8 @@
         required=False,
         help="additional argument to pass to the launcher (can appear more "
         "than once). Multiple arguments may be provided together in a quoted "
-        "string (arguments with spaces inside must be additionally quoted)",
+        "string (arguments with spaces inside must be additionally quoted), "
+        "[legate-only, not supported with standard Python invocation]",
     ),
 )
 
diff --git a/legate_core_cpp.cmake b/legate_core_cpp.cmake
index ef714345f..499db234f 100644
--- a/legate_core_cpp.cmake
+++ b/legate_core_cpp.cmake
@@ -205,8 +205,10 @@ list(APPEND legate_core_SOURCES
   src/core/runtime/projection.cc
   src/core/runtime/runtime.cc
   src/core/runtime/shard.cc
+  src/core/task/registrar.cc
   src/core/task/return.cc
   src/core/task/task.cc
+  src/core/task/variant.cc
   src/core/utilities/debug.cc
   src/core/utilities/deserializer.cc
   src/core/utilities/machine.cc
@@ -231,11 +233,17 @@ endif()
 add_library(legate_core ${legate_core_SOURCES})
 add_library(legate::core ALIAS legate_core)
 
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  set(platform_rpath_origin "\$ORIGIN")
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  set(platform_rpath_origin "@loader_path")
+endif ()
+
 set_target_properties(legate_core
            PROPERTIES EXPORT_NAME                         core
                       LIBRARY_OUTPUT_NAME                 lgcore
-                      BUILD_RPATH                         "\$ORIGIN"
-                      INSTALL_RPATH                       "\$ORIGIN"
+                      BUILD_RPATH                         "${platform_rpath_origin}"
+                      INSTALL_RPATH                       "${platform_rpath_origin}"
                       CXX_STANDARD                        17
                       CXX_STANDARD_REQUIRED               ON
                       CUDA_STANDARD                       17
@@ -297,6 +305,59 @@ SECTIONS
   target_link_options(legate_core PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 endif()
 
+##############################################################################
+# - Doxygen target------------------------------------------------------------
+
+if (legate_core_BUILD_DOCS)
+  find_package(Doxygen)
+  if(Doxygen_FOUND)
+    set(legate_core_DOC_SOURCES "")
+    list(APPEND legate_core_DOC_SOURCES
+      # task
+      src/core/task/task.h
+      src/core/task/registrar.h
+      src/core/task/variant.h
+      src/core/task/exception.h
+      src/core/cuda/stream_pool.h
+      # data
+      src/core/data/store.h
+      src/core/data/scalar.h
+      src/core/data/buffer.h
+      src/core/utilities/span.h
+      src/core/data/allocator.h
+      # runtime
+      src/core/runtime/runtime.h
+      src/core/runtime/runtime.inl
+      src/core/runtime/context.h
+      # mapping
+      src/core/mapping/mapping.h
+      src/core/mapping/operation.h
+      # aliases
+      src/core/utilities/typedefs.h
+      # utilities
+      src/core/utilities/debug.h
+      src/core/utilities/dispatch.h
+      src/core/utilities/type_traits.h
+      # main page
+      src/legate.h
+    )
+    set(DOXYGEN_PROJECT_NAME "Legate")
+    set(DOXYGEN_FULL_PATH_NAMES NO)
+    set(DOXYGEN_GENERATE_HTML YES)
+    set(DOXYGEN_GENERATE_LATEX NO)
+    set(DOXYGEN_EXTENSION_MAPPING cu=C++ cuh=C++)
+    set(DOXYGEN_HIDE_UNDOC_MEMBERS YES)
+    set(DOXYGEN_HIDE_UNDOC_CLASSES YES)
+    set(DOXYGEN_STRIP_FROM_INC_PATH ${CMAKE_SOURCE_DIR}/src)
+    doxygen_add_docs("doxygen_legate" ALL
+      ${legate_core_DOC_SOURCES}
+      COMMENT "Custom command for building Doxygen docs."
+    )
+  else()
+    message(STATUS "cannot find Doxygen. not generating docs.")
+  endif()
+endif()
+
 ##############################################################################
 # - install targets-----------------------------------------------------------
 
@@ -349,13 +410,18 @@ install(
 
 install(
   FILES src/core/runtime/context.h
+        src/core/runtime/context.inl
         src/core/runtime/runtime.h
+        src/core/runtime/runtime.inl
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/legate/core/runtime)
 
 install(
   FILES src/core/task/exception.h
+        src/core/task/registrar.h
         src/core/task/return.h
         src/core/task/task.h
+        src/core/task/task.inl
+        src/core/task/variant.h
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/legate/core/task)
 
 install(
@@ -382,6 +448,8 @@ Imported Targets:
 
 ]=])
 
+file(READ ${CMAKE_SOURCE_DIR}/cmake/legate_helper_functions.cmake helper_functions)
+
 string(JOIN "\n" code_string
 [=[
 if(NOT TARGET legate::Thrust)
@@ -398,15 +466,25 @@ if(Legion_NETWORKS)
   find_package(MPI REQUIRED COMPONENTS CXX)
 endif()
 ]=]
+"${helper_functions}"
 )
 
+if(DEFINED legate_core_cuda_stubs_path)
+  string(JOIN "\n" code_string "${code_string}"
+    "list(APPEND CMAKE_C_IMPLICIT_LINK_DIRECTORIES ${legate_core_cuda_stubs_path})"
+    "list(APPEND CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES ${legate_core_cuda_stubs_path})"
+    "list(APPEND CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES ${legate_core_cuda_stubs_path})")
+endif()
+
 rapids_export(
   INSTALL legate_core
   EXPORT_SET legate-core-exports
   GLOBAL_TARGETS core
   NAMESPACE legate::
   DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK code_string)
+  FINAL_CODE_BLOCK code_string
+  LANGUAGES ${ENABLED_LANGUAGES}
+)
 
 # build export targets
 rapids_export(
@@ -415,4 +493,6 @@ rapids_export(
   GLOBAL_TARGETS core
   NAMESPACE legate::
   DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK code_string)
+  FINAL_CODE_BLOCK code_string
+  LANGUAGES ${ENABLED_LANGUAES}
+)
diff --git a/legate_core_python.cmake b/legate_core_python.cmake
index 05d92853e..c3db1b60e 100644
--- a/legate_core_python.cmake
+++ b/legate_core_python.cmake
@@ -45,10 +45,10 @@ endif()
 
 add_custom_target("generate_install_info_py" ALL
   COMMAND ${CMAKE_COMMAND}
+          -DLegion_NETWORKS="${Legion_NETWORKS}"
           -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
           -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate_install_info_py.cmake"
   COMMENT "Generate install_info.py"
-  VERBATIM
 )
 
 add_library(legate_core_python INTERFACE)
diff --git a/pyproject.toml b/pyproject.toml
index 8f82a0d13..c186fc785 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,11 +88,14 @@ strict_equality = true
 
 warn_unused_configs = true
 
+exclude = ['tests/examples']
+
 [[tool.mypy.overrides]]
 # ignore certain auto-generated and utility files
 module = [
     "legate._version",
     "legate.__main__",
     "legate.install_info",
+    "legate._sphinxext.*",
 ]
 ignore_errors = true
diff --git a/scripts/generate-conda-envs.py b/scripts/generate-conda-envs.py
index 361f35149..18e35d2b3 100755
--- a/scripts/generate-conda-envs.py
+++ b/scripts/generate-conda-envs.py
@@ -155,13 +155,19 @@ def pip(self) -> Reqs:
 class DocsConfig(SectionConfig):
     header = "docs"
 
+    @property
+    def conda(self) -> Reqs:
+        return ("pandoc", "doxygen")
+
     @property
     def pip(self) -> Reqs:
         return (
+            "ipython",
             "jinja2",
             "markdown<3.4.0",
-            "pydata-sphinx-theme",
+            "pydata-sphinx-theme>=0.13",
             "myst-parser",
+            "nbsphinx",
             "sphinx-copybutton",
             "sphinx>=4.4.0",
         )
@@ -214,7 +220,7 @@ def filename(self) -> str:
 
 # --- Setup -------------------------------------------------------------------
 
-PYTHON_VERSIONS = ("3.8", "3.9", "3.10")
+PYTHON_VERSIONS = ("3.9", "3.10", "3.11")
 
 CTK_VERSIONS = (
     "none",
@@ -239,7 +245,7 @@ def filename(self) -> str:
   - conda-forge
 dependencies:
 
-  - python={python}
+  - python={python},!=3.9.7  # avoid https://bugs.python.org/issue45121
 
 {conda_sections}{pip}
 """
@@ -314,7 +320,6 @@ def __call__(self, parser, namespace, values, option_string):
 
 
 if __name__ == "__main__":
-
     import sys
 
     parser = ArgumentParser()
diff --git a/setup.cfg b/setup.cfg
index ce00187b8..65a47048c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -61,4 +61,4 @@ packages = find:
 install_requires =
     numpy>=1.22
     # TODO: Add rest of install dependencies
-python_requires = >=3.8
+python_requires = >=3.9,!=3.9.7
diff --git a/setup.py b/setup.py
index 89583411a..83912f31f 100755
--- a/setup.py
+++ b/setup.py
@@ -33,9 +33,9 @@
         "Topic :: Scientific/Engineering",
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
     ],
     extras_require={
         "test": [
diff --git a/src/core/comm/coll.cc b/src/core/comm/coll.cc
index 5f1a1f4e9..4bd76a758 100644
--- a/src/core/comm/coll.cc
+++ b/src/core/comm/coll.cc
@@ -35,7 +35,6 @@ namespace legate {
 namespace comm {
 namespace coll {
 
-using namespace Legion;
 Logger log_coll("coll");
 
 BackendNetwork* backend_network = nullptr;
diff --git a/src/core/comm/comm_cpu.cc b/src/core/comm/comm_cpu.cc
index 05c2f6283..7b0393f91 100644
--- a/src/core/comm/comm_cpu.cc
+++ b/src/core/comm/comm_cpu.cc
@@ -19,8 +19,6 @@
 
 #include "core/comm/coll.h"
 
-using namespace Legion;
-
 namespace legate {
 namespace comm {
 namespace cpu {
@@ -30,7 +28,7 @@ static int init_cpucoll_mapping(const Legion::Task* task,
                                 Legion::Context context,
                                 Legion::Runtime* runtime)
 {
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
   int mpi_rank = 0;
 #if defined(LEGATE_USE_NETWORK)
   if (coll::backend_network->comm_type == coll::CollCommType::CollMPI) {
@@ -46,7 +44,7 @@ static coll::CollComm init_cpucoll(const Legion::Task* task,
                                    Legion::Context context,
                                    Legion::Runtime* runtime)
 {
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
 
   const int point = task->index_point[0];
   int num_ranks   = task->index_domain.get_volume();
@@ -80,7 +78,7 @@ static void finalize_cpucoll(const Legion::Task* task,
                              Legion::Context context,
                              Legion::Runtime* runtime)
 {
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
 
   assert(task->futures.size() == 1);
   coll::CollComm comm = task->futures[0].get_result<coll::CollComm>();
@@ -95,32 +93,29 @@ void register_tasks(Legion::Machine machine,
                     Legion::Runtime* runtime,
                     const LibraryContext& context)
 {
-  const InputArgs& command_args = Legion::Runtime::get_input_args();
-  int argc                      = command_args.argc;
-  char** argv                   = command_args.argv;
-  coll::collInit(argc, argv);
+  const auto& command_args = Legion::Runtime::get_input_args();
+  coll::collInit(command_args.argc, command_args.argv);
 
-  const TaskID init_cpucoll_mapping_task_id =
-    context.get_task_id(LEGATE_CORE_INIT_CPUCOLL_MAPPING_TASK_ID);
+  auto init_cpucoll_mapping_task_id = context.get_task_id(LEGATE_CORE_INIT_CPUCOLL_MAPPING_TASK_ID);
   const char* init_cpucoll_mapping_task_name = "core::comm::cpu::init_mapping";
   runtime->attach_name(init_cpucoll_mapping_task_id,
                        init_cpucoll_mapping_task_name,
                        false /*mutable*/,
                        true /*local only*/);
 
-  const TaskID init_cpucoll_task_id  = context.get_task_id(LEGATE_CORE_INIT_CPUCOLL_TASK_ID);
+  auto init_cpucoll_task_id          = context.get_task_id(LEGATE_CORE_INIT_CPUCOLL_TASK_ID);
   const char* init_cpucoll_task_name = "core::comm::cpu::init";
   runtime->attach_name(
     init_cpucoll_task_id, init_cpucoll_task_name, false /*mutable*/, true /*local only*/);
 
-  const TaskID finalize_cpucoll_task_id = context.get_task_id(LEGATE_CORE_FINALIZE_CPUCOLL_TASK_ID);
+  auto finalize_cpucoll_task_id = context.get_task_id(LEGATE_CORE_FINALIZE_CPUCOLL_TASK_ID);
   const char* finalize_cpucoll_task_name = "core::comm::cpu::finalize";
   runtime->attach_name(
     finalize_cpucoll_task_id, finalize_cpucoll_task_name, false /*mutable*/, true /*local only*/);
 
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
-    TaskVariantRegistrar registrar(task_id, task_name);
-    registrar.add_constraint(ProcessorConstraint(proc_kind));
+    Legion::TaskVariantRegistrar registrar(task_id, task_name);
+    registrar.add_constraint(Legion::ProcessorConstraint(proc_kind));
     registrar.set_leaf(true);
     registrar.global_registration = false;
     return registrar;
diff --git a/src/core/comm/comm_nccl.cu b/src/core/comm/comm_nccl.cu
index 2e22b6e92..0f95fe74e 100644
--- a/src/core/comm/comm_nccl.cu
+++ b/src/core/comm/comm_nccl.cu
@@ -17,14 +17,14 @@
 #include "core/comm/comm_nccl.h"
 #include "core/cuda/cuda_help.h"
 #include "core/cuda/stream_pool.h"
+#include "core/data/buffer.h"
 #include "core/utilities/nvtx_help.h"
+#include "core/utilities/typedefs.h"
 #include "legate.h"
 
 #include <cuda.h>
 #include <nccl.h>
 
-using namespace Legion;
-
 namespace legate {
 namespace comm {
 namespace nccl {
@@ -59,7 +59,7 @@ static ncclUniqueId init_nccl_id(const Legion::Task* task,
 {
   legate::nvtx::Range auto_range("core::comm::nccl::init_id");
 
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
 
   ncclUniqueId id;
   CHECK_NCCL(ncclGetUniqueId(&id));
@@ -74,7 +74,7 @@ static ncclComm_t* init_nccl(const Legion::Task* task,
 {
   legate::nvtx::Range auto_range("core::comm::nccl::init");
 
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
 
   assert(task->futures.size() == 1);
 
@@ -92,13 +92,8 @@ static ncclComm_t* init_nccl(const Legion::Task* task,
 
   // Perform a warm-up all-to-all
 
-  using namespace Legion;
-
-  DeferredBuffer<_Payload, 1> src_buffer(Memory::GPU_FB_MEM,
-                                         Domain(Rect<1>{Point<1>{0}, Point<1>{num_ranks - 1}}));
-
-  DeferredBuffer<_Payload, 1> tgt_buffer(Memory::GPU_FB_MEM,
-                                         Domain(Rect<1>{Point<1>{0}, Point<1>{num_ranks - 1}}));
+  auto src_buffer = create_buffer<_Payload>(num_ranks, Memory::Kind::GPU_FB_MEM);
+  auto tgt_buffer = create_buffer<_Payload>(num_ranks, Memory::Kind::GPU_FB_MEM);
 
   CHECK_NCCL(ncclGroupStart());
   for (auto idx = 0; idx < num_ranks; ++idx) {
@@ -119,7 +114,7 @@ static void finalize_nccl(const Legion::Task* task,
 {
   legate::nvtx::Range auto_range("core::comm::nccl::finalize");
 
-  Core::show_progress(task, context, runtime, task->get_task_name());
+  Core::show_progress(task, context, runtime);
 
   assert(task->futures.size() == 1);
   auto comm = task->futures[0].get_result<ncclComm_t*>();
@@ -131,24 +126,24 @@ void register_tasks(Legion::Machine machine,
                     Legion::Runtime* runtime,
                     const LibraryContext& context)
 {
-  const TaskID init_nccl_id_task_id  = context.get_task_id(LEGATE_CORE_INIT_NCCL_ID_TASK_ID);
+  auto init_nccl_id_task_id          = context.get_task_id(LEGATE_CORE_INIT_NCCL_ID_TASK_ID);
   const char* init_nccl_id_task_name = "core::comm::nccl::init_id";
   runtime->attach_name(
     init_nccl_id_task_id, init_nccl_id_task_name, false /*mutable*/, true /*local only*/);
 
-  const TaskID init_nccl_task_id  = context.get_task_id(LEGATE_CORE_INIT_NCCL_TASK_ID);
+  auto init_nccl_task_id          = context.get_task_id(LEGATE_CORE_INIT_NCCL_TASK_ID);
   const char* init_nccl_task_name = "core::comm::nccl::init";
   runtime->attach_name(
     init_nccl_task_id, init_nccl_task_name, false /*mutable*/, true /*local only*/);
 
-  const TaskID finalize_nccl_task_id  = context.get_task_id(LEGATE_CORE_FINALIZE_NCCL_TASK_ID);
+  auto finalize_nccl_task_id          = context.get_task_id(LEGATE_CORE_FINALIZE_NCCL_TASK_ID);
   const char* finalize_nccl_task_name = "core::comm::nccl::finalize";
   runtime->attach_name(
     finalize_nccl_task_id, finalize_nccl_task_name, false /*mutable*/, true /*local only*/);
 
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
-    TaskVariantRegistrar registrar(task_id, task_name);
-    registrar.add_constraint(ProcessorConstraint(proc_kind));
+    Legion::TaskVariantRegistrar registrar(task_id, task_name);
+    registrar.add_constraint(Legion::ProcessorConstraint(proc_kind));
     registrar.set_leaf(true);
     registrar.global_registration = false;
     return registrar;
diff --git a/src/core/comm/communicator.h b/src/core/comm/communicator.h
index ed75aeb66..4da3cf7b4 100644
--- a/src/core/comm/communicator.h
+++ b/src/core/comm/communicator.h
@@ -18,17 +18,25 @@
 
 #include "legion.h"
 
+/**
+ * @file
+ * @brief Class definition for legate::comm::Communicator
+ */
+
 namespace legate {
 namespace comm {
 
-// This is a thin class wrapping a future that contains a communicator.
-// This class only provides a template member function for retrieving the handle
-// and the client is expected to use a correct type for the communicators that it uses.
-//
-// The following is the list of handle types for supported communicators:
-//
-//   - NCCL: ncclComm_t*
-//
+/**
+ * @ingroup task
+ * @brief A thin wrapper class for communicators stored in futures. This class only provides
+ * a tempalte method to retrieve the communicator handle and the client is expected to pass
+ * the right handle type.
+ *
+ * The following is the list of handle types for communicators supported in Legate:
+ *
+ *   - NCCL: ncclComm_t*
+ *   - CPU communicator in Legate: legate::comm::coll::CollComm*
+ */
 class Communicator {
  public:
   Communicator() {}
@@ -39,6 +47,13 @@ class Communicator {
   Communicator& operator=(const Communicator&) = default;
 
  public:
+  /**
+   * @brief Returns the communicator stored in the wrapper
+   *
+   * @tparam T The type of communicator handle to get (see valid types above)
+   *
+   * @return A communicator
+   */
   template <typename T>
   T get() const
   {
diff --git a/src/core/comm/local_comm.cc b/src/core/comm/local_comm.cc
index 8adc4a2f3..29d317c38 100644
--- a/src/core/comm/local_comm.cc
+++ b/src/core/comm/local_comm.cc
@@ -27,7 +27,6 @@ namespace legate {
 namespace comm {
 namespace coll {
 
-using namespace Legion;
 extern Logger log_coll;
 
 // public functions start from here
@@ -348,4 +347,4 @@ void LocalNetwork::barrierLocal(CollComm global_comm)
 
 }  // namespace coll
 }  // namespace comm
-}  // namespace legate
\ No newline at end of file
+}  // namespace legate
diff --git a/src/core/comm/mpi_comm.cc b/src/core/comm/mpi_comm.cc
index 1761701ff..114c82171 100644
--- a/src/core/comm/mpi_comm.cc
+++ b/src/core/comm/mpi_comm.cc
@@ -27,7 +27,6 @@ namespace legate {
 namespace comm {
 namespace coll {
 
-using namespace Legion;
 extern Logger log_coll;
 
 enum CollTag : int {
@@ -572,4 +571,4 @@ int MPINetwork::generateGatherTag(int rank, CollComm global_comm)
 
 }  // namespace coll
 }  // namespace comm
-}  // namespace legate
\ No newline at end of file
+}  // namespace legate
diff --git a/src/core/cuda/stream_pool.h b/src/core/cuda/stream_pool.h
index e2eeb86ee..27c55fa90 100644
--- a/src/core/cuda/stream_pool.h
+++ b/src/core/cuda/stream_pool.h
@@ -21,11 +21,28 @@
 #include <cuda_runtime.h>
 #include "legion.h"
 
+/**
+ * @file
+ * @brief Class definition for legate::cuda::StreamPool
+ */
+
 namespace legate {
 namespace cuda {
 
+/**
+ * @ingroup task
+ * @brief A simple wrapper around CUDA streams to inject auxiliary features
+ *
+ * When `LEGATE_SYNC_STREAM_VIEW` is set to 1, every `StreamView` synchronizes the CUDA stream
+ * that it wraps when it is destroyed.
+ */
 struct StreamView {
  public:
+  /**
+   * @brief Creates a `StreamView` with a raw CUDA stream
+   *
+   * @param stream Raw CUDA stream to wrap
+   */
   StreamView(cudaStream_t stream) : valid_(true), stream_(stream) {}
   ~StreamView();
 
@@ -38,6 +55,11 @@ struct StreamView {
   StreamView& operator=(StreamView&&);
 
  public:
+  /**
+   * @brief Unwraps the raw CUDA stream
+   *
+   * @return Raw CUDA stream wrapped by the `StreamView`
+   */
   operator cudaStream_t() const { return stream_; }
 
  private:
@@ -45,19 +67,37 @@ struct StreamView {
   cudaStream_t stream_;
 };
 
+/**
+ * @brief A stream pool
+ */
 struct StreamPool {
  public:
   StreamPool() {}
   ~StreamPool();
 
  public:
+  /**
+   * @brief Returns a `StreamView` in the pool
+   *
+   * @return A `StreamView` object. Currently, all stream views returned from this pool are backed
+   * by the same CUDA stream.
+   */
   StreamView get_stream();
 
  public:
+  /**
+   * @brief Returns a singleton stream pool
+   *
+   * The stream pool is alive throughout the program execution.
+   *
+   * @return A `StreamPool` object
+   */
   static StreamPool& get_stream_pool();
 
  private:
   // For now we keep only one stream in the pool
+  // TODO: If this ever changes, the use of non-stream-ordered `DeferredBuffer`s
+  // in `core/data/buffer.h` will no longer be safe.
   std::unique_ptr<cudaStream_t> cached_stream_{nullptr};
 };
 
diff --git a/src/core/data/allocator.cc b/src/core/data/allocator.cc
index 62051d05e..7f4512064 100644
--- a/src/core/data/allocator.cc
+++ b/src/core/data/allocator.cc
@@ -19,7 +19,7 @@
 
 namespace legate {
 
-ScopedAllocator::ScopedAllocator(Legion::Memory::Kind kind, bool scoped, size_t alignment)
+ScopedAllocator::ScopedAllocator(Memory::Kind kind, bool scoped, size_t alignment)
   : target_kind_(kind), scoped_(scoped), alignment_(alignment)
 {
 }
@@ -59,4 +59,4 @@ void ScopedAllocator::deallocate(void* ptr)
   buffer.destroy();
 }
 
-}  // namespace legate
\ No newline at end of file
+}  // namespace legate
diff --git a/src/core/data/allocator.h b/src/core/data/allocator.h
index 47d3c1a32..f9c80a64f 100644
--- a/src/core/data/allocator.h
+++ b/src/core/data/allocator.h
@@ -20,8 +20,22 @@
 
 #include <unordered_map>
 
+/**
+ * @file
+ * @brief Class definition for legate::ScopedAllocator
+ */
+
 namespace legate {
 
+/**
+ * @ingroup data
+ * @brief A simple allocator backed by `Buffer` objects
+ *
+ * For each allocation request, this allocator creates a 1D `Buffer` of `int8_t` and returns
+ * the raw pointer to it. By default, all allocations are deallocated when the allocator is
+ * destroyed, and can optionally be made alive until the task finishes by making the allocator
+ * unscoped.
+ */
 class ScopedAllocator {
  public:
   using ByteBuffer = Buffer<int8_t>;
@@ -31,18 +45,43 @@ class ScopedAllocator {
 
   // Iff 'scoped', all allocations will be released upon destruction.
   // Otherwise this is up to the runtime after the task has finished.
-  ScopedAllocator(Legion::Memory::Kind kind, bool scoped = true, size_t alignment = 16);
+  /**
+   * @brief Create a `ScopedAllocator` for a specific memory kind
+   *
+   * @param kind Kind of the memory on which the `Buffer`s should be created
+   * @param scoped If true, the allocator is scoped; i.e., lifetimes of allocations are tied to
+   * the allocator's lifetime. Otherwise, the allocations are alive until the task finishes
+   * (and unless explicitly deallocated).
+   * @param alignment Alignment for the allocations
+   */
+  ScopedAllocator(Memory::Kind kind, bool scoped = true, size_t alignment = 16);
   ~ScopedAllocator();
 
  public:
+  /**
+   * @brief Allocates a contiguous buffer of the given Memory::Kind
+   *
+   * When the allocator runs out of memory, the runtime will fail with an error message.
+   * Otherwise, the function returns a valid pointer.
+   *
+   * @param bytes Size of the allocation in bytes
+   *
+   * @return A raw pointer to the allocation
+   */
   void* allocate(size_t bytes);
+  /**
+   * @brief Deallocates an allocation. The input pointer must be one that was previously
+   * returned by an `allocate` call, otherwise the code will fail with an error message.
+   *
+   * @param ptr Pointer to the allocation to deallocate
+   */
   void deallocate(void* ptr);
 
  private:
-  Legion::Memory::Kind target_kind_{Legion::Memory::Kind::SYSTEM_MEM};
+  Memory::Kind target_kind_{Memory::Kind::SYSTEM_MEM};
   bool scoped_;
   size_t alignment_;
   std::unordered_map<const void*, ByteBuffer> buffers_{};
 };
 
-}  // namespace legate
\ No newline at end of file
+}  // namespace legate
diff --git a/src/core/data/buffer.h b/src/core/data/buffer.h
index 91550f69d..f706c232c 100644
--- a/src/core/data/buffer.h
+++ b/src/core/data/buffer.h
@@ -19,18 +19,62 @@
 #include "legion.h"
 
 #include "core/utilities/machine.h"
+#include "core/utilities/typedefs.h"
+
+/**
+ * @file
+ * @brief Type alias definition for legate::Buffer and utility functions for it
+ */
 
 namespace legate {
 
+/**
+ * @ingroup data
+ * @brief A typed buffer class for intra-task temporary allocations
+ *
+ * Values in a buffer can be accessed by index expressions with legate::Point objects,
+ * or via a raw pointer to the underlying allocation, which can be queried with the `ptr` method.
+ *
+ * `legate::Buffer` is an alias to
+ * [`Legion::DeferredBuffer`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion.h#L3509-L3609).
+ *
+ * Note on using temporary buffers in CUDA tasks:
+ *
+ * We use Legion `DeferredBuffer`, whose lifetime is not connected with the CUDA stream(s) used to
+ * launch kernels. The buffer is allocated immediately at the point when `create_buffer` is called,
+ * whereas the kernel that uses it is placed on a stream, and may run at a later point. Normally
+ * a `DeferredBuffer` is deallocated automatically by Legion once all the kernels launched in the
+ * task are complete. However, a `DeferredBuffer` can also be deallocated immediately using
+ * `destroy()`, which is useful for operations that want to deallocate intermediate memory as soon
+ * as possible. This deallocation is not synchronized with the task stream, i.e. it may happen
+ * before a kernel which uses the buffer has actually completed. This is safe as long as we use the
+ * same stream on all GPU tasks running on the same device (which is guaranteed by the current
+ * implementation of `get_cached_stream`), because then all the actual uses of the buffer are done
+ * in order on the one stream. It is important that all library CUDA code uses
+ * `get_cached_stream()`, and all CUDA operations (including library calls) are enqueued on that
+ * stream exclusively. This analysis additionally assumes that no code outside of Legate is
+ * concurrently allocating from the eager pool, and that it's OK for kernels to access a buffer even
+ * after it's technically been deallocated.
+ */
 template <typename VAL, int32_t DIM = 1>
 using Buffer = Legion::DeferredBuffer<VAL, DIM>;
 
+/**
+ * @ingroup data
+ * @brief Creates a `Buffer` of specific extents
+ *
+ * @param extents Extents of the buffer
+ * @param kind Kind of the target memory (optional). If not given, the runtime will pick
+ * automatically based on the executing processor
+ * @param alignment Alignment for the memory allocation (optional)
+ *
+ * @return A `Buffer` object
+ */
 template <typename VAL, int32_t DIM>
-Buffer<VAL, DIM> create_buffer(const Legion::Point<DIM>& extents,
-                               Legion::Memory::Kind kind = Legion::Memory::Kind::NO_MEMKIND,
-                               size_t alignment          = 16)
+Buffer<VAL, DIM> create_buffer(const Point<DIM>& extents,
+                               Memory::Kind kind = Memory::Kind::NO_MEMKIND,
+                               size_t alignment  = 16)
 {
-  using namespace Legion;
   if (Memory::Kind::NO_MEMKIND == kind) kind = find_memory_kind_for_executing_processor(false);
   auto hi = extents - Point<DIM>::ONES();
   // We just avoid creating empty buffers, as they cause all sorts of headaches.
@@ -39,12 +83,23 @@ Buffer<VAL, DIM> create_buffer(const Legion::Point<DIM>& extents,
   return Buffer<VAL, DIM>(bounds, kind, nullptr, alignment);
 }
 
+/**
+ * @ingroup data
+ * @brief Creates a `Buffer` of a specific size. Always returns a 1D buffer.
+ *
+ * @param size Size of the buffdr
+ * @param kind Kind of the target memory (optional). If not given, the runtime will pick
+ * automatically based on the executing processor
+ * @param alignment Alignment for the memory allocation (optional)
+ *
+ * @return A 1D `Buffer` object
+ */
 template <typename VAL>
 Buffer<VAL> create_buffer(size_t size,
-                          Legion::Memory::Kind kind = Legion::Memory::Kind::NO_MEMKIND,
-                          size_t alignment          = 16)
+                          Memory::Kind kind = Memory::Kind::NO_MEMKIND,
+                          size_t alignment  = 16)
 {
-  return create_buffer<VAL, 1>(Legion::Point<1>(size), kind, alignment);
+  return create_buffer<VAL, 1>(Point<1>(size), kind, alignment);
 }
 
 }  // namespace legate
diff --git a/src/core/data/scalar.cc b/src/core/data/scalar.cc
index 2282c2f8a..13300e60e 100644
--- a/src/core/data/scalar.cc
+++ b/src/core/data/scalar.cc
@@ -71,7 +71,7 @@ size_t Scalar::size() const
   auto elem_size = type_dispatch(code_, elem_size_fn{});
   if (tuple_) {
     auto num_elements = *static_cast<const uint32_t*>(data_);
-    return sizeof(int32_t) + num_elements * elem_size;
+    return sizeof(uint32_t) + num_elements * elem_size;
   } else
     return elem_size;
 }
diff --git a/src/core/data/scalar.h b/src/core/data/scalar.h
index 5fc782271..63db09fbd 100644
--- a/src/core/data/scalar.h
+++ b/src/core/data/scalar.h
@@ -20,18 +20,58 @@
 #include "core/utilities/type_traits.h"
 #include "core/utilities/typedefs.h"
 
+/**
+ * @file
+ * @brief Class definition for legate::Scalar
+ */
+
 namespace legate {
 
+/**
+ * @ingroup data
+ * @brief A type-erased container for scalars and tuples of scalars.
+ *
+ * A Scalar can be owned or shared, depending on whether it owns the backing allocation:
+ * If a `Scalar` is shared, it does not own the allocation and any of its copies are also
+ * shared. If a `Scalar` is owned, it owns the backing allocation and releases it upon
+ * destruction. Any copy of an owned `Scalar` is owned as well.
+ *
+ * A `Scalar` that stores a tuple of scalars has an allocation big enough to contain both
+ * the number of elements and the elements themselves. The number of elements should be
+ * stored in the first four bytes of the allocation.
+ *
+ */
 class Scalar {
  public:
   Scalar() = default;
   Scalar(const Scalar& other);
+  /**
+   * @brief Creates a shared `Scalar` with an existing allocation. The caller is responsible
+   * for passing in a sufficiently big allocation.
+   *
+   * @param tuple If true, the allocation contains a tuple of scalars.
+   * @param code Type code of the scalar(s)
+   * @param data Allocation containing the data.
+   */
   Scalar(bool tuple, LegateTypeCode code, const void* data);
   ~Scalar();
 
  public:
+  /**
+   * @brief Creates an owned scalar from a scalar value
+   *
+   * @tparam T The scalar type to wrap
+   *
+   * @param value A scalar value to create a `Scalar` with
+   */
   template <typename T>
   Scalar(T value);
+  /**
+   * @brief Creates an owned scalar from a tuple of scalars. The values in the input vector
+   * will be copied.
+   *
+   * @param values A vector that contains elements of a tuple
+   */
   template <typename T>
   Scalar(const std::vector<T>& values);
 
@@ -42,14 +82,44 @@ class Scalar {
   void copy(const Scalar& other);
 
  public:
+  /**
+   * @brief Indicates if the `Scalar` object represents a tuple
+   *
+   * @return true The `Scalar` is a tuple
+   * @return false The `Scalar` is a scalar
+   */
   bool is_tuple() const { return tuple_; }
+  /**
+   * @brief Returns the size of allocation for the `Scalar`.
+   *
+   * @return The size of allocation
+   */
   size_t size() const;
 
  public:
+  /**
+   * @brief Returns the value stored in the `Scalar`. The call does no type checking;
+   * i.e., passing a wrong type parameter will not be caught by the call.
+   *
+   * @tparam VAL Type of the value to unwrap
+   *
+   * @return The value stored in the `Scalar`
+   */
   template <typename VAL>
   VAL value() const;
+  /**
+   * @brief Returns values stored in the `Scalar`. If the `Scalar` contains a scalar,
+   * a unit span will be returned.
+   *
+   * @return Values stored in the `Scalar`
+   */
   template <typename VAL>
   Span<const VAL> values() const;
+  /**
+   * @brief Returns a raw pointer to the backing allocation
+   *
+   * @return A raw pointer to the `Scalar`'s data
+   */
   const void* ptr() const { return data_; }
 
  private:
diff --git a/src/core/data/store.cc b/src/core/data/store.cc
index c185ab602..bc7592c33 100644
--- a/src/core/data/store.cc
+++ b/src/core/data/store.cc
@@ -28,9 +28,7 @@
 
 namespace legate {
 
-using namespace Legion;
-
-RegionField::RegionField(int32_t dim, const PhysicalRegion& pr, FieldID fid)
+RegionField::RegionField(int32_t dim, const Legion::PhysicalRegion& pr, Legion::FieldID fid)
   : dim_(dim), pr_(pr), fid_(fid)
 {
   auto priv  = pr.get_privilege();
@@ -60,14 +58,18 @@ RegionField& RegionField::operator=(RegionField&& other) noexcept
   return *this;
 }
 
-bool RegionField::valid() const { return pr_.get_logical_region() != LogicalRegion::NO_REGION; }
+bool RegionField::valid() const
+{
+  return pr_.get_logical_region() != Legion::LogicalRegion::NO_REGION;
+}
 
 Domain RegionField::domain() const { return dim_dispatch(dim_, get_domain_fn{}, pr_); }
 
-OutputRegionField::OutputRegionField(const OutputRegion& out, FieldID fid)
+OutputRegionField::OutputRegionField(const Legion::OutputRegion& out, Legion::FieldID fid)
   : out_(out),
     fid_(fid),
-    num_elements_(UntypedDeferredValue(sizeof(size_t), find_memory_kind_for_executing_processor()))
+    num_elements_(
+      Legion::UntypedDeferredValue(sizeof(size_t), find_memory_kind_for_executing_processor()))
 {
 }
 
@@ -75,9 +77,9 @@ OutputRegionField::OutputRegionField(OutputRegionField&& other) noexcept
   : bound_(other.bound_), out_(other.out_), fid_(other.fid_), num_elements_(other.num_elements_)
 {
   other.bound_        = false;
-  other.out_          = OutputRegion();
+  other.out_          = Legion::OutputRegion();
   other.fid_          = -1;
-  other.num_elements_ = UntypedDeferredValue();
+  other.num_elements_ = Legion::UntypedDeferredValue();
 }
 
 OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexcept
@@ -88,9 +90,9 @@ OutputRegionField& OutputRegionField::operator=(OutputRegionField&& other) noexc
   num_elements_ = other.num_elements_;
 
   other.bound_        = false;
-  other.out_          = OutputRegion();
+  other.out_          = Legion::OutputRegion();
   other.fid_          = -1;
-  other.num_elements_ = UntypedDeferredValue();
+  other.num_elements_ = Legion::UntypedDeferredValue();
 
   return *this;
 }
@@ -125,8 +127,11 @@ void OutputRegionField::update_num_elements(size_t num_elements)
   acc[0] = num_elements;
 }
 
-FutureWrapper::FutureWrapper(
-  bool read_only, int32_t field_size, Domain domain, Future future, bool initialize /*= false*/)
+FutureWrapper::FutureWrapper(bool read_only,
+                             int32_t field_size,
+                             Domain domain,
+                             Legion::Future future,
+                             bool initialize /*= false*/)
   : read_only_(read_only), field_size_(field_size), domain_(domain), future_(future)
 {
 #ifdef DEBUG_LEGATE
@@ -148,16 +153,16 @@ FutureWrapper::FutureWrapper(
 #ifdef LEGATE_USE_CUDA
       if (mem_kind == Memory::Kind::GPU_FB_MEM) {
         // TODO: This should be done by Legion
-        buffer_ = UntypedDeferredValue(field_size, mem_kind);
+        buffer_ = Legion::UntypedDeferredValue(field_size, mem_kind);
         AccessorWO<int8_t, 1> acc(buffer_, field_size, false);
         auto stream = cuda::StreamPool::get_stream_pool().get_stream();
         CHECK_CUDA(
           cudaMemcpyAsync(acc.ptr(0), p_init_value, field_size, cudaMemcpyDeviceToDevice, stream));
       } else
 #endif
-        buffer_ = UntypedDeferredValue(field_size, mem_kind, p_init_value);
+        buffer_ = Legion::UntypedDeferredValue(field_size, mem_kind, p_init_value);
     } else
-      buffer_ = UntypedDeferredValue(field_size, mem_kind);
+      buffer_ = Legion::UntypedDeferredValue(field_size, mem_kind);
   }
 }
 
@@ -187,7 +192,7 @@ void FutureWrapper::initialize_with_identity(int32_t redop_id)
   auto untyped_acc = AccessorWO<int8_t, 1>(buffer_, field_size_);
   auto ptr         = untyped_acc.ptr(0);
 
-  auto redop = Runtime::get_reduction_op(redop_id);
+  auto redop = Legion::Runtime::get_reduction_op(redop_id);
 #ifdef DEBUG_LEGATE
   assert(redop->sizeof_lhs == field_size_);
 #endif
diff --git a/src/core/data/store.h b/src/core/data/store.h
index f21c820fc..a66617b71 100644
--- a/src/core/data/store.h
+++ b/src/core/data/store.h
@@ -24,6 +24,14 @@
 #include "legate_defines.h"
 #include "legion.h"
 
+/** @defgroup data Data abstractions and allocators
+ */
+
+/**
+ * @file
+ * @brief Class definition for legate::Store
+ */
+
 namespace legate {
 
 class RegionField {
@@ -59,7 +67,7 @@ class RegionField {
     ACC operator()(const Legion::PhysicalRegion& pr,
                    Legion::FieldID fid,
                    const Legion::AffineTransform<M, N>& transform,
-                   const Legion::Rect<N>& bounds)
+                   const Rect<N>& bounds)
     {
       return ACC(pr, fid, transform, bounds);
     }
@@ -76,7 +84,7 @@ class RegionField {
                    Legion::FieldID fid,
                    int32_t redop_id,
                    const Legion::AffineTransform<M, N>& transform,
-                   const Legion::Rect<N>& bounds)
+                   const Rect<N>& bounds)
     {
       return ACC(pr, fid, redop_id, transform, bounds);
     }
@@ -84,9 +92,9 @@ class RegionField {
 
   struct get_domain_fn {
     template <int32_t DIM>
-    Legion::Domain operator()(const Legion::PhysicalRegion& pr)
+    Domain operator()(const Legion::PhysicalRegion& pr)
     {
-      return Legion::Domain(pr.get_bounds<DIM, Legion::coord_t>());
+      return Domain(pr.get_bounds<DIM, Legion::coord_t>());
     }
   };
 
@@ -113,35 +121,34 @@ class RegionField {
 
  public:
   template <typename T, int32_t DIM>
-  AccessorRO<T, DIM> read_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRO<T, DIM> read_accessor(const Rect<DIM>& bounds) const;
   template <typename T, int32_t DIM>
-  AccessorWO<T, DIM> write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorWO<T, DIM> write_accessor(const Rect<DIM>& bounds) const;
   template <typename T, int32_t DIM>
-  AccessorRW<T, DIM> read_write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRW<T, DIM> read_write_accessor(const Rect<DIM>& bounds) const;
   template <typename OP, bool EXCLUSIVE, int32_t DIM>
-  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(int32_t redop_id,
-                                                 const Legion::Rect<DIM>& bounds) const;
+  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(int32_t redop_id, const Rect<DIM>& bounds) const;
 
  public:
   template <typename T, int32_t DIM>
-  AccessorRO<T, DIM> read_accessor(const Legion::Rect<DIM>& bounds,
+  AccessorRO<T, DIM> read_accessor(const Rect<DIM>& bounds,
                                    const Legion::DomainAffineTransform& transform) const;
   template <typename T, int32_t DIM>
-  AccessorWO<T, DIM> write_accessor(const Legion::Rect<DIM>& bounds,
+  AccessorWO<T, DIM> write_accessor(const Rect<DIM>& bounds,
                                     const Legion::DomainAffineTransform& transform) const;
   template <typename T, int32_t DIM>
-  AccessorRW<T, DIM> read_write_accessor(const Legion::Rect<DIM>& bounds,
+  AccessorRW<T, DIM> read_write_accessor(const Rect<DIM>& bounds,
                                          const Legion::DomainAffineTransform& transform) const;
   template <typename OP, bool EXCLUSIVE, int32_t DIM>
   AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(
     int32_t redop_id,
-    const Legion::Rect<DIM>& bounds,
+    const Rect<DIM>& bounds,
     const Legion::DomainAffineTransform& transform) const;
 
  public:
   template <int32_t DIM>
-  Legion::Rect<DIM> shape() const;
-  Legion::Domain domain() const;
+  Rect<DIM> shape() const;
+  Domain domain() const;
 
  public:
   bool is_readable() const { return readable_; }
@@ -177,11 +184,11 @@ class OutputRegionField {
 
  public:
   template <typename T, int32_t DIM>
-  Buffer<T, DIM> create_output_buffer(const Legion::Point<DIM>& extents, bool return_buffer);
+  Buffer<T, DIM> create_output_buffer(const Point<DIM>& extents, bool return_buffer);
 
  public:
   template <typename T, int32_t DIM>
-  void return_data(Buffer<T, DIM>& buffer, const Legion::Point<DIM>& extents);
+  void return_data(Buffer<T, DIM>& buffer, const Point<DIM>& extents);
   void make_empty(int32_t dim);
 
  public:
@@ -202,7 +209,7 @@ class FutureWrapper {
   FutureWrapper() {}
   FutureWrapper(bool read_only,
                 int32_t field_size,
-                Legion::Domain domain,
+                Domain domain,
                 Legion::Future future,
                 bool initialize = false);
 
@@ -225,14 +232,13 @@ class FutureWrapper {
 
  public:
   template <typename T, int32_t DIM>
-  AccessorRO<T, DIM> read_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRO<T, DIM> read_accessor(const Rect<DIM>& bounds) const;
   template <typename T, int32_t DIM>
-  AccessorWO<T, DIM> write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorWO<T, DIM> write_accessor(const Rect<DIM>& bounds) const;
   template <typename T, int32_t DIM>
-  AccessorRW<T, DIM> read_write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRW<T, DIM> read_write_accessor(const Rect<DIM>& bounds) const;
   template <typename OP, bool EXCLUSIVE, int32_t DIM>
-  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(int32_t redop_id,
-                                                 const Legion::Rect<DIM>& bounds) const;
+  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(int32_t redop_id, const Rect<DIM>& bounds) const;
 
  public:
   template <typename VAL>
@@ -240,8 +246,8 @@ class FutureWrapper {
 
  public:
   template <int32_t DIM>
-  Legion::Rect<DIM> shape() const;
-  Legion::Domain domain() const;
+  Rect<DIM> shape() const;
+  Domain domain() const;
 
  public:
   void initialize_with_identity(int32_t redop_id);
@@ -252,11 +258,15 @@ class FutureWrapper {
  private:
   bool read_only_{true};
   size_t field_size_{0};
-  Legion::Domain domain_{};
+  Domain domain_{};
   Legion::Future future_{};
   Legion::UntypedDeferredValue buffer_{};
 };
 
+/**
+ * @ingroup data
+ * @brief A multi-dimensional data container storing task data
+ */
 class Store {
  public:
   Store() {}
@@ -284,11 +294,34 @@ class Store {
   Store& operator=(const Store& other) = delete;
 
  public:
+  /**
+   * @brief Indicates whether the store is valid. A store passed to a task can be invalid
+   * only for reducer tasks for tree reduction.
+   *
+   * @return true The store is valid
+   * @return false The store is invalid and cannot be used in any data access
+   */
   bool valid() const;
+  /**
+   * @brief Indicates whether the store is transformed in any way.
+   *
+   * @return true The store is transformed
+   * @return false The store is not transformed
+   */
   bool transformed() const { return !transform_->identity(); }
 
  public:
+  /**
+   * @brief Returns the dimension of the store
+   *
+   * @return The store's dimension
+   */
   int32_t dim() const { return dim_; }
+  /**
+   * @brief Returns the type code of the store
+   *
+   * @return The store's type code
+   */
   template <typename TYPE_CODE = LegateTypeCode>
   TYPE_CODE code() const
   {
@@ -296,51 +329,200 @@ class Store {
   }
 
  public:
+  /**
+   * @brief Returns a read-only accessor to the store for the entire domain
+   *
+   * @return A read-only accessor to the store
+   */
   template <typename T, int32_t DIM>
   AccessorRO<T, DIM> read_accessor() const;
+  /**
+   * @brief Returns a write-only accessor to the store for the entire domain
+   *
+   * @return A write-only accessor to the store
+   */
   template <typename T, int32_t DIM>
   AccessorWO<T, DIM> write_accessor() const;
+  /**
+   * @brief Returns a read-write accessor to the store for the entire domain
+   *
+   * @return A read-write accessor to the store
+   */
   template <typename T, int32_t DIM>
   AccessorRW<T, DIM> read_write_accessor() const;
+  /**
+   * @brief Returns a reduction accessor to the store for the entire domain
+   *
+   * @tparam OP Reduction operator class. For details about reduction operators, See
+   * LibraryContext::register_reduction_operator.
+   *
+   * @tparam EXCLUSIVE Indicates whether reductions can be performed in exclusive mode. If
+   * `EXCLUSIVE` is `false`, every reduction via the acecssor is performed atomically.
+   *
+   * @return A reduction accessor to the store
+   */
   template <typename OP, bool EXCLUSIVE, int32_t DIM>
   AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor() const;
 
  public:
+  /**
+   * @brief Returns a read-only accessor to the store for specific bounds.
+   *
+   * @param bounds Domain within which accesses should be allowed.
+   * The actual bounds for valid access are determined by an intersection between
+   * the store's domain and the bounds.
+   *
+   * @return A read-only accessor to the store
+   */
   template <typename T, int32_t DIM>
-  AccessorRO<T, DIM> read_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRO<T, DIM> read_accessor(const Rect<DIM>& bounds) const;
+  /**
+   * @brief Returns a write-only accessor to the store for the entire domain
+   *
+   * @param bounds Domain within which accesses should be allowed.
+   * The actual bounds for valid access are determined by an intersection between
+   * the store's domain and the bounds.
+   *
+   * @return A write-only accessor to the store
+   */
   template <typename T, int32_t DIM>
-  AccessorWO<T, DIM> write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorWO<T, DIM> write_accessor(const Rect<DIM>& bounds) const;
+  /**
+   * @brief Returns a read-write accessor to the store for the entire domain
+   *
+   * @param bounds Domain within which accesses should be allowed.
+   * The actual bounds for valid access are determined by an intersection between
+   * the store's domain and the bounds.
+   *
+   * @return A read-write accessor to the store
+   */
   template <typename T, int32_t DIM>
-  AccessorRW<T, DIM> read_write_accessor(const Legion::Rect<DIM>& bounds) const;
+  AccessorRW<T, DIM> read_write_accessor(const Rect<DIM>& bounds) const;
+  /**
+   * @brief Returns a reduction accessor to the store for the entire domain
+   *
+   * @param bounds Domain within which accesses should be allowed.
+   * The actual bounds for valid access are determined by an intersection between
+   * the store's domain and the bounds.
+   *
+   * @tparam OP Reduction operator class. For details about reduction operators, See
+   * LibraryContext::register_reduction_operator.
+   *
+   * @tparam EXCLUSIVE Indicates whether reductions can be performed in exclusive mode. If
+   * `EXCLUSIVE` is `false`, every reduction via the acecssor is performed atomically.
+   *
+   * @return A reduction accessor to the store
+   */
   template <typename OP, bool EXCLUSIVE, int32_t DIM>
-  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(const Legion::Rect<DIM>& bounds) const;
-
- public:
+  AccessorRD<OP, EXCLUSIVE, DIM> reduce_accessor(const Rect<DIM>& bounds) const;
+
+ public:
+  /**
+   * @brief Creates a buffer of specified extents for the unbound store. The returned
+   * buffer is always consistent with the mapping policy for the store. Can be invoked
+   * multiple times unless `return_buffer` is true.
+   *
+   * @param extents Extents of the buffer
+   *
+   * @param return_buffer If the value is true, the created buffer will be bound
+   * to the store upon return
+   *
+   * @return A reduction accessor to the store
+   */
   template <typename T, int32_t DIM>
-  Buffer<T, DIM> create_output_buffer(const Legion::Point<DIM>& extents,
-                                      bool return_buffer = false);
+  Buffer<T, DIM> create_output_buffer(const Point<DIM>& extents, bool return_buffer = false);
 
  public:
+  /**
+   * @brief Returns the store's domain
+   *
+   * @return Store's domain
+   */
   template <int32_t DIM>
-  Legion::Rect<DIM> shape() const;
-  Legion::Domain domain() const;
-
- public:
+  Rect<DIM> shape() const;
+  /**
+   * @brief Returns the store's domain in a dimension-erased domain type
+   *
+   * @return Store's domain in a dimension-erased domain type
+   */
+  Domain domain() const;
+
+ public:
+  /**
+   * @brief Indicates whether the store can have a read accessor
+   *
+   * @return true The store can have a read accessor
+   * @return false The store cannot have a read accesor
+   */
   bool is_readable() const { return readable_; }
+  /**
+   * @brief Indicates whether the store can have a write accessor
+   *
+   * @return true The store can have a write accessor
+   * @return false The store cannot have a write accesor
+   */
   bool is_writable() const { return writable_; }
+  /**
+   * @brief Indicates whether the store can have a reduction accessor
+   *
+   * @return true The store can have a reduction accessor
+   * @return false The store cannot have a reduction accesor
+   */
   bool is_reducible() const { return reducible_; }
 
  public:
+  /**
+   * @brief Returns the scalar value stored in the store.
+   *
+   * The requested type must match with the store's data type. If the store is not
+   * backed by the future, the runtime will fail with an error message.
+   *
+   * @tparam VAL Type of the scalar value
+   *
+   * @return The scalar value stored in the store
+   */
   template <typename VAL>
   VAL scalar() const;
 
  public:
+  /**
+   * @brief Binds a buffer to the store. Valid only when the store is unbound and
+   * has not yet been bound to another buffer. The buffer must be consistent with
+   * the mapping policy for the store. Recommend that the buffer be created by
+   * a `create_output_buffer` call.
+   *
+   * @param buffer Buffer to bind to the store
+   *
+   * @param extents Extents of the buffer. Passing extents smaller than the actual
+   * extents of the buffer is legal; the runtime uses the passed extents as the
+   * extents of this store.
+   *
+   */
   template <typename T, int32_t DIM>
-  void return_data(Buffer<T, DIM>& buffer, const Legion::Point<DIM>& extents);
+  void return_data(Buffer<T, DIM>& buffer, const Point<DIM>& extents);
+  /**
+   * @brief Makes the unbound store empty. Valid only when the store is unbound and
+   * has not yet been bound to another buffer.
+   */
   void make_empty();
 
  public:
+  /**
+   * @brief Indicates whether the store is backed by a future
+   * (i.e., a container for scalar value)
+   *
+   * @return true The store is backed by a future
+   * @return false The store is backed by a region field
+   */
   bool is_future() const { return is_future_; }
+  /**
+   * @brief Indicates whether the store is an unbound store. The value DOES NOT indicate
+   * that the store has already assigned to a buffer; i.e., the store may have been assigned
+   * to a buffer even when this function returns `true`.
+   *
+   * @return true The store is an unbound store
+   * @return false The store is a normal store
+   */
   bool is_output_store() const { return is_output_store_; }
   ReturnValue pack() const { return future_.pack(); }
   ReturnValue pack_weight() const { return output_field_.pack_weight(); }
diff --git a/src/core/data/store.inl b/src/core/data/store.inl
index 7dc1d38db..d3cd6e594 100644
--- a/src/core/data/store.inl
+++ b/src/core/data/store.inl
@@ -72,32 +72,32 @@ AccessorRD<OP, EXCLUSIVE, DIM> RegionField::reduce_accessor(
 }
 
 template <typename T, int DIM>
-AccessorRO<T, DIM> RegionField::read_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRO<T, DIM> RegionField::read_accessor(const Rect<DIM>& bounds) const
 {
   return AccessorRO<T, DIM>(pr_, fid_, bounds);
 }
 
 template <typename T, int DIM>
-AccessorWO<T, DIM> RegionField::write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorWO<T, DIM> RegionField::write_accessor(const Rect<DIM>& bounds) const
 {
   return AccessorWO<T, DIM>(pr_, fid_, bounds);
 }
 
 template <typename T, int DIM>
-AccessorRW<T, DIM> RegionField::read_write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRW<T, DIM> RegionField::read_write_accessor(const Rect<DIM>& bounds) const
 {
   return AccessorRW<T, DIM>(pr_, fid_, bounds);
 }
 
 template <typename OP, bool EXCLUSIVE, int DIM>
 AccessorRD<OP, EXCLUSIVE, DIM> RegionField::reduce_accessor(int32_t redop_id,
-                                                            const Legion::Rect<DIM>& bounds) const
+                                                            const Rect<DIM>& bounds) const
 {
   return AccessorRD<OP, EXCLUSIVE, DIM>(pr_, fid_, redop_id, bounds);
 }
 
 template <typename T, int32_t DIM>
-AccessorRO<T, DIM> RegionField::read_accessor(const Legion::Rect<DIM>& bounds,
+AccessorRO<T, DIM> RegionField::read_accessor(const Rect<DIM>& bounds,
                                               const Legion::DomainAffineTransform& transform) const
 {
   using ACC = AccessorRO<T, DIM>;
@@ -106,7 +106,7 @@ AccessorRO<T, DIM> RegionField::read_accessor(const Legion::Rect<DIM>& bounds,
 }
 
 template <typename T, int32_t DIM>
-AccessorWO<T, DIM> RegionField::write_accessor(const Legion::Rect<DIM>& bounds,
+AccessorWO<T, DIM> RegionField::write_accessor(const Rect<DIM>& bounds,
                                                const Legion::DomainAffineTransform& transform) const
 {
   using ACC = AccessorWO<T, DIM>;
@@ -116,7 +116,7 @@ AccessorWO<T, DIM> RegionField::write_accessor(const Legion::Rect<DIM>& bounds,
 
 template <typename T, int32_t DIM>
 AccessorRW<T, DIM> RegionField::read_write_accessor(
-  const Legion::Rect<DIM>& bounds, const Legion::DomainAffineTransform& transform) const
+  const Rect<DIM>& bounds, const Legion::DomainAffineTransform& transform) const
 {
   using ACC = AccessorRW<T, DIM>;
   return dim_dispatch(
@@ -125,9 +125,7 @@ AccessorRW<T, DIM> RegionField::read_write_accessor(
 
 template <typename OP, bool EXCLUSIVE, int DIM>
 AccessorRD<OP, EXCLUSIVE, DIM> RegionField::reduce_accessor(
-  int32_t redop_id,
-  const Legion::Rect<DIM>& bounds,
-  const Legion::DomainAffineTransform& transform) const
+  int32_t redop_id, const Rect<DIM>& bounds, const Legion::DomainAffineTransform& transform) const
 {
   using ACC = AccessorRD<OP, EXCLUSIVE, DIM>;
   return dim_dispatch(
@@ -135,9 +133,9 @@ AccessorRD<OP, EXCLUSIVE, DIM> RegionField::reduce_accessor(
 }
 
 template <int32_t DIM>
-Legion::Rect<DIM> RegionField::shape() const
+Rect<DIM> RegionField::shape() const
 {
-  return Legion::Rect<DIM>(pr_);
+  return Rect<DIM>(pr_);
 }
 
 template <typename T, int DIM>
@@ -147,7 +145,7 @@ AccessorRO<T, DIM> FutureWrapper::read_accessor() const
   assert(sizeof(T) == field_size_);
 #endif
   if (read_only_) {
-    auto memkind = Legion::Memory::Kind::NO_MEMKIND;
+    auto memkind = Memory::Kind::NO_MEMKIND;
     return AccessorRO<T, DIM>(future_, memkind);
   } else
     return AccessorRO<T, DIM>(buffer_);
@@ -184,20 +182,20 @@ AccessorRD<OP, EXCLUSIVE, DIM> FutureWrapper::reduce_accessor(int32_t redop_id)
 }
 
 template <typename T, int DIM>
-AccessorRO<T, DIM> FutureWrapper::read_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRO<T, DIM> FutureWrapper::read_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   assert(sizeof(T) == field_size_);
 #endif
   if (read_only_) {
-    auto memkind = Legion::Memory::Kind::NO_MEMKIND;
+    auto memkind = Memory::Kind::NO_MEMKIND;
     return AccessorRO<T, DIM>(future_, bounds, memkind);
   } else
     return AccessorRO<T, DIM>(buffer_, bounds);
 }
 
 template <typename T, int DIM>
-AccessorWO<T, DIM> FutureWrapper::write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorWO<T, DIM> FutureWrapper::write_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   assert(sizeof(T) == field_size_);
@@ -207,7 +205,7 @@ AccessorWO<T, DIM> FutureWrapper::write_accessor(const Legion::Rect<DIM>& bounds
 }
 
 template <typename T, int DIM>
-AccessorRW<T, DIM> FutureWrapper::read_write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRW<T, DIM> FutureWrapper::read_write_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   assert(sizeof(T) == field_size_);
@@ -218,7 +216,7 @@ AccessorRW<T, DIM> FutureWrapper::read_write_accessor(const Legion::Rect<DIM>& b
 
 template <typename OP, bool EXCLUSIVE, int DIM>
 AccessorRD<OP, EXCLUSIVE, DIM> FutureWrapper::reduce_accessor(int32_t redop_id,
-                                                              const Legion::Rect<DIM>& bounds) const
+                                                              const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   assert(sizeof(typename OP::LHS) == field_size_);
@@ -228,9 +226,9 @@ AccessorRD<OP, EXCLUSIVE, DIM> FutureWrapper::reduce_accessor(int32_t redop_id,
 }
 
 template <int32_t DIM>
-Legion::Rect<DIM> FutureWrapper::shape() const
+Rect<DIM> FutureWrapper::shape() const
 {
-  return Legion::Rect<DIM>(domain());
+  return Rect<DIM>(domain());
 }
 
 template <typename VAL>
@@ -246,7 +244,7 @@ VAL FutureWrapper::scalar() const
 }
 
 template <typename T, int32_t DIM>
-Buffer<T, DIM> OutputRegionField::create_output_buffer(const Legion::Point<DIM>& extents,
+Buffer<T, DIM> OutputRegionField::create_output_buffer(const Point<DIM>& extents,
                                                        bool return_buffer)
 {
   if (return_buffer) {
@@ -261,7 +259,7 @@ Buffer<T, DIM> OutputRegionField::create_output_buffer(const Legion::Point<DIM>&
 }
 
 template <typename T, int32_t DIM>
-void OutputRegionField::return_data(Buffer<T, DIM>& buffer, const Legion::Point<DIM>& extents)
+void OutputRegionField::return_data(Buffer<T, DIM>& buffer, const Point<DIM>& extents)
 {
 #ifdef DEBUG_LEGATE
   assert(!bound_);
@@ -337,7 +335,7 @@ AccessorRD<OP, EXCLUSIVE, DIM> Store::reduce_accessor() const
 }
 
 template <typename T, int DIM>
-AccessorRO<T, DIM> Store::read_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRO<T, DIM> Store::read_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   check_accessor_dimension(DIM);
@@ -353,7 +351,7 @@ AccessorRO<T, DIM> Store::read_accessor(const Legion::Rect<DIM>& bounds) const
 }
 
 template <typename T, int DIM>
-AccessorWO<T, DIM> Store::write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorWO<T, DIM> Store::write_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   check_accessor_dimension(DIM);
@@ -369,7 +367,7 @@ AccessorWO<T, DIM> Store::write_accessor(const Legion::Rect<DIM>& bounds) const
 }
 
 template <typename T, int DIM>
-AccessorRW<T, DIM> Store::read_write_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRW<T, DIM> Store::read_write_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   check_accessor_dimension(DIM);
@@ -385,7 +383,7 @@ AccessorRW<T, DIM> Store::read_write_accessor(const Legion::Rect<DIM>& bounds) c
 }
 
 template <typename OP, bool EXCLUSIVE, int DIM>
-AccessorRD<OP, EXCLUSIVE, DIM> Store::reduce_accessor(const Legion::Rect<DIM>& bounds) const
+AccessorRD<OP, EXCLUSIVE, DIM> Store::reduce_accessor(const Rect<DIM>& bounds) const
 {
 #ifdef DEBUG_LEGATE
   check_accessor_dimension(DIM);
@@ -401,7 +399,7 @@ AccessorRD<OP, EXCLUSIVE, DIM> Store::reduce_accessor(const Legion::Rect<DIM>& b
 }
 
 template <typename T, int32_t DIM>
-Buffer<T, DIM> Store::create_output_buffer(const Legion::Point<DIM>& extents,
+Buffer<T, DIM> Store::create_output_buffer(const Point<DIM>& extents,
                                            bool return_buffer /*= false*/)
 {
 #ifdef DEBUG_LEGATE
@@ -412,7 +410,7 @@ Buffer<T, DIM> Store::create_output_buffer(const Legion::Point<DIM>& extents,
 }
 
 template <int32_t DIM>
-Legion::Rect<DIM> Store::shape() const
+Rect<DIM> Store::shape() const
 {
 #ifdef DEBUG_LEGATE
   if (!(DIM == dim_ || (dim_ == 0 && DIM == 1))) {
@@ -426,8 +424,8 @@ Legion::Rect<DIM> Store::shape() const
   if (dom.dim > 0)
     return dom.bounds<DIM, Legion::coord_t>();
   else {
-    auto p = Legion::Point<DIM>::ZEROES();
-    return Legion::Rect<DIM>(p, p);
+    auto p = Point<DIM>::ZEROES();
+    return Rect<DIM>(p, p);
   }
 }
 
@@ -441,7 +439,7 @@ VAL Store::scalar() const
 }
 
 template <typename T, int32_t DIM>
-void Store::return_data(Buffer<T, DIM>& buffer, const Legion::Point<DIM>& extents)
+void Store::return_data(Buffer<T, DIM>& buffer, const Point<DIM>& extents)
 {
 #ifdef DEBUG_LEGATE
   check_valid_return();
diff --git a/src/core/data/transform.cc b/src/core/data/transform.cc
index 07ab5f7ef..93d8e6956 100644
--- a/src/core/data/transform.cc
+++ b/src/core/data/transform.cc
@@ -18,11 +18,10 @@
 
 namespace legate {
 
-using namespace Legion;
-
-DomainAffineTransform combine(const DomainAffineTransform& lhs, const DomainAffineTransform& rhs)
+Legion::DomainAffineTransform combine(const Legion::DomainAffineTransform& lhs,
+                                      const Legion::DomainAffineTransform& rhs)
 {
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   auto transform   = lhs.transform * rhs.transform;
   auto offset      = lhs.transform * rhs.offset + lhs.offset;
   result.transform = transform;
@@ -37,7 +36,7 @@ TransformStack::TransformStack(std::unique_ptr<StoreTransform>&& transform,
 {
 }
 
-Legion::Domain TransformStack::transform(const Legion::Domain& input) const
+Domain TransformStack::transform(const Domain& input) const
 {
 #ifdef DEBUG_LEGATE
   assert(transform_ != nullptr);
@@ -100,12 +99,12 @@ Domain Shift::transform(const Domain& input) const
   return result;
 }
 
-DomainAffineTransform Shift::inverse_transform(int32_t in_dim) const
+Legion::DomainAffineTransform Shift::inverse_transform(int32_t in_dim) const
 {
   assert(dim_ < in_dim);
   auto out_dim = in_dim;
 
-  DomainTransform transform;
+  Legion::DomainTransform transform;
   transform.m = out_dim;
   transform.n = in_dim;
   for (int32_t i = 0; i < out_dim; ++i)
@@ -116,7 +115,7 @@ DomainAffineTransform Shift::inverse_transform(int32_t in_dim) const
   offset.dim = out_dim;
   for (int32_t i = 0; i < out_dim; ++i) offset[i] = i == dim_ ? -offset_ : 0;
 
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   result.transform = transform;
   result.offset    = offset;
   return result;
@@ -152,12 +151,12 @@ Domain Promote::transform(const Domain& input) const
   return output;
 }
 
-DomainAffineTransform Promote::inverse_transform(int32_t in_dim) const
+Legion::DomainAffineTransform Promote::inverse_transform(int32_t in_dim) const
 {
   assert(extra_dim_ < in_dim);
   auto out_dim = in_dim - 1;
 
-  DomainTransform transform;
+  Legion::DomainTransform transform;
   transform.m = std::max<int32_t>(out_dim, 1);
   transform.n = in_dim;
   for (int32_t i = 0; i < transform.m; ++i)
@@ -171,7 +170,7 @@ DomainAffineTransform Promote::inverse_transform(int32_t in_dim) const
   offset.dim = std::max<int32_t>(out_dim, 1);
   for (int32_t i = 0; i < transform.m; ++i) offset[i] = 0;
 
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   result.transform = transform;
   result.offset    = offset;
   return result;
@@ -202,12 +201,12 @@ Domain Project::transform(const Domain& input) const
   return output;
 }
 
-DomainAffineTransform Project::inverse_transform(int32_t in_dim) const
+Legion::DomainAffineTransform Project::inverse_transform(int32_t in_dim) const
 {
   auto out_dim = in_dim + 1;
   assert(dim_ < out_dim);
 
-  DomainTransform transform;
+  Legion::DomainTransform transform;
   transform.m = out_dim;
   if (in_dim == 0) {
     transform.n         = out_dim;
@@ -225,7 +224,7 @@ DomainAffineTransform Project::inverse_transform(int32_t in_dim) const
   offset.dim = out_dim;
   for (int32_t i = 0; i < out_dim; ++i) offset[i] = i == dim_ ? coord_ : 0;
 
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   result.transform = transform;
   result.offset    = offset;
   return result;
@@ -254,9 +253,9 @@ Domain Transpose::transform(const Domain& input) const
   return output;
 }
 
-DomainAffineTransform Transpose::inverse_transform(int32_t in_dim) const
+Legion::DomainAffineTransform Transpose::inverse_transform(int32_t in_dim) const
 {
-  DomainTransform transform;
+  Legion::DomainTransform transform;
   transform.m = in_dim;
   transform.n = in_dim;
   for (int32_t i = 0; i < in_dim; ++i)
@@ -268,7 +267,7 @@ DomainAffineTransform Transpose::inverse_transform(int32_t in_dim) const
   offset.dim = in_dim;
   for (int32_t i = 0; i < in_dim; ++i) offset[i] = 0;
 
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   result.transform = transform;
   result.offset    = offset;
   return result;
@@ -338,9 +337,9 @@ Domain Delinearize::transform(const Domain& input) const
   return delinearize(dim_, sizes_.size(), strides_, input);
 }
 
-DomainAffineTransform Delinearize::inverse_transform(int32_t in_dim) const
+Legion::DomainAffineTransform Delinearize::inverse_transform(int32_t in_dim) const
 {
-  DomainTransform transform;
+  Legion::DomainTransform transform;
   int32_t out_dim = in_dim - strides_.size() + 1;
   transform.m     = out_dim;
   transform.n     = in_dim;
@@ -357,7 +356,7 @@ DomainAffineTransform Delinearize::inverse_transform(int32_t in_dim) const
   offset.dim = out_dim;
   for (int32_t i = 0; i < out_dim; ++i) offset[i] = 0;
 
-  DomainAffineTransform result;
+  Legion::DomainAffineTransform result;
   result.transform = transform;
   result.offset    = offset;
   return result;
diff --git a/src/core/data/transform.h b/src/core/data/transform.h
index fbdd2e4a8..a925b6bf0 100644
--- a/src/core/data/transform.h
+++ b/src/core/data/transform.h
@@ -18,12 +18,12 @@
 
 #include <memory>
 
-#include "legion.h"
+#include "core/utilities/typedefs.h"
 
 namespace legate {
 
 struct Transform {
-  virtual Legion::Domain transform(const Legion::Domain& input) const           = 0;
+  virtual Domain transform(const Domain& input) const                           = 0;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const = 0;
   virtual void print(std::ostream& out) const                                   = 0;
 };
@@ -40,7 +40,7 @@ struct TransformStack : public Transform {
                  std::shared_ptr<TransformStack>&& parent);
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& input) const override;
+  virtual Domain transform(const Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
@@ -61,7 +61,7 @@ class Shift : public StoreTransform {
   Shift(int32_t dim, int64_t offset);
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& input) const override;
+  virtual Domain transform(const Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
@@ -78,7 +78,7 @@ class Promote : public StoreTransform {
   Promote(int32_t extra_dim, int64_t dim_size);
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& input) const override;
+  virtual Domain transform(const Domain& input) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
@@ -96,7 +96,7 @@ class Project : public StoreTransform {
   virtual ~Project() {}
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& domain) const override;
+  virtual Domain transform(const Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
@@ -113,7 +113,7 @@ class Transpose : public StoreTransform {
   Transpose(std::vector<int32_t>&& axes);
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& domain) const override;
+  virtual Domain transform(const Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
@@ -129,7 +129,7 @@ class Delinearize : public StoreTransform {
   Delinearize(int32_t dim, std::vector<int64_t>&& sizes);
 
  public:
-  virtual Legion::Domain transform(const Legion::Domain& domain) const override;
+  virtual Domain transform(const Domain& domain) const override;
   virtual Legion::DomainAffineTransform inverse_transform(int32_t in_dim) const override;
   virtual void print(std::ostream& out) const override;
 
diff --git a/src/core/mapping/base_mapper.cc b/src/core/mapping/base_mapper.cc
index 0fe6075e9..65c9f1537 100644
--- a/src/core/mapping/base_mapper.cc
+++ b/src/core/mapping/base_mapper.cc
@@ -30,12 +30,6 @@
 #include "core/utilities/linearize.h"
 #include "legate_defines.h"
 
-using LegionTask = Legion::Task;
-using LegionCopy = Legion::Copy;
-
-using namespace Legion;
-using namespace Legion::Mapping;
-
 namespace legate {
 namespace mapping {
 
@@ -54,9 +48,9 @@ const std::vector<StoreTarget>& default_store_targets(Processor::Kind kind)
   return finder->second;
 }
 
-std::string log_mappable(const Mappable& mappable, bool prefix_only = false)
+std::string log_mappable(const Legion::Mappable& mappable, bool prefix_only = false)
 {
-  static const std::map<MappableType, std::string> prefixes = {
+  static const std::map<Legion::MappableType, std::string> prefixes = {
     {LEGION_TASK_MAPPABLE, "Task "},
     {LEGION_COPY_MAPPABLE, "Copy "},
     {LEGION_INLINE_MAPPABLE, "Inline mapping "},
@@ -75,20 +69,24 @@ std::string log_mappable(const Mappable& mappable, bool prefix_only = false)
 
 }  // namespace
 
-BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
+BaseMapper::BaseMapper(std::unique_ptr<LegateMapper> legate_mapper,
+                       Legion::Runtime* rt,
+                       Legion::Machine m,
+                       const LibraryContext& ctx)
   : Mapper(rt->get_mapper_runtime()),
+    legate_mapper_(std::move(legate_mapper)),
     legion_runtime(rt),
     machine(m),
     context(ctx),
     local_node(get_local_node()),
-    total_nodes(get_total_nodes(m)),
+    total_nodes_(get_total_nodes(m)),
     mapper_name(std::move(create_name(local_node))),
     logger(create_logger_name().c_str()),
     local_instances(InstanceManager::get_instance_manager()),
     reduction_instances(ReductionInstanceManager::get_instance_manager())
 {
   // Query to find all our local processors
-  Machine::ProcessorQuery local_procs(machine);
+  Legion::Machine::ProcessorQuery local_procs(machine);
   local_procs.local_address_space();
   for (auto local_proc : local_procs) {
     switch (local_proc.kind()) {
@@ -108,20 +106,20 @@ BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
     }
   }
   // Now do queries to find all our local memories
-  Machine::MemoryQuery local_sysmem(machine);
+  Legion::Machine::MemoryQuery local_sysmem(machine);
   local_sysmem.local_address_space();
   local_sysmem.only_kind(Memory::SYSTEM_MEM);
   assert(local_sysmem.count() > 0);
   local_system_memory = local_sysmem.first();
   if (!local_gpus.empty()) {
-    Machine::MemoryQuery local_zcmem(machine);
+    Legion::Machine::MemoryQuery local_zcmem(machine);
     local_zcmem.local_address_space();
     local_zcmem.only_kind(Memory::Z_COPY_MEM);
     assert(local_zcmem.count() > 0);
     local_zerocopy_memory = local_zcmem.first();
   }
   for (auto& local_gpu : local_gpus) {
-    Machine::MemoryQuery local_framebuffer(machine);
+    Legion::Machine::MemoryQuery local_framebuffer(machine);
     local_framebuffer.local_address_space();
     local_framebuffer.only_kind(Memory::GPU_FB_MEM);
     local_framebuffer.best_affinity_to(local_gpu);
@@ -129,7 +127,7 @@ BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
     local_frame_buffers[local_gpu] = local_framebuffer.first();
   }
   for (auto& local_omp : local_omps) {
-    Machine::MemoryQuery local_numa(machine);
+    Legion::Machine::MemoryQuery local_numa(machine);
     local_numa.local_address_space();
     local_numa.only_kind(Memory::SOCKET_MEM);
     local_numa.best_affinity_to(local_omp);
@@ -139,9 +137,11 @@ BaseMapper::BaseMapper(Runtime* rt, Machine m, const LibraryContext& ctx)
       local_numa_domains[local_omp] = local_system_memory;
   }
   generate_prime_factors();
+
+  legate_mapper_->set_machine(this);
 }
 
-BaseMapper::~BaseMapper(void)
+BaseMapper::~BaseMapper()
 {
   // Compute the size of all our remaining instances in each memory
   const char* show_usage = getenv("LEGATE_SHOW_USAGE");
@@ -168,22 +168,22 @@ BaseMapper::~BaseMapper(void)
   }
 }
 
-/*static*/ AddressSpace BaseMapper::get_local_node(void)
+/*static*/ Legion::AddressSpace BaseMapper::get_local_node()
 {
   Processor p = Processor::get_executing_processor();
   return p.address_space();
 }
 
-/*static*/ size_t BaseMapper::get_total_nodes(Machine m)
+/*static*/ size_t BaseMapper::get_total_nodes(Legion::Machine m)
 {
-  Machine::ProcessorQuery query(m);
+  Legion::Machine::ProcessorQuery query(m);
   query.only_kind(Processor::LOC_PROC);
-  std::set<AddressSpace> spaces;
+  std::set<Legion::AddressSpace> spaces;
   for (auto proc : query) spaces.insert(proc.address_space());
   return spaces.size();
 }
 
-std::string BaseMapper::create_name(AddressSpace node) const
+std::string BaseMapper::create_name(Legion::AddressSpace node) const
 {
   std::stringstream ss;
   ss << context.get_library_name() << " on Node " << node;
@@ -197,15 +197,15 @@ std::string BaseMapper::create_logger_name() const
   return ss.str();
 }
 
-const char* BaseMapper::get_mapper_name(void) const { return mapper_name.c_str(); }
+const char* BaseMapper::get_mapper_name() const { return mapper_name.c_str(); }
 
-Mapper::MapperSyncModel BaseMapper::get_mapper_sync_model(void) const
+Legion::Mapping::Mapper::MapperSyncModel BaseMapper::get_mapper_sync_model() const
 {
   return SERIALIZED_REENTRANT_MAPPER_MODEL;
 }
 
-void BaseMapper::select_task_options(const MapperContext ctx,
-                                     const LegionTask& task,
+void BaseMapper::select_task_options(const Legion::Mapping::MapperContext ctx,
+                                     const Legion::Task& task,
                                      TaskOptions& output)
 {
 #ifdef LEGATE_USE_COLLECTIVE
@@ -232,27 +232,27 @@ void BaseMapper::select_task_options(const MapperContext ctx,
   options.push_back(TaskTarget::CPU);
 
   Task legate_task(&task, context, runtime, ctx);
-  auto target = task_target(legate_task, options);
+  auto target = legate_mapper_->task_target(legate_task, options);
 
   dispatch(target, [&output](auto& procs) { output.initial_proc = procs.front(); });
   // We never want valid instances
   output.valid_instances = false;
 }
 
-void BaseMapper::premap_task(const MapperContext ctx,
-                             const LegionTask& task,
+void BaseMapper::premap_task(const Legion::Mapping::MapperContext ctx,
+                             const Legion::Task& task,
                              const PremapTaskInput& input,
                              PremapTaskOutput& output)
 {
   // NO-op since we know that all our futures should be mapped in the system memory
 }
 
-void BaseMapper::slice_auto_task(const MapperContext ctx,
-                                 const LegionTask& task,
+void BaseMapper::slice_auto_task(const Legion::Mapping::MapperContext ctx,
+                                 const Legion::Task& task,
                                  const SliceTaskInput& input,
                                  SliceTaskOutput& output)
 {
-  ProjectionID projection = 0;
+  Legion::ProjectionID projection = 0;
   for (auto& req : task.regions)
     if (req.tag == LEGATE_CORE_KEY_STORE_TAG) {
       projection = req.projection;
@@ -310,8 +310,7 @@ void BaseMapper::generate_prime_factors()
   if (local_cpus.size() > 0) generate_prime_factor(local_cpus, Processor::LOC_PROC);
 }
 
-const std::vector<int32_t> BaseMapper::get_processor_grid(Legion::Processor::Kind kind,
-                                                          int32_t ndim)
+const std::vector<int32_t> BaseMapper::get_processor_grid(Processor::Kind kind, int32_t ndim)
 {
   auto key    = std::make_pair(kind, ndim);
   auto finder = proc_grids.find(key);
@@ -337,8 +336,8 @@ const std::vector<int32_t> BaseMapper::get_processor_grid(Legion::Processor::Kin
   return pitches;
 }
 
-void BaseMapper::slice_manual_task(const MapperContext ctx,
-                                   const LegionTask& task,
+void BaseMapper::slice_manual_task(const Legion::Mapping::MapperContext ctx,
+                                   const Legion::Task& task,
                                    const SliceTaskInput& input,
                                    SliceTaskOutput& output)
 {
@@ -358,8 +357,8 @@ void BaseMapper::slice_manual_task(const MapperContext ctx,
   dispatch(task.target_proc.kind(), distribute);
 }
 
-void BaseMapper::slice_task(const MapperContext ctx,
-                            const LegionTask& task,
+void BaseMapper::slice_task(const Legion::Mapping::MapperContext ctx,
+                            const Legion::Task& task,
                             const SliceTaskInput& input,
                             SliceTaskOutput& output)
 {
@@ -369,23 +368,25 @@ void BaseMapper::slice_task(const MapperContext ctx,
     slice_auto_task(ctx, task, input, output);
 }
 
-bool BaseMapper::has_variant(const MapperContext ctx, const LegionTask& task, Processor::Kind kind)
+bool BaseMapper::has_variant(const Legion::Mapping::MapperContext ctx,
+                             const Legion::Task& task,
+                             Processor::Kind kind)
 {
   return find_variant(ctx, task, kind).has_value();
 }
 
-std::optional<VariantID> BaseMapper::find_variant(const MapperContext ctx,
-                                                  const LegionTask& task,
-                                                  Processor::Kind kind)
+std::optional<Legion::VariantID> BaseMapper::find_variant(const Legion::Mapping::MapperContext ctx,
+                                                          const Legion::Task& task,
+                                                          Processor::Kind kind)
 {
   const VariantCacheKey key(task.task_id, kind);
   auto finder = variants.find(key);
   if (finder != variants.end()) return finder->second;
 
   // Haven't seen it before so let's look it up to make sure it exists
-  std::vector<VariantID> avail_variants;
+  std::vector<Legion::VariantID> avail_variants;
   runtime->find_valid_variants(ctx, key.first, avail_variants, key.second);
-  std::optional<VariantID> result;
+  std::optional<Legion::VariantID> result;
   for (auto vid : avail_variants) {
 #ifdef DEBUG_LEGATE
     assert(vid > 0);
@@ -404,13 +405,14 @@ std::optional<VariantID> BaseMapper::find_variant(const MapperContext ctx,
   return result;
 }
 
-void BaseMapper::map_task(const MapperContext ctx,
-                          const LegionTask& task,
+void BaseMapper::map_task(const Legion::Mapping::MapperContext ctx,
+                          const Legion::Task& task,
                           const MapTaskInput& input,
                           MapTaskOutput& output)
 {
 #ifdef DEBUG_LEGATE
-  logger.debug() << "Entering map_task for " << Utilities::to_string(runtime, ctx, task);
+  logger.debug() << "Entering map_task for "
+                 << Legion::Mapping::Utilities::to_string(runtime, ctx, task);
 #endif
 
   // Should never be mapping the top-level task here
@@ -429,7 +431,7 @@ void BaseMapper::map_task(const MapperContext ctx,
 
   const auto& options = default_store_targets(task.target_proc.kind());
 
-  auto mappings = store_mappings(legate_task, options);
+  auto mappings = legate_mapper_->store_mappings(legate_task, options);
 
   auto validate_colocation = [this](const auto& mapping) {
     if (mapping.stores.empty()) {
@@ -533,27 +535,28 @@ void BaseMapper::map_task(const MapperContext ctx,
       output.output_targets[req_idx] = get_target_memory(task.target_proc, mapping.policy.target);
       auto ndim                      = mapping.store().dim();
       // FIXME: Unbound stores can have more than one dimension later
-      std::vector<DimensionKind> dimension_ordering;
+      std::vector<Legion::DimensionKind> dimension_ordering;
       for (int32_t dim = ndim - 1; dim >= 0; --dim)
-        dimension_ordering.push_back(
-          static_cast<DimensionKind>(static_cast<int32_t>(DimensionKind::LEGION_DIM_X) + dim));
-      dimension_ordering.push_back(DimensionKind::LEGION_DIM_F);
+        dimension_ordering.push_back(static_cast<Legion::DimensionKind>(
+          static_cast<int32_t>(Legion::DimensionKind::LEGION_DIM_X) + dim));
+      dimension_ordering.push_back(Legion::DimensionKind::LEGION_DIM_F);
       output.output_constraints[req_idx].ordering_constraint =
-        OrderingConstraint(dimension_ordering, false);
+        Legion::OrderingConstraint(dimension_ordering, false);
     }
   };
   map_unbound_stores(for_unbound_stores);
 
   output.chosen_instances.resize(task.regions.size());
-  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>
+    output_map;
   for (uint32_t idx = 0; idx < task.regions.size(); ++idx)
     output_map[&task.regions[idx]] = &output.chosen_instances[idx];
 
   map_legate_stores(ctx, task, for_stores, task.target_proc, output_map);
 }
 
-void BaseMapper::map_replicate_task(const MapperContext ctx,
-                                    const LegionTask& task,
+void BaseMapper::map_replicate_task(const Legion::Mapping::MapperContext ctx,
+                                    const Legion::Task& task,
                                     const MapTaskInput& input,
                                     const MapTaskOutput& def_output,
                                     MapReplicateTaskOutput& output)
@@ -574,18 +577,18 @@ Memory BaseMapper::get_target_memory(Processor proc, StoreTarget target)
   return Memory::NO_MEMORY;
 }
 
-void BaseMapper::map_legate_stores(const MapperContext ctx,
-                                   const Mappable& mappable,
+void BaseMapper::map_legate_stores(const Legion::Mapping::MapperContext ctx,
+                                   const Legion::Mappable& mappable,
                                    std::vector<StoreMapping>& mappings,
                                    Processor target_proc,
                                    OutputMap& output_map)
 {
   auto try_mapping = [&](bool can_fail) {
-    const PhysicalInstance NO_INST{};
-    std::vector<PhysicalInstance> instances;
+    const Legion::Mapping::PhysicalInstance NO_INST{};
+    std::vector<Legion::Mapping::PhysicalInstance> instances;
     for (auto& mapping : mappings) {
-      PhysicalInstance result = NO_INST;
-      auto reqs               = mapping.requirements();
+      Legion::Mapping::PhysicalInstance result = NO_INST;
+      auto reqs                                = mapping.requirements();
       while (map_legate_store(ctx, mappable, mapping, reqs, target_proc, result, can_fail)) {
         if (NO_INST == result) {
 #ifdef DEBUG_LEGATE
@@ -610,10 +613,10 @@ void BaseMapper::map_legate_stores(const MapperContext ctx,
                        << " for reqs:" << reqs_ss.str();
 #endif
         if ((*reqs.begin())->redop != 0) {
-          AutoLock lock(ctx, reduction_instances->manager_lock());
+          Legion::Mapping::AutoLock lock(ctx, reduction_instances->manager_lock());
           reduction_instances->erase(result);
         } else {
-          AutoLock lock(ctx, local_instances->manager_lock());
+          Legion::Mapping::AutoLock lock(ctx, local_instances->manager_lock());
           local_instances->erase(result);
         }
         result = NO_INST;
@@ -649,14 +652,14 @@ void BaseMapper::map_legate_stores(const MapperContext ctx,
   }
 }
 
-void BaseMapper::tighten_write_policies(const Mappable& mappable,
+void BaseMapper::tighten_write_policies(const Legion::Mappable& mappable,
                                         std::vector<StoreMapping>& mappings)
 {
   for (auto& mapping : mappings) {
     // If the policy is exact, there's nothing we can tighten
     if (mapping.policy.exact) continue;
 
-    PrivilegeMode priv = LEGION_NO_ACCESS;
+    int32_t priv = LEGION_NO_ACCESS;
     for (auto* req : mapping.requirements()) priv |= req->privilege;
     // We tighten only write requirements
     if (!(priv & LEGION_WRITE_PRIV)) continue;
@@ -671,22 +674,22 @@ void BaseMapper::tighten_write_policies(const Mappable& mappable,
   }
 }
 
-bool BaseMapper::map_legate_store(const MapperContext ctx,
-                                  const Mappable& mappable,
+bool BaseMapper::map_legate_store(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Mappable& mappable,
                                   const StoreMapping& mapping,
-                                  const std::set<const RegionRequirement*>& reqs,
+                                  const std::set<const Legion::RegionRequirement*>& reqs,
                                   Processor target_proc,
-                                  PhysicalInstance& result,
+                                  Legion::Mapping::PhysicalInstance& result,
                                   bool can_fail)
 {
   if (reqs.empty()) return false;
 
   const auto& policy = mapping.policy;
-  std::vector<LogicalRegion> regions;
+  std::vector<Legion::LogicalRegion> regions;
   for (auto* req : reqs) regions.push_back(req->region);
   auto target_memory = get_target_memory(target_proc, policy.target);
 
-  ReductionOpID redop = (*reqs.begin())->redop;
+  auto redop = (*reqs.begin())->redop;
 #ifdef DEBUG_LEGATE
   for (auto* req : reqs) {
     if (redop != req->redop) {
@@ -699,7 +702,7 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
 #endif
 
   // Generate layout constraints from the store mapping
-  LayoutConstraintSet layout_constraints;
+  Legion::LayoutConstraintSet layout_constraints;
   mapping.populate_layout_constraints(layout_constraints);
   auto& fields = layout_constraints.field_constraint.field_set;
 
@@ -707,7 +710,7 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   if (redop != 0) {
     // We need to hold the instance manager lock as we're about to try
     // to find an instance
-    AutoLock reduction_lock(ctx, reduction_instances->manager_lock());
+    Legion::Mapping::AutoLock reduction_lock(ctx, reduction_instances->manager_lock());
 
     // This whole process has to appear atomic
     runtime->disable_reentrant(ctx);
@@ -730,7 +733,8 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
     }
 
     // if we didn't find it, create one
-    layout_constraints.add_constraint(SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
+    layout_constraints.add_constraint(
+      Legion::SpecializedConstraint(REDUCTION_FOLD_SPECIALIZE, redop));
     size_t footprint = 0;
     if (runtime->create_physical_instance(ctx,
                                           target_memory,
@@ -745,7 +749,7 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
       Realm::LoggerMessage msg = logger.debug();
       msg << "Operation " << mappable.get_unique_id() << ": created reduction instance " << result
           << " for";
-      for (LogicalRegion r : regions) msg << " " << r;
+      for (auto& r : regions) msg << " " << r;
       msg << " (size: " << footprint << " bytes, memory: " << target_memory << ")";
 #endif
       if (target_proc.kind() == Processor::TOC_PROC) {
@@ -760,11 +764,11 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
       return false;
     }
     if (!can_fail)
-      report_failed_mapping(mappable, mapping.requirement_index(), target_memory, redop);
+      report_failed_mapping(mappable, mapping.requirement_index(), target_memory, redop, footprint);
     return true;
   }
 
-  AutoLock lock(ctx, local_instances->manager_lock());
+  Legion::Mapping::AutoLock lock(ctx, local_instances->manager_lock());
   runtime->disable_reentrant(ctx);
   // See if we already have it in our local instances
   if (fields.size() == 1 && regions.size() == 1 &&
@@ -790,7 +794,7 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
     // that instance for all the tasks for the different regions.
     // First we have to see if there is anything we overlap with
     auto fid            = fields.front();
-    const IndexSpace is = regions.front().get_index_space();
+    auto is             = regions.front().get_index_space();
     const Domain domain = runtime->get_index_space_domain(ctx, is);
     group =
       local_instances->find_region_group(regions.front(), domain, fid, target_memory, policy.exact);
@@ -859,15 +863,17 @@ bool BaseMapper::map_legate_store(const MapperContext ctx,
   // If we make it here then we failed entirely
   if (!can_fail) {
     auto req_indices = mapping.requirement_indices();
-    for (auto req_idx : req_indices) report_failed_mapping(mappable, req_idx, target_memory, redop);
+    for (auto req_idx : req_indices)
+      report_failed_mapping(mappable, req_idx, target_memory, redop, footprint);
   }
   return true;
 }
 
-void BaseMapper::report_failed_mapping(const Mappable& mappable,
+void BaseMapper::report_failed_mapping(const Legion::Mappable& mappable,
                                        uint32_t index,
                                        Memory target_memory,
-                                       ReductionOpID redop)
+                                       Legion::ReductionOpID redop,
+                                       size_t footprint)
 {
   static const char* memory_kinds[] = {
 #define MEM_NAMES(name, desc) desc,
@@ -876,7 +882,7 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
   };
 
   std::string opname = "";
-  if (mappable.get_mappable_type() == Mappable::TASK_MAPPABLE) {
+  if (mappable.get_mappable_type() == Legion::Mappable::TASK_MAPPABLE) {
     const auto task = mappable.as_task();
     opname          = task->get_task_name();
   }
@@ -890,20 +896,37 @@ void BaseMapper::report_failed_mapping(const Mappable& mappable,
   else
     req_ss << "region requirement " << index;
 
-  logger.error("Mapper %s failed to map %s of %s%s[%s] (UID %lld) into %s memory " IDFMT,
+  logger.error("Mapper %s failed to allocate %zd bytes on memory " IDFMT
+               " (of kind %s: %s) for %s of %s%s[%s] (UID %lld).\n"
+               "This means Legate was unable to reserve ouf of its memory pool the full amount "
+               "required for the above operation. Here are some things to try:\n"
+               "* Make sure your code is not impeding the garbage collection of Legate-backed "
+               "objects, e.g. by storing references in caches, or creating reference cycles.\n"
+               "* Ask Legate to reserve more space on the above memory, using the appropriate "
+               "--*mem legate flag.\n"
+               "* Assign less memory to the eager pool, by reducing --eager-alloc-percentage.\n"
+               "* If running on multiple nodes, increase how often distributed garbage collection "
+               "runs, by reducing LEGATE_FIELD_REUSE_FREQ (default: 32, warning: may "
+               "incur overhead).\n"
+               "* Adapt your code to reduce temporary storage requirements, e.g. by breaking up "
+               "larger operations into batches.\n"
+               "* If the previous steps don't help, and you are confident Legate should be able to "
+               "handle your code's working set, please open an issue on Legate's bug tracker.",
                get_mapper_name(),
+               footprint,
+               target_memory.id,
+               Legion::Mapping::Utilities::to_string(target_memory.kind()),
+               memory_kinds[target_memory.kind()],
                req_ss.str().c_str(),
                log_mappable(mappable, true /*prefix_only*/).c_str(),
                opname.c_str(),
                provenance.c_str(),
-               mappable.get_unique_id(),
-               memory_kinds[target_memory.kind()],
-               target_memory.id);
+               mappable.get_unique_id());
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_task_variant(const MapperContext ctx,
-                                     const LegionTask& task,
+void BaseMapper::select_task_variant(const Legion::Mapping::MapperContext ctx,
+                                     const Legion::Task& task,
                                      const SelectVariantInput& input,
                                      SelectVariantOutput& output)
 {
@@ -914,8 +937,8 @@ void BaseMapper::select_task_variant(const MapperContext ctx,
   output.chosen_variant = *variant;
 }
 
-void BaseMapper::postmap_task(const MapperContext ctx,
-                              const LegionTask& task,
+void BaseMapper::postmap_task(const Legion::Mapping::MapperContext ctx,
+                              const Legion::Task& task,
                               const PostMapInput& input,
                               PostMapOutput& output)
 {
@@ -923,8 +946,8 @@ void BaseMapper::postmap_task(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_task_sources(const MapperContext ctx,
-                                     const LegionTask& task,
+void BaseMapper::select_task_sources(const Legion::Mapping::MapperContext ctx,
+                                     const Legion::Task& task,
                                      const SelectTaskSrcInput& input,
                                      SelectTaskSrcOutput& output)
 {
@@ -932,17 +955,18 @@ void BaseMapper::select_task_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void add_instance_to_band_ranking(const PhysicalInstance& instance,
-                                  const Legion::AddressSpace& local_node,
-                                  std::map<Memory, uint32_t /*bandwidth*/>& source_memories,
-                                  std::vector<std::pair<PhysicalInstance, uint32_t>>& band_ranking,
-                                  const Memory& destination_memory,
-                                  const Legion::Machine& machine)
+void add_instance_to_band_ranking(
+  const Legion::Mapping::PhysicalInstance& instance,
+  const Legion::AddressSpace& local_node,
+  std::map<Memory, uint32_t /*bandwidth*/>& source_memories,
+  std::vector<std::pair<Legion::Mapping::PhysicalInstance, uint32_t>>& band_ranking,
+  const Memory& destination_memory,
+  const Legion::Machine& machine)
 {
   Memory location = instance.get_location();
   auto finder     = source_memories.find(location);
   if (finder == source_memories.end()) {
-    std::vector<MemoryMemoryAffinity> affinity;
+    std::vector<Legion::MemoryMemoryAffinity> affinity;
     machine.get_mem_mem_affinity(
       affinity, location, destination_memory, false /*not just local affinities*/);
     uint32_t memory_bandwidth = 0;
@@ -953,16 +977,19 @@ void add_instance_to_band_ranking(const PhysicalInstance& instance,
       memory_bandwidth = affinity[0].bandwidth;
     }
     source_memories[location] = memory_bandwidth;
-    band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, memory_bandwidth));
+    band_ranking.push_back(
+      std::pair<Legion::Mapping::PhysicalInstance, uint32_t>(instance, memory_bandwidth));
   } else
-    band_ranking.push_back(std::pair<PhysicalInstance, uint32_t>(instance, finder->second));
+    band_ranking.push_back(
+      std::pair<Legion::Mapping::PhysicalInstance, uint32_t>(instance, finder->second));
 }
 
-void BaseMapper::legate_select_sources(const MapperContext ctx,
-                                       const PhysicalInstance& target,
-                                       const std::vector<PhysicalInstance>& sources,
-                                       const std::vector<CollectiveView>& collective_sources,
-                                       std::deque<PhysicalInstance>& ranking)
+void BaseMapper::legate_select_sources(
+  const Legion::Mapping::MapperContext ctx,
+  const Legion::Mapping::PhysicalInstance& target,
+  const std::vector<Legion::Mapping::PhysicalInstance>& sources,
+  const std::vector<Legion::Mapping::CollectiveView>& collective_sources,
+  std::deque<Legion::Mapping::PhysicalInstance>& ranking)
 {
   std::map<Memory, uint32_t /*bandwidth*/> source_memories;
   // For right now we'll rank instances by the bandwidth of the memory
@@ -970,18 +997,22 @@ void BaseMapper::legate_select_sources(const MapperContext ctx,
   // TODO: consider layouts when ranking source to help out the DMA system
   Memory destination_memory = target.get_location();
   // fill in a vector of the sources with their bandwidths and sort them
-  std::vector<std::pair<PhysicalInstance, uint32_t /*bandwidth*/>> band_ranking;
+  std::vector<std::pair<Legion::Mapping::PhysicalInstance, uint32_t /*bandwidth*/>> band_ranking;
   for (uint32_t idx = 0; idx < sources.size(); idx++) {
-    const PhysicalInstance& instance = sources[idx];
+    const Legion::Mapping::PhysicalInstance& instance = sources[idx];
     add_instance_to_band_ranking(
       instance, local_node, source_memories, band_ranking, destination_memory, machine);
   }
 
   for (uint32_t idx = 0; idx < collective_sources.size(); idx++) {
-    std::vector<PhysicalInstance> col_instances;
+    std::vector<Legion::Mapping::PhysicalInstance> col_instances;
     collective_sources[idx].find_instances_nearest_memory(destination_memory, col_instances);
+#ifdef DEBUG_LEGATE
+    // there must exist at least one instance in the collective view
+    assert(!col_instances.empty());
+#endif
     // we need only first instance if there are several
-    const PhysicalInstance& instance = col_instances[0];
+    const Legion::Mapping::PhysicalInstance& instance = col_instances[0];
     add_instance_to_band_ranking(
       instance, local_node, source_memories, band_ranking, destination_memory, machine);
   }
@@ -1000,25 +1031,25 @@ void BaseMapper::legate_select_sources(const MapperContext ctx,
     ranking.push_back(it->first);
 }
 
-void BaseMapper::speculate(const MapperContext ctx,
-                           const LegionTask& task,
+void BaseMapper::speculate(const Legion::Mapping::MapperContext ctx,
+                           const Legion::Task& task,
                            SpeculativeOutput& output)
 {
   output.speculate = false;
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const LegionTask& task,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Task& task,
                                   const TaskProfilingInfo& input)
 {
   // Shouldn't get any profiling feedback currently
   LEGATE_ABORT;
 }
 
-ShardingID BaseMapper::find_sharding_functor_by_key_store_projection(
-  const std::vector<RegionRequirement>& requirements)
+Legion::ShardingID BaseMapper::find_sharding_functor_by_key_store_projection(
+  const std::vector<Legion::RegionRequirement>& requirements)
 {
-  ProjectionID proj_id = 0;
+  Legion::ProjectionID proj_id = 0;
   for (auto& requirement : requirements)
     if (LEGATE_CORE_KEY_STORE_TAG == requirement.tag) {
       proj_id = requirement.projection;
@@ -1027,8 +1058,8 @@ ShardingID BaseMapper::find_sharding_functor_by_key_store_projection(
   return find_sharding_functor_by_projection_functor(proj_id);
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const LegionTask& task,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Task& task,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
@@ -1037,8 +1068,8 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
                             : find_sharding_functor_by_projection_functor(0);
 }
 
-void BaseMapper::map_inline(const MapperContext ctx,
-                            const InlineMapping& inline_op,
+void BaseMapper::map_inline(const Legion::Mapping::MapperContext ctx,
+                            const Legion::InlineMapping& inline_op,
                             const MapInlineInput& input,
                             MapInlineOutput& output)
 {
@@ -1058,14 +1089,15 @@ void BaseMapper::map_inline(const MapperContext ctx,
   std::vector<StoreMapping> mappings;
   mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
 
-  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>
+    output_map;
   for (auto* req : mappings.front().requirements()) output_map[req] = &output.chosen_instances;
 
   map_legate_stores(ctx, inline_op, mappings, target_proc, output_map);
 }
 
-void BaseMapper::select_inline_sources(const MapperContext ctx,
-                                       const InlineMapping& inline_op,
+void BaseMapper::select_inline_sources(const Legion::Mapping::MapperContext ctx,
+                                       const Legion::InlineMapping& inline_op,
                                        const SelectInlineSrcInput& input,
                                        SelectInlineSrcOutput& output)
 {
@@ -1073,16 +1105,16 @@ void BaseMapper::select_inline_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const InlineMapping& inline_op,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::InlineMapping& inline_op,
                                   const InlineProfilingInfo& input)
 {
   // No profiling yet for inline mappings
   LEGATE_ABORT;
 }
 
-void BaseMapper::map_copy(const MapperContext ctx,
-                          const LegionCopy& copy,
+void BaseMapper::map_copy(const Legion::Mapping::MapperContext ctx,
+                          const Legion::Copy& copy,
                           const MapCopyInput& input,
                           MapCopyOutput& output)
 {
@@ -1124,7 +1156,8 @@ void BaseMapper::map_copy(const MapperContext ctx,
 
   Copy legate_copy(&copy, runtime, ctx);
 
-  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>
+    output_map;
   auto add_to_output_map = [&output_map](auto& reqs, auto& instances) {
     instances.resize(reqs.size());
     for (uint32_t idx = 0; idx < reqs.size(); ++idx) output_map[&reqs[idx]] = &instances[idx];
@@ -1161,8 +1194,8 @@ void BaseMapper::map_copy(const MapperContext ctx,
   map_legate_stores(ctx, copy, mappings, target_proc, output_map);
 }
 
-void BaseMapper::select_copy_sources(const MapperContext ctx,
-                                     const LegionCopy& copy,
+void BaseMapper::select_copy_sources(const Legion::Mapping::MapperContext ctx,
+                                     const Legion::Copy& copy,
                                      const SelectCopySrcInput& input,
                                      SelectCopySrcOutput& output)
 {
@@ -1170,23 +1203,23 @@ void BaseMapper::select_copy_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void BaseMapper::speculate(const MapperContext ctx,
-                           const LegionCopy& copy,
+void BaseMapper::speculate(const Legion::Mapping::MapperContext ctx,
+                           const Legion::Copy& copy,
                            SpeculativeOutput& output)
 {
   output.speculate = false;
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const LegionCopy& copy,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Copy& copy,
                                   const CopyProfilingInfo& input)
 {
   // No profiling for copies yet
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const LegionCopy& copy,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Copy& copy,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
@@ -1194,8 +1227,8 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
   output.chosen_functor = find_sharding_functor_by_projection_functor(0);
 }
 
-void BaseMapper::select_close_sources(const MapperContext ctx,
-                                      const Close& close,
+void BaseMapper::select_close_sources(const Legion::Mapping::MapperContext ctx,
+                                      const Legion::Close& close,
                                       const SelectCloseSrcInput& input,
                                       SelectCloseSrcOutput& output)
 {
@@ -1203,63 +1236,63 @@ void BaseMapper::select_close_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const Close& close,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Close& close,
                                   const CloseProfilingInfo& input)
 {
   // No profiling yet for legate
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Close& close,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Close& close,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
   LEGATE_ABORT;
 }
 
-void BaseMapper::map_acquire(const MapperContext ctx,
-                             const Acquire& acquire,
+void BaseMapper::map_acquire(const Legion::Mapping::MapperContext ctx,
+                             const Legion::Acquire& acquire,
                              const MapAcquireInput& input,
                              MapAcquireOutput& output)
 {
   // Nothing to do
 }
 
-void BaseMapper::speculate(const MapperContext ctx,
-                           const Acquire& acquire,
+void BaseMapper::speculate(const Legion::Mapping::MapperContext ctx,
+                           const Legion::Acquire& acquire,
                            SpeculativeOutput& output)
 {
   output.speculate = false;
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const Acquire& acquire,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Acquire& acquire,
                                   const AcquireProfilingInfo& input)
 {
   // No profiling for legate yet
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Acquire& acquire,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Acquire& acquire,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
   LEGATE_ABORT;
 }
 
-void BaseMapper::map_release(const MapperContext ctx,
-                             const Release& release,
+void BaseMapper::map_release(const Legion::Mapping::MapperContext ctx,
+                             const Legion::Release& release,
                              const MapReleaseInput& input,
                              MapReleaseOutput& output)
 {
   // Nothing to do
 }
 
-void BaseMapper::select_release_sources(const MapperContext ctx,
-                                        const Release& release,
+void BaseMapper::select_release_sources(const Legion::Mapping::MapperContext ctx,
+                                        const Legion::Release& release,
                                         const SelectReleaseSrcInput& input,
                                         SelectReleaseSrcOutput& output)
 {
@@ -1267,31 +1300,31 @@ void BaseMapper::select_release_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void BaseMapper::speculate(const MapperContext ctx,
-                           const Release& release,
+void BaseMapper::speculate(const Legion::Mapping::MapperContext ctx,
+                           const Legion::Release& release,
                            SpeculativeOutput& output)
 {
   output.speculate = false;
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const Release& release,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Release& release,
                                   const ReleaseProfilingInfo& input)
 {
   // No profiling for legate yet
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Release& release,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Release& release,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_partition_projection(const MapperContext ctx,
-                                             const Partition& partition,
+void BaseMapper::select_partition_projection(const Legion::Mapping::MapperContext ctx,
+                                             const Legion::Partition& partition,
                                              const SelectPartitionProjectionInput& input,
                                              SelectPartitionProjectionOutput& output)
 {
@@ -1299,11 +1332,11 @@ void BaseMapper::select_partition_projection(const MapperContext ctx,
   if (!input.open_complete_partitions.empty())
     output.chosen_partition = input.open_complete_partitions[0];
   else
-    output.chosen_partition = LogicalPartition::NO_PART;
+    output.chosen_partition = Legion::LogicalPartition::NO_PART;
 }
 
-void BaseMapper::map_partition(const MapperContext ctx,
-                               const Partition& partition,
+void BaseMapper::map_partition(const Legion::Mapping::MapperContext ctx,
+                               const Legion::Partition& partition,
                                const MapPartitionInput& input,
                                MapPartitionOutput& output)
 {
@@ -1323,14 +1356,15 @@ void BaseMapper::map_partition(const MapperContext ctx,
   std::vector<StoreMapping> mappings;
   mappings.push_back(StoreMapping::default_mapping(store, store_target, false));
 
-  std::map<const RegionRequirement*, std::vector<PhysicalInstance>*> output_map;
+  std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>
+    output_map;
   for (auto* req : mappings.front().requirements()) output_map[req] = &output.chosen_instances;
 
   map_legate_stores(ctx, partition, mappings, target_proc, output_map);
 }
 
-void BaseMapper::select_partition_sources(const MapperContext ctx,
-                                          const Partition& partition,
+void BaseMapper::select_partition_sources(const Legion::Mapping::MapperContext ctx,
+                                          const Legion::Partition& partition,
                                           const SelectPartitionSrcInput& input,
                                           SelectPartitionSrcOutput& output)
 {
@@ -1338,24 +1372,24 @@ void BaseMapper::select_partition_sources(const MapperContext ctx,
     ctx, input.target, input.source_instances, input.collective_views, output.chosen_ranking);
 }
 
-void BaseMapper::report_profiling(const MapperContext ctx,
-                                  const Partition& partition,
+void BaseMapper::report_profiling(const Legion::Mapping::MapperContext ctx,
+                                  const Legion::Partition& partition,
                                   const PartitionProfilingInfo& input)
 {
   // No profiling yet
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Partition& partition,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Partition& partition,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
   output.chosen_functor = find_sharding_functor_by_projection_functor(0);
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Fill& fill,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Fill& fill,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
@@ -1364,26 +1398,26 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
                             : find_sharding_functor_by_projection_functor(0);
 }
 
-void BaseMapper::configure_context(const MapperContext ctx,
-                                   const LegionTask& task,
+void BaseMapper::configure_context(const Legion::Mapping::MapperContext ctx,
+                                   const Legion::Task& task,
                                    ContextConfigOutput& output)
 {
   // Use the defaults currently
 }
 
-void BaseMapper::select_tunable_value(const MapperContext ctx,
-                                      const LegionTask& task,
+void BaseMapper::select_tunable_value(const Legion::Mapping::MapperContext ctx,
+                                      const Legion::Task& task,
                                       const SelectTunableInput& input,
                                       SelectTunableOutput& output)
 {
-  auto value   = tunable_value(input.tunable_id);
+  auto value   = legate_mapper_->tunable_value(input.tunable_id);
   output.size  = value.size();
   output.value = malloc(output.size);
   memcpy(output.value, value.ptr(), output.size);
 }
 
-void BaseMapper::select_sharding_functor(const MapperContext ctx,
-                                         const MustEpoch& epoch,
+void BaseMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::MustEpoch& epoch,
                                          const SelectShardingFunctorInput& input,
                                          MustEpochShardingFunctorOutput& output)
 {
@@ -1391,15 +1425,15 @@ void BaseMapper::select_sharding_functor(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void BaseMapper::memoize_operation(const MapperContext ctx,
-                                   const Mappable& mappable,
+void BaseMapper::memoize_operation(const Legion::Mapping::MapperContext ctx,
+                                   const Legion::Mappable& mappable,
                                    const MemoizeInput& input,
                                    MemoizeOutput& output)
 {
   LEGATE_ABORT;
 }
 
-void BaseMapper::map_must_epoch(const MapperContext ctx,
+void BaseMapper::map_must_epoch(const Legion::Mapping::MapperContext ctx,
                                 const MapMustEpochInput& input,
                                 MapMustEpochOutput& output)
 {
@@ -1407,7 +1441,7 @@ void BaseMapper::map_must_epoch(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void BaseMapper::map_dataflow_graph(const MapperContext ctx,
+void BaseMapper::map_dataflow_graph(const Legion::Mapping::MapperContext ctx,
                                     const MapDataflowGraphInput& input,
                                     MapDataflowGraphOutput& output)
 {
@@ -1415,7 +1449,7 @@ void BaseMapper::map_dataflow_graph(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void BaseMapper::select_tasks_to_map(const MapperContext ctx,
+void BaseMapper::select_tasks_to_map(const Legion::Mapping::MapperContext ctx,
                                      const SelectMappingInput& input,
                                      SelectMappingOutput& output)
 {
@@ -1423,14 +1457,14 @@ void BaseMapper::select_tasks_to_map(const MapperContext ctx,
   for (auto task : input.ready_tasks) output.map_tasks.insert(task);
 }
 
-void BaseMapper::select_steal_targets(const MapperContext ctx,
+void BaseMapper::select_steal_targets(const Legion::Mapping::MapperContext ctx,
                                       const SelectStealingInput& input,
                                       SelectStealingOutput& output)
 {
   // Nothing to do, no stealing in the leagte mapper currently
 }
 
-void BaseMapper::permit_steal_request(const MapperContext ctx,
+void BaseMapper::permit_steal_request(const Legion::Mapping::MapperContext ctx,
                                       const StealRequestInput& input,
                                       StealRequestOutput& output)
 {
@@ -1438,13 +1472,15 @@ void BaseMapper::permit_steal_request(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void BaseMapper::handle_message(const MapperContext ctx, const MapperMessage& message)
+void BaseMapper::handle_message(const Legion::Mapping::MapperContext ctx,
+                                const MapperMessage& message)
 {
   // We shouldn't be receiving any messages currently
   LEGATE_ABORT;
 }
 
-void BaseMapper::handle_task_result(const MapperContext ctx, const MapperTaskResult& result)
+void BaseMapper::handle_task_result(const Legion::Mapping::MapperContext ctx,
+                                    const MapperTaskResult& result)
 {
   // Nothing to do since we should never get one of these
   LEGATE_ABORT;
diff --git a/src/core/mapping/base_mapper.h b/src/core/mapping/base_mapper.h
index 86e558e0b..b4c6dcfd7 100644
--- a/src/core/mapping/base_mapper.h
+++ b/src/core/mapping/base_mapper.h
@@ -38,10 +38,13 @@ enum class Strictness : bool {
   hint   = false,
 };
 
-class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
+class BaseMapper : public Legion::Mapping::Mapper, public MachineQueryInterface {
  public:
-  BaseMapper(Legion::Runtime* rt, Legion::Machine machine, const LibraryContext& context);
-  virtual ~BaseMapper(void);
+  BaseMapper(std::unique_ptr<LegateMapper> legate_mapper,
+             Legion::Runtime* rt,
+             Legion::Machine machine,
+             const LibraryContext& context);
+  virtual ~BaseMapper();
 
  private:
   BaseMapper(const BaseMapper& rhs)            = delete;
@@ -49,15 +52,22 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
 
  protected:
   // Start-up methods
-  static Legion::AddressSpaceID get_local_node(void);
+  static Legion::AddressSpaceID get_local_node();
   static size_t get_total_nodes(Legion::Machine m);
   std::string create_name(Legion::AddressSpace node) const;
   std::string create_logger_name() const;
 
  public:
-  virtual const char* get_mapper_name(void) const override;
-  virtual Legion::Mapping::Mapper::MapperSyncModel get_mapper_sync_model(void) const override;
-  virtual bool request_valid_instances(void) const override { return false; }
+  // MachineQueryInterface
+  virtual const std::vector<Processor>& cpus() const override { return local_cpus; }
+  virtual const std::vector<Processor>& gpus() const override { return local_gpus; }
+  virtual const std::vector<Processor>& omps() const override { return local_omps; }
+  virtual uint32_t total_nodes() const override { return total_nodes_; }
+
+ public:
+  virtual const char* get_mapper_name() const override;
+  virtual Legion::Mapping::Mapper::MapperSyncModel get_mapper_sync_model() const override;
+  virtual bool request_valid_instances() const override { return false; }
 
  public:  // Task mapping calls
   virtual void select_task_options(const Legion::Mapping::MapperContext ctx,
@@ -257,13 +267,13 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
                                   const MapperTaskResult& result) override;
 
  protected:
-  Legion::Memory get_target_memory(Legion::Processor proc, StoreTarget target);
+  Memory get_target_memory(Processor proc, StoreTarget target);
   using OutputMap =
     std::map<const Legion::RegionRequirement*, std::vector<Legion::Mapping::PhysicalInstance>*>;
   void map_legate_stores(const Legion::Mapping::MapperContext ctx,
                          const Legion::Mappable& mappable,
                          std::vector<StoreMapping>& mappings,
-                         Legion::Processor target_proc,
+                         Processor target_proc,
                          OutputMap& output_map);
   void tighten_write_policies(const Legion::Mappable& mappable,
                               std::vector<StoreMapping>& mappings);
@@ -271,13 +281,14 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
                         const Legion::Mappable& mappable,
                         const StoreMapping& mapping,
                         const std::set<const Legion::RegionRequirement*>& reqs,
-                        Legion::Processor target_proc,
+                        Processor target_proc,
                         Legion::Mapping::PhysicalInstance& result,
                         bool can_fail);
   void report_failed_mapping(const Legion::Mappable& mappable,
                              unsigned index,
-                             Legion::Memory target_memory,
-                             Legion::ReductionOpID redop);
+                             Memory target_memory,
+                             Legion::ReductionOpID redop,
+                             size_t footprint);
   void legate_select_sources(const Legion::Mapping::MapperContext ctx,
                              const Legion::Mapping::PhysicalInstance& target,
                              const std::vector<Legion::Mapping::PhysicalInstance>& sources,
@@ -287,15 +298,14 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
  protected:
   bool has_variant(const Legion::Mapping::MapperContext ctx,
                    const Legion::Task& task,
-                   Legion::Processor::Kind kind);
+                   Processor::Kind kind);
   std::optional<Legion::VariantID> find_variant(const Legion::Mapping::MapperContext ctx,
                                                 const Legion::Task& task,
-                                                Legion::Processor::Kind kind);
+                                                Processor::Kind kind);
 
  private:
   void generate_prime_factors();
-  void generate_prime_factor(const std::vector<Legion::Processor>& processors,
-                             Legion::Processor::Kind kind);
+  void generate_prime_factor(const std::vector<Processor>& processors, Processor::Kind kind);
 
  protected:
   template <typename Functor>
@@ -310,12 +320,12 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
     return functor(local_cpus);
   }
   template <typename Functor>
-  decltype(auto) dispatch(Legion::Processor::Kind kind, Functor functor)
+  decltype(auto) dispatch(Processor::Kind kind, Functor functor)
   {
     switch (kind) {
-      case Legion::Processor::LOC_PROC: return functor(local_cpus);
-      case Legion::Processor::TOC_PROC: return functor(local_gpus);
-      case Legion::Processor::OMP_PROC: return functor(local_omps);
+      case Processor::LOC_PROC: return functor(local_cpus);
+      case Processor::TOC_PROC: return functor(local_gpus);
+      case Processor::OMP_PROC: return functor(local_omps);
       default: LEGATE_ABORT;
     }
     assert(false);
@@ -323,7 +333,7 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
   }
 
  protected:
-  const std::vector<int32_t> get_processor_grid(Legion::Processor::Kind kind, int32_t ndim);
+  const std::vector<int32_t> get_processor_grid(Processor::Kind kind, int32_t ndim);
   void slice_auto_task(const Legion::Mapping::MapperContext ctx,
                        const Legion::Task& task,
                        const SliceTaskInput& input,
@@ -344,28 +354,30 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
   {
     return (left.second < right.second);
   }
-  // NumPyOpCode decode_task_id(Legion::TaskID tid);
+
+ private:
+  std::unique_ptr<LegateMapper> legate_mapper_;
 
  public:
   Legion::Runtime* const legion_runtime;
   const Legion::Machine machine;
   const LibraryContext context;
   const Legion::AddressSpace local_node;
-  const size_t total_nodes;
   const std::string mapper_name;
   Legion::Logger logger;
 
  protected:
-  std::vector<Legion::Processor> local_cpus;
-  std::vector<Legion::Processor> local_gpus;
-  std::vector<Legion::Processor> local_omps;  // OpenMP processors
+  const size_t total_nodes_;
+  std::vector<Processor> local_cpus;
+  std::vector<Processor> local_gpus;
+  std::vector<Processor> local_omps;  // OpenMP processors
  protected:
-  Legion::Memory local_system_memory, local_zerocopy_memory;
-  std::map<Legion::Processor, Legion::Memory> local_frame_buffers;
-  std::map<Legion::Processor, Legion::Memory> local_numa_domains;
+  Memory local_system_memory, local_zerocopy_memory;
+  std::map<Processor, Memory> local_frame_buffers;
+  std::map<Processor, Memory> local_numa_domains;
 
  protected:
-  using VariantCacheKey = std::pair<Legion::TaskID, Legion::Processor::Kind>;
+  using VariantCacheKey = std::pair<Legion::TaskID, Processor::Kind>;
   std::map<VariantCacheKey, std::optional<Legion::VariantID>> variants;
 
  protected:
@@ -374,8 +386,8 @@ class BaseMapper : public Legion::Mapping::Mapper, public LegateMapper {
 
  protected:
   // Used for n-D cyclic distribution
-  std::map<Legion::Processor::Kind, std::vector<int32_t>> all_factors;
-  std::map<std::pair<Legion::Processor::Kind, int32_t>, std::vector<int32_t>> proc_grids;
+  std::map<Processor::Kind, std::vector<int32_t>> all_factors;
+  std::map<std::pair<Processor::Kind, int32_t>, std::vector<int32_t>> proc_grids;
 
  protected:
   // These are used for computing sharding functions
diff --git a/src/core/mapping/core_mapper.cc b/src/core/mapping/core_mapper.cc
index bb9389108..0e42875a1 100644
--- a/src/core/mapping/core_mapper.cc
+++ b/src/core/mapping/core_mapper.cc
@@ -24,12 +24,10 @@
 #endif
 #include "core/task/task.h"
 #include "core/utilities/linearize.h"
+#include "core/utilities/typedefs.h"
 
 namespace legate {
 
-using namespace Legion;
-using namespace Legion::Mapping;
-
 uint32_t extract_env(const char* env_name, const uint32_t default_value, const uint32_t test_value)
 {
   const char* env_value = getenv(env_name);
@@ -48,61 +46,66 @@ uint32_t extract_env(const char* env_name, const uint32_t default_value, const u
 // should be overriding this mapper so we burry it in here
 class CoreMapper : public Legion::Mapping::NullMapper {
  public:
-  CoreMapper(MapperRuntime* runtime, Machine machine, const LibraryContext& context);
-  virtual ~CoreMapper(void);
+  CoreMapper(Legion::Mapping::MapperRuntime* runtime,
+             Legion::Machine machine,
+             const LibraryContext& context);
+
+  virtual ~CoreMapper();
 
  public:
   // Start-up methods
-  static AddressSpaceID get_local_node(void);
-  static size_t get_total_nodes(Machine m);
-  static const char* create_name(AddressSpace node);
+  static Legion::AddressSpaceID get_local_node();
+  static size_t get_total_nodes(Legion::Machine m);
+  static const char* create_name(Legion::AddressSpace node);
 
  public:
-  virtual const char* get_mapper_name(void) const;
-  virtual MapperSyncModel get_mapper_sync_model(void) const;
-  virtual bool request_valid_instances(void) const { return false; }
+  const char* get_mapper_name() const override;
+  Legion::Mapping::Mapper::MapperSyncModel get_mapper_sync_model() const override;
+  bool request_valid_instances() const override { return false; }
 
  public:  // Task mapping calls
-  virtual void select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output);
-  virtual void slice_task(const MapperContext ctx,
-                          const Task& task,
-                          const SliceTaskInput& input,
-                          SliceTaskOutput& output);
-  virtual void map_task(const MapperContext ctx,
-                        const Task& task,
-                        const MapTaskInput& input,
-                        MapTaskOutput& output);
-  virtual void select_sharding_functor(const MapperContext ctx,
-                                       const Task& task,
-                                       const SelectShardingFunctorInput& input,
-                                       SelectShardingFunctorOutput& output);
-  virtual void select_steal_targets(const MapperContext ctx,
-                                    const SelectStealingInput& input,
-                                    SelectStealingOutput& output);
-  virtual void select_tasks_to_map(const MapperContext ctx,
-                                   const SelectMappingInput& input,
-                                   SelectMappingOutput& output);
+  void select_task_options(const Legion::Mapping::MapperContext ctx,
+                           const Legion::Task& task,
+                           TaskOptions& output) override;
+  void slice_task(const Legion::Mapping::MapperContext ctx,
+                  const Legion::Task& task,
+                  const SliceTaskInput& input,
+                  SliceTaskOutput& output) override;
+  void map_task(const Legion::Mapping::MapperContext ctx,
+                const Legion::Task& task,
+                const MapTaskInput& input,
+                MapTaskOutput& output) override;
+  void select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                               const Legion::Task& task,
+                               const SelectShardingFunctorInput& input,
+                               SelectShardingFunctorOutput& output) override;
+  void select_steal_targets(const Legion::Mapping::MapperContext ctx,
+                            const SelectStealingInput& input,
+                            SelectStealingOutput& output) override;
+  void select_tasks_to_map(const Legion::Mapping::MapperContext ctx,
+                           const SelectMappingInput& input,
+                           SelectMappingOutput& output) override;
 
  public:
-  virtual void configure_context(const MapperContext ctx,
-                                 const Task& task,
-                                 ContextConfigOutput& output);
-  void map_future_map_reduction(const MapperContext ctx,
+  void configure_context(const Legion::Mapping::MapperContext ctx,
+                         const Legion::Task& task,
+                         ContextConfigOutput& output) override;
+  void map_future_map_reduction(const Legion::Mapping::MapperContext ctx,
                                 const FutureMapReductionInput& input,
-                                FutureMapReductionOutput& output);
-  virtual void select_tunable_value(const MapperContext ctx,
-                                    const Task& task,
-                                    const SelectTunableInput& input,
-                                    SelectTunableOutput& output);
+                                FutureMapReductionOutput& output) override;
+  void select_tunable_value(const Legion::Mapping::MapperContext ctx,
+                            const Legion::Task& task,
+                            const SelectTunableInput& input,
+                            SelectTunableOutput& output) override;
 
  protected:
   template <typename Functor>
-  decltype(auto) dispatch(Legion::Processor::Kind kind, Functor functor)
+  decltype(auto) dispatch(Processor::Kind kind, Functor functor)
   {
     switch (kind) {
-      case Legion::Processor::LOC_PROC: return functor(local_cpus);
-      case Legion::Processor::TOC_PROC: return functor(local_gpus);
-      case Legion::Processor::OMP_PROC: return functor(local_omps);
+      case Processor::LOC_PROC: return functor(local_cpus);
+      case Processor::TOC_PROC: return functor(local_gpus);
+      case Processor::OMP_PROC: return functor(local_omps);
       default: LEGATE_ABORT;
     }
     assert(false);
@@ -110,7 +113,7 @@ class CoreMapper : public Legion::Mapping::NullMapper {
   }
 
  public:
-  const AddressSpace local_node;
+  const Legion::AddressSpace local_node;
   const size_t total_nodes;
   const char* const mapper_name;
   LibraryContext context;
@@ -138,7 +141,9 @@ class CoreMapper : public Legion::Mapping::NullMapper {
   std::map<Processor, Memory> local_numa_domains;
 };
 
-CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
+CoreMapper::CoreMapper(Legion::Mapping::MapperRuntime* rt,
+                       Legion::Machine m,
+                       const LibraryContext& c)
   : NullMapper(rt, m),
     local_node(get_local_node()),
     total_nodes(get_total_nodes(m)),
@@ -164,9 +169,10 @@ CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
     has_socket_mem(false)
 {
   // Query to find all our local processors
-  Machine::ProcessorQuery local_procs(machine);
+  Legion::Machine::ProcessorQuery local_procs(machine);
   local_procs.local_address_space();
-  for (Machine::ProcessorQuery::iterator it = local_procs.begin(); it != local_procs.end(); it++) {
+  for (Legion::Machine::ProcessorQuery::iterator it = local_procs.begin(); it != local_procs.end();
+       it++) {
     switch (it->kind()) {
       case Processor::LOC_PROC: {
         local_cpus.push_back(*it);
@@ -184,20 +190,20 @@ CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
     }
   }
   // Now do queries to find all our local memories
-  Machine::MemoryQuery local_sysmem(machine);
+  Legion::Machine::MemoryQuery local_sysmem(machine);
   local_sysmem.local_address_space();
   local_sysmem.only_kind(Memory::SYSTEM_MEM);
   assert(local_sysmem.count() > 0);
   local_system_memory = local_sysmem.first();
   if (!local_gpus.empty()) {
-    Machine::MemoryQuery local_zcmem(machine);
+    Legion::Machine::MemoryQuery local_zcmem(machine);
     local_zcmem.local_address_space();
     local_zcmem.only_kind(Memory::Z_COPY_MEM);
     assert(local_zcmem.count() > 0);
     local_zerocopy_memory = local_zcmem.first();
   }
   for (auto local_gpu : local_gpus) {
-    Machine::MemoryQuery local_framebuffer(machine);
+    Legion::Machine::MemoryQuery local_framebuffer(machine);
     local_framebuffer.local_address_space();
     local_framebuffer.only_kind(Memory::GPU_FB_MEM);
     local_framebuffer.best_affinity_to(local_gpu);
@@ -205,7 +211,7 @@ CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
     local_frame_buffers[local_gpu] = local_framebuffer.first();
   }
   for (auto local_omp : local_omps) {
-    Machine::MemoryQuery local_numa(machine);
+    Legion::Machine::MemoryQuery local_numa(machine);
     local_numa.local_address_space();
     local_numa.only_kind(Memory::SOCKET_MEM);
     local_numa.best_affinity_to(local_omp);
@@ -218,39 +224,40 @@ CoreMapper::CoreMapper(MapperRuntime* rt, Machine m, const LibraryContext& c)
   }
 }
 
-CoreMapper::~CoreMapper(void) { free(const_cast<char*>(mapper_name)); }
+CoreMapper::~CoreMapper() { free(const_cast<char*>(mapper_name)); }
 
-/*static*/ AddressSpace CoreMapper::get_local_node(void)
+/*static*/ Legion::AddressSpace CoreMapper::get_local_node()
 {
   Processor p = Processor::get_executing_processor();
   return p.address_space();
 }
 
-/*static*/ size_t CoreMapper::get_total_nodes(Machine m)
+/*static*/ size_t CoreMapper::get_total_nodes(Legion::Machine m)
 {
-  Machine::ProcessorQuery query(m);
+  Legion::Machine::ProcessorQuery query(m);
   query.only_kind(Processor::LOC_PROC);
-  std::set<AddressSpace> spaces;
-  for (Machine::ProcessorQuery::iterator it = query.begin(); it != query.end(); it++)
-    spaces.insert(it->address_space());
+  std::set<Legion::AddressSpace> spaces;
+  for (auto it = query.begin(); it != query.end(); it++) spaces.insert(it->address_space());
   return spaces.size();
 }
 
-/*static*/ const char* CoreMapper::create_name(AddressSpace node)
+/*static*/ const char* CoreMapper::create_name(Legion::AddressSpace node)
 {
   char buffer[128];
   snprintf(buffer, 127, "Legate Mapper on Node %d", node);
   return strdup(buffer);
 }
 
-const char* CoreMapper::get_mapper_name(void) const { return mapper_name; }
+const char* CoreMapper::get_mapper_name() const { return mapper_name; }
 
-Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model(void) const
+Legion::Mapping::Mapper::MapperSyncModel CoreMapper::get_mapper_sync_model() const
 {
   return SERIALIZED_REENTRANT_MAPPER_MODEL;
 }
 
-void CoreMapper::select_task_options(const MapperContext ctx, const Task& task, TaskOptions& output)
+void CoreMapper::select_task_options(const Legion::Mapping::MapperContext ctx,
+                                     const Legion::Task& task,
+                                     TaskOptions& output)
 {
   assert(context.valid_task_id(task.task_id));
   if (task.tag == LEGATE_CPU_VARIANT) {
@@ -266,8 +273,8 @@ void CoreMapper::select_task_options(const MapperContext ctx, const Task& task,
   }
 }
 
-void CoreMapper::slice_task(const MapperContext ctx,
-                            const Task& task,
+void CoreMapper::slice_task(const Legion::Mapping::MapperContext ctx,
+                            const Legion::Task& task,
                             const SliceTaskInput& input,
                             SliceTaskOutput& output)
 {
@@ -291,8 +298,8 @@ void CoreMapper::slice_task(const MapperContext ctx,
   dispatch(task.target_proc.kind(), round_robin);
 }
 
-void CoreMapper::map_task(const MapperContext ctx,
-                          const Task& task,
+void CoreMapper::map_task(const Legion::Mapping::MapperContext ctx,
+                          const Legion::Task& task,
                           const MapTaskInput& input,
                           MapTaskOutput& output)
 {
@@ -302,8 +309,8 @@ void CoreMapper::map_task(const MapperContext ctx,
   output.chosen_variant = task.tag;
 }
 
-void CoreMapper::select_sharding_functor(const MapperContext ctx,
-                                         const Task& task,
+void CoreMapper::select_sharding_functor(const Legion::Mapping::MapperContext ctx,
+                                         const Legion::Task& task,
                                          const SelectShardingFunctorInput& input,
                                          SelectShardingFunctorOutput& output)
 {
@@ -314,29 +321,29 @@ void CoreMapper::select_sharding_functor(const MapperContext ctx,
   output.chosen_functor = context.get_sharding_id(LEGATE_CORE_TOPLEVEL_TASK_SHARD_ID);
 }
 
-void CoreMapper::select_steal_targets(const MapperContext ctx,
+void CoreMapper::select_steal_targets(const Legion::Mapping::MapperContext ctx,
                                       const SelectStealingInput& input,
                                       SelectStealingOutput& output)
 {
   // Do nothing
 }
 
-void CoreMapper::select_tasks_to_map(const MapperContext ctx,
+void CoreMapper::select_tasks_to_map(const Legion::Mapping::MapperContext ctx,
                                      const SelectMappingInput& input,
                                      SelectMappingOutput& output)
 {
   output.map_tasks.insert(input.ready_tasks.begin(), input.ready_tasks.end());
 }
 
-void CoreMapper::configure_context(const MapperContext ctx,
-                                   const Task& task,
+void CoreMapper::configure_context(const Legion::Mapping::MapperContext ctx,
+                                   const Legion::Task& task,
                                    ContextConfigOutput& output)
 {
   // Use the defaults currently
 }
 
 template <typename T>
-void pack_tunable(const T value, Mapper::SelectTunableOutput& output)
+void pack_tunable(const T value, Legion::Mapping::Mapper::SelectTunableOutput& output)
 {
   T* result    = static_cast<T*>(malloc(sizeof(value)));
   *result      = value;
@@ -344,7 +351,7 @@ void pack_tunable(const T value, Mapper::SelectTunableOutput& output)
   output.size  = sizeof(value);
 }
 
-void CoreMapper::map_future_map_reduction(const MapperContext ctx,
+void CoreMapper::map_future_map_reduction(const Legion::Mapping::MapperContext ctx,
                                           const FutureMapReductionInput& input,
                                           FutureMapReductionOutput& output)
 {
@@ -368,8 +375,8 @@ void CoreMapper::map_future_map_reduction(const MapperContext ctx,
     for (auto& pair : local_numa_domains) output.destination_memories.push_back(pair.second);
 }
 
-void CoreMapper::select_tunable_value(const MapperContext ctx,
-                                      const Task& task,
+void CoreMapper::select_tunable_value(const Legion::Mapping::MapperContext ctx,
+                                      const Legion::Task& task,
                                       const SelectTunableInput& input,
                                       SelectTunableOutput& output)
 {
@@ -468,7 +475,9 @@ void CoreMapper::select_tunable_value(const MapperContext ctx,
   LEGATE_ABORT;
 }
 
-void register_legate_core_mapper(Machine machine, Runtime* runtime, const LibraryContext& context)
+void register_legate_core_mapper(Legion::Machine machine,
+                                 Legion::Runtime* runtime,
+                                 const LibraryContext& context)
 {
   // Replace all the default mappers with our custom mapper for the Legate
   // top-level task and init task
diff --git a/src/core/mapping/instance_manager.cc b/src/core/mapping/instance_manager.cc
index f732b0b79..c4416cd83 100644
--- a/src/core/mapping/instance_manager.cc
+++ b/src/core/mapping/instance_manager.cc
@@ -20,9 +20,6 @@
 namespace legate {
 namespace mapping {
 
-using namespace Legion;
-using namespace Legion::Mapping;
-
 using RegionGroupP = std::shared_ptr<RegionGroup>;
 
 static Legion::Logger log_instmgr("instmgr");
@@ -37,9 +34,9 @@ RegionGroup::RegionGroup(std::set<Region>&& rs, const Domain bound)
 {
 }
 
-std::vector<LogicalRegion> RegionGroup::get_regions() const
+std::vector<RegionGroup::Region> RegionGroup::get_regions() const
 {
-  std::vector<LogicalRegion> result;
+  std::vector<Region> result;
   result.insert(result.end(), regions.begin(), regions.end());
   return std::move(result);
 }
@@ -106,7 +103,7 @@ static inline bool too_big(size_t union_volume,
 struct construct_overlapping_region_group_fn {
   template <int32_t DIM>
   RegionGroupP operator()(const InstanceSet::Region& region,
-                          const InstanceSet::Domain& domain,
+                          const Domain& domain,
                           const std::map<RegionGroup*, InstanceSet::InstanceSpec>& instances)
   {
     auto bound       = domain.bounds<DIM, coord_t>();
@@ -159,7 +156,7 @@ struct construct_overlapping_region_group_fn {
       bound_vol = union_vol;
     }
 
-    return std::make_shared<RegionGroup>(std::move(regions), InstanceSet::Domain(bound));
+    return std::make_shared<RegionGroup>(std::move(regions), Domain(bound));
   }
 };
 
@@ -241,7 +238,7 @@ std::set<InstanceSet::Instance> InstanceSet::record_instance(RegionGroupP group,
   return std::move(replaced);
 }
 
-bool InstanceSet::erase(PhysicalInstance inst)
+bool InstanceSet::erase(Instance inst)
 {
   std::set<RegionGroup*> filtered_groups;
 #ifdef DEBUG_LEGATE
@@ -333,7 +330,7 @@ void ReductionInstanceSet::record_instance(ReductionOpID& redop,
   }
 }
 
-bool ReductionInstanceSet::erase(PhysicalInstance inst)
+bool ReductionInstanceSet::erase(Instance inst)
 {
   for (auto it = instances_.begin(); it != instances_.end(); /*nothing*/) {
     if (it->second.instance == inst) {
@@ -390,7 +387,7 @@ std::set<InstanceManager::Instance> InstanceManager::record_instance(
   return instance_sets_[key].record_instance(group, instance, policy);
 }
 
-void InstanceManager::erase(PhysicalInstance inst)
+void InstanceManager::erase(Instance inst)
 {
   const auto mem = inst.get_location();
   const auto tid = inst.get_tree_id();
@@ -408,9 +405,9 @@ void InstanceManager::erase(PhysicalInstance inst)
   }
 }
 
-std::map<Legion::Memory, size_t> InstanceManager::aggregate_instance_sizes() const
+std::map<Memory, size_t> InstanceManager::aggregate_instance_sizes() const
 {
-  std::map<Legion::Memory, size_t> result;
+  std::map<Memory, size_t> result;
   for (auto& pair : instance_sets_) {
     auto& memory = pair.first.memory;
     if (result.find(memory) == result.end()) result[memory] = 0;
@@ -459,7 +456,7 @@ void ReductionInstanceManager::record_instance(ReductionOpID& redop,
   }
 }
 
-void ReductionInstanceManager::erase(PhysicalInstance inst)
+void ReductionInstanceManager::erase(Instance inst)
 {
   const auto mem = inst.get_location();
   const auto tid = inst.get_tree_id();
diff --git a/src/core/mapping/instance_manager.h b/src/core/mapping/instance_manager.h
index c42df3119..35a49befb 100644
--- a/src/core/mapping/instance_manager.h
+++ b/src/core/mapping/instance_manager.h
@@ -30,7 +30,6 @@ namespace mapping {
 struct RegionGroup {
  public:
   using Region = Legion::LogicalRegion;
-  using Domain = Legion::Domain;
 
  public:
   RegionGroup(const std::set<Region>& regions, const Domain bounding_box);
@@ -56,7 +55,6 @@ struct InstanceSet {
  public:
   using Region       = Legion::LogicalRegion;
   using Instance     = Legion::Mapping::PhysicalInstance;
-  using Domain       = Legion::Domain;
   using RegionGroupP = std::shared_ptr<RegionGroup>;
 
  public:
@@ -99,7 +97,6 @@ class ReductionInstanceSet {
  public:
   using Region        = Legion::LogicalRegion;
   using Instance      = Legion::Mapping::PhysicalInstance;
-  using Domain        = Legion::Domain;
   using ReductionOpID = Legion::ReductionOpID;
 
  public:
@@ -141,9 +138,7 @@ class BaseInstanceManager {
   using Region       = Legion::LogicalRegion;
   using RegionTreeID = Legion::RegionTreeID;
   using Instance     = Legion::Mapping::PhysicalInstance;
-  using Domain       = Legion::Domain;
   using FieldID      = Legion::FieldID;
-  using Memory       = Legion::Memory;
 
  public:
   struct FieldMemInfo {
@@ -206,7 +201,7 @@ class InstanceManager : public BaseInstanceManager {
   static InstanceManager* get_instance_manager();
 
  public:
-  std::map<Legion::Memory, size_t> aggregate_instance_sizes() const;
+  std::map<Memory, size_t> aggregate_instance_sizes() const;
 
  private:
   std::map<FieldMemInfo, InstanceSet> instance_sets_{};
diff --git a/src/core/mapping/mapping.cc b/src/core/mapping/mapping.cc
index 5d1aa971c..960cbfbc0 100644
--- a/src/core/mapping/mapping.cc
+++ b/src/core/mapping/mapping.cc
@@ -20,8 +20,6 @@
 
 #include "core/mapping/mapping.h"
 
-using namespace Legion;
-
 namespace legate {
 namespace mapping {
 
@@ -44,7 +42,7 @@ bool DimOrdering::operator==(const DimOrdering& other) const
 }
 
 void DimOrdering::populate_dimension_ordering(const Store& store,
-                                              std::vector<DimensionKind>& ordering) const
+                                              std::vector<Legion::DimensionKind>& ordering) const
 {
   // TODO: We need to implement the relative dimension ordering
   assert(!relative);
@@ -52,17 +50,17 @@ void DimOrdering::populate_dimension_ordering(const Store& store,
     case Kind::C: {
       auto dim = store.region_field().dim();
       for (int32_t idx = dim - 1; idx >= 0; --idx)
-        ordering.push_back(static_cast<DimensionKind>(DIM_X + idx));
+        ordering.push_back(static_cast<Legion::DimensionKind>(DIM_X + idx));
       break;
     }
     case Kind::FORTRAN: {
       auto dim = store.region_field().dim();
       for (int32_t idx = 0; idx < dim; ++idx)
-        ordering.push_back(static_cast<DimensionKind>(DIM_X + idx));
+        ordering.push_back(static_cast<Legion::DimensionKind>(DIM_X + idx));
       break;
     }
     case Kind::CUSTOM: {
-      for (auto idx : dims) ordering.push_back(static_cast<DimensionKind>(DIM_X + idx));
+      for (auto idx : dims) ordering.push_back(static_cast<Legion::DimensionKind>(DIM_X + idx));
       break;
     }
   }
@@ -92,15 +90,16 @@ bool InstanceMappingPolicy::operator!=(const InstanceMappingPolicy& other) const
 void InstanceMappingPolicy::populate_layout_constraints(
   const Store& store, Legion::LayoutConstraintSet& layout_constraints) const
 {
-  std::vector<DimensionKind> dimension_ordering{};
+  std::vector<Legion::DimensionKind> dimension_ordering{};
 
   if (layout == InstLayout::AOS) dimension_ordering.push_back(DIM_F);
   ordering.populate_dimension_ordering(store, dimension_ordering);
   if (layout == InstLayout::SOA) dimension_ordering.push_back(DIM_F);
 
-  layout_constraints.add_constraint(OrderingConstraint(dimension_ordering, false /*contiguous*/));
+  layout_constraints.add_constraint(
+    Legion::OrderingConstraint(dimension_ordering, false /*contiguous*/));
 
-  layout_constraints.add_constraint(MemoryConstraint(get_memory_kind(target)));
+  layout_constraints.add_constraint(Legion::MemoryConstraint(get_memory_kind(target)));
 }
 
 /*static*/ InstanceMappingPolicy InstanceMappingPolicy::default_policy(StoreTarget target,
@@ -146,6 +145,8 @@ uint32_t StoreMapping::requirement_index() const
   }
   return result;
 #else
+  static constexpr uint32_t invalid = -1U;
+  if (stores.empty()) return invalid;
   return stores.front().requirement_index();
 #endif
 }
@@ -160,9 +161,9 @@ std::set<uint32_t> StoreMapping::requirement_indices() const
   return std::move(indices);
 }
 
-std::set<const RegionRequirement*> StoreMapping::requirements() const
+std::set<const Legion::RegionRequirement*> StoreMapping::requirements() const
 {
-  std::set<const RegionRequirement*> reqs;
+  std::set<const Legion::RegionRequirement*> reqs;
   for (auto& store : stores) {
     if (store.is_future()) continue;
     auto* req = store.region_field().get_requirement();
@@ -177,9 +178,9 @@ void StoreMapping::populate_layout_constraints(
 {
   policy.populate_layout_constraints(stores.front(), layout_constraints);
 
-  std::vector<FieldID> fields{};
+  std::vector<Legion::FieldID> fields{};
   if (stores.size() > 1) {
-    std::set<FieldID> field_set{};
+    std::set<Legion::FieldID> field_set{};
     for (auto& store : stores) {
       auto field_id = store.region_field().field_id();
       if (field_set.find(field_id) == field_set.end()) {
@@ -189,7 +190,7 @@ void StoreMapping::populate_layout_constraints(
     }
   } else
     fields.push_back(stores.front().region_field().field_id());
-  layout_constraints.add_constraint(FieldConstraint(fields, true /*contiguous*/));
+  layout_constraints.add_constraint(Legion::FieldConstraint(fields, true /*contiguous*/));
 }
 
 /*static*/ StoreMapping StoreMapping::default_mapping(const Store& store,
diff --git a/src/core/mapping/mapping.h b/src/core/mapping/mapping.h
index 2d56d1a67..bed1247ee 100644
--- a/src/core/mapping/mapping.h
+++ b/src/core/mapping/mapping.h
@@ -17,39 +17,116 @@
 #pragma once
 
 #include "core/mapping/operation.h"
+#include "core/utilities/typedefs.h"
+
+/** @defgroup mapping Mapping API
+ */
+
+/**
+ * @file
+ * @brief Legate Mapping API
+ */
 
 namespace legate {
 namespace mapping {
 
+/**
+ * @ingroup mapping
+ * @brief An enum class for task targets
+ */
 enum class TaskTarget : int32_t {
+  /**
+   * @brief Indicates the task be mapped to a CPU
+   */
   CPU = 1,
+  /**
+   * @brief Indicates the task be mapped to a GPU
+   */
   GPU = 2,
+  /**
+   * @brief Indicates the task be mapped to an OpenMP processor
+   */
   OMP = 3,
 };
 
+/**
+ * @ingroup mapping
+ * @brief An enum class for store targets
+ */
 enum class StoreTarget : int32_t {
-  SYSMEM    = 1,
-  FBMEM     = 2,
-  ZCMEM     = 3,
+  /**
+   * @brief Indicates the store be mapped to the system memory (host memory)
+   */
+  SYSMEM = 1,
+  /**
+   * @brief Indicates the store be mapped to the GPU framebuffer
+   */
+  FBMEM = 2,
+  /**
+   * @brief Indicates the store be mapped to the pinned memory for zero-copy GPU accesses
+   */
+  ZCMEM = 3,
+  /**
+   * @brief Indicates the store be mapped to the host memory closest to the target CPU
+   */
   SOCKETMEM = 4,
 };
 
+/**
+ * @ingroup mapping
+ * @brief An enum class for instance allocation policies
+ */
 enum class AllocPolicy : int32_t {
-  MAY_ALLOC  = 1,
+  /**
+   * @brief Indicates the store can reuse an existing instance
+   */
+  MAY_ALLOC = 1,
+  /**
+   * @brief Indicates the store must be mapped to a fresh instance
+   */
   MUST_ALLOC = 2,
 };
 
+/**
+ * @ingroup mapping
+ * @brief An enum class for instant layouts
+ */
 enum class InstLayout : int32_t {
+  /**
+   * @brief Indicates the store must be mapped to an SOA instance
+   */
   SOA = 1,
+  /**
+   * @brief Indicates the store must be mapped to an AOS instance. No different than `SOA` in a
+   * store mapping for a single store
+   */
   AOS = 2,
 };
 
+/**
+ * @ingroup mapping
+ * @brief A descriptor for dimension ordering
+ */
 struct DimOrdering {
  public:
+  /**
+   * @brief An enum class for kinds of dimension ordering
+   */
   enum class Kind : int32_t {
-    C       = 1,
+    /**
+     * @brief Indicates the instance have C layout (i.e., the last dimension is the leading
+     * dimension in the instance)
+     */
+    C = 1,
+    /**
+     * @brief Indicates the instance have Fortran layout (i.e., the first dimension is the leading
+     * dimension instance)
+     */
     FORTRAN = 2,
-    CUSTOM  = 3,
+    /**
+     * @brief Indicates the order of dimensions of the instance is manually specified
+     */
+    CUSTOM = 3,
   };
 
  public:
@@ -71,26 +148,67 @@ struct DimOrdering {
                                    std::vector<Legion::DimensionKind>& ordering) const;
 
  public:
+  /**
+   * @brief Sets the dimension ordering to C
+   */
   void c_order();
+  /**
+   * @brief Sets the dimension ordering to Fortran
+   */
   void fortran_order();
+  /**
+   * @brief Sets a custom dimension ordering
+   *
+   * @param dims A vector that stores the order of dimensions.
+   */
   void custom_order(std::vector<int32_t>&& dims);
 
  public:
+  /**
+   * @brief Dimension ordering type
+   */
   Kind kind{Kind::C};
   // When relative is true, 'dims' specifies the order of dimensions
   // for the store's local coordinate space, which will be mapped
   // back to the root store's original coordinate space.
+  /**
+   * @brief If true, the dimension ordering specifies the order of dimensions
+   * for the store's current domain, which will be transformed back to the root
+   * store's domain.
+   */
   bool relative{false};
-  // Used only when the kind is CUSTOM
+  /**
+   * @brief Dimension list. Used only when the `kind` is `CUSTOM`.
+   */
   std::vector<int32_t> dims{};
 };
 
+/**
+ * @ingroup mapping
+ * @brief A descriptor for instance mapping policy
+ */
 struct InstanceMappingPolicy {
  public:
+  /**
+   * @brief Target memory type for the instance
+   */
   StoreTarget target{StoreTarget::SYSMEM};
+  /**
+   * @brief Allocation policy
+   */
   AllocPolicy allocation{AllocPolicy::MAY_ALLOC};
+  /**
+   * @brief Instance layout for the instance
+   */
   InstLayout layout{InstLayout::SOA};
+  /**
+   * @brief Dimension ordering for the instance
+   */
   DimOrdering ordering{};
+  /**
+   * @brief If true, the instance must be tight to the store(s); i.e., the instance
+   * must not have any extra elements not included in the store(s).
+   */
   bool exact{false};
 
  public:
@@ -108,7 +226,8 @@ struct InstanceMappingPolicy {
   bool operator==(const InstanceMappingPolicy&) const;
   bool operator!=(const InstanceMappingPolicy&) const;
 
- public:
+ private:
+  friend class StoreMapping;
   void populate_layout_constraints(const Store& store,
                                    Legion::LayoutConstraintSet& layout_constraints) const;
 
@@ -116,9 +235,19 @@ struct InstanceMappingPolicy {
   static InstanceMappingPolicy default_policy(StoreTarget target, bool exact = false);
 };
 
+/**
+ * @ingroup mapping
+ * @brief A mapping policy for stores
+ */
 struct StoreMapping {
  public:
+  /**
+   * @brief Stores to which the `policy` should be applied
+   */
   std::vector<Store> stores{};
+  /**
+   * @brief Instance mapping policy
+   */
   InstanceMappingPolicy policy;
 
  public:
@@ -138,23 +267,124 @@ struct StoreMapping {
   const Store& store() const;
 
  public:
+  /**
+   * @brief Returns a region requirement index for the stores.
+   *
+   * Returns an undefined value if the store mapping has more than one store and the stores are
+   * mapped to different region requirements.
+   *
+   * @return Region requirement index
+   */
   uint32_t requirement_index() const;
+  /**
+   * @brief Returns a set of region requirement indices for the stores.
+   *
+   * @return A set of region requirement indices
+   */
   std::set<uint32_t> requirement_indices() const;
+  /**
+   * @brief Returns the stores' region requirements
+   *
+   * @return A set of region requirements
+   */
   std::set<const Legion::RegionRequirement*> requirements() const;
 
- public:
+ private:
+  friend class BaseMapper;
   void populate_layout_constraints(Legion::LayoutConstraintSet& layout_constraints) const;
 
  public:
+  /**
+   * @brief Creates a `StoreMapping` object following the default mapping poicy
+   *
+   * @param store Target store for the mapping policy
+   * @param target Target memory type for the store
+   * @param exact Indicates whether the policy should request an exact instance
+   *
+   * @return A `StoreMapping` object
+   */
   static StoreMapping default_mapping(const Store& store, StoreTarget target, bool exact = false);
 };
 
-struct LegateMapper {
-  virtual bool is_pure() const                                                              = 0;
-  virtual TaskTarget task_target(const Task& task, const std::vector<TaskTarget>& options)  = 0;
+/**
+ * @ingroup mapping
+ * @brief An abstract class that defines machine query APIs
+ */
+class MachineQueryInterface {
+ public:
+  virtual ~MachineQueryInterface() {}
+  /**
+   * @brief Returns local CPUs
+   *
+   * @return A vector of processors
+   */
+  virtual const std::vector<Processor>& cpus() const = 0;
+  /**
+   * @brief Returns local GPUs
+   *
+   * @return A vector of processors
+   */
+  virtual const std::vector<Processor>& gpus() const = 0;
+  /**
+   * @brief Returns local OpenMP processors
+   *
+   * @return A vector of processors
+   */
+  virtual const std::vector<Processor>& omps() const = 0;
+  /**
+   * @brief Returns the total number of nodes
+   *
+   * @return Total number of nodes
+   */
+  virtual uint32_t total_nodes() const = 0;
+};
+
+/**
+ * @ingroup mapping
+ * @brief An abstract class that defines Legate mapping APIs
+ *
+ * The APIs give Legate libraries high-level control on task and store mappings
+ */
+class LegateMapper {
+ public:
+  virtual ~LegateMapper() {}
+  /**
+   * @brief Sets a machine query interface. This call gives the mapper a chance
+   * to cache the machine query interface.
+   *
+   * @param machine Machine query interface
+   */
+  virtual void set_machine(const MachineQueryInterface* machine) = 0;
+  /**
+   * @brief Picks the target processor type for the task
+   *
+   * @param task Task to map
+   * @param options Processor types for which the task has variants
+   *
+   * @return A target processor type
+   */
+  virtual TaskTarget task_target(const Task& task, const std::vector<TaskTarget>& options) = 0;
+  /**
+   * @brief Chooses mapping policies for the task's stores.
+   *
+   * Store mappings can be underspecified; any store of the task that doesn't have a mapping policy
+   * will fall back to the default one.
+   *
+   * @param task Task to map
+   * @param options Types of memories to which the stores can be mapped
+   *
+   * @return A vector of store mappings
+   */
   virtual std::vector<StoreMapping> store_mappings(const Task& task,
                                                    const std::vector<StoreTarget>& options) = 0;
-  virtual Scalar tunable_value(TunableID tunable_id)                                        = 0;
+  /**
+   * @brief Returns a tunable value
+   *
+   * @param tunable_id a tunable value id
+   *
+   * @return A tunable value in a `Scalar` object
+   */
+  virtual Scalar tunable_value(TunableID tunable_id) = 0;
 };
 
 }  // namespace mapping
diff --git a/src/core/mapping/operation.cc b/src/core/mapping/operation.cc
index 03f34d5b2..fb6826ac6 100644
--- a/src/core/mapping/operation.cc
+++ b/src/core/mapping/operation.cc
@@ -20,13 +20,10 @@
 namespace legate {
 namespace mapping {
 
-using LegionTask = Legion::Task;
-using LegionCopy = Legion::Copy;
-
-using namespace Legion;
-using namespace Legion::Mapping;
-
-RegionField::RegionField(const RegionRequirement* req, int32_t dim, uint32_t idx, FieldID fid)
+RegionField::RegionField(const Legion::RegionRequirement* req,
+                         int32_t dim,
+                         uint32_t idx,
+                         Legion::FieldID fid)
   : req_(req), dim_(dim), idx_(idx), fid_(fid)
 {
 }
@@ -38,12 +35,13 @@ bool RegionField::can_colocate_with(const RegionField& other) const
   return my_req->region.get_tree_id() == other_req->region.get_tree_id();
 }
 
-Domain RegionField::domain(MapperRuntime* runtime, const MapperContext context) const
+Domain RegionField::domain(Legion::Mapping::MapperRuntime* runtime,
+                           const Legion::Mapping::MapperContext context) const
 {
   return runtime->get_index_space_domain(context, get_index_space());
 }
 
-IndexSpace RegionField::get_index_space() const { return req_->region.get_index_space(); }
+Legion::IndexSpace RegionField::get_index_space() const { return req_->region.get_index_space(); }
 
 FutureWrapper::FutureWrapper(uint32_t idx, const Domain& domain) : idx_(idx), domain_(domain) {}
 
@@ -139,10 +137,10 @@ Domain Store::domain() const
   return result;
 }
 
-Task::Task(const LegionTask* task,
+Task::Task(const Legion::Task* task,
            const LibraryContext& library,
-           MapperRuntime* runtime,
-           const MapperContext context)
+           Legion::Mapping::MapperRuntime* runtime,
+           const Legion::Mapping::MapperContext context)
   : task_(task), library_(library)
 {
   TaskDeserializer dez(task, runtime, context);
@@ -154,7 +152,9 @@ Task::Task(const LegionTask* task,
 
 int64_t Task::task_id() const { return library_.get_local_task_id(task_->task_id); }
 
-Copy::Copy(const LegionCopy* copy, MapperRuntime* runtime, const MapperContext context)
+Copy::Copy(const Legion::Copy* copy,
+           Legion::Mapping::MapperRuntime* runtime,
+           const Legion::Mapping::MapperContext context)
   : copy_(copy)
 {
   CopyDeserializer dez(copy->mapper_data,
diff --git a/src/core/mapping/operation.h b/src/core/mapping/operation.h
index 0cc5dc267..47ef686e3 100644
--- a/src/core/mapping/operation.h
+++ b/src/core/mapping/operation.h
@@ -23,6 +23,11 @@
 #include "core/data/transform.h"
 #include "core/runtime/context.h"
 
+/**
+ * @file
+ * @brief Class definitions for operations and stores used in mapping
+ */
+
 namespace legate {
 namespace mapping {
 
@@ -43,12 +48,12 @@ class RegionField {
 
  public:
   template <int32_t DIM>
-  Legion::Rect<DIM> shape(Legion::Mapping::MapperRuntime* runtime,
-                          const Legion::Mapping::MapperContext context) const;
+  Rect<DIM> shape(Legion::Mapping::MapperRuntime* runtime,
+                  const Legion::Mapping::MapperContext context) const;
 
  public:
-  Legion::Domain domain(Legion::Mapping::MapperRuntime* runtime,
-                        const Legion::Mapping::MapperContext context) const;
+  Domain domain(Legion::Mapping::MapperRuntime* runtime,
+                const Legion::Mapping::MapperContext context) const;
 
  public:
   bool operator==(const RegionField& other) const;
@@ -76,7 +81,7 @@ class RegionField {
 class FutureWrapper {
  public:
   FutureWrapper() {}
-  FutureWrapper(uint32_t idx, const Legion::Domain& domain);
+  FutureWrapper(uint32_t idx, const Domain& domain);
 
  public:
   FutureWrapper(const FutureWrapper& other)            = default;
@@ -88,14 +93,19 @@ class FutureWrapper {
 
  public:
   template <int32_t DIM>
-  Legion::Rect<DIM> shape() const;
-  Legion::Domain domain() const;
+  Rect<DIM> shape() const;
+  Domain domain() const;
 
  private:
   uint32_t idx_{-1U};
-  Legion::Domain domain_{};
+  Domain domain_{};
 };
 
+/**
+ * @ingroup mapping
+ * @brief A metadata class that mirrors the structure of legate::Store but contains
+ * only the data relevant to mapping
+ */
 class Store {
  public:
   Store() {}
@@ -125,15 +135,51 @@ class Store {
   Store& operator=(Store&& other) = default;
 
  public:
+  /**
+   * @brief Indicates whether the store is backed by a future
+   *
+   * @return true The store is backed by a future
+   * @return false The store is backed by a region field
+   */
   bool is_future() const { return is_future_; }
+  /**
+   * @brief Indicates whether the store is unbound
+   *
+   * @return true The store is unbound
+   * @return false The store is a normal store
+   */
   bool unbound() const { return is_output_store_; }
+  /**
+   * @brief Returns the store's dimension
+   *
+   * @return Store's dimension
+   */
   int32_t dim() const { return dim_; }
 
  public:
+  /**
+   * @brief Indicates whether the store is a reduction store
+   *
+   * @return true The store is a reduction store
+   * @return false The store is either an input or output store
+   */
   bool is_reduction() const { return redop_id_ > 0; }
-  Legion::ReductionOpID redop() const { return redop_id_; }
+  /**
+   * @brief Returns the reduction operator id for the store
+   *
+   * @return Reduction oeprator id
+   */
+  int32_t redop() const { return redop_id_; }
 
  public:
+  /**
+   * @brief Indicates whether the store can colocate in an instance with a given store
+   *
+   * @param other Store against which the colocation is checked
+   *
+   * @return true The store can colocate with the input
+   * @return false The store cannot colocate with the input
+   */
   bool can_colocate_with(const Store& other) const;
   const RegionField& region_field() const;
   const FutureWrapper& future() const;
@@ -144,11 +190,19 @@ class Store {
   uint32_t future_index() const;
 
  public:
+  /**
+   * @brief Returns the store's domain
+   *
+   * @return Store's domain
+   */
   template <int32_t DIM>
-  Legion::Rect<DIM> shape() const;
-
- public:
-  Legion::Domain domain() const;
+  Rect<DIM> shape() const;
+  /**
+   * @brief Returns the store's domain in a dimension-erased domain type
+   *
+   * @return Store's domain in a dimension-erased domain type
+   */
+  Domain domain() const;
 
  private:
   bool is_future_{false};
@@ -169,6 +223,10 @@ class Store {
   Legion::Mapping::MapperContext context_{nullptr};
 };
 
+/**
+ * @ingroup mapping
+ * @brief A metadata class for tasks
+ */
 class Task {
  public:
   Task(const Legion::Task* task,
@@ -177,16 +235,48 @@ class Task {
        const Legion::Mapping::MapperContext context);
 
  public:
+  /**
+   * @brief Returns the task id
+   *
+   * @return Task id
+   */
   int64_t task_id() const;
 
  public:
+  /**
+   * @brief Returns metadata for the task's input stores
+   *
+   * @return Vector of store metadata objects
+   */
   const std::vector<Store>& inputs() const { return inputs_; }
+  /**
+   * @brief Returns metadata for the task's output stores
+   *
+   * @return Vector of store metadata objects
+   */
   const std::vector<Store>& outputs() const { return outputs_; }
+  /**
+   * @brief Returns metadata for the task's reduction stores
+   *
+   * @return Vector of store metadata objects
+   */
   const std::vector<Store>& reductions() const { return reductions_; }
+  /**
+   * @brief Returns the vector of the task's by-value arguments. Unlike `mapping::Store`
+   * objects that have no access to data in the stores, the returned `Scalar` objects
+   * contain valid arguments to the task
+   *
+   * @return Vector of `Scalar` objects
+   */
   const std::vector<Scalar>& scalars() const { return scalars_; }
 
  public:
-  Legion::DomainPoint point() const { return task_->index_point; }
+  /**
+   * @brief Returns the point of the task
+   *
+   * @return The point of the task
+   */
+  DomainPoint point() const { return task_->index_point; }
 
  private:
   const LibraryContext& library_;
@@ -210,7 +300,7 @@ class Copy {
   const std::vector<Store>& output_indirections() const { return output_indirections_; }
 
  public:
-  Legion::DomainPoint point() const { return copy_->index_point; }
+  DomainPoint point() const { return copy_->index_point; }
 
  private:
   const Legion::Copy* copy_;
diff --git a/src/core/runtime/context.cc b/src/core/runtime/context.cc
index c482ae3aa..f696c101b 100644
--- a/src/core/runtime/context.cc
+++ b/src/core/runtime/context.cc
@@ -32,24 +32,22 @@
 
 namespace legate {
 
-LibraryContext::LibraryContext(Legion::Runtime* runtime,
-                               const std::string& library_name,
-                               const ResourceConfig& config)
-  : runtime_(runtime), library_name_(library_name)
+LibraryContext::LibraryContext(const std::string& library_name, const ResourceConfig& config)
+  : runtime_(Legion::Runtime::get_runtime()), library_name_(library_name)
 {
   task_scope_ = ResourceScope(
-    runtime->generate_library_task_ids(library_name.c_str(), config.max_tasks), config.max_tasks);
+    runtime_->generate_library_task_ids(library_name.c_str(), config.max_tasks), config.max_tasks);
   mapper_scope_ =
-    ResourceScope(runtime->generate_library_mapper_ids(library_name.c_str(), config.max_mappers),
+    ResourceScope(runtime_->generate_library_mapper_ids(library_name.c_str(), config.max_mappers),
                   config.max_mappers);
   redop_scope_ = ResourceScope(
-    runtime->generate_library_reduction_ids(library_name.c_str(), config.max_reduction_ops),
+    runtime_->generate_library_reduction_ids(library_name.c_str(), config.max_reduction_ops),
     config.max_reduction_ops);
   proj_scope_ = ResourceScope(
-    runtime->generate_library_projection_ids(library_name.c_str(), config.max_projections),
+    runtime_->generate_library_projection_ids(library_name.c_str(), config.max_projections),
     config.max_projections);
   shard_scope_ = ResourceScope(
-    runtime->generate_library_sharding_ids(library_name.c_str(), config.max_shardings),
+    runtime_->generate_library_sharding_ids(library_name.c_str(), config.max_shardings),
     config.max_shardings);
 }
 
@@ -148,13 +146,15 @@ bool LibraryContext::valid_sharding_id(Legion::ShardingID shard_id) const
   return shard_scope_.in_scope(shard_id);
 }
 
-void LibraryContext::register_mapper(mapping::BaseMapper* mapper, int64_t local_mapper_id) const
+void LibraryContext::register_mapper(std::unique_ptr<mapping::LegateMapper> mapper,
+                                     int64_t local_mapper_id) const
 {
-  auto mapper_id = get_mapper_id(local_mapper_id);
+  auto base_mapper = new legate::mapping::BaseMapper(
+    std::move(mapper), runtime_, Realm::Machine::get_machine(), *this);
+  Legion::Mapping::Mapper* legion_mapper = base_mapper;
   if (Core::log_mapping_decisions)
-    runtime_->add_mapper(mapper_id, new Legion::Mapping::LoggingWrapper(mapper, &mapper->logger));
-  else
-    runtime_->add_mapper(mapper_id, mapper);
+    legion_mapper = new Legion::Mapping::LoggingWrapper(base_mapper, &base_mapper->logger);
+  runtime_->add_mapper(get_mapper_id(local_mapper_id), legion_mapper);
 }
 
 TaskContext::TaskContext(const Legion::Task* task,
@@ -203,7 +203,7 @@ TaskContext::TaskContext(const Legion::Task* task,
 #ifdef LEGATE_USE_CUDA
   // If the task is running on a GPU and there is at least one scalar store for reduction,
   // we need to wait for all the host-to-device copies for initialization to finish
-  if (Legion::Processor::get_executing_processor().kind() == Legion::Processor::Kind::TOC_PROC)
+  if (Processor::get_executing_processor().kind() == Processor::Kind::TOC_PROC)
     for (auto& reduction : reductions_)
       if (reduction.is_future()) {
         CHECK_CUDA(cudaDeviceSynchronize());
@@ -214,9 +214,9 @@ TaskContext::TaskContext(const Legion::Task* task,
 
 bool TaskContext::is_single_task() const { return !task_->is_index_space; }
 
-Legion::DomainPoint TaskContext::get_task_index() const { return task_->index_point; }
+DomainPoint TaskContext::get_task_index() const { return task_->index_point; }
 
-Legion::Domain TaskContext::get_launch_domain() const { return task_->index_domain; }
+Domain TaskContext::get_launch_domain() const { return task_->index_domain; }
 
 void TaskContext::make_all_unbound_stores_empty()
 {
diff --git a/src/core/runtime/context.h b/src/core/runtime/context.h
index b387b4979..dac640a3f 100644
--- a/src/core/runtime/context.h
+++ b/src/core/runtime/context.h
@@ -16,25 +16,48 @@
 
 #pragma once
 
+#include <memory>
+
 #include "legion.h"
+// Must be included after legion.h
+#include "legate_defines.h"
 
 #include "core/comm/communicator.h"
 #include "core/task/return.h"
+#include "core/utilities/typedefs.h"
+
+/**
+ * @file
+ * @brief Class definitions for legate::LibraryContext and legate::TaskContext
+ */
 
 namespace legate {
 
 namespace mapping {
 
-class BaseMapper;
+class LegateMapper;
 
 }  // namespace mapping
 
 class Store;
 class Scalar;
 
+/**
+ * @ingroup runtime
+ * @brief POD for library configuration.
+ */
 struct ResourceConfig {
+  /**
+   * @brief Maximum number of tasks that the library can register
+   */
   int64_t max_tasks{1000000};
+  /**
+   * @brief Maximum number of mappers that the library can register
+   */
   int64_t max_mappers{1};
+  /**
+   * @brief Maximum number of custom reduction operators that the library can register
+   */
   int64_t max_reduction_ops{0};
   int64_t max_projections{0};
   int64_t max_shardings{0};
@@ -68,16 +91,35 @@ class ResourceScope {
   int64_t max_{-1};
 };
 
+/**
+ * @ingroup runtime
+ * @brief A library context that provides APIs for registering components
+ */
 class LibraryContext {
  public:
-  LibraryContext(Legion::Runtime* runtime,
-                 const std::string& library_name,
-                 const ResourceConfig& config);
+  /**
+   * @brief Creates a library context from a library name and a configuration.
+   *
+   * A library is registered to the runtime only upon the first construction
+   * and the `config` object is referred to only when the registration happens.
+   * All the following constructions of `LibraryContext` only retrieve the
+   * metadata from the runtime without registration and ignore the `config`.
+   *
+   * @param library_name Library name
+   * @param config Resource configuration for the library. If the library is already
+   * registered, the value will be ignored.
+   */
+  LibraryContext(const std::string& library_name, const ResourceConfig& config);
 
  public:
   LibraryContext(const LibraryContext&) = default;
 
  public:
+  /**
+   * @brief Returns the name of the library
+   *
+   * @return Library name
+   */
   const std::string& get_library_name() const;
 
  public:
@@ -102,7 +144,68 @@ class LibraryContext {
   bool valid_sharding_id(Legion::ShardingID shard_id) const;
 
  public:
-  void register_mapper(mapping::BaseMapper* mapper, int64_t local_mapper_id = 0) const;
+  /**
+   * @brief Registers a library specific reduction operator.
+   *
+   * The type parameter `REDOP` points to a class that implements a reduction operator.
+   * Each reduction operator class has the following structure:
+   *
+   * @code{.cpp}
+   * struct RedOp {
+   *   using LHS = ...; // Type of the LHS values
+   *   using RHS = ...; // Type of the RHS values
+   *
+   *   static const RHS identity = ...; // Identity of the reduction operator
+   *   static const int32_t REDOP_ID = ... // Reduction operator id
+   *
+   *   template <bool EXCLUSIVE>
+   *   __CUDA_HD__ inline static void apply(LHS& lhs, RHS rhs)
+   *   {
+   *     ...
+   *   }
+   *   template <bool EXCLUSIVE>
+   *   __CUDA_HD__ inline static void fold(RHS& rhs1, RHS rhs2)
+   *   {
+   *     ...
+   *   }
+   * };
+   * @endcode
+   *
+   * Semantically, Legate performs reductions of values `V0`, ..., `Vn` to element `E` in the
+   * following way:
+   *
+   * @code{.cpp}
+   * RHS T = RedOp::identity;
+   * RedOp::fold(T, V0)
+   * ...
+   * RedOp::fold(T, Vn)
+   * RedOp::apply(E, T)
+   * @endcode
+   * I.e., Legate gathers all reduction contributions using `fold` and applies the accumulator
+   * to the element using `apply`.
+   *
+   * Oftentimes, the LHS and RHS of a reduction operator are the same type and `fold` and  `apply`
+   * perform the same computation, but that's not mandatory. For example, one may implement
+   * a reduction operator for subtraction, where the `fold` would sum up all RHS values whereas
+   * the `apply` would subtract the aggregate value from the LHS.
+   *
+   * The reduction operator id (`REDOP_ID`) can be local to the library but should be unique
+   * for each opeartor within the library.
+   *
+   * Finally, the contract for `apply` and `fold` is that they must update the
+   * reference atomically when the `EXCLUSIVE` is `false`.
+   */
+  template <typename REDOP>
+  void register_reduction_operator();
+  /**
+   * @brief Registers a library specific mapper. Transfers the ownership of the mapper to
+   * the runtime.
+   *
+   * @param mapper Mapper object
+   * @param local_mapper_id Id for the mapper. Used only when there is more than one mapper.
+   */
+  void register_mapper(std::unique_ptr<mapping::LegateMapper> mapper,
+                       int64_t local_mapper_id = 0) const;
 
  private:
   Legion::Runtime* runtime_;
@@ -114,8 +217,10 @@ class LibraryContext {
   ResourceScope shard_scope_;
 };
 
-// A thin context layer on top of the Legion runtime, primarily designed to hide verbosity
-// of the Legion API.
+/**
+ * @ingroup task
+ * @brief A task context that contains task arguments and communicators
+ */
 class TaskContext {
  public:
   TaskContext(const Legion::Task* task,
@@ -124,19 +229,69 @@ class TaskContext {
               Legion::Runtime* runtime);
 
  public:
+  /**
+   * @brief Returns input stores of the task
+   *
+   * @return Vector of input stores
+   */
   std::vector<Store>& inputs() { return inputs_; }
+  /**
+   * @brief Returns output stores of the task
+   *
+   * @return Vector of output stores
+   */
   std::vector<Store>& outputs() { return outputs_; }
+  /**
+   * @brief Returns reduction stores of the task
+   *
+   * @return Vector of reduction stores
+   */
   std::vector<Store>& reductions() { return reductions_; }
+  /**
+   * @brief Returns by-value arguments of the task
+   *
+   * @return Vector of scalar objects
+   */
   std::vector<Scalar>& scalars() { return scalars_; }
+  /**
+   * @brief Returns communicators of the task
+   *
+   * @return Vector of communicator objects
+   */
   std::vector<comm::Communicator>& communicators() { return comms_; }
 
  public:
+  /**
+   * @brief Indicates whether the task is parallelized
+   *
+   * @return true The task is a single task
+   * @return false The task is one in a set of multiple parallel tasks
+   */
   bool is_single_task() const;
+  /**
+   * @brief Indicates whether the task is allowed to raise an exception
+   *
+   * @return true The task can raise an exception
+   * @return false The task must not raise an exception
+   */
   bool can_raise_exception() const { return can_raise_exception_; }
-  Legion::DomainPoint get_task_index() const;
-  Legion::Domain get_launch_domain() const;
+  /**
+   * @brief Returns the point of the task. A 0D point will be returned for a single task.
+   *
+   * @return The point of the task
+   */
+  DomainPoint get_task_index() const;
+  /**
+   * @brief Returns the task group's launch domain. A single task returns an empty domain
+   *
+   * @return The task group's launch domain
+   */
+  Domain get_launch_domain() const;
 
  public:
+  /**
+   * @brief Makes all of unbound output stores of this task empty
+   */
   void make_all_unbound_stores_empty();
   ReturnValues pack_return_values() const;
   ReturnValues pack_return_values_with_exception(int32_t index,
@@ -159,3 +314,5 @@ class TaskContext {
 };
 
 }  // namespace legate
+
+#include "core/runtime/context.inl"
diff --git a/src/core/runtime/context.inl b/src/core/runtime/context.inl
new file mode 100644
index 000000000..e847b5bf1
--- /dev/null
+++ b/src/core/runtime/context.inl
@@ -0,0 +1,76 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "core/runtime/context.h"
+
+namespace legate {
+
+#ifndef REALM_COMPILER_IS_NVCC
+
+#ifdef LEGATE_USE_CUDA
+extern Legion::Logger log_legate;
+#endif
+
+template <typename REDOP>
+void LibraryContext::register_reduction_operator()
+{
+#ifdef LEGATE_USE_CUDA
+  log_legate.error("Reduction operators must be registered in a .cu file when CUDA is enabled");
+  LEGATE_ABORT;
+#endif
+  Legion::Runtime::register_reduction_op<REDOP>(get_reduction_op_id(REDOP::REDOP_ID));
+}
+
+#else  // ifndef REALM_COMPILER_IS_NVCC
+
+namespace detail {
+
+template <typename T>
+class CUDAReductionOpWrapper : public T {
+ public:
+  static const bool has_cuda_reductions = true;
+
+  template <bool EXCLUSIVE>
+  __device__ static void apply_cuda(typename T::LHS& lhs, typename T::RHS rhs)
+  {
+    T::template apply<EXCLUSIVE>(lhs, rhs);
+  }
+
+  template <bool EXCLUSIVE>
+  __device__ static void fold_cuda(typename T::LHS& lhs, typename T::RHS rhs)
+  {
+    T::template fold<EXCLUSIVE>(lhs, rhs);
+  }
+};
+
+}  // namespace detail
+
+template <typename REDOP>
+void LibraryContext::register_reduction_operator()
+{
+  Legion::Runtime::register_reduction_op(
+    get_reduction_op_id(REDOP::REDOP_ID),
+    Realm::ReductionOpUntyped::create_reduction_op<detail::CUDAReductionOpWrapper<REDOP>>(),
+    nullptr,
+    nullptr,
+    false);
+}
+
+#endif  // ifndef REALM_COMPILER_IS_NVCC
+
+}  // namespace legate
diff --git a/src/core/runtime/projection.cc b/src/core/runtime/projection.cc
index 9392cd051..c45794d68 100644
--- a/src/core/runtime/projection.cc
+++ b/src/core/runtime/projection.cc
@@ -22,36 +22,33 @@
 
 #include "core/runtime/projection.h"
 #include "core/utilities/dispatch.h"
+#include "core/utilities/typedefs.h"
 #include "legate_defines.h"
 
-using namespace Legion;
-
 namespace legate {
 
-extern Logger log_legate;
-
 // This special functor overrides the default projection implementation because it needs
 // to know the the target color space for delinearization. Also note that this functor's
 // project_point passes through input points, as we already know they are always 1D points
 // and the output will be linearized back to integers.
 class DelinearizationFunctor : public LegateProjectionFunctor {
  public:
-  DelinearizationFunctor(Runtime* runtime);
+  DelinearizationFunctor(Legion::Runtime* runtime);
 
  public:
   virtual Legion::LogicalRegion project(Legion::LogicalPartition upper_bound,
-                                        const Legion::DomainPoint& point,
-                                        const Legion::Domain& launch_domain) override;
+                                        const DomainPoint& point,
+                                        const Domain& launch_domain) override;
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
-                                            const Legion::Domain& launch_domain) const override;
+  virtual DomainPoint project_point(const DomainPoint& point,
+                                    const Domain& launch_domain) const override;
 };
 
 template <int32_t SRC_DIM, int32_t TGT_DIM>
 class AffineFunctor : public LegateProjectionFunctor {
  public:
-  AffineFunctor(Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets);
+  AffineFunctor(Legion::Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets);
 
  public:
   DomainPoint project_point(const DomainPoint& point, const Domain& launch_domain) const override;
@@ -64,26 +61,27 @@ class AffineFunctor : public LegateProjectionFunctor {
   Point<TGT_DIM> offsets_;
 };
 
-LegateProjectionFunctor::LegateProjectionFunctor(Runtime* rt) : ProjectionFunctor(rt) {}
+LegateProjectionFunctor::LegateProjectionFunctor(Legion::Runtime* rt) : ProjectionFunctor(rt) {}
 
-LogicalRegion LegateProjectionFunctor::project(LogicalPartition upper_bound,
-                                               const DomainPoint& point,
-                                               const Domain& launch_domain)
+Legion::LogicalRegion LegateProjectionFunctor::project(Legion::LogicalPartition upper_bound,
+                                                       const DomainPoint& point,
+                                                       const Domain& launch_domain)
 {
   const DomainPoint dp = project_point(point, launch_domain);
   if (runtime->has_logical_subregion_by_color(upper_bound, dp))
     return runtime->get_logical_subregion_by_color(upper_bound, dp);
   else
-    return LogicalRegion::NO_REGION;
+    return Legion::LogicalRegion::NO_REGION;
 }
 
-DelinearizationFunctor::DelinearizationFunctor(Runtime* runtime) : LegateProjectionFunctor(runtime)
+DelinearizationFunctor::DelinearizationFunctor(Legion::Runtime* runtime)
+  : LegateProjectionFunctor(runtime)
 {
 }
 
-LogicalRegion DelinearizationFunctor::project(LogicalPartition upper_bound,
-                                              const DomainPoint& point,
-                                              const Domain& launch_domain)
+Legion::LogicalRegion DelinearizationFunctor::project(Legion::LogicalPartition upper_bound,
+                                                      const DomainPoint& point,
+                                                      const Domain& launch_domain)
 {
   const auto color_space =
     runtime->get_index_partition_color_space(upper_bound.get_index_partition());
@@ -108,17 +106,17 @@ LogicalRegion DelinearizationFunctor::project(LogicalPartition upper_bound,
   if (runtime->has_logical_subregion_by_color(upper_bound, delinearized))
     return runtime->get_logical_subregion_by_color(upper_bound, delinearized);
   else
-    return LogicalRegion::NO_REGION;
+    return Legion::LogicalRegion::NO_REGION;
 }
 
-Legion::DomainPoint DelinearizationFunctor::project_point(const Legion::DomainPoint& point,
-                                                          const Legion::Domain& launch_domain) const
+DomainPoint DelinearizationFunctor::project_point(const DomainPoint& point,
+                                                  const Domain& launch_domain) const
 {
   return point;
 }
 
 template <int32_t SRC_DIM, int32_t TGT_DIM>
-AffineFunctor<SRC_DIM, TGT_DIM>::AffineFunctor(Runtime* runtime,
+AffineFunctor<SRC_DIM, TGT_DIM>::AffineFunctor(Legion::Runtime* runtime,
                                                int32_t* dims,
                                                int32_t* weights,
                                                int32_t* offsets)
@@ -170,7 +168,7 @@ template <int32_t SRC_DIM, int32_t TGT_DIM>
 }
 
 struct IdentityFunctor : public LegateProjectionFunctor {
-  IdentityFunctor(Runtime* runtime) : LegateProjectionFunctor(runtime) {}
+  IdentityFunctor(Legion::Runtime* runtime) : LegateProjectionFunctor(runtime) {}
   DomainPoint project_point(const DomainPoint& point, const Domain&) const override
   {
     return point;
@@ -178,7 +176,7 @@ struct IdentityFunctor : public LegateProjectionFunctor {
 };
 
 static LegateProjectionFunctor* identity_functor{nullptr};
-static std::unordered_map<ProjectionID, LegateProjectionFunctor*> functor_table{};
+static std::unordered_map<Legion::ProjectionID, LegateProjectionFunctor*> functor_table{};
 static std::mutex functor_table_lock{};
 
 struct create_affine_functor_fn {
@@ -218,8 +216,11 @@ struct create_affine_functor_fn {
   }
 
   template <int32_t SRC_DIM, int32_t TGT_DIM>
-  void operator()(
-    Runtime* runtime, int32_t* dims, int32_t* weights, int32_t* offsets, ProjectionID proj_id)
+  void operator()(Legion::Runtime* runtime,
+                  int32_t* dims,
+                  int32_t* weights,
+                  int32_t* offsets,
+                  Legion::ProjectionID proj_id)
   {
     auto functor = new AffineFunctor<SRC_DIM, TGT_DIM>(runtime, dims, weights, offsets);
 #ifdef DEBUG_LEGATE
@@ -251,7 +252,8 @@ void register_legate_core_projection_functors(Legion::Runtime* runtime,
   identity_functor = new IdentityFunctor(runtime);
 }
 
-LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id, bool allow_missing)
+LegateProjectionFunctor* find_legate_projection_functor(Legion::ProjectionID proj_id,
+                                                        bool allow_missing)
 {
   if (0 == proj_id) return identity_functor;
   const std::lock_guard<std::mutex> lock(functor_table_lock);
@@ -264,7 +266,7 @@ LegateProjectionFunctor* find_legate_projection_functor(ProjectionID proj_id, bo
   return result;
 }
 
-struct LinearizingPointTransformFunctor : public PointTransformFunctor {
+struct LinearizingPointTransformFunctor : public Legion::PointTransformFunctor {
   // This is actually an invertible functor, but we will not use this for inversion
   virtual bool is_invertible(void) const { return false; }
 
@@ -300,7 +302,7 @@ void legate_register_affine_projection_functor(int32_t src_ndim,
                                                int32_t* offsets,
                                                legion_projection_id_t proj_id)
 {
-  auto runtime = Runtime::get_runtime();
+  auto runtime = Legion::Runtime::get_runtime();
   legate::double_dispatch(src_ndim,
                           tgt_ndim,
                           legate::create_affine_functor_fn{},
diff --git a/src/core/runtime/projection.h b/src/core/runtime/projection.h
index cf74d1689..4eb2bb1ad 100644
--- a/src/core/runtime/projection.h
+++ b/src/core/runtime/projection.h
@@ -19,6 +19,7 @@
 #include "legion.h"
 
 #include "core/runtime/context.h"
+#include "core/utilities/typedefs.h"
 
 namespace legate {
 
@@ -30,8 +31,8 @@ class LegateProjectionFunctor : public Legion::ProjectionFunctor {
  public:
   using Legion::ProjectionFunctor::project;
   virtual Legion::LogicalRegion project(Legion::LogicalPartition upper_bound,
-                                        const Legion::DomainPoint& point,
-                                        const Legion::Domain& launch_domain);
+                                        const DomainPoint& point,
+                                        const Domain& launch_domain);
 
  public:
   // legate projection functors are almost always functional and don't traverse the region tree
@@ -42,8 +43,8 @@ class LegateProjectionFunctor : public Legion::ProjectionFunctor {
   void set_collective() { is_collective_ = true; }
 
  public:
-  virtual Legion::DomainPoint project_point(const Legion::DomainPoint& point,
-                                            const Legion::Domain& launch_domain) const = 0;
+  virtual DomainPoint project_point(const DomainPoint& point,
+                                    const Domain& launch_domain) const = 0;
 
  private:
   bool is_collective_ = false;
diff --git a/src/core/runtime/runtime.cc b/src/core/runtime/runtime.cc
index 27e14a7da..2b17a830b 100644
--- a/src/core/runtime/runtime.cc
+++ b/src/core/runtime/runtime.cc
@@ -23,12 +23,11 @@
 #include "core/task/task.h"
 #include "core/utilities/deserializer.h"
 #include "core/utilities/machine.h"
+#include "core/utilities/nvtx_help.h"
 #include "legate.h"
 
 namespace legate {
 
-using namespace Legion;
-
 Logger log_legate("legate");
 
 // This is the unique string name for our library which can be used
@@ -95,7 +94,7 @@ static void extract_scalar_task(
   Legion::Runtime* runtime;
   Legion::Runtime::legion_task_preamble(args, arglen, p, task, regions, legion_context, runtime);
 
-  Core::show_progress(task, legion_context, runtime, task->get_task_name());
+  Core::show_progress(task, legion_context, runtime);
 
   TaskContext context(task, *regions, legion_context, runtime);
   auto idx            = context.scalars()[0].value<int32_t>();
@@ -112,50 +111,51 @@ static void extract_scalar_task(
 
 /*static*/ void Core::show_progress(const Legion::Task* task,
                                     Legion::Context ctx,
-                                    Legion::Runtime* runtime,
-                                    const char* task_name)
+                                    Legion::Runtime* runtime)
 {
   if (!Core::show_progress_requested) return;
   const auto exec_proc     = runtime->get_executing_processor(ctx);
-  const auto proc_kind_str = (exec_proc.kind() == Legion::Processor::LOC_PROC)   ? "CPU"
-                             : (exec_proc.kind() == Legion::Processor::TOC_PROC) ? "GPU"
-                                                                                 : "OpenMP";
+  const auto proc_kind_str = (exec_proc.kind() == Processor::LOC_PROC)   ? "CPU"
+                             : (exec_proc.kind() == Processor::TOC_PROC) ? "GPU"
+                                                                         : "OpenMP";
 
   std::stringstream point_str;
   const auto& point = task->index_point;
   point_str << point[0];
-  for (int32_t dim = 1; dim < task->index_point.dim; ++dim) point_str << "," << point[dim];
+  for (int32_t dim = 1; dim < point.dim; ++dim) point_str << "," << point[dim];
 
   log_legate.print("%s %s task [%s], pt = (%s), proc = " IDFMT,
-                   task_name,
+                   task->get_task_name(),
                    proc_kind_str,
                    task->get_provenance_string().c_str(),
                    point_str.str().c_str(),
                    exec_proc.id);
 }
 
-/*static*/ void Core::report_unexpected_exception(const char* task_name,
+/*static*/ void Core::report_unexpected_exception(const Legion::Task* task,
                                                   const legate::TaskException& e)
 {
   log_legate.error(
     "Task %s threw an exception \"%s\", but the task did not declare any exception. "
     "Please specify a Python exception that you want this exception to be re-thrown with "
     "using 'throws_exception'.",
-    task_name,
+    task->get_task_name(),
     e.error_message().c_str());
   LEGATE_ABORT;
 }
 
-void register_legate_core_tasks(Machine machine, Runtime* runtime, const LibraryContext& context)
+void register_legate_core_tasks(Legion::Machine machine,
+                                Legion::Runtime* runtime,
+                                const LibraryContext& context)
 {
-  const TaskID extract_scalar_task_id  = context.get_task_id(LEGATE_CORE_EXTRACT_SCALAR_TASK_ID);
+  auto extract_scalar_task_id          = context.get_task_id(LEGATE_CORE_EXTRACT_SCALAR_TASK_ID);
   const char* extract_scalar_task_name = "core::extract_scalar";
   runtime->attach_name(
     extract_scalar_task_id, extract_scalar_task_name, false /*mutable*/, true /*local only*/);
 
   auto make_registrar = [&](auto task_id, auto* task_name, auto proc_kind) {
-    TaskVariantRegistrar registrar(task_id, task_name);
-    registrar.add_constraint(ProcessorConstraint(proc_kind));
+    Legion::TaskVariantRegistrar registrar(task_id, task_name);
+    registrar.add_constraint(Legion::ProcessorConstraint(proc_kind));
     registrar.set_leaf(true);
     registrar.global_registration = false;
     return registrar;
@@ -178,10 +178,11 @@ void register_legate_core_tasks(Machine machine, Runtime* runtime, const Library
   comm::register_tasks(machine, runtime, context);
 }
 
-extern void register_exception_reduction_op(Runtime* runtime, const LibraryContext& context);
+extern void register_exception_reduction_op(Legion::Runtime* runtime,
+                                            const LibraryContext& context);
 
-/*static*/ void core_registration_callback(Machine machine,
-                                           Runtime* runtime,
+/*static*/ void core_registration_callback(Legion::Machine machine,
+                                           Legion::Runtime* runtime,
                                            const std::set<Processor>& local_procs)
 {
   ResourceConfig config;
@@ -190,7 +191,7 @@ extern void register_exception_reduction_op(Runtime* runtime, const LibraryConte
   // We register one sharding functor for each new projection functor
   config.max_shardings     = LEGATE_CORE_MAX_FUNCTOR_ID;
   config.max_reduction_ops = LEGATE_CORE_MAX_REDUCTION_OP_ID;
-  LibraryContext context(runtime, core_library_name, config);
+  LibraryContext context(core_library_name, config);
 
   register_legate_core_tasks(machine, runtime, context);
 
@@ -203,7 +204,7 @@ extern void register_exception_reduction_op(Runtime* runtime, const LibraryConte
   register_legate_core_sharding_functors(runtime, context);
 
   auto fut = runtime->select_tunable_value(
-    Runtime::get_context(), LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM, context.get_mapper_id(0));
+    Legion::Runtime::get_context(), LEGATE_CORE_TUNABLE_HAS_SOCKET_MEM, context.get_mapper_id(0));
   Core::has_socket_mem = fut.get_result<bool>();
 }
 
diff --git a/src/core/runtime/runtime.h b/src/core/runtime/runtime.h
index b7b86c836..a6143f44e 100644
--- a/src/core/runtime/runtime.h
+++ b/src/core/runtime/runtime.h
@@ -21,21 +21,42 @@
 #include "core/task/exception.h"
 #include "core/utilities/typedefs.h"
 
+/** @defgroup runtime Runtime and library contexts
+ */
+
 namespace legate {
 
 extern uint32_t extract_env(const char* env_name,
                             const uint32_t default_value,
                             const uint32_t test_value);
 
-class Core {
+/**
+ * @ingroup runtime
+ * @brief A utility class that collects static members shared by all Legate libraries
+ */
+struct Core {
  public:
   static void parse_config(void);
   static void shutdown(void);
   static void show_progress(const Legion::Task* task,
                             Legion::Context ctx,
-                            Legion::Runtime* runtime,
-                            const char* task_name);
-  static void report_unexpected_exception(const char* task_name, const legate::TaskException& e);
+                            Legion::Runtime* runtime);
+  static void report_unexpected_exception(const Legion::Task* task, const legate::TaskException& e);
+
+ public:
+  /**
+   * @brief Type signature for registration callbacks
+   */
+  using RegistrationCallback = void (*)();
+
+  /**
+   * @brief Performs a registration callback. Libraries must perform
+   * registration of tasks and other components through this function.
+   *
+   * @tparam CALLBACK Registration callback to perform
+   */
+  template <RegistrationCallback CALLBACK>
+  static void perform_registration();
 
  public:
   // Configuration settings
@@ -47,3 +68,5 @@ class Core {
 };
 
 }  // namespace legate
+
+#include "core/runtime/runtime.inl"
diff --git a/src/core/runtime/runtime.inl b/src/core/runtime/runtime.inl
new file mode 100644
index 000000000..0c98e9c7f
--- /dev/null
+++ b/src/core/runtime/runtime.inl
@@ -0,0 +1,42 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "core/runtime/runtime.h"
+
+namespace legate {
+
+namespace detail {
+
+template <Core::RegistrationCallback CALLBACK>
+void invoke_legate_registration_callback(Legion::Machine,
+                                         Legion::Runtime*,
+                                         const std::set<Processor>&)
+{
+  CALLBACK();
+};
+
+}  // namespace detail
+
+template <Core::RegistrationCallback CALLBACK>
+/*static*/ void Core::perform_registration()
+{
+  Legion::Runtime::perform_registration_callback(
+    detail::invoke_legate_registration_callback<CALLBACK>, true /*global*/);
+}
+
+}  // namespace legate
diff --git a/src/core/runtime/shard.cc b/src/core/runtime/shard.cc
index eca6a7775..db2e74434 100644
--- a/src/core/runtime/shard.cc
+++ b/src/core/runtime/shard.cc
@@ -23,16 +23,16 @@
 #include "core/runtime/shard.h"
 #include "core/utilities/linearize.h"
 
-using namespace Legion;
-
 namespace legate {
 
-static std::unordered_map<ProjectionID, ShardID> functor_id_table;
+static std::unordered_map<Legion::ProjectionID, Legion::ShardID> functor_id_table;
 static std::mutex functor_table_lock;
 
-class ToplevelTaskShardingFunctor : public ShardingFunctor {
+class ToplevelTaskShardingFunctor : public Legion::ShardingFunctor {
  public:
-  virtual ShardID shard(const DomainPoint& p, const Domain& launch_space, const size_t total_shards)
+  virtual Legion::ShardID shard(const DomainPoint& p,
+                                const Domain& launch_space,
+                                const size_t total_shards)
   {
     // Just tile this space in 1D
     const Point<1> point = p;
@@ -43,9 +43,11 @@ class ToplevelTaskShardingFunctor : public ShardingFunctor {
   }
 };
 
-class LinearizingShardingFunctor : public ShardingFunctor {
+class LinearizingShardingFunctor : public Legion::ShardingFunctor {
  public:
-  virtual ShardID shard(const DomainPoint& p, const Domain& launch_space, const size_t total_shards)
+  virtual Legion::ShardID shard(const DomainPoint& p,
+                                const Domain& launch_space,
+                                const size_t total_shards)
   {
     const size_t size  = launch_space.get_volume();
     const size_t chunk = (size + total_shards - 1) / total_shards;
@@ -54,7 +56,7 @@ class LinearizingShardingFunctor : public ShardingFunctor {
 
   virtual bool is_invertible(void) const { return true; }
 
-  virtual void invert(ShardID shard,
+  virtual void invert(Legion::ShardID shard,
                       const Domain& shard_domain,
                       const Domain& full_domain,
                       const size_t total_shards,
@@ -95,14 +97,14 @@ void register_legate_core_sharding_functors(Legion::Runtime* runtime, const Libr
   functor_id_table[context.get_projection_id(LEGATE_CORE_DELINEARIZE_PROJ_ID)] = sharding_id;
 }
 
-class LegateShardingFunctor : public ShardingFunctor {
+class LegateShardingFunctor : public Legion::ShardingFunctor {
  public:
   LegateShardingFunctor(LegateProjectionFunctor* proj_functor) : proj_functor_(proj_functor) {}
 
  public:
-  virtual ShardID shard(const DomainPoint& p,
-                        const Domain& launch_space,
-                        const size_t total_shards) override
+  virtual Legion::ShardID shard(const DomainPoint& p,
+                                const Domain& launch_space,
+                                const size_t total_shards) override
   {
     auto lo    = proj_functor_->project_point(launch_space.lo(), launch_space);
     auto hi    = proj_functor_->project_point(launch_space.hi(), launch_space);
@@ -117,25 +119,25 @@ class LegateShardingFunctor : public ShardingFunctor {
   LegateProjectionFunctor* proj_functor_;
 };
 
-ShardingID find_sharding_functor_by_projection_functor(Legion::ProjectionID proj_id)
+Legion::ShardingID find_sharding_functor_by_projection_functor(Legion::ProjectionID proj_id)
 {
   const std::lock_guard<std::mutex> lock(legate::functor_table_lock);
   assert(functor_id_table.find(proj_id) != functor_id_table.end());
   return functor_id_table[proj_id];
 }
 
-struct callback_args_t {
+struct ShardingCallbackArgs {
   Legion::ShardID shard_id;
   Legion::ProjectionID proj_id;
 };
 
 static void sharding_functor_registration_callback(const Legion::RegistrationCallbackArgs& args)
 {
-  auto p_args   = static_cast<callback_args_t*>(args.buffer.get_ptr());
+  auto p_args   = static_cast<ShardingCallbackArgs*>(args.buffer.get_ptr());
   auto shard_id = p_args->shard_id;
   auto proj_id  = p_args->proj_id;
 
-  auto runtime = Runtime::get_runtime();
+  auto runtime = Legion::Runtime::get_runtime();
   auto sharding_functor =
     new legate::LegateShardingFunctor(legate::find_legate_projection_functor(proj_id));
   runtime->register_sharding_functor(shard_id, sharding_functor, true /*silence warnings*/);
@@ -148,13 +150,13 @@ extern "C" {
 void legate_create_sharding_functor_using_projection(Legion::ShardID shard_id,
                                                      Legion::ProjectionID proj_id)
 {
-  auto runtime = Runtime::get_runtime();
-  legate::callback_args_t args{shard_id, proj_id};
+  auto runtime = Legion::Runtime::get_runtime();
+  legate::ShardingCallbackArgs args{shard_id, proj_id};
   {
     const std::lock_guard<std::mutex> lock(legate::functor_table_lock);
     legate::functor_id_table[proj_id] = shard_id;
   }
-  UntypedBuffer buffer(&args, sizeof(args));
+  Legion::UntypedBuffer buffer(&args, sizeof(args));
   Legion::Runtime::perform_registration_callback(
     legate::sharding_functor_registration_callback, buffer, false /*global*/, false /*dedup*/);
 }
diff --git a/src/core/task/exception.h b/src/core/task/exception.h
index ec0c147ab..9f34f8b76 100644
--- a/src/core/task/exception.h
+++ b/src/core/task/exception.h
@@ -18,22 +18,60 @@
 
 #include <exception>
 
+/**
+ * @file
+ * @brief Class definition for legate::TaskException
+ */
+
 namespace legate {
 
+/**
+ * @ingroup task
+ * @brief An exception class used in cross language exception handling
+ *
+ * Any client that needs to catch a C++ exception during task execution and have it rethrown
+ * on the launcher side should wrap that C++ exception with a `TaskException`. In case the
+ * task can raise more than one type of exception, they are distinguished by integer ids;
+ * the launcher is responsible for enumerating a list of all exceptions that can be raised
+ * and the integer ids are positions in that list.
+ */
 class TaskException : public std::exception {
  public:
-  TaskException(const std::string& error_message) : index_(0), error_message_(error_message) {}
-
+  /**
+   * @brief Constructs a `TaskException` object with an exception id and an error message.
+   * The id must be a valid index for the list of exceptions declared by the launcher.
+   *
+   * @param index Exception id
+   * @param error_message Error message
+   */
   TaskException(int32_t index, const std::string& error_message)
     : index_(index), error_message_(error_message)
   {
   }
 
+  /**
+   * @brief Constructs a `TaskException` object with an error message. The exception id
+   * is set to 0.
+   *
+   * @param error_message Error message
+   */
+  TaskException(const std::string& error_message) : index_(0), error_message_(error_message) {}
+
  public:
   virtual const char* what() const throw() { return error_message_.c_str(); }
 
  public:
+  /**
+   * @brief Returns the exception id
+   *
+   * @return The exception id
+   */
   int32_t index() const { return index_; }
+  /**
+   * @brief Returns the error message
+   *
+   * @return The error message
+   */
   const std::string& error_message() const { return error_message_; }
 
  private:
diff --git a/src/core/task/registrar.cc b/src/core/task/registrar.cc
new file mode 100644
index 000000000..3386d2010
--- /dev/null
+++ b/src/core/task/registrar.cc
@@ -0,0 +1,99 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "core/task/registrar.h"
+
+#include "core/runtime/context.h"
+#include "core/utilities/typedefs.h"
+
+namespace legate {
+
+struct PendingTaskVariant : public Legion::TaskVariantRegistrar {
+  PendingTaskVariant(void)
+    : Legion::TaskVariantRegistrar(), task_name(nullptr), var(LEGATE_NO_VARIANT)
+  {
+  }
+  PendingTaskVariant(Legion::TaskID tid,
+                     bool global,
+                     const char* var_name,
+                     const char* t_name,
+                     const Legion::CodeDescriptor& desc,
+                     LegateVariantCode v,
+                     size_t ret)
+    : Legion::TaskVariantRegistrar(tid, global, var_name),
+      task_name(t_name),
+      descriptor(desc),
+      var(v),
+      ret_size(ret)
+  {
+  }
+
+  const char* task_name;
+  Legion::CodeDescriptor descriptor;
+  LegateVariantCode var;
+  size_t ret_size;
+};
+
+void TaskRegistrar::record_variant(Legion::TaskID tid,
+                                   const char* task_name,
+                                   const Legion::CodeDescriptor& desc,
+                                   Legion::ExecutionConstraintSet& execution_constraints,
+                                   Legion::TaskLayoutConstraintSet& layout_constraints,
+                                   LegateVariantCode var,
+                                   Processor::Kind kind,
+                                   const VariantOptions& options)
+{
+  assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) ||
+         (kind == Processor::OMP_PROC));
+
+  // Buffer these up until we can do our actual registration with the runtime
+  auto registrar = new PendingTaskVariant(tid,
+                                          false /*global*/,
+                                          (kind == Processor::LOC_PROC)   ? "CPU"
+                                          : (kind == Processor::TOC_PROC) ? "GPU"
+                                                                          : "OpenMP",
+                                          task_name,
+                                          desc,
+                                          var,
+                                          options.return_size);
+
+  registrar->execution_constraints.swap(execution_constraints);
+  registrar->layout_constraints.swap(layout_constraints);
+  registrar->add_constraint(Legion::ProcessorConstraint(kind));
+  registrar->set_leaf(options.leaf);
+  registrar->set_inner(options.inner);
+  registrar->set_idempotent(options.idempotent);
+  registrar->set_concurrent(options.concurrent);
+
+  pending_task_variants_.push_back(registrar);
+}
+
+void TaskRegistrar::register_all_tasks(const LibraryContext& context)
+{
+  auto runtime = Legion::Runtime::get_runtime();
+  // Do all our registrations
+  for (auto& task : pending_task_variants_) {
+    task->task_id =
+      context.get_task_id(task->task_id);  // Convert a task local task id to a global id
+    // Attach the task name too for debugging
+    runtime->attach_name(task->task_id, task->task_name, false /*mutable*/, true /*local only*/);
+    runtime->register_task_variant(*task, task->descriptor, nullptr, 0, task->ret_size, task->var);
+    delete task;
+  }
+  pending_task_variants_.clear();
+}
+
+}  // namespace legate
diff --git a/src/core/task/registrar.h b/src/core/task/registrar.h
new file mode 100644
index 000000000..67c8c0d81
--- /dev/null
+++ b/src/core/task/registrar.h
@@ -0,0 +1,108 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+
+#include "legion.h"
+
+#include "core/task/variant.h"
+#include "core/utilities/typedefs.h"
+
+/**
+ * @file
+ * @brief Class definition fo legate::TaskRegistrar
+ */
+
+namespace legate {
+
+class LibraryContext;
+class PendingTaskVariant;
+
+/**
+ * @ingroup task
+ * @brief A helper class for task variant registration.
+ *
+ * The `legate::TaskRegistrar` class is designed to simplify the boilerplate that client libraries
+ * need to register all its task variants. The following is a boilerplate that each library
+ * needs to write:
+ *
+ * @code{.cpp}
+ * struct MyLibrary {
+ *  public:
+ *   template <typename... Args>
+ *   static void record_variant(Args&&... args)
+ *   {
+ *     get_registrar().record_variant(std::forward<Args>(args)...);
+ *   }
+ *   static legate::TaskRegistrar& get_registrar();
+ * };
+ *
+ * template <typename T>
+ * struct MyLibraryTaskBase : public legate::LegateTask<T> {
+ *   using Registrar = MyLibrary;
+ *
+ *   ...
+ * };
+ * @endcode
+ *
+ * In the code above, the `MyLibrary` has a static member that returns a singleton
+ * `legate::TaskRegistrar` object, and another member `record_variant` that simply forwards all
+ * arguments to the registrar. Then, the `MyLibraryTaskBase` points to the class so Legate can find
+ * where task variants are registered.
+ *
+ * Once this registrar is set up in a library, each library task can simply register itself
+ * with the `LegateTask::register_variants` method like the following:
+ *
+ * @code{.cpp}
+ * // In a header
+ * struct MyLibraryTask : public MyLibraryTaskBase<MyLibraryTask> {
+ *   ...
+ * };
+ *
+ * // In a C++ file
+ * static void __attribute__((constructor)) register_tasks()
+ * {
+ *   MyLibraryTask::register_variants();
+ * }
+ * @endcode
+ */
+class TaskRegistrar {
+ public:
+  void record_variant(Legion::TaskID tid,
+                      const char* task_name,
+                      const Legion::CodeDescriptor& desc,
+                      Legion::ExecutionConstraintSet& execution_constraints,
+                      Legion::TaskLayoutConstraintSet& layout_constraints,
+                      LegateVariantCode var,
+                      Processor::Kind kind,
+                      const VariantOptions& options);
+
+ public:
+  /**
+   * @brief Registers all tasks recorded in this registrar. Typically invoked in a registration
+   * callback of a library.
+   *
+   * @param context Context of the library that owns this registrar
+   */
+  void register_all_tasks(const LibraryContext& context);
+
+ private:
+  std::vector<PendingTaskVariant*> pending_task_variants_;
+};
+
+}  // namespace legate
diff --git a/src/core/task/return.cc b/src/core/task/return.cc
index 98c70419a..10d69d079 100644
--- a/src/core/task/return.cc
+++ b/src/core/task/return.cc
@@ -31,8 +31,6 @@
 #include "core/cuda/stream_pool.h"
 #endif
 
-using namespace Legion;
-
 namespace legate {
 
 ReturnValue::ReturnValue(Legion::UntypedDeferredValue value, size_t size)
@@ -43,7 +41,7 @@ ReturnValue::ReturnValue(Legion::UntypedDeferredValue value, size_t size)
 
 /*static*/ ReturnValue ReturnValue::unpack(const void* ptr, size_t size, Memory::Kind memory_kind)
 {
-  ReturnValue result(UntypedDeferredValue(size, memory_kind), size);
+  ReturnValue result(Legion::UntypedDeferredValue(size, memory_kind), size);
 #ifdef DEBUG_LEGATE
   assert(!result.is_device_value());
 #endif
@@ -108,12 +106,14 @@ static void pack_returned_exception(const ReturnedException& value, void*& ptr,
   value.legion_serialize(ptr);
 }
 
-static void returned_exception_init(const ReductionOp* reduction_op, void*& ptr, size_t& size)
+static void returned_exception_init(const Legion::ReductionOp* reduction_op,
+                                    void*& ptr,
+                                    size_t& size)
 {
   pack_returned_exception(JoinReturnedException::identity, ptr, size);
 }
 
-static void returned_exception_fold(const ReductionOp* reduction_op,
+static void returned_exception_fold(const Legion::ReductionOp* reduction_op,
                                     void*& lhs_ptr,
                                     size_t& lhs_size,
                                     const void* rhs_ptr)
@@ -171,7 +171,7 @@ ReturnValue ReturnedException::pack() const
 {
   auto buffer_size = legion_buffer_size();
   auto mem_kind    = find_memory_kind_for_executing_processor();
-  auto buffer      = UntypedDeferredValue(buffer_size, mem_kind);
+  auto buffer      = Legion::UntypedDeferredValue(buffer_size, mem_kind);
 
   AccessorWO<int8_t, 1> acc(buffer, buffer_size, false);
   legion_serialize(acc.ptr(0));
@@ -296,10 +296,10 @@ void ReturnValues::legion_deserialize(const void* buffer)
   return ReturnValue::unpack(values + offset, size, kind);
 }
 
-void ReturnValues::finalize(Context legion_context) const
+void ReturnValues::finalize(Legion::Context legion_context) const
 {
   if (return_values_.empty()) {
-    Runtime::legion_task_postamble(legion_context);
+    Legion::Runtime::legion_task_postamble(legion_context);
     return;
   } else if (return_values_.size() == 1) {
     return_values_.front().finalize(legion_context);
@@ -317,17 +317,18 @@ void ReturnValues::finalize(Context legion_context) const
 
   size_t return_size = legion_buffer_size();
   auto return_buffer =
-    UntypedDeferredValue(return_size, find_memory_kind_for_executing_processor());
+    Legion::UntypedDeferredValue(return_size, find_memory_kind_for_executing_processor());
   AccessorWO<int8_t, 1> acc(return_buffer, return_size, false);
   legion_serialize(acc.ptr(0));
   return_buffer.finalize(legion_context);
 }
 
-void register_exception_reduction_op(Runtime* runtime, const LibraryContext& context)
+void register_exception_reduction_op(Legion::Runtime* runtime, const LibraryContext& context)
 {
   auto redop_id = context.get_reduction_op_id(LEGATE_CORE_JOIN_EXCEPTION_OP);
   auto* redop   = Realm::ReductionOpUntyped::create_reduction_op<JoinReturnedException>();
-  Runtime::register_reduction_op(redop_id, redop, returned_exception_init, returned_exception_fold);
+  Legion::Runtime::register_reduction_op(
+    redop_id, redop, returned_exception_init, returned_exception_fold);
 }
 
 }  // namespace legate
diff --git a/src/core/task/return.h b/src/core/task/return.h
index 031bb71f2..6b7b9935f 100644
--- a/src/core/task/return.h
+++ b/src/core/task/return.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <vector>
+#include "core/utilities/typedefs.h"
 
 namespace legate {
 
@@ -29,7 +30,7 @@ struct ReturnValue {
   ReturnValue& operator=(const ReturnValue&) = default;
 
  public:
-  static ReturnValue unpack(const void* ptr, size_t size, Legion::Memory::Kind memory_kind);
+  static ReturnValue unpack(const void* ptr, size_t size, Memory::Kind memory_kind);
 
  public:
   void* ptr();
diff --git a/src/core/task/task.cc b/src/core/task/task.cc
index 51ac3e1c1..26366c64b 100644
--- a/src/core/task/task.cc
+++ b/src/core/task/task.cc
@@ -16,54 +16,73 @@
 
 #include "core/task/task.h"
 
-namespace legate {
+#include <cxxabi.h>
 
-using namespace Legion;
+#include "realm/faults.h"
 
-void LegateTaskRegistrar::record_variant(TaskID tid,
-                                         const char* task_name,
-                                         const CodeDescriptor& descriptor,
-                                         ExecutionConstraintSet& execution_constraints,
-                                         TaskLayoutConstraintSet& layout_constraints,
-                                         LegateVariantCode var,
-                                         Processor::Kind kind,
-                                         const VariantOptions& options)
-{
-  assert((kind == Processor::LOC_PROC) || (kind == Processor::TOC_PROC) ||
-         (kind == Processor::OMP_PROC));
+#include "core/runtime/context.h"
+#include "core/runtime/runtime.h"
+#include "core/task/exception.h"
+#include "core/task/registrar.h"
+#include "core/task/return.h"
+#include "core/utilities/deserializer.h"
+#include "core/utilities/nvtx_help.h"
+#include "core/utilities/typedefs.h"
 
-  // Buffer these up until we can do our actual registration with the runtime
-  pending_task_variants_.push_back(PendingTaskVariant(tid,
-                                                      false /*global*/,
-                                                      (kind == Processor::LOC_PROC)   ? "CPU"
-                                                      : (kind == Processor::TOC_PROC) ? "GPU"
-                                                                                      : "OpenMP",
-                                                      task_name,
-                                                      descriptor,
-                                                      var,
-                                                      options.return_size));
+namespace legate {
+namespace detail {
 
-  auto& registrar = pending_task_variants_.back();
-  registrar.execution_constraints.swap(execution_constraints);
-  registrar.layout_constraints.swap(layout_constraints);
-  registrar.add_constraint(ProcessorConstraint(kind));
-  registrar.set_leaf(options.leaf);
-  registrar.set_inner(options.inner);
-  registrar.set_idempotent(options.idempotent);
-  registrar.set_concurrent(options.concurrent);
+std::string generate_task_name(const std::type_info& ti)
+{
+  std::string result;
+  int status      = 0;
+  char* demangled = abi::__cxa_demangle(ti.name(), 0, 0, &status);
+  result          = demangled;
+  free(demangled);
+  return std::move(result);
 }
 
-void LegateTaskRegistrar::register_all_tasks(Runtime* runtime, LibraryContext& context)
+void task_wrapper(VariantImpl variant_impl,
+                  const char* task_name,
+                  const void* args,
+                  size_t arglen,
+                  const void* userdata,
+                  size_t userlen,
+                  Processor p)
+
 {
-  // Do all our registrations
-  for (auto& task : pending_task_variants_) {
-    task.task_id =
-      context.get_task_id(task.task_id);  // Convert a task local task id to a global id
-    // Attach the task name too for debugging
-    runtime->attach_name(task.task_id, task.task_name, false /*mutable*/, true /*local only*/);
-    runtime->register_task_variant(task, task.descriptor, nullptr, 0, task.ret_size, task.var);
+  // Legion preamble
+  const Legion::Task* task;
+  const std::vector<Legion::PhysicalRegion>* regions;
+  Legion::Context legion_context;
+  Legion::Runtime* runtime;
+  Legion::Runtime::legion_task_preamble(args, arglen, p, task, regions, legion_context, runtime);
+
+#ifdef LEGATE_USE_CUDA
+  nvtx::Range auto_range(task_name);
+#endif
+
+  Core::show_progress(task, legion_context, runtime);
+
+  TaskContext context(task, *regions, legion_context, runtime);
+
+  ReturnValues return_values{};
+  try {
+    if (!Core::use_empty_task) (*variant_impl)(context);
+    return_values = context.pack_return_values();
+  } catch (legate::TaskException& e) {
+    if (context.can_raise_exception()) {
+      context.make_all_unbound_stores_empty();
+      return_values = context.pack_return_values_with_exception(e.index(), e.error_message());
+    } else
+      // If a Legate exception is thrown by a task that does not declare any exception,
+      // this is a bug in the library that needs to be reported to the developer
+      Core::report_unexpected_exception(task, e);
   }
-  pending_task_variants_.clear();
+
+  // Legion postamble
+  return_values.finalize(legion_context);
 }
 
+}  // namespace detail
 }  // namespace legate
diff --git a/src/core/task/task.h b/src/core/task/task.h
index b90a4e86b..50577bce2 100644
--- a/src/core/task/task.h
+++ b/src/core/task/task.h
@@ -16,293 +16,81 @@
 
 #pragma once
 
-#include <cxxabi.h>
-#include <sstream>
-
-#include "legion.h"
-#include "realm/faults.h"
-
-#include "core/runtime/context.h"
-#include "core/runtime/runtime.h"
-#include "core/task/exception.h"
-#include "core/task/return.h"
-#include "core/utilities/deserializer.h"
-#include "core/utilities/nvtx_help.h"
+#include "core/task/variant.h"
 #include "core/utilities/typedefs.h"
 
-namespace legate {
+/** @defgroup task Task
+ */
 
-// We're going to allow for each task to use only up to 341 scalar output stores
-constexpr size_t LEGATE_MAX_SIZE_SCALAR_RETURN = 4096;
+/**
+ * @file
+ * @brief Class definition fo legate::LegateTask
+ */
+namespace legate {
 
-struct VariantOptions {
-  bool leaf{true};
-  bool inner{false};
-  bool idempotent{false};
-  bool concurrent{false};
-  size_t return_size{LEGATE_MAX_SIZE_SCALAR_RETURN};
+class TaskContext;
 
-  VariantOptions& with_leaf(bool _leaf)
-  {
-    leaf = _leaf;
-    return *this;
-  }
-  VariantOptions& with_inner(bool _inner)
-  {
-    inner = _inner;
-    return *this;
-  }
-  VariantOptions& with_idempotent(bool _idempotent)
-  {
-    idempotent = _idempotent;
-    return *this;
-  }
-  VariantOptions& with_concurrent(bool _concurrent)
-  {
-    concurrent = _concurrent;
-    return *this;
-  }
-  VariantOptions& with_return_size(size_t _return_size)
-  {
-    return_size = _return_size;
-    return *this;
-  }
-};
-
-using LegateVariantImpl = void (*)(TaskContext&);
+/**
+ * @brief Function signature for task variants. Each task variant must be a function of this type.
+ */
+using VariantImpl = void (*)(TaskContext&);
 
+/**
+ * @ingroup task
+ * @brief A base class template for Legate task implementations.
+ *
+ * Any Legate task class must inherit legate::LegateTask directly or transitively. The type
+ * parameter `T` needs to be bound to a child Legate task class that inherits legate::LegateTask.
+ *
+ * Curently, each task can have up to three variants and the variants need to be static member
+ * functions of the class under the following names:
+ *
+ *   - `cpu_variant`: CPU implementation of the task
+ *   - `gpu_variant`: GPU implementation of the task
+ *   - `omp_variant`: OpenMP implementation of the task
+ *
+ * Tasks must have at least one variant, and all task variants must be semantically equivalent
+ * (modulo some minor rounding errors due to floating point imprecision).
+ *
+ * Each task class must also have a type alias `Registrar` that points to a library specific
+ * registrar class. (See legate::TaskRegistrar for details.)
+ */
 template <typename T>
-class LegateTask {
- protected:
-  // Helper class for checking for various kinds of variants
-  using __no  = int8_t[1];
-  using __yes = int8_t[2];
-  struct HasCPUVariant {
-    template <typename U>
-    static __yes& test(decltype(&U::cpu_variant));
-    template <typename U>
-    static __no& test(...);
-    static const bool value = (sizeof(test<T>(0)) == sizeof(__yes));
-  };
-  struct HasOMPVariant {
-    template <typename U>
-    static __yes& test(decltype(&U::omp_variant));
-    template <typename U>
-    static __no& test(...);
-    static const bool value = (sizeof(test<T>(0)) == sizeof(__yes));
-  };
-  struct HasGPUVariant {
-    template <typename U>
-    static __yes& test(decltype(&U::gpu_variant));
-    template <typename U>
-    static __no& test(...);
-    static const bool value = (sizeof(test<T>(0)) == sizeof(__yes));
-  };
-
- public:
-  static const char* task_name()
-  {
-    static std::string result;
-    if (result.empty()) {
-      int status      = 0;
-      char* demangled = abi::__cxa_demangle(typeid(T).name(), 0, 0, &status);
-      result          = demangled;
-      free(demangled);
-    }
+struct LegateTask {
+  // Exports the base class so we can access it via subclass T
+  using BASE = LegateTask<T>;
+
+  /**
+   * @brief Registers all task variants of the task. The client can optionally specify
+   * variant options.
+   *
+   * @param all_options Options for task variants. Variants with no entires in `all_options` will
+   * use the default set of options
+   */
+  static void register_variants(
+    const std::map<LegateVariantCode, VariantOptions>& all_options = {});
 
-    return result.c_str();
-  }
+ private:
+  template <typename, template <typename...> typename, bool>
+  friend struct detail::RegisterVariantImpl;
 
-  // Task wrappers so we can instrument all Legate tasks if we want
-  template <LegateVariantImpl TASK_PTR>
+  // A wrapper that wraps all Legate task variant implementations. Provides
+  // common functionalities and instrumentations
+  template <VariantImpl VARIANT_IMPL>
   static void legate_task_wrapper(
-    const void* args, size_t arglen, const void* userdata, size_t userlen, Legion::Processor p)
-  {
-    // Legion preamble
-    const Legion::Task* task;
-    const std::vector<Legion::PhysicalRegion>* regions;
-    Legion::Context legion_context;
-    Legion::Runtime* runtime;
-    Legion::Runtime::legion_task_preamble(args, arglen, p, task, regions, legion_context, runtime);
+    const void* args, size_t arglen, const void* userdata, size_t userlen, Processor p);
 
-#ifdef LEGATE_USE_CUDA
-    nvtx::Range auto_range(task_name());
-#endif
-
-    Core::show_progress(task, legion_context, runtime, task_name());
-
-    TaskContext context(task, *regions, legion_context, runtime);
-
-    ReturnValues return_values{};
-    try {
-      if (!Core::use_empty_task) (*TASK_PTR)(context);
-      return_values = context.pack_return_values();
-    } catch (legate::TaskException& e) {
-      if (context.can_raise_exception()) {
-        context.make_all_unbound_stores_empty();
-        return_values = context.pack_return_values_with_exception(e.index(), e.error_message());
-      } else
-        // If a Legate exception is thrown by a task that does not declare any exception,
-        // this is a bug in the library that needs to be reported to the developer
-        Core::report_unexpected_exception(task_name(), e);
-    }
-
-    // Legion postamble
-    return_values.finalize(legion_context);
-  }
-
- public:
-  // Methods for registering variants
-  template <LegateVariantImpl TASK_PTR>
+  // A helper to register a single task variant
+  template <VariantImpl VARIANT_IMPL>
   static void register_variant(Legion::ExecutionConstraintSet& execution_constraints,
                                Legion::TaskLayoutConstraintSet& layout_constraints,
                                LegateVariantCode var,
-                               Legion::Processor::Kind kind,
-                               const VariantOptions& options)
-  {
-    // Construct the code descriptor for this task so that the library
-    // can register it later when it is ready
-    Legion::CodeDescriptor desc(legate_task_wrapper<TASK_PTR>);
-    auto task_id = T::TASK_ID;
+                               Processor::Kind kind,
+                               const VariantOptions& options);
 
-    T::Registrar::record_variant(
-      task_id, T::task_name(), desc, execution_constraints, layout_constraints, var, kind, options);
-  }
-  static void register_variants(
-    const std::map<LegateVariantCode, VariantOptions>& all_options = {});
-};
-
-template <typename T, typename BASE, bool HAS_CPU>
-class RegisterCPUVariant {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    Legion::ExecutionConstraintSet execution_constraints;
-    Legion::TaskLayoutConstraintSet layout_constraints;
-    BASE::template register_variant<T::cpu_variant>(execution_constraints,
-                                                    layout_constraints,
-                                                    LEGATE_CPU_VARIANT,
-                                                    Legion::Processor::LOC_PROC,
-                                                    options);
-  }
-};
-
-template <typename T, typename BASE>
-class RegisterCPUVariant<T, BASE, false> {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    // Do nothing
-  }
-};
-
-template <typename T, typename BASE, bool HAS_OPENMP>
-class RegisterOMPVariant {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    Legion::ExecutionConstraintSet execution_constraints;
-    Legion::TaskLayoutConstraintSet layout_constraints;
-    BASE::template register_variant<T::omp_variant>(execution_constraints,
-                                                    layout_constraints,
-                                                    LEGATE_OMP_VARIANT,
-                                                    Legion::Processor::OMP_PROC,
-                                                    options);
-  }
-};
-
-template <typename T, typename BASE>
-class RegisterOMPVariant<T, BASE, false> {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    // Do nothing
-  }
-};
-
-template <typename T, typename BASE, bool HAS_GPU>
-class RegisterGPUVariant {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    Legion::ExecutionConstraintSet execution_constraints;
-    Legion::TaskLayoutConstraintSet layout_constraints;
-    BASE::template register_variant<T::gpu_variant>(execution_constraints,
-                                                    layout_constraints,
-                                                    LEGATE_GPU_VARIANT,
-                                                    Legion::Processor::TOC_PROC,
-                                                    options);
-  }
-};
-
-template <typename T, typename BASE>
-class RegisterGPUVariant<T, BASE, false> {
- public:
-  static void register_variant(const VariantOptions& options)
-  {
-    // Do nothing
-  }
-};
-
-template <typename T>
-/*static*/ void LegateTask<T>::register_variants(
-  const std::map<LegateVariantCode, VariantOptions>& all_options)
-{
-  // Make a copy of the map of options so that we can do find-or-create on it
-  auto all_options_copy = all_options;
-  RegisterCPUVariant<T, LegateTask<T>, HasCPUVariant::value>::register_variant(
-    all_options_copy[LEGATE_CPU_VARIANT]);
-  RegisterOMPVariant<T, LegateTask<T>, HasOMPVariant::value>::register_variant(
-    all_options_copy[LEGATE_OMP_VARIANT]);
-  RegisterGPUVariant<T, LegateTask<T>, HasGPUVariant::value>::register_variant(
-    all_options_copy[LEGATE_GPU_VARIANT]);
-}
-
-class LegateTaskRegistrar {
- public:
-  void record_variant(Legion::TaskID tid,
-                      const char* task_name,
-                      const Legion::CodeDescriptor& desc,
-                      Legion::ExecutionConstraintSet& execution_constraints,
-                      Legion::TaskLayoutConstraintSet& layout_constraints,
-                      LegateVariantCode var,
-                      Legion::Processor::Kind kind,
-                      const VariantOptions& options);
-
- public:
-  void register_all_tasks(Legion::Runtime* runtime, LibraryContext& context);
-
- private:
-  struct PendingTaskVariant : public Legion::TaskVariantRegistrar {
-   public:
-    PendingTaskVariant(void)
-      : Legion::TaskVariantRegistrar(), task_name(nullptr), var(LEGATE_NO_VARIANT)
-    {
-    }
-    PendingTaskVariant(Legion::TaskID tid,
-                       bool global,
-                       const char* var_name,
-                       const char* t_name,
-                       const Legion::CodeDescriptor& desc,
-                       LegateVariantCode v,
-                       size_t ret)
-      : Legion::TaskVariantRegistrar(tid, global, var_name),
-        task_name(t_name),
-        descriptor(desc),
-        var(v),
-        ret_size(ret)
-    {
-    }
-
-   public:
-    const char* task_name;
-    Legion::CodeDescriptor descriptor;
-    LegateVariantCode var;
-    size_t ret_size;
-  };
-
- private:
-  std::vector<PendingTaskVariant> pending_task_variants_;
+  static const char* task_name();
 };
 
 }  // namespace legate
+
+#include "core/task/task.inl"
diff --git a/src/core/task/task.inl b/src/core/task/task.inl
new file mode 100644
index 000000000..713cc3efd
--- /dev/null
+++ b/src/core/task/task.inl
@@ -0,0 +1,79 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "core/task/task.h"
+
+namespace legate {
+
+namespace detail {
+
+std::string generate_task_name(const std::type_info&);
+
+void task_wrapper(
+  VariantImpl, const char*, const void*, size_t, const void*, size_t, Legion::Processor);
+
+};  // namespace detail
+
+template <typename T>
+template <VariantImpl VARIANT_IMPL>
+/*static*/ void LegateTask<T>::legate_task_wrapper(
+  const void* args, size_t arglen, const void* userdata, size_t userlen, Legion::Processor p)
+{
+  detail::task_wrapper(VARIANT_IMPL, task_name(), args, arglen, userdata, userlen, p);
+}
+
+template <typename T>
+template <VariantImpl VARIANT_IMPL>
+/*static*/ void LegateTask<T>::register_variant(
+  Legion::ExecutionConstraintSet& execution_constraints,
+  Legion::TaskLayoutConstraintSet& layout_constraints,
+  LegateVariantCode var,
+  Legion::Processor::Kind kind,
+  const VariantOptions& options)
+{
+  // Construct the code descriptor for this task so that the library
+  // can register it later when it is ready
+  Legion::CodeDescriptor desc(legate_task_wrapper<VARIANT_IMPL>);
+  auto task_id = T::TASK_ID;
+
+  T::Registrar::record_variant(
+    task_id, task_name(), desc, execution_constraints, layout_constraints, var, kind, options);
+}
+
+template <typename T>
+/*static*/ void LegateTask<T>::register_variants(
+  const std::map<LegateVariantCode, VariantOptions>& all_options)
+{
+  // Make a copy of the map of options so that we can do find-or-create on it
+  auto all_options_copy = all_options;
+  detail::RegisterVariant<T, detail::CPUVariant>::register_variant(
+    all_options_copy[LEGATE_CPU_VARIANT]);
+  detail::RegisterVariant<T, detail::OMPVariant>::register_variant(
+    all_options_copy[LEGATE_OMP_VARIANT]);
+  detail::RegisterVariant<T, detail::GPUVariant>::register_variant(
+    all_options_copy[LEGATE_GPU_VARIANT]);
+}
+
+template <typename T>
+/*static*/ const char* LegateTask<T>::task_name()
+{
+  static std::string result = detail::generate_task_name(typeid(T));
+  return result.c_str();
+}
+
+}  // namespace legate
diff --git a/src/core/task/variant.cc b/src/core/task/variant.cc
new file mode 100644
index 000000000..ea54b414e
--- /dev/null
+++ b/src/core/task/variant.cc
@@ -0,0 +1,51 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "core/task/variant.h"
+
+namespace legate {
+
+VariantOptions& VariantOptions::with_leaf(bool _leaf)
+{
+  leaf = _leaf;
+  return *this;
+}
+
+VariantOptions& VariantOptions::with_inner(bool _inner)
+{
+  inner = _inner;
+  return *this;
+}
+
+VariantOptions& VariantOptions::with_idempotent(bool _idempotent)
+{
+  idempotent = _idempotent;
+  return *this;
+}
+
+VariantOptions& VariantOptions::with_concurrent(bool _concurrent)
+{
+  concurrent = _concurrent;
+  return *this;
+}
+
+VariantOptions& VariantOptions::with_return_size(size_t _return_size)
+{
+  return_size = _return_size;
+  return *this;
+}
+
+}  // namespace legate
diff --git a/src/core/task/variant.h b/src/core/task/variant.h
new file mode 100644
index 000000000..1ec6007ec
--- /dev/null
+++ b/src/core/task/variant.h
@@ -0,0 +1,139 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legion.h"
+
+#include "core/utilities/typedefs.h"
+
+/**
+ * @file
+ * @brief Class definition fo legate::VariantOptions
+ */
+namespace legate {
+
+// Each scalar output store can take up to 12 bytes, so in the worst case there can be only up to
+// 341 scalar output stores.
+constexpr size_t LEGATE_MAX_SIZE_SCALAR_RETURN = 4096;
+
+/**
+ * @ingroup task
+ * @brief A helper class for specifying variant options
+ */
+struct VariantOptions {
+  /**
+   * @brief If the flag is `true`, the variant launches no subtasks. `true` by default.
+   */
+  bool leaf{true};
+  bool inner{false};
+  bool idempotent{false};
+  /**
+   * @brief If the flag is `true`, the variant needs a concurrent task launch. `false` by default.
+   */
+  bool concurrent{false};
+  /**
+   * @brief Maximum aggregate size for scalar output values. 4096 by default.
+   */
+  size_t return_size{LEGATE_MAX_SIZE_SCALAR_RETURN};
+
+  /**
+   * @brief Changes the value of the `leaf` flag
+   *
+   * @param `leaf` A new value for the `leaf` flag
+   */
+  VariantOptions& with_leaf(bool leaf);
+  VariantOptions& with_inner(bool inner);
+  VariantOptions& with_idempotent(bool idempotent);
+  /**
+   * @brief Changes the value of the `concurrent` flag
+   *
+   * @param `concurrent` A new value for the `concurrent` flag
+   */
+  VariantOptions& with_concurrent(bool concurrent);
+  /**
+   * @brief Sets a maximum aggregate size for scalar output values
+   *
+   * @param `return_size` A new maximum aggregate size for scalar output values
+   */
+  VariantOptions& with_return_size(size_t return_size);
+};
+
+namespace detail {
+
+template <typename T>
+using void_t = void;
+
+template <typename T, typename = void>
+struct CPUVariant : std::false_type {};
+
+template <typename T, typename = void>
+struct OMPVariant : std::false_type {};
+
+template <typename T, typename = void>
+struct GPUVariant : std::false_type {};
+
+template <typename T>
+struct CPUVariant<T, void_t<decltype(T::cpu_variant)>> : std::true_type {
+  static constexpr auto variant   = T::cpu_variant;
+  static constexpr auto id        = LEGATE_CPU_VARIANT;
+  static constexpr auto proc_kind = Processor::LOC_PROC;
+};
+
+template <typename T>
+struct OMPVariant<T, void_t<decltype(T::omp_variant)>> : std::true_type {
+  static constexpr auto variant   = T::omp_variant;
+  static constexpr auto id        = LEGATE_OMP_VARIANT;
+  static constexpr auto proc_kind = Processor::OMP_PROC;
+};
+
+template <typename T>
+struct GPUVariant<T, void_t<decltype(T::gpu_variant)>> : std::true_type {
+  static constexpr auto variant   = T::gpu_variant;
+  static constexpr auto id        = LEGATE_GPU_VARIANT;
+  static constexpr auto proc_kind = Processor::TOC_PROC;
+};
+
+template <typename T, template <typename...> typename SELECTOR, bool HAS_VARIANT>
+struct RegisterVariantImpl {
+  static void register_variant(const VariantOptions& options)
+  {
+    Legion::ExecutionConstraintSet execution_constraints;
+    Legion::TaskLayoutConstraintSet layout_constraints;
+    T::BASE::template register_variant<SELECTOR<T>::variant>(
+      execution_constraints, layout_constraints, SELECTOR<T>::id, SELECTOR<T>::proc_kind, options);
+  }
+};
+
+template <typename T, template <typename...> typename SELECTOR>
+struct RegisterVariantImpl<T, SELECTOR, false> {
+  static void register_variant(const VariantOptions& options)
+  {
+    // Do nothing
+  }
+};
+
+template <typename T, template <typename...> typename SELECTOR>
+struct RegisterVariant {
+  static void register_variant(const VariantOptions& options)
+  {
+    RegisterVariantImpl<T, SELECTOR, SELECTOR<T>::value>::register_variant(options);
+  }
+};
+
+}  // namespace detail
+
+}  // namespace legate
diff --git a/src/core/utilities/debug.h b/src/core/utilities/debug.h
index e6f52897f..d5e86cac4 100644
--- a/src/core/utilities/debug.h
+++ b/src/core/utilities/debug.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "core/data/store.h"
+#include "core/utilities/typedefs.h"
 
 #ifdef LEGATE_USE_CUDA
 #include <cuda_runtime_api.h>
@@ -24,9 +25,14 @@
 
 #include <sstream>
 
-namespace legate {
+/** @defgroup util Utilities
+ */
 
-using namespace Legion;
+/**
+ * @file
+ * @brief Debugging utilities
+ */
+namespace legate {
 
 #ifdef LEGATE_USE_CUDA
 
@@ -43,6 +49,16 @@ inline bool is_device_only_ptr(const void* ptr)
 
 #endif  // LEGATE_USE_CUDA
 
+/**
+ * @ingroup util
+ * @brief Converts the dense array into a string
+ *
+ * @param base Array to convert
+ * @param extents Extents of the array
+ * @param strides Strides for dimensions
+ *
+ * @return A string expressing the contents of the array
+ */
 template <typename T, int DIM>
 std::string print_dense_array(const T* base, const Point<DIM>& extents, size_t strides[DIM])
 {
@@ -94,6 +110,15 @@ std::string print_dense_array(const T* base, const Point<DIM>& extents, size_t s
   return ss.str();
 }
 
+/**
+ * @ingroup util
+ * @brief Converts the dense array into a string using an accessor
+ *
+ * @param accessor Accessor to an array
+ * @param rect Sub-rectangle within which the elements should be retrieved
+ *
+ * @return A string expressing the contents of the array
+ */
 template <int DIM, typename ACC>
 std::string print_dense_array(ACC accessor, const Rect<DIM>& rect)
 {
@@ -103,6 +128,14 @@ std::string print_dense_array(ACC accessor, const Rect<DIM>& rect)
   return print_dense_array(base, extents, strides);
 }
 
+/**
+ * @ingroup util
+ * @brief Converts the store to a string
+ *
+ * @param store Store to convert
+ *
+ * @return A string expressing the contents of the store
+ */
 std::string print_dense_array(const Store& store);
 
 }  // namespace legate
diff --git a/src/core/utilities/deserializer.cc b/src/core/utilities/deserializer.cc
index f62f5b50a..325b0d2d4 100644
--- a/src/core/utilities/deserializer.cc
+++ b/src/core/utilities/deserializer.cc
@@ -18,26 +18,22 @@
 #include "core/data/scalar.h"
 #include "core/data/store.h"
 #include "core/utilities/machine.h"
+#include "core/utilities/typedefs.h"
 
 #include "legion/legion_c.h"
 #include "legion/legion_c_util.h"
 
-using LegionTask = Legion::Task;
-
-using namespace Legion;
-using namespace Legion::Mapping;
-
 namespace legate {
 
-TaskDeserializer::TaskDeserializer(const LegionTask* task,
-                                   const std::vector<PhysicalRegion>& regions)
+TaskDeserializer::TaskDeserializer(const Legion::Task* task,
+                                   const std::vector<Legion::PhysicalRegion>& regions)
   : BaseDeserializer(static_cast<const int8_t*>(task->args), task->arglen),
     futures_{task->futures.data(), task->futures.size()},
     regions_{regions.data(), regions.size()},
     outputs_()
 {
-  auto runtime = Runtime::get_runtime();
-  auto ctx     = Runtime::get_context();
+  auto runtime = Legion::Runtime::get_runtime();
+  auto ctx     = Legion::Runtime::get_context();
   runtime->get_output_regions(ctx, outputs_);
 
   first_task_ = !task->is_index_space || (task->index_point == task->index_domain.lo());
@@ -76,7 +72,7 @@ void TaskDeserializer::_unpack(FutureWrapper& value)
   auto field_size  = unpack<int32_t>();
 
   auto point = unpack<std::vector<int64_t>>();
-  Legion::Domain domain;
+  Domain domain;
   domain.dim = static_cast<int32_t>(point.size());
   for (int32_t idx = 0; idx < domain.dim; ++idx) {
     domain.rect_data[idx]              = 0;
@@ -122,14 +118,14 @@ void TaskDeserializer::_unpack(Legion::PhaseBarrier& barrier)
   auto future   = futures_[0];
   futures_      = futures_.subspan(1);
   auto barrier_ = future.get_result<legion_phase_barrier_t>();
-  barrier       = CObjectWrapper::unwrap(barrier_);
+  barrier       = Legion::CObjectWrapper::unwrap(barrier_);
 }
 
 namespace mapping {
 
 TaskDeserializer::TaskDeserializer(const Legion::Task* task,
-                                   MapperRuntime* runtime,
-                                   MapperContext context)
+                                   Legion::Mapping::MapperRuntime* runtime,
+                                   Legion::Mapping::MapperContext context)
   : BaseDeserializer(static_cast<const int8_t*>(task->args), task->arglen),
     task_(task),
     runtime_(runtime),
@@ -170,7 +166,7 @@ void TaskDeserializer::_unpack(FutureWrapper& value)
   unpack<int32_t>();
 
   auto point = unpack<std::vector<int64_t>>();
-  Legion::Domain domain;
+  Domain domain;
   domain.dim = static_cast<int32_t>(point.size());
   for (int32_t idx = 0; idx < domain.dim; ++idx) {
     domain.rect_data[idx]              = 0;
@@ -193,8 +189,8 @@ void TaskDeserializer::_unpack(RegionField& value, bool is_output_region)
 CopyDeserializer::CopyDeserializer(const void* args,
                                    size_t arglen,
                                    std::vector<ReqsRef>&& all_requirements,
-                                   MapperRuntime* runtime,
-                                   MapperContext context)
+                                   Legion::Mapping::MapperRuntime* runtime,
+                                   Legion::Mapping::MapperContext context)
   : BaseDeserializer(static_cast<const int8_t*>(args), arglen),
     all_reqs_(std::forward<std::vector<ReqsRef>>(all_requirements)),
     curr_reqs_(all_reqs_.begin()),
diff --git a/src/core/utilities/dispatch.h b/src/core/utilities/dispatch.h
index c2e849f24..f7c808005 100644
--- a/src/core/utilities/dispatch.h
+++ b/src/core/utilities/dispatch.h
@@ -18,6 +18,10 @@
 
 #include "core/utilities/typedefs.h"
 
+/**
+ * @file
+ * @brief Definitions for dispatch routines
+ */
 namespace legate {
 
 template <int DIM>
@@ -86,42 +90,42 @@ struct inner_dim_dispatch_fn {
       case 1: {
         return f.template operator()<DIM, 1>(std::forward<Fnargs>(args)...);
       }
-#if LEGION_MAX_DIM >= 2
+#if LEGATE_MAX_DIM >= 2
       case 2: {
         return f.template operator()<DIM, 2>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 3
+#if LEGATE_MAX_DIM >= 3
       case 3: {
         return f.template operator()<DIM, 3>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 4
+#if LEGATE_MAX_DIM >= 4
       case 4: {
         return f.template operator()<DIM, 4>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 5
+#if LEGATE_MAX_DIM >= 5
       case 5: {
         return f.template operator()<DIM, 5>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 6
+#if LEGATE_MAX_DIM >= 6
       case 6: {
         return f.template operator()<DIM, 6>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 7
+#if LEGATE_MAX_DIM >= 7
       case 7: {
         return f.template operator()<DIM, 7>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 8
+#if LEGATE_MAX_DIM >= 8
       case 8: {
         return f.template operator()<DIM, 8>(std::forward<Fnargs>(args)...);
       }
 #endif
-#if LEGION_MAX_DIM >= 9
+#if LEGATE_MAX_DIM >= 9
       case 9: {
         return f.template operator()<DIM, 9>(std::forward<Fnargs>(args)...);
       }
@@ -132,51 +136,65 @@ struct inner_dim_dispatch_fn {
   }
 };
 
+/**
+ * @ingroup util
+ * @brief Converts the runtime dimension and type code into compile time constants and
+ * invokes the functor with them
+ *
+ * The functor's `operator()` should take a dimension and a type code as template parameters.
+ *
+ * @param dim Dimension
+ * @param code Type code
+ * @param f Functor to dispatch
+ * @param args Extra arguments to the functor
+ *
+ * @return The functor's return value
+ */
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) double_dispatch(int dim, LegateTypeCode code, Functor f, Fnargs&&... args)
 {
   switch (dim) {
-#if LEGION_MAX_DIM >= 1
+#if LEGATE_MAX_DIM >= 1
     case 1: {
       return inner_type_dispatch_fn<1>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 2
+#if LEGATE_MAX_DIM >= 2
     case 2: {
       return inner_type_dispatch_fn<2>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 3
+#if LEGATE_MAX_DIM >= 3
     case 3: {
       return inner_type_dispatch_fn<3>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 4
+#if LEGATE_MAX_DIM >= 4
     case 4: {
       return inner_type_dispatch_fn<4>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 5
+#if LEGATE_MAX_DIM >= 5
     case 5: {
       return inner_type_dispatch_fn<5>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 6
+#if LEGATE_MAX_DIM >= 6
     case 6: {
       return inner_type_dispatch_fn<6>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 7
+#if LEGATE_MAX_DIM >= 7
     case 7: {
       return inner_type_dispatch_fn<7>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 8
+#if LEGATE_MAX_DIM >= 8
     case 8: {
       return inner_type_dispatch_fn<8>{}(code, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 9
+#if LEGATE_MAX_DIM >= 9
     case 9: {
       return inner_type_dispatch_fn<9>{}(code, f, std::forward<Fnargs>(args)...);
     }
@@ -186,51 +204,65 @@ constexpr decltype(auto) double_dispatch(int dim, LegateTypeCode code, Functor f
   return inner_type_dispatch_fn<1>{}(code, f, std::forward<Fnargs>(args)...);
 }
 
+/**
+ * @ingroup util
+ * @brief Converts the runtime dimensions into compile time constants and invokes
+ * the functor with them
+ *
+ * The functor's `operator()` should take exactly two integers as template parameters.
+ *
+ * @param dim1 First dimension
+ * @param dim2 Second dimension
+ * @param f Functor to dispatch
+ * @param args Extra arguments to the functor
+ *
+ * @return The functor's return value
+ */
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) double_dispatch(int dim1, int dim2, Functor f, Fnargs&&... args)
 {
   switch (dim1) {
-#if LEGION_MAX_DIM >= 1
+#if LEGATE_MAX_DIM >= 1
     case 1: {
       return inner_dim_dispatch_fn<1>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 2
+#if LEGATE_MAX_DIM >= 2
     case 2: {
       return inner_dim_dispatch_fn<2>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 3
+#if LEGATE_MAX_DIM >= 3
     case 3: {
       return inner_dim_dispatch_fn<3>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 4
+#if LEGATE_MAX_DIM >= 4
     case 4: {
       return inner_dim_dispatch_fn<4>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 5
+#if LEGATE_MAX_DIM >= 5
     case 5: {
       return inner_dim_dispatch_fn<5>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 6
+#if LEGATE_MAX_DIM >= 6
     case 6: {
       return inner_dim_dispatch_fn<6>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 7
+#if LEGATE_MAX_DIM >= 7
     case 7: {
       return inner_dim_dispatch_fn<7>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 8
+#if LEGATE_MAX_DIM >= 8
     case 8: {
       return inner_dim_dispatch_fn<8>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 9
+#if LEGATE_MAX_DIM >= 9
     case 9: {
       return inner_dim_dispatch_fn<9>{}(dim2, f, std::forward<Fnargs>(args)...);
     }
@@ -240,51 +272,64 @@ constexpr decltype(auto) double_dispatch(int dim1, int dim2, Functor f, Fnargs&&
   return inner_dim_dispatch_fn<1>{}(dim2, f, std::forward<Fnargs>(args)...);
 }
 
+/**
+ * @ingroup util
+ * @brief Converts the runtime dimension into a compile time constant and invokes
+ * the functor with it
+ *
+ * The functor's `operator()` should take an integer as its sole template parameter.
+ *
+ * @param dim Dimension
+ * @param f Functor to dispatch
+ * @param args Extra arguments to the functor
+ *
+ * @return The functor's return value
+ */
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) dim_dispatch(int dim, Functor f, Fnargs&&... args)
 {
   switch (dim) {
-#if LEGION_MAX_DIM >= 1
+#if LEGATE_MAX_DIM >= 1
     case 1: {
       return f.template operator()<1>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 2
+#if LEGATE_MAX_DIM >= 2
     case 2: {
       return f.template operator()<2>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 3
+#if LEGATE_MAX_DIM >= 3
     case 3: {
       return f.template operator()<3>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 4
+#if LEGATE_MAX_DIM >= 4
     case 4: {
       return f.template operator()<4>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 5
+#if LEGATE_MAX_DIM >= 5
     case 5: {
       return f.template operator()<5>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 6
+#if LEGATE_MAX_DIM >= 6
     case 6: {
       return f.template operator()<6>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 7
+#if LEGATE_MAX_DIM >= 7
     case 7: {
       return f.template operator()<7>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 8
+#if LEGATE_MAX_DIM >= 8
     case 8: {
       return f.template operator()<8>(std::forward<Fnargs>(args)...);
     }
 #endif
-#if LEGION_MAX_DIM >= 9
+#if LEGATE_MAX_DIM >= 9
     case 9: {
       return f.template operator()<9>(std::forward<Fnargs>(args)...);
     }
@@ -294,6 +339,19 @@ constexpr decltype(auto) dim_dispatch(int dim, Functor f, Fnargs&&... args)
   return f.template operator()<1>(std::forward<Fnargs>(args)...);
 }
 
+/**
+ * @ingroup util
+ * @brief Converts the runtime type code into a compile time constant and invokes
+ * the functor with it
+ *
+ * The functor's `operator()` should take a type code as its sole template parameter.
+ *
+ * @param code Type code
+ * @param f Functor to dispatch
+ * @param args Extra arguments to the functor
+ *
+ * @return The functor's return value
+ */
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) type_dispatch(LegateTypeCode code, Functor f, Fnargs&&... args)
 {
diff --git a/src/core/utilities/linearize.cc b/src/core/utilities/linearize.cc
index 5223fd2fb..02ed39ac2 100644
--- a/src/core/utilities/linearize.cc
+++ b/src/core/utilities/linearize.cc
@@ -19,8 +19,6 @@
 
 namespace legate {
 
-using namespace Legion;
-
 struct linearize_fn {
   template <int32_t DIM>
   size_t operator()(const DomainPoint& lo_dp, const DomainPoint& hi_dp, const DomainPoint& point_dp)
diff --git a/src/core/utilities/linearize.h b/src/core/utilities/linearize.h
index 70a016521..a8d1720ac 100644
--- a/src/core/utilities/linearize.h
+++ b/src/core/utilities/linearize.h
@@ -16,16 +16,12 @@
 
 #pragma once
 
-#include "legion.h"
+#include "core/utilities/typedefs.h"
 
 namespace legate {
 
-size_t linearize(const Legion::DomainPoint& lo,
-                 const Legion::DomainPoint& hi,
-                 const Legion::DomainPoint& point);
+size_t linearize(const DomainPoint& lo, const DomainPoint& hi, const DomainPoint& point);
 
-Legion::DomainPoint delinearize(const Legion::DomainPoint& lo,
-                                const Legion::DomainPoint& hi,
-                                size_t idx);
+DomainPoint delinearize(const DomainPoint& lo, const DomainPoint& hi, size_t idx);
 
 }  // namespace legate
diff --git a/src/core/utilities/machine.cc b/src/core/utilities/machine.cc
index 9d7e31cdd..ca79637e9 100644
--- a/src/core/utilities/machine.cc
+++ b/src/core/utilities/machine.cc
@@ -19,8 +19,6 @@
 #include "core/runtime/runtime.h"
 #include "legate_defines.h"
 
-using namespace Legion;
-
 namespace legate {
 
 Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible)
@@ -36,6 +34,7 @@ Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible)
     case Processor::Kind::OMP_PROC: {
       return Core::has_socket_mem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
     }
+    default: break;
   }
   LEGATE_ABORT;
   return Memory::Kind::SYSTEM_MEM;
diff --git a/src/core/utilities/machine.h b/src/core/utilities/machine.h
index b61824542..5e0cd9d60 100644
--- a/src/core/utilities/machine.h
+++ b/src/core/utilities/machine.h
@@ -18,8 +18,10 @@
 
 #include "legion.h"
 
+#include "core/utilities/typedefs.h"
+
 namespace legate {
 
-Legion::Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible = true);
+Memory::Kind find_memory_kind_for_executing_processor(bool host_accessible = true);
 
 }  // namespace legate
diff --git a/src/core/utilities/span.h b/src/core/utilities/span.h
index a4fd12a8c..4f34c172d 100644
--- a/src/core/utilities/span.h
+++ b/src/core/utilities/span.h
@@ -19,8 +19,18 @@
 #include <assert.h>
 #include <stddef.h>
 
+/**
+ * @file
+ * @brief Class definition for legate::Span
+ */
+
 namespace legate {
 
+/**
+ * @ingroup data
+ * @brief A simple span implementation used in Legate. Should eventually be replaced with
+ * std::span once we bump up the C++ standard version to C++20
+ */
 template <typename T>
 struct Span {
  public:
@@ -28,9 +38,23 @@ struct Span {
   Span(const Span&) = default;
 
  public:
+  /**
+   * @brief Creates a span with an existing pointer and a size.
+   *
+   * The caller must guarantee that the allocation is big enough (i.e., bigger than or
+   * equal to `sizeof(T) * size`) and that the allocation is alive while the span is alive.
+   *
+   * @param data Pointer to the data
+   * @param size Number of elements
+   */
   Span(T* data, size_t size) : data_(data), size_(size) {}
 
  public:
+  /**
+   * @brief Returns the number of elements
+   *
+   * @return The number of elements
+   */
   size_t size() const { return size_; }
 
  public:
@@ -39,10 +63,28 @@ struct Span {
     assert(pos < size_);
     return data_[pos];
   }
+  /**
+   * @brief Returns the pointer to the first element
+   *
+   * @return Pointer to the first element
+   */
   const T* begin() const { return &data_[0]; }
+  /**
+   * @brief Returns the pointer to the end of allocation
+   *
+   * @return Pointer to the end of allocation
+   */
   const T* end() const { return &data_[size_]; }
 
  public:
+  /**
+   * @brief Slices off the first `off` elements. Passing an `off` greater than
+   * the size will fail with an assertion failure.
+   *
+   * @param off Number of elements to skip
+   *
+   * @return A span for range `[off, size())`
+   */
   decltype(auto) subspan(size_t off)
   {
     assert(off <= size_);
@@ -50,6 +92,11 @@ struct Span {
   }
 
  public:
+  /**
+   * @brief Returns a `const` pointer to the data
+   *
+   * @return Pointer to the data
+   */
   const T* ptr() const { return data_; }
 
  private:
diff --git a/src/core/utilities/type_traits.h b/src/core/utilities/type_traits.h
index 84197f528..9ac4d8929 100644
--- a/src/core/utilities/type_traits.h
+++ b/src/core/utilities/type_traits.h
@@ -18,6 +18,11 @@
 
 #include "core/utilities/typedefs.h"
 
+/**
+ * @file
+ * @brief Definitions for type traits in Legate
+ */
+
 namespace legate {
 
 // This maps a type to its LegateTypeCode
@@ -54,6 +59,10 @@ static constexpr LegateTypeCode legate_type_code_of<complex<float>> = COMPLEX64_
 template <>
 static constexpr LegateTypeCode legate_type_code_of<complex<double>> = COMPLEX128_LT;
 #else  // not clang
+/**
+ * @ingroup util
+ * @brief A template constexpr that converts types to type codes
+ */
 template <class>
 constexpr LegateTypeCode legate_type_code_of = MAX_TYPE_NUMBER;
 
@@ -148,29 +157,53 @@ struct LegateTypeOf<LegateTypeCode::COMPLEX128_LT> {
   using type = complex<double>;
 };
 
+/**
+ * @ingroup util
+ * @brief A template that converts type codes to types
+ */
 template <LegateTypeCode CODE>
 using legate_type_of = typename LegateTypeOf<CODE>::type;
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type code is of an integral type
+ */
 template <LegateTypeCode CODE>
 struct is_integral {
   static constexpr bool value = std::is_integral<legate_type_of<CODE>>::value;
 };
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type code is of a signed integral type
+ */
 template <LegateTypeCode CODE>
 struct is_signed {
   static constexpr bool value = std::is_signed<legate_type_of<CODE>>::value;
 };
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type code is of an unsigned integral type
+ */
 template <LegateTypeCode CODE>
 struct is_unsigned {
   static constexpr bool value = std::is_unsigned<legate_type_of<CODE>>::value;
 };
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type code is of a floating point type
+ */
 template <LegateTypeCode CODE>
 struct is_floating_point {
   static constexpr bool value = std::is_floating_point<legate_type_of<CODE>>::value;
 };
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type code is of a complex type
+ */
 template <LegateTypeCode CODE>
 struct is_complex : std::false_type {};
 
@@ -180,6 +213,10 @@ struct is_complex<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
 struct is_complex<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
+/**
+ * @ingroup util
+ * @brief A predicate that holds if the type is one of the supported complex types
+ */
 template <typename T>
 struct is_complex_type : std::false_type {};
 
diff --git a/src/core/utilities/typedefs.h b/src/core/utilities/typedefs.h
index f33bcff79..b9ca9b38e 100644
--- a/src/core/utilities/typedefs.h
+++ b/src/core/utilities/typedefs.h
@@ -19,32 +19,316 @@
 #include "legion.h"
 
 #include "core/legate_c.h"
+#include "legate_defines.h"
+
+/**
+ * @file
+ * @brief Type aliases to Legion components
+ */
 
 namespace legate {
 
-extern Legion::Logger log_legate;
+// C enum typedefs
+using LegateVariantCode = legate_core_variant_t;
+using LegateTypeCode    = legate_core_type_code_t;
+using LegateMappingTag  = legate_core_mapping_tag_t;
+
+using Logger = Legion::Logger;
+
+extern Logger log_legate;
+
+// Re-export Legion types
+
+using TunableID = Legion::TunableID;
+
+// Geometry types
+
+/** @defgroup geometry Geometry types
+ *
+ * @{
+ */
 
-template <typename FT, int N, typename T = Legion::coord_t>
+/**
+ * @brief Coordinate type.
+ */
+using coord_t = Legion::coord_t;
+
+/**
+ * @brief Type for multi-dimensional points.
+ *
+ * Point objects support index expressions; they can be accessed like a statically-sized array.
+ * Point objects also support usual arithmetic operators and a dot opreator.
+ *
+ * For a complete definition, see
+ * [`Realm::Point`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/point.h#L46-L124).
+ */
+template <int DIM, typename T = coord_t>
+using Point = Legion::Point<DIM, T>;
+
+/**
+ * @brief Type for multi-dimensional rectangles.
+ *
+ * Each rectangle consists of two legate::Point objects, one for the lower
+ * bounds (`.lo`) and one for the upper bounds (`.hi`).
+ *
+ * For a complete definition, see
+ * [`Realm::Rect`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/point.h#L126-L212).
+ */
+template <int DIM, typename T = coord_t>
+using Rect = Legion::Rect<DIM, T>;
+
+/**
+ * @brief Dimension-erased type for multi-dimensional points.
+ *
+ * For a complete definition, see
+ * [`Legion::DomainPoint`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_domain.h#L127-L253).
+ */
+using DomainPoint = Legion::DomainPoint;
+
+/**
+ * @brief Dimension-erased type for multi-dimensional rectangles.
+ *
+ * For a complete definition, see
+ * [`Legion::Domain`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_domain.h#L255-L543).
+ */
+using Domain = Legion::Domain;
+
+/** @} */  // end of geometry
+
+// Accessor types
+
+/** @defgroup accessor Accessor types
+ *
+ * Accessors provide an interface to access values in stores. Access modes are encoded
+ * in the accessor types so that the compiler can catch invalid accesses. Accessors also
+ * provide bounds checks (which can be turned on with a compile flag).
+ *
+ * All accessors have a `ptr` method that returns a raw pointer to the underlying allocation.
+ * The caller can optionally pass an array to query strides of dimensions, necessary for correct
+ * accesse. Unlike the accesses mediated by accessors, raw pointer accesses are not protected by
+ * Legate, and thus the developer should make sure of safety of the accesses.
+ *
+ * The most common mistake with raw pointers from reduction accessors are that the code overwrites
+ * values to the elements, instead of reducing them. The key contract with reduction is that
+ * the values must be reduced to the elements in the store. So, any client code that uses a raw
+ * pointer to a reduction store should make sure that it makes updates to the effect of reducing
+ * its contributions to the original elements. Not abiding by this contract can lead to
+ * non-deterministic conrrectness issues.
+ *
+ * @{
+ */
+
+/**
+ * @brief Read-only accessor
+ *
+ * See
+ * [legion.h](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion.h#L2555-L2562)
+ * for a complete list of supported operators.
+ */
+template <typename FT, int N, typename T = coord_t>
 using AccessorRO = Legion::FieldAccessor<READ_ONLY, FT, N, T, Realm::AffineAccessor<FT, N, T>>;
-template <typename FT, int N, typename T = Legion::coord_t>
+
+/**
+ * @brief Write-only accessor
+ *
+ * See
+ * [legion.h](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion.h#L2575-L2581)
+ * for a complete list of supported operators.
+ */
+template <typename FT, int N, typename T = coord_t>
 using AccessorWO = Legion::FieldAccessor<WRITE_DISCARD, FT, N, T, Realm::AffineAccessor<FT, N, T>>;
-template <typename FT, int N, typename T = Legion::coord_t>
+
+/**
+ * @brief Read-write accessor
+ *
+ * See
+ * [legion.h](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion.h#L2564-L2573)
+ * for a complete list of supported operators.
+ */
+template <typename FT, int N, typename T = coord_t>
 using AccessorRW = Legion::FieldAccessor<READ_WRITE, FT, N, T, Realm::AffineAccessor<FT, N, T>>;
-template <typename REDOP, bool EXCLUSIVE, int N, typename T = Legion::coord_t>
+
+/**
+ * @brief Reduction accessor
+ *
+ * Unlike the other accessors, an index expression on a reduction accessor allows the client to
+ * perform only two operations, `<<=` and `reduce`, both of which reduce a value to the chosen
+ * element.
+ *
+ * See
+ * [legion.h](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion.h#L2837-L2848)
+ * for details about the reduction accessor.
+ */
+template <typename REDOP, bool EXCLUSIVE, int N, typename T = coord_t>
 using AccessorRD = Legion::
   ReductionAccessor<REDOP, EXCLUSIVE, N, T, Realm::AffineAccessor<typename REDOP::RHS, N, T>>;
-template <typename FT, int N, typename T = Legion::coord_t>
-using GenericAccessorRO = Legion::FieldAccessor<READ_ONLY, FT, N, T>;
-template <typename FT, int N, typename T = Legion::coord_t>
-using GenericAccessorWO = Legion::FieldAccessor<WRITE_DISCARD, FT, N, T>;
-template <typename FT, int N, typename T = Legion::coord_t>
-using GenericAccessorRW = Legion::FieldAccessor<READ_WRITE, FT, N, T>;
 
-using TunableID = Legion::TunableID;
+/** @} */  // end of accessor
 
-// C enum typedefs
-using LegateVariantCode = legate_core_variant_t;
-using LegateTypeCode    = legate_core_type_code_t;
-using LegateMappingTag  = legate_core_mapping_tag_t;
+// Iterators
+
+/** @defgroup iterator Iterator types
+ *
+ * @{
+ */
+
+/**
+ * @brief Iterator that iterates all points in a given `legate::Rect`.
+ *
+ * See
+ * [Realm::PointInRectIterator](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/point.h#L239-L255)
+ * for a complete definition.
+ */
+template <int DIM, typename T = coord_t>
+using PointInRectIterator = Legion::PointInRectIterator<DIM, T>;
+
+/**
+ * @brief Iterator that iterates all points in a given `legate::Domain`.
+ *
+ * See
+ * [Legion::PointInDomainIterator](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_domain.h#L599-L622)
+ * for a complete definition.
+ */
+template <int DIM, typename T = coord_t>
+using PointInDomainIterator = Legion::PointInDomainIterator<DIM, T>;
+
+/** @} */  // end of iterator
+
+// Machine
+
+/** @defgroup machine Machine objects
+ *
+ * @{
+ */
+
+/**
+ * @brief Logical processor handle
+ *
+ * Legate libraries rarely use processor handles directly and there are no Legate APIs that take
+ * a processor handle. However, the libraries may want to query the processor that runs the
+ * current task to perform some processor- or processor kind-specific operations. In that case,
+ * `legate::Processor::get_executing_processor` can be used. Other useful memobers of
+ * `legate::Processor` are the `kind` method, which returns the processor kind, and
+ * `legate::Processor::Kind`, an enum for all processor types.
+ *
+ * See
+ * [`Realm::Processor`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/processor.h#L35-L141)
+ * for a complete definition. The list of processor types can be found
+ * [here](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/realm_c.h#L45-L54).
+ *
+ */
+using Processor = Legion::Processor;
+
+/**
+ * @brief Logical memory handle
+ *
+ * In Legate, libraries will never have to use memory handles directly. However, some Legate
+ * APIs (e.g., legate::create_buffer) take a memory kind as an argument; `legate::Memory::Kind`
+ * is an enum for all memory types.
+ *
+ * See
+ * [`Realm::Memory`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/memory.h#L30-L65)
+ * for a complete definition. The list of memory types can be found
+ * [here](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/realm/realm_c.h#L63-L78).
+ */
+using Memory = Legion::Memory;
+
+/** @} */  // end of machine
+
+// Reduction operators
+
+/** @defgroup reduction Built-in reduction operators
+ *
+ * All built-in operators are defined for signed and unsigned integer types. Floating point
+ * types (`__half`, `float`, and `double`) are supported by all but bitwise operators. Arithmetic
+ * operators also cover complex types `complex<__half>` and `complex<float>`.
+ *
+ * For details about reduction operators, See LibraryContext::register_reduction_operator.
+ *
+ * @{
+ */
+
+/**
+ * @brief Reduction with addition
+ *
+ * See
+ * [`Legion::SumReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L46-L285).
+ */
+template <typename T>
+using SumReduction = Legion::SumReduction<T>;
+
+/**
+ * @brief Reduction with subtraction
+ *
+ * See
+ * [`Legion::DiffReduction`](https://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L287-L492).
+ */
+template <typename T>
+using DiffReduction = Legion::DiffReduction<T>;
+
+/**
+ * @brief Reduction with multiplication
+ *
+ * See
+ * [`Legion::ProdReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L494-L714).
+ */
+template <typename T>
+using ProdReduction = Legion::ProdReduction<T>;
+
+/**
+ * @brief Reduction with division
+ *
+ * See
+ * [`Legion::DivReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L716-L921).
+ */
+template <typename T>
+using DivReduction = Legion::DivReduction<T>;
+
+/**
+ * @brief Reduction with the binary max operator
+ *
+ * See
+ * [`Legion::MaxReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L923-L1109).
+ */
+template <typename T>
+using MaxReduction = Legion::MaxReduction<T>;
+
+/**
+ * @brief Reduction with the binary min operator
+ *
+ * See
+ * [`Legion::MinReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L1111-L1297).
+ */
+template <typename T>
+using MinReduction = Legion::MinReduction<T>;
+
+/**
+ * @brief Reduction with bitwise or
+ *
+ * See
+ * [`Legion::OrReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L1299-L1423).
+ */
+template <typename T>
+using OrReduction = Legion::OrReduction<T>;
+
+/**
+ * @brief Reduction with bitwise and
+ *
+ * See
+ * [`Legion::AndReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L1425-L1549).
+ */
+template <typename T>
+using AndReduction = Legion::AndReduction<T>;
+
+/**
+ * @brief Reduction with bitwise xor
+ *
+ * See
+ * [`Legion::XorReduction`](http://github.com/StanfordLegion/legion/blob/9ed6f4d6b579c4f17e0298462e89548a4f0ed6e5/runtime/legion/legion_redop.h#L1551-L1690).
+ */
+template <typename T>
+using XorReduction = Legion::XorReduction<T>;
+
+/** @} */  // end of reduction
 
 }  // namespace legate
diff --git a/src/legate.h b/src/legate.h
index 14171e1c1..76e4b6d36 100644
--- a/src/legate.h
+++ b/src/legate.h
@@ -16,6 +16,12 @@
 
 #pragma once
 
+/**
+ * @mainpage Legate C++ API reference
+ *
+ * This is an API reference for Legate's C++ components.
+ */
+
 #include "legion.h"
 // legion.h has to go before these
 #include "core/data/allocator.h"
@@ -23,6 +29,7 @@
 #include "core/data/store.h"
 #include "core/legate_c.h"
 #include "core/runtime/runtime.h"
+#include "core/task/registrar.h"
 #include "core/task/task.h"
 #include "core/utilities/deserializer.h"
 #include "core/utilities/dispatch.h"
diff --git a/src/legate_defines.h b/src/legate_defines.h
index de272dde0..7c08bc3b1 100644
--- a/src/legate_defines.h
+++ b/src/legate_defines.h
@@ -52,5 +52,11 @@
 #endif
 #endif
 
+#ifdef LEGION_BOUNDS_CHECKS
+#define LEGATE_BOUNDS_CHECKS
+#endif
+
+#define LEGATE_MAX_DIM LEGION_MAX_DIM
+
 // TODO: 2022-10-04: Work around a Legion bug, by not instantiating futures on framebuffer.
 #define LEGATE_NO_FUTURES_ON_FB
diff --git a/tests/unit/legate/driver/test_command.py b/tests/unit/legate/driver/test_command.py
index 436d53bbf..2ea0b7c53 100644
--- a/tests/unit/legate/driver/test_command.py
+++ b/tests/unit/legate/driver/test_command.py
@@ -20,6 +20,7 @@
 import pytest
 
 import legate.driver.command as m
+from legate import install_info
 from legate.driver.launcher import RANK_ENV_VARS
 from legate.util.colors import scrub
 from legate.util.types import LauncherType
@@ -29,7 +30,10 @@
 
 
 def test___all__() -> None:
-    assert m.__all__ == ("CMD_PARTS",)
+    assert m.__all__ == (
+        "CMD_PARTS_LEGION",
+        "CMD_PARTS_CANONICAL",
+    )
 
 
 def test_LEGATE_GLOBAL_RANK_SUBSTITUTION() -> None:
@@ -37,7 +41,7 @@ def test_LEGATE_GLOBAL_RANK_SUBSTITUTION() -> None:
 
 
 def test_CMD_PARTS() -> None:
-    assert m.CMD_PARTS == (
+    assert m.CMD_PARTS_LEGION == (
         m.cmd_bind,
         m.cmd_rlwrap,
         m.cmd_gdb,
@@ -46,14 +50,16 @@ def test_CMD_PARTS() -> None:
         m.cmd_nsys,
         m.cmd_memcheck,
         m.cmd_legion,
-        m.cmd_nocr,
+        m.cmd_python_processor,
         m.cmd_module,
-        m.cmd_processor,
+        m.cmd_nocr,
+        m.cmd_local_field,
         m.cmd_kthreads,
         m.cmd_cpus,
         m.cmd_gpus,
         m.cmd_openmp,
         m.cmd_utility,
+        m.cmd_bgwork,
         m.cmd_mem,
         m.cmd_numamem,
         m.cmd_fbmem,
@@ -63,6 +69,7 @@ def test_CMD_PARTS() -> None:
         m.cmd_log_file,
         m.cmd_eager_alloc,
         m.cmd_ucx,
+        m.cmd_user_script,
         m.cmd_user_opts,
     )
 
@@ -188,7 +195,6 @@ def test_ranks_bad(
 
 
 class Test_cmd_gdb:
-
     MULTI_RANK_WARN = (
         "WARNING: Legate does not support gdb for multi-rank runs"
     )
@@ -226,7 +232,6 @@ def test_with_option_multi_rank(
 
 
 class Test_cmd_cuda_gdb:
-
     MULTI_RANK_WARN = (
         "WARNING: Legate does not support cuda-gdb for multi-rank runs"
     )
@@ -629,17 +634,25 @@ def test_default(self, genobjs: GenObjs) -> None:
         assert result == (str(system.legion_paths.legion_python),)
 
 
-class Test_cmd_processor:
+class Test_cmd_python_processor:
     def test_default(self, genobjs: GenObjs) -> None:
         config, system, launcher = genobjs([])
 
-        result = m.cmd_processor(config, system, launcher)
+        result = m.cmd_python_processor(config, system, launcher)
 
-        assert result == ("-ll:py", "1", "-lg:local", "0")
+        assert result == ("-ll:py", "1")
 
 
-class Test_cmd_kthreads:
+class Test_cmd_local_field:
+    def test_default(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs([])
+
+        result = m.cmd_local_field(config, system, launcher)
+
+        assert result == ("-lg:local", "0")
 
+
+class Test_cmd_kthreads:
     DBG_OPTS = ("--gdb", "--cuda-gdb", "--freeze-on-error")
 
     def test_default(self, genobjs: GenObjs) -> None:
@@ -815,7 +828,7 @@ def test_default_multi_rank(
 
         result = m.cmd_utility(config, system, launcher)
 
-        assert result == ("-ll:util", "2", "-ll:bgwork", "2")
+        assert result == ("-ll:util", "2")
 
     @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
     @pytest.mark.parametrize("rank", ("0", "1", "2"))
@@ -854,7 +867,7 @@ def test_utility_n_multi_rank_no_launcher(
 
         result = m.cmd_utility(config, system, launcher)
 
-        assert result == ("-ll:util", value, "-ll:bgwork", value)
+        assert result == ("-ll:util", value)
 
     @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
     @pytest.mark.parametrize("value", ("2", "3", "10"))
@@ -867,7 +880,201 @@ def test_utility_n_multi_rank_with_launcher(
 
         result = m.cmd_utility(config, system, launcher)
 
-        assert result == ("-ll:util", value, "-ll:bgwork", value)
+        assert result == ("-ll:util", value)
+
+
+class Test_cmd_bgwork:
+    def test_default_single_rank(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs([])
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ()
+
+    def test_utility_1_single_rank(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs(["--utility", "1"])
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ()
+
+    def test_utility_1_single_rank_and_ucx(self, genobjs: GenObjs) -> None:
+        config, system, launcher = genobjs(["--utility", "1"])
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ()
+
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utiltity_n_single_rank(
+        self, genobjs: GenObjs, value: str
+    ) -> None:
+        config, system, launcher = genobjs(["--utility", value])
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ()
+
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utiltity_n_single_rank_and_ucx(
+        self, genobjs: GenObjs, value: str
+    ) -> None:
+        config, system, launcher = genobjs(["--utility", value])
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ()
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    def test_default_multi_rank(
+        self, genobjs: GenObjs, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            [], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ("-ll:bgwork", "2")
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    def test_default_multi_rank_and_ucx(
+        self, genobjs: GenObjs, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            [], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    def test_utility_1_multi_rank_no_launcher(
+        self, genobjs: GenObjs, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", "1"], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ("-ll:bgwork", "2")
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    def test_utility_1_multi_rank_no_launcher_and_ucx(
+        self, genobjs: GenObjs, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", "1"], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    def test_utility_1_multi_rank_with_launcher(
+        self, genobjs: GenObjs, launch: str
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", "1", "--launcher", launch], multi_rank=(2, 2)
+        )
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ("-ll:bgwork", "2")
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    def test_utility_1_multi_rank_with_launcher_and_ucx(
+        self, genobjs: GenObjs, launch: str
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", "1", "--launcher", launch], multi_rank=(2, 2)
+        )
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ("-ll:bgwork", "2", "-ll:bgworkpin", "1")
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utility_n_multi_rank_no_launcher(
+        self, genobjs: GenObjs, value: str, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", value], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ("-ll:bgwork", value)
+
+    @pytest.mark.parametrize("rank_var", RANK_ENV_VARS)
+    @pytest.mark.parametrize("rank", ("0", "1", "2"))
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utility_n_multi_rank_no_launcher_and_ucx(
+        self, genobjs: GenObjs, value: str, rank: str, rank_var: dict[str, str]
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", value], multi_rank=(2, 2), rank_env={rank_var: rank}
+        )
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utility_n_multi_rank_with_launcher(
+        self, genobjs: GenObjs, value: str, launch: str
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", value, "--launcher", launch], multi_rank=(2, 2)
+        )
+
+        result = m.cmd_bgwork(config, system, launcher)
+
+        assert result == ("-ll:bgwork", value)
+
+    @pytest.mark.parametrize("launch", ("mpirun", "jsrun", "srun"))
+    @pytest.mark.parametrize("value", ("2", "3", "10"))
+    def test_utility_n_multi_rank_with_launcher_and_ucx(
+        self, genobjs: GenObjs, value: str, launch: str
+    ) -> None:
+        config, system, launcher = genobjs(
+            ["--utility", value, "--launcher", launch], multi_rank=(2, 2)
+        )
+
+        networks_orig = list(install_info.networks)
+        install_info.networks.append("ucx")
+        result = m.cmd_bgwork(config, system, launcher)
+        install_info.networks[:] = networks_orig[:]
+
+        assert result == ("-ll:bgwork", value, "-ll:bgworkpin", "1")
 
 
 class Test_cmd_sysmem:
@@ -1173,7 +1380,7 @@ def test_flag_without_dir(self, genobjs: GenObjs) -> None:
         result = m.cmd_log_file(config, system, launcher)
 
         logfile = str(config.logging.logdir / "legate_%.log")
-        assert result == ("-logfile", logfile)
+        assert result == ("-logfile", logfile, "-errlevel", "4")
 
     def test_flag_with_dir(self, genobjs: GenObjs) -> None:
         config, system, launcher = genobjs(
@@ -1183,7 +1390,7 @@ def test_flag_with_dir(self, genobjs: GenObjs) -> None:
         result = m.cmd_log_file(config, system, launcher)
 
         logfile = str(Path("foo") / "legate_%.log")
-        assert result == ("-logfile", logfile)
+        assert result == ("-logfile", logfile, "-errlevel", "4")
 
 
 class Test_cmd_eager_alloc:
@@ -1197,7 +1404,6 @@ def test_basic(self, genobjs: GenObjs, value: str) -> None:
 
 
 class Test_cmd_user_opts:
-
     USER_OPTS: tuple[list[str], ...] = (
         [],
         ["foo"],
@@ -1210,7 +1416,9 @@ class Test_cmd_user_opts:
     def test_basic(self, genobjs: GenObjs, opts: list[str]) -> None:
         config, system, launcher = genobjs(opts, fake_module=None)
 
-        result = m.cmd_user_opts(config, system, launcher)
+        user_opts = m.cmd_user_opts(config, system, launcher)
+        user_script = m.cmd_user_script(config, system, launcher)
+        result = user_script + user_opts
 
         assert result == tuple(opts)
 
@@ -1219,7 +1427,9 @@ def test_with_legate_opts(self, genobjs: GenObjs, opts: list[str]) -> None:
         args = ["--verbose", "--rlwrap", "--gpus", "2"] + opts
         config, system, launcher = genobjs(args, fake_module=None)
 
-        result = m.cmd_user_opts(config, system, launcher)
+        user_opts = m.cmd_user_opts(config, system, launcher)
+        user_script = m.cmd_user_script(config, system, launcher)
+        result = user_script + user_opts
 
         assert result == tuple(opts)
 
diff --git a/tests/unit/legate/driver/test_config.py b/tests/unit/legate/driver/test_config.py
index 91b6f5056..ec0f7d48d 100644
--- a/tests/unit/legate/driver/test_config.py
+++ b/tests/unit/legate/driver/test_config.py
@@ -272,7 +272,6 @@ def test_mixin(self) -> None:
 
 class TestConfig:
     def test_default_init(self) -> None:
-
         # Note this test does not clear the environment. Default values from
         # the defaults module can depend on the environment, but what matters
         # is that the generated config matches those values, whatever they are.
@@ -349,7 +348,6 @@ def test_color_arg(self) -> None:
         assert colors.ENABLED is True
 
     def test_arg_conversions(self, mocker: MockerFixture) -> None:
-
         # This is kind of a dumb short-cut test, but if we believe that
         # object_to_dataclass works as advertised, then this test ensures that
         # it is being used for all the sub-configs that it should be used for
@@ -447,7 +445,8 @@ def test_log_to_file_fixup(
     def test_user_opts(self, args: tuple[str, ...]) -> None:
         c = m.Config(["legate"] + list(args) + ["foo.py", "-a", "1"])
 
-        assert c.user_opts == ("foo.py", "-a", "1")
+        assert c.user_opts == ("-a", "1")
+        assert c.user_script == "foo.py"
 
     def test_console_true(self) -> None:
         c = m.Config(["legate"])
@@ -458,5 +457,6 @@ def test_console_true(self) -> None:
     def test_console_false(self) -> None:
         c = m.Config(["legate", "--rlwrap", "--gpus", "2", "foo.py", "-a"])
 
-        assert c.user_opts == ("foo.py", "-a")
+        assert c.user_opts == ("-a",)
+        assert c.user_script == "foo.py"
         assert not c.console
diff --git a/tests/unit/legate/driver/test_driver.py b/tests/unit/legate/driver/test_driver.py
index 652f73627..3bb417ffb 100644
--- a/tests/unit/legate/driver/test_driver.py
+++ b/tests/unit/legate/driver/test_driver.py
@@ -21,7 +21,7 @@
 from pytest_mock import MockerFixture
 
 import legate.driver.driver as m
-from legate.driver.command import CMD_PARTS
+from legate.driver.command import CMD_PARTS_LEGION
 from legate.driver.config import Config
 from legate.driver.launcher import RANK_ENV_VARS, Launcher
 from legate.util.colors import scrub
@@ -48,7 +48,7 @@ class TestDriver:
     def test_init(self, genconfig: GenConfig, launch: LauncherType) -> None:
         config = genconfig(["--launcher", launch])
 
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         assert driver.config is config
         assert driver.system is SYSTEM
@@ -58,9 +58,11 @@ def test_init(self, genconfig: GenConfig, launch: LauncherType) -> None:
     def test_cmd(self, genconfig: GenConfig, launch: LauncherType) -> None:
         config = genconfig(["--launcher", launch])
 
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
-        parts = (part(config, SYSTEM, driver.launcher) for part in CMD_PARTS)
+        parts = (
+            part(config, SYSTEM, driver.launcher) for part in CMD_PARTS_LEGION
+        )
         expected_cmd = driver.launcher.cmd + sum(parts, ())
 
         assert driver.cmd == expected_cmd
@@ -69,7 +71,7 @@ def test_cmd(self, genconfig: GenConfig, launch: LauncherType) -> None:
     def test_env(self, genconfig: GenConfig, launch: LauncherType) -> None:
         config = genconfig(["--launcher", launch])
 
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         assert driver.env == driver.launcher.env
 
@@ -79,7 +81,7 @@ def test_custom_env_vars(
     ) -> None:
         config = genconfig(["--launcher", launch])
 
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         assert driver.custom_env_vars == driver.launcher.custom_env_vars
 
@@ -88,7 +90,7 @@ def test_dry_run(
         self, genconfig: GenConfig, mocker: MockerFixture, launch: LauncherType
     ) -> None:
         config = genconfig(["--launcher", launch, "--dry-run"])
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         mocker.patch.object(m, "process_logs")
         mock_run = mocker.patch.object(m, "run")
@@ -102,7 +104,7 @@ def test_run(
         self, genconfig: GenConfig, mocker: MockerFixture, launch: LauncherType
     ) -> None:
         config = genconfig(["--launcher", launch])
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         mocker.patch.object(m, "process_logs")
         mock_run = mocker.patch.object(m, "run")
@@ -120,7 +122,7 @@ def test_verbose(
     ) -> None:
         # set --dry-run to avoid needing to mock anything
         config = genconfig(["--launcher", launch, "--verbose", "--dry-run"])
-        driver = m.Driver(config, SYSTEM)
+        driver = m.LegateDriver(config, SYSTEM)
 
         driver.run()
 
@@ -149,7 +151,7 @@ def test_verbose_nonero_rank_id(
             ["--launcher", "none", "--verbose", "--dry-run"], multi_rank=(2, 2)
         )
         system = System()
-        driver = m.Driver(config, system)
+        driver = m.LegateDriver(config, system)
 
         driver.run()
 
@@ -176,7 +178,7 @@ def test_darwin_gdb_warning(
 
         # set --dry-run to avoid needing to mock anything
         config = genconfig(["--launcher", launch, "--gdb", "--dry-run"])
-        driver = m.Driver(config, system)
+        driver = m.LegateDriver(config, system)
 
         driver.run()
 
@@ -205,7 +207,7 @@ def test_system_only(self, capsys: Capsys) -> None:
     def test_system_and_driver(self, capsys: Capsys) -> None:
         config = Config(["legate", "--no-replicate"])
         system = System()
-        driver = m.Driver(config, system)
+        driver = m.LegateDriver(config, system)
 
         m.print_verbose(system, driver)
 
diff --git a/tests/unit/legate/driver/test_launcher.py b/tests/unit/legate/driver/test_launcher.py
index f9a1b9d1a..d2a183d27 100644
--- a/tests/unit/legate/driver/test_launcher.py
+++ b/tests/unit/legate/driver/test_launcher.py
@@ -421,7 +421,6 @@ def test_multi_rank_launcher_extra_ignored(
 
 
 class TestMPILauncher:
-
     XARGS1 = (
         ()
         + ("-x", "DEFAULTS_PATH")
@@ -536,7 +535,6 @@ def test_multi_rank_launcher_extra(
         genconfig: GenConfig,
         rank_var: str,
     ) -> None:
-
         for name in m.RANK_ENV_VARS:
             monkeypatch.delenv(name, raising=False)
         monkeypatch.setenv(name, "123")
@@ -633,7 +631,6 @@ def test_multi_rank_launcher_extra(
         genconfig: GenConfig,
         rank_var: str,
     ) -> None:
-
         for name in m.RANK_ENV_VARS:
             monkeypatch.delenv(name, raising=False)
         monkeypatch.setenv(name, "123")
diff --git a/tests/unit/legate/driver/test_main.py b/tests/unit/legate/driver/test_main.py
index c0dfd07d5..77d29953c 100644
--- a/tests/unit/legate/driver/test_main.py
+++ b/tests/unit/legate/driver/test_main.py
@@ -38,8 +38,8 @@ def test_main(mocker: MockerFixture) -> None:
 
     config_spy = mocker.spy(legate.driver.config.Config, "__init__")
     system_spy = mocker.spy(legate.util.system.System, "__init__")
-    driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
-    mocker.patch("legate.driver.driver.Driver.run", return_value=123)
+    driver_spy = mocker.spy(legate.driver.driver.LegateDriver, "__init__")
+    mocker.patch("legate.driver.driver.LegateDriver.run", return_value=123)
     mocker.patch.object(sys, "argv", ["/some/path/foo", "bar"])
 
     result = m.main()
diff --git a/tests/unit/legate/jupyter/test_config.py b/tests/unit/legate/jupyter/test_config.py
index d1c425237..d0e44aa33 100644
--- a/tests/unit/legate/jupyter/test_config.py
+++ b/tests/unit/legate/jupyter/test_config.py
@@ -41,7 +41,6 @@ def test_mixin(self) -> None:
 
 class TestConfig:
     def test_default_init(self) -> None:
-
         # Note this test does not clear the environment. Default values from
         # the defaults module can depend on the environment, but what matters
         # is that the generated config matches those values, whatever they are.
@@ -120,7 +119,6 @@ def test_color_arg(self) -> None:
         assert colors.ENABLED is True
 
     def test_arg_conversions(self, mocker: MockerFixture) -> None:
-
         # This is kind of a dumb short-cut test, but if we believe that
         # object_to_dataclass works as advertised, then this test ensures that
         # it is being used for all the sub-configs that it should be used for
diff --git a/tests/unit/legate/jupyter/test_kernel.py b/tests/unit/legate/jupyter/test_kernel.py
index d176ba23d..105451b93 100644
--- a/tests/unit/legate/jupyter/test_kernel.py
+++ b/tests/unit/legate/jupyter/test_kernel.py
@@ -20,7 +20,7 @@
 from pytest_mock import MockerFixture
 
 import legate.jupyter.kernel as m
-from legate.driver import Driver
+from legate.driver import LegateDriver
 from legate.jupyter.config import Config
 from legate.util.system import System
 
@@ -41,7 +41,7 @@ def test_LEGATE_JUPYTER_METADATA_KEY() -> None:
 class Test_generate_kernel_spec:
     def test_defatul(self) -> None:
         config = Config([])
-        driver = Driver(config, system)
+        driver = LegateDriver(config, system)
 
         spec = m.generate_kernel_spec(driver, config)
 
@@ -77,7 +77,7 @@ def test_install(self, mocker: MockerFixture, capsys: Capsys) -> None:
         config = Config(
             ["legate-jupyter", "--name", "____fake_test_kernel_123abc_____"]
         )
-        driver = Driver(config, system)
+        driver = LegateDriver(config, system)
 
         spec = m.generate_kernel_spec(driver, config)
 
@@ -112,7 +112,7 @@ def test_install_verbose(
                 "____fake_test_kernel_123abc_____",
             ]
         )
-        driver = Driver(config, system)
+        driver = LegateDriver(config, system)
 
         spec = m.generate_kernel_spec(driver, config)
 
@@ -148,7 +148,7 @@ def test_install_verbose2(
                 "____fake_test_kernel_123abc_____",
             ]
         )
-        driver = Driver(config, system)
+        driver = LegateDriver(config, system)
 
         spec = m.generate_kernel_spec(driver, config)
 
diff --git a/tests/unit/legate/jupyter/test_main.py b/tests/unit/legate/jupyter/test_main.py
index a92b071c7..c25c2c5b3 100644
--- a/tests/unit/legate/jupyter/test_main.py
+++ b/tests/unit/legate/jupyter/test_main.py
@@ -38,7 +38,7 @@ def test_main(mocker: MockerFixture) -> None:
 
     config_spy = mocker.spy(legate.jupyter.config.Config, "__init__")
     system_spy = mocker.spy(legate.util.system.System, "__init__")
-    driver_spy = mocker.spy(legate.driver.driver.Driver, "__init__")
+    driver_spy = mocker.spy(legate.driver.driver.LegateDriver, "__init__")
     generate_spy = mocker.spy(legate.jupyter.kernel, "generate_kernel_spec")
     install_mock = mocker.patch("legate.jupyter.kernel.install_kernel_spec")
     mocker.patch.object(
@@ -64,7 +64,7 @@ def test_main(mocker: MockerFixture) -> None:
     assert generate_spy.call_count == 1
     assert len(generate_spy.call_args[0]) == 2
     assert isinstance(
-        generate_spy.call_args[0][0], legate.driver.driver.Driver
+        generate_spy.call_args[0][0], legate.driver.driver.LegateDriver
     )
     assert isinstance(
         generate_spy.call_args[0][1], legate.jupyter.config.Config
diff --git a/tests/unit/legate/test_cycle_check.py b/tests/unit/legate/test_cycle_check.py
index 5839bd917..5f5961b49 100644
--- a/tests/unit/legate/test_cycle_check.py
+++ b/tests/unit/legate/test_cycle_check.py
@@ -1,4 +1,4 @@
-# Copyright 2022 NVIDIA Corporation
+# Copyright 2022-2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 
+import os
 import subprocess
 from pathlib import Path
 
@@ -35,8 +36,16 @@
 def test_cycle_check(tmp_path: Path) -> None:
     prog_file = tmp_path / "prog.py"
     prog_file.write_text(PROG_TEXT)
+    env = os.environ.copy()
+    env["LEGATE_CYCLE_CHECK"] = "yes"
     output = subprocess.check_output(
-        ["legate", prog_file, "--cpus", "1", "-legate:cycle-check"]
+        [
+            "legate",
+            prog_file,
+            "--cpus",
+            "1",
+        ],
+        env=env,
     )
     assert "found cycle!" in output.decode("utf-8")
 
diff --git a/tests/unit/legate/test_rc.py b/tests/unit/legate/test_rc.py
deleted file mode 100644
index 74cea3092..000000000
--- a/tests/unit/legate/test_rc.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import sys
-from unittest.mock import MagicMock
-
-import pytest
-
-import legate.rc as m
-
-
-@pytest.fixture
-def mock_has_legion_context(monkeypatch: pytest.MonkeyPatch) -> MagicMock:
-    stub = MagicMock()
-    monkeypatch.setattr("legate.rc.has_legion_context", stub)
-    return stub
-
-
-class Test_check_legion:
-    def test_True(self, mock_has_legion_context: MagicMock) -> None:
-        mock_has_legion_context.return_value = True
-        assert m.check_legion() is None  # type: ignore[func-returns-value]
-
-    def test_True_with_msg(self, mock_has_legion_context: MagicMock) -> None:
-        mock_has_legion_context.return_value = True
-        assert m.check_legion(msg="custom") is None  # type: ignore[func-returns-value]  # noqa
-
-    def test_False(self, mock_has_legion_context: MagicMock) -> None:
-        mock_has_legion_context.return_value = False
-        with pytest.raises(RuntimeError) as e:
-            m.check_legion()
-            assert str(e) == m.LEGION_WARNING
-
-    def test_False_with_msg(self, mock_has_legion_context: MagicMock) -> None:
-        mock_has_legion_context.return_value = False
-        with pytest.raises(RuntimeError) as e:
-            m.check_legion(msg="custom")
-            assert str(e) == "custom"
-
-
-@pytest.mark.skip
-class Test_has_legion_context:
-    def test_True(self) -> None:
-        assert m.has_legion_context() is True
-
-    # It does not seem possible to patch CFFI libs, so testing
-    # the "False" branch is not really feasible
-    @pytest.mark.skip
-    def test_False(self) -> None:
-        pass
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/legate/test_settings.py b/tests/unit/legate/test_settings.py
new file mode 100644
index 000000000..092229e28
--- /dev/null
+++ b/tests/unit/legate/test_settings.py
@@ -0,0 +1,62 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import pytest
+
+import legate.settings as m
+from legate.util.settings import PrioritizedSetting
+
+_expected_settings = (
+    "consensus",
+    "cycle_check",
+    "future_leak_check",
+)
+
+
+class TestSettings:
+    def test_standard_settings(self) -> None:
+        settings = [
+            k
+            for k, v in m.settings.__class__.__dict__.items()
+            if isinstance(v, PrioritizedSetting)
+        ]
+        assert set(settings) == set(_expected_settings)
+
+    @pytest.mark.parametrize("name", _expected_settings)
+    def test_prefix(self, name: str) -> None:
+        ps = getattr(m.settings, name)
+        assert ps.env_var.startswith("LEGATE_")
+
+    @pytest.mark.parametrize("name", _expected_settings)
+    def test_parent(self, name: str) -> None:
+        ps = getattr(m.settings, name)
+        assert ps._parent == m.settings
+
+    def test_types(self) -> None:
+        assert m.settings.consensus.convert_type == "bool"
+        assert m.settings.cycle_check.convert_type == "bool"
+        assert m.settings.future_leak_check.convert_type == "bool"
+
+
+class TestDefaults:
+    def test_consensus(self) -> None:
+        assert m.settings.consensus.default is False
+
+    def test_cycle_check(self) -> None:
+        assert m.settings.cycle_check.default is False
+
+    def test_future_leak_check(self) -> None:
+        assert m.settings.future_leak_check.default is False
diff --git a/tests/unit/legate/tester/stages/_linux/test_cpu.py b/tests/unit/legate/tester/stages/_linux/test_cpu.py
index 24a4eef3d..c4394af14 100644
--- a/tests/unit/legate/tester/stages/_linux/test_cpu.py
+++ b/tests/unit/legate/tester/stages/_linux/test_cpu.py
@@ -21,18 +21,21 @@
 
 from legate.tester.config import Config
 from legate.tester.stages._linux import cpu as m
-from legate.tester.stages.util import UNPIN_ENV
+from legate.tester.stages.util import CUNUMERIC_TEST_ENV, UNPIN_ENV
 
 from .. import FakeSystem
 
+unpin_and_test = dict(UNPIN_ENV)
+unpin_and_test.update(CUNUMERIC_TEST_ENV)
+
 
 def test_default() -> None:
     c = Config([])
     s = FakeSystem(cpus=12)
     stage = m.CPU(c, s)
     assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.args == []
+    assert stage.env(c, s) == unpin_and_test
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
@@ -44,8 +47,8 @@ def test_cpu_pin_strict() -> None:
     s = FakeSystem(cpus=12)
     stage = m.CPU(c, s)
     assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
+    assert stage.args == []
+    assert stage.env(c, s) == CUNUMERIC_TEST_ENV
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
@@ -57,8 +60,8 @@ def test_cpu_pin_none() -> None:
     s = FakeSystem(cpus=12)
     stage = m.CPU(c, s)
     assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.args == []
+    assert stage.env(c, s) == unpin_and_test
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
diff --git a/tests/unit/legate/tester/stages/_linux/test_eager.py b/tests/unit/legate/tester/stages/_linux/test_eager.py
index eb8c48629..477a4e54d 100644
--- a/tests/unit/legate/tester/stages/_linux/test_eager.py
+++ b/tests/unit/legate/tester/stages/_linux/test_eager.py
@@ -32,6 +32,7 @@ def test_default() -> None:
     assert stage.kind == "eager"
     assert stage.args == []
     assert stage.env(c, s) == {
+        "CUNUMERIC_TEST": "0",
         "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
         "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
         "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
diff --git a/tests/unit/legate/tester/stages/_linux/test_gpu.py b/tests/unit/legate/tester/stages/_linux/test_gpu.py
index 8d792b7b3..8e7d7e234 100644
--- a/tests/unit/legate/tester/stages/_linux/test_gpu.py
+++ b/tests/unit/legate/tester/stages/_linux/test_gpu.py
@@ -21,6 +21,7 @@
 
 from legate.tester.config import Config
 from legate.tester.stages._linux import gpu as m
+from legate.tester.stages.util import CUNUMERIC_TEST_ENV
 
 from .. import FakeSystem
 
@@ -30,8 +31,8 @@ def test_default() -> None:
     s = FakeSystem()
     stage = m.GPU(c, s)
     assert stage.kind == "cuda"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
+    assert stage.args == []
+    assert stage.env(c, s) == CUNUMERIC_TEST_ENV
     assert stage.spec.workers > 0
 
 
diff --git a/tests/unit/legate/tester/stages/_linux/test_omp.py b/tests/unit/legate/tester/stages/_linux/test_omp.py
index a4d319fc0..cb926d295 100644
--- a/tests/unit/legate/tester/stages/_linux/test_omp.py
+++ b/tests/unit/legate/tester/stages/_linux/test_omp.py
@@ -21,18 +21,21 @@
 
 from legate.tester.config import Config
 from legate.tester.stages._linux import omp as m
-from legate.tester.stages.util import UNPIN_ENV
+from legate.tester.stages.util import CUNUMERIC_TEST_ENV, UNPIN_ENV
 
 from .. import FakeSystem
 
+unpin_and_test = dict(UNPIN_ENV)
+unpin_and_test.update(CUNUMERIC_TEST_ENV)
+
 
 def test_default() -> None:
     c = Config([])
     s = FakeSystem(cpus=12)
     stage = m.OMP(c, s)
     assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.args == []
+    assert stage.env(c, s) == unpin_and_test
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
@@ -44,8 +47,8 @@ def test_cpu_pin_strict() -> None:
     s = FakeSystem(cpus=12)
     stage = m.OMP(c, s)
     assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
+    assert stage.args == []
+    assert stage.env(c, s) == CUNUMERIC_TEST_ENV
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
@@ -57,8 +60,8 @@ def test_cpu_pin_none() -> None:
     s = FakeSystem(cpus=12)
     stage = m.OMP(c, s)
     assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
+    assert stage.args == []
+    assert stage.env(c, s) == unpin_and_test
     assert stage.spec.workers > 0
 
     shard = (1, 2, 3)
@@ -76,6 +79,8 @@ def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
         f"{c.omps}",
         "--ompthreads",
         f"{c.ompthreads}",
+        "--numamem",
+        f"{c.numamem}",
         "--cpu-bind",
         expected,
     ]
diff --git a/tests/unit/legate/tester/stages/test_test_stage.py b/tests/unit/legate/tester/stages/test_test_stage.py
index fcdc7a934..d37363abb 100644
--- a/tests/unit/legate/tester/stages/test_test_stage.py
+++ b/tests/unit/legate/tester/stages/test_test_stage.py
@@ -33,7 +33,6 @@
 
 
 class MockTestStage(m.TestStage):
-
     kind: FeatureType = "eager"
 
     name = "mock"
diff --git a/tests/unit/legate/tester/test___init__.py b/tests/unit/legate/tester/test___init__.py
index 6431469ff..e28217082 100644
--- a/tests/unit/legate/tester/test___init__.py
+++ b/tests/unit/legate/tester/test___init__.py
@@ -22,6 +22,7 @@
     DEFAULT_GPU_DELAY,
     DEFAULT_GPU_MEMORY_BUDGET,
     DEFAULT_GPUS_PER_NODE,
+    DEFAULT_NUMAMEM,
     DEFAULT_OMPS_PER_NODE,
     DEFAULT_OMPTHREADS,
     DEFAULT_PROCESS_ENV,
@@ -33,7 +34,7 @@
 
 class TestConsts:
     def test_DEFAULT_CPUS_PER_NODE(self) -> None:
-        assert DEFAULT_CPUS_PER_NODE == 4
+        assert DEFAULT_CPUS_PER_NODE == 2
 
     def test_DEFAULT_GPUS_PER_NODE(self) -> None:
         assert DEFAULT_GPUS_PER_NODE == 1
@@ -50,6 +51,9 @@ def test_DEFAULT_OMPS_PER_NODE(self) -> None:
     def test_DEFAULT_OMPTHREADS(self) -> None:
         assert DEFAULT_OMPTHREADS == 4
 
+    def test_DEFAULT_NUMAMEM(self) -> None:
+        assert DEFAULT_NUMAMEM == 0
+
     def test_DEFAULT_PROCESS_ENV(self) -> None:
         assert DEFAULT_PROCESS_ENV == {
             "LEGATE_TEST": "1",
diff --git a/tests/unit/legate/tester/test_args.py b/tests/unit/legate/tester/test_args.py
index c307a7080..ad87c536f 100644
--- a/tests/unit/legate/tester/test_args.py
+++ b/tests/unit/legate/tester/test_args.py
@@ -22,6 +22,7 @@
     DEFAULT_GPU_DELAY,
     DEFAULT_GPU_MEMORY_BUDGET,
     DEFAULT_GPUS_PER_NODE,
+    DEFAULT_NUMAMEM,
     DEFAULT_OMPS_PER_NODE,
     DEFAULT_OMPTHREADS,
     args as m,
@@ -59,6 +60,9 @@ def test_omps(self) -> None:
     def test_ompthreads(self) -> None:
         assert m.parser.get_default("ompthreads") == DEFAULT_OMPTHREADS
 
+    def test_numamem(self) -> None:
+        assert m.parser.get_default("numamem") == DEFAULT_NUMAMEM
+
     def test_legate_dir(self) -> None:
         assert m.parser.get_default("legate_dir") is None
 
diff --git a/tests/unit/legate/tester/test_config.py b/tests/unit/legate/tester/test_config.py
index ac7b30f07..e4016f5d8 100644
--- a/tests/unit/legate/tester/test_config.py
+++ b/tests/unit/legate/tester/test_config.py
@@ -100,7 +100,6 @@ def test_env_features(
 
     @pytest.mark.parametrize("feature", FEATURES)
     def test_cmd_features(self, feature: str) -> None:
-
         # test a single value
         c = m.Config(["test.py", "--use", feature])
         assert set(c.features) == {feature}
diff --git a/tests/unit/legate/util/test_args.py b/tests/unit/legate/util/test_args.py
index 190ff3c6b..77de6ed11 100644
--- a/tests/unit/legate/util/test_args.py
+++ b/tests/unit/legate/util/test_args.py
@@ -22,7 +22,7 @@
 
 import legate.util.args as m
 
-from ...util import Capsys, powerset
+from ...util import powerset
 
 T = TypeVar("T")
 
@@ -101,121 +101,5 @@ def test_entries() -> None:
     assert set(m.entries(_TestObj())) == {("a", 10), ("c", "foo")}
 
 
-class Test_parse_library_command_args:
-    @pytest.mark.parametrize("name", ("1foo", "a.b", "a/b", "a[", "a("))
-    def test_bad_libname(self, name: str) -> None:
-        with pytest.raises(ValueError):
-            m.parse_library_command_args(name, [])
-
-    def test_default_help(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
-        with pytest.raises(SystemExit) as e:
-            m.parse_library_command_args("foo", [])
-        assert e.value.code is None
-        out, err = capsys.readouterr()
-        assert out.startswith("usage: <foo program>")
-
-    def test_default_help_precedence(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
-        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
-        with pytest.raises(SystemExit) as e:
-            m.parse_library_command_args("foo", args)
-        assert e.value.code is None
-        out, err = capsys.readouterr()
-        assert out.startswith("usage: <foo program>")
-
-    def test_default_help_patches_short_args(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help", "-foo:bar"])
-        args = [m.Argument("bar", m.ArgSpec(dest="bar"))]
-        with pytest.raises(SystemExit) as e:
-            m.parse_library_command_args("foo", args)
-        assert e.value.code is None
-        out, err = capsys.readouterr()
-        assert out.startswith("usage: <foo program>")
-        assert "-foo:bar" in out
-        assert "--foo:bar" not in out
-
-    def test_help_override(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:help"])
-        args = [
-            m.Argument("help", m.ArgSpec(action="store_true", dest="help"))
-        ]
-        ns = m.parse_library_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"help": True}
-        assert sys.argv == ["app"]
-
-    def test_basic(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:quux", "1"])
-        args = [
-            m.Argument("bar", m.ArgSpec(action="store_true", dest="bar")),
-            m.Argument(
-                "quux", m.ArgSpec(dest="quux", action="store", type=int)
-            ),
-        ]
-        ns = m.parse_library_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"bar": True, "quux": 1}
-        assert sys.argv == ["app"]
-
-    def test_extra_args_passed_on(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "--extra", "1"])
-        args = [m.Argument("bar", m.ArgSpec(action="store_true", dest="bar"))]
-        ns = m.parse_library_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"bar": True}
-        assert sys.argv == ["app", "--extra", "1"]
-
-    def test_unrecognized_libname_arg(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr("sys.argv", ["app", "-foo:bar", "-foo:baz"])
-        with pytest.warns(UserWarning) as record:
-            ns = m.parse_library_command_args("foo", [])
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {}
-        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
-
-        # issues one warning for the first encountered
-        assert len(record) == 1
-        assert isinstance(record[0].message, Warning)
-        assert (
-            record[0].message.args[0]
-            == "Unrecognized argument '-foo:bar' for foo (passed on as-is)"
-        )
-        assert out == ""
-        assert vars(ns) == {}
-        assert sys.argv == ["app", "-foo:bar", "-foo:baz"]
-
-    def test_no_prefix_conflict(
-        self, monkeypatch: pytest.MonkeyPatch, capsys: Capsys
-    ) -> None:
-        monkeypatch.setattr(
-            "sys.argv", ["app", "-foo:bar", "--foo", "-f", "1", "-ff"]
-        )
-        args = [m.Argument("bar", m.ArgSpec(action="store_true", dest="bar"))]
-        ns = m.parse_library_command_args("foo", args)
-        out, err = capsys.readouterr()
-        assert out == ""
-        assert vars(ns) == {"bar": True}
-        assert sys.argv == ["app", "--foo", "-f", "1", "-ff"]
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/legate/util/test_settings.py b/tests/unit/legate/util/test_settings.py
new file mode 100644
index 000000000..071c4df2d
--- /dev/null
+++ b/tests/unit/legate/util/test_settings.py
@@ -0,0 +1,182 @@
+# Copyright 2023 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from typing import Any, Iterator, Mapping
+
+import pytest
+
+import legate.util.settings as m
+
+
+@contextmanager
+def envset(
+    value: Mapping[str, str] | None = None, **kw: Any
+) -> Iterator[None]:
+    old = os.environ.copy()
+    if value:
+        os.environ.update(value)
+    os.environ.update(**kw)
+    yield
+    # take care to keep the same actual dict object
+    os.environ.clear()
+    os.environ.update(old)
+
+
+class TestConverters:
+    @pytest.mark.parametrize(
+        "value", ["Yes", "YES", "yes", "1", "ON", "on", "true", "True", True]
+    )
+    def test_convert_bool(self, value: str) -> None:
+        assert m.convert_bool(value)
+
+    @pytest.mark.parametrize(
+        "value", ["No", "NO", "no", "0", "OFF", "off", "false", "False", False]
+    )
+    def test_convert_bool_false(self, value: str) -> None:
+        assert not m.convert_bool(value)
+
+    @pytest.mark.parametrize("value", [True, False])
+    def test_convert_bool_identity(self, value: bool) -> None:
+        assert m.convert_bool(value) == value
+
+    def test_convert_bool_bad(self) -> None:
+        with pytest.raises(ValueError):
+            m.convert_bool("junk")
+
+
+class TestPrioritizedSetting:
+    def test_env_var_property(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO")
+        assert ps.env_var == "LEGATE_FOO"
+
+    def test_everything_unset_raises(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo")
+        with pytest.raises(RuntimeError):
+            ps()
+
+    def test_implict_default(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", default=10)
+        assert ps() == 10
+
+    def test_implict_default_converts(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", convert=int, default="10")
+        assert ps() == 10
+
+    def test_help(self) -> None:
+        ps: Any = m.PrioritizedSetting(
+            "foo", env_var="LEGATE_FOO", default=10, help="bar"
+        )
+        assert ps.help == "bar"
+
+    def test_name(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO", default=10)
+        assert ps.name == "foo"
+
+    def test_global_default(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO", default=10)
+        assert ps.default == 10
+        assert ps() == 10
+
+    def test_local_default(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO", default=10)
+        assert ps.default == 10
+        assert ps(default=20) == 20
+
+    def test_env_var(self) -> None:
+        with envset(LEGATE_FOO="30"):
+            ps: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO")
+            assert ps.env_var == "LEGATE_FOO"
+            assert ps() == "30"
+            assert ps(default=20) == "30"
+
+    def test_env_var_converts(self) -> None:
+        with envset(LEGATE_FOO="30"):
+            ps: Any = m.PrioritizedSetting(
+                "foo", convert=int, env_var="LEGATE_FOO"
+            )
+            assert ps() == 30
+
+    def test_user_set(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo")
+        ps.set_value(40)
+        assert ps() == 40
+        assert ps(default=20) == 40
+
+    def test_user_unset(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", default=2)
+        ps.set_value(40)
+        assert ps() == 40
+        ps.unset_value()
+        assert ps() == 2
+
+    def test_user_set_converts(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", convert=int)
+        ps.set_value("40")
+        assert ps() == 40
+
+    def test_immediate(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo")
+        assert ps(50) == 50
+        assert ps(50, default=20) == 50
+
+    def test_immediate_converts(self) -> None:
+        ps: Any = m.PrioritizedSetting("foo", convert=int)
+        assert ps("50") == 50
+
+    def test_precedence(self) -> None:
+        class FakeSettings:
+            pass
+
+        ps: Any = m.PrioritizedSetting(
+            "foo", env_var="LEGATE_FOO", convert=int, default=0
+        )
+        ps._parent = FakeSettings
+
+        # 0. global default
+        assert ps() == 0
+
+        # 1. local default
+        assert ps(default=10) == 10
+
+        # 2. environment variable
+        with envset(LEGATE_FOO="40"):
+            assert ps() == 40
+            assert ps(default=10) == 40
+
+            # 3. previously user-set value
+            ps.set_value(60)
+            assert ps() == 60
+            assert ps(default=10) == 60
+
+            # 4. immediate values
+            assert ps(70) == 70
+            assert ps(70, default=10) == 70
+
+    def test_descriptors(self) -> None:
+        class FakeSettings:
+            foo: Any = m.PrioritizedSetting("foo", env_var="LEGATE_FOO")
+            bar: Any = m.PrioritizedSetting(
+                "bar", env_var="LEGATE_BAR", default=10
+            )
+
+        s = FakeSettings()
+        assert s.foo is FakeSettings.foo
+
+        assert s.bar() == 10
+        s.bar = 20
+        assert s.bar() == 20  # type: ignore
diff --git a/typings/legion_cffi/__init__.pyi b/typings/legion_cffi/__init__.pyi
index 66acac230..50d84df5b 100644
--- a/typings/legion_cffi/__init__.pyi
+++ b/typings/legion_cffi/__init__.pyi
@@ -42,3 +42,5 @@ class FFI:
     ) -> CData: ...
 
 ffi: FFI
+
+is_legion_python: bool
diff --git a/typings/legion_top/__init__.pyi b/typings/legion_top/__init__.pyi
index da8b37606..2ee59f46f 100644
--- a/typings/legion_top/__init__.pyi
+++ b/typings/legion_top/__init__.pyi
@@ -13,11 +13,13 @@
 # limitations under the License.
 #
 
-from typing import Any, Callable
+from typing import Any, Callable, Tuple
 
 from legion_cffi.lib import legion_context_t, legion_runtime_t
 
 def add_cleanup_item(callback: Callable[[], None]) -> None: ...
+def legion_canonical_python_main(sys_argv: Tuple[str, ...]) -> None: ...
+def legion_canonical_python_cleanup() -> None: ...
 
 class top_level:
     runtime: list[legion_runtime_t]
@@ -25,5 +27,7 @@ class top_level:
 
 __all__ = (
     "add_cleanup_item",
+    "legion_canonical_python_main",
+    "legion_canonical_python_cleanup",
     "top_level",
 )