From 29006bddb9b4daa2eac1f9a658909fbe381c73ff Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 12 Oct 2021 14:06:12 +0000
Subject: [PATCH 01/45] Added initial chapter skeleton for CUDA backend
 specification

---
 adoc/chapters/cuda_backend.adoc | 209 ++++++++++++++++++++++++++++++++
 adoc/syclbase.adoc              |   3 +
 2 files changed, 212 insertions(+)
 create mode 100644 adoc/chapters/cuda_backend.adoc
diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
new file mode 100644
index 00000000..9f26cf09
--- /dev/null
+++ b/adoc/chapters/cuda_backend.adoc
@@ -0,0 +1,209 @@
+// %%%%%%%%%%%%%%%%%%%%%%%%%%%% begin cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+[appendix]
+[[chapter:cuda-backend]]
+= CUDA backend specification
+
+[[sec:cuda:introduction]]
+== Introduction
+
+[[sec:cuda:mapping_of_sycl_programming_model]]
+== Mapping of SYCL programming model
+
+[[sub:cuda:platform_model]]
+=== Platform Model
+
+[[sub:cuda:memory_model]]
+=== Memory model
+
+[[sub:cuda:execution_model]]
+=== Execution Model
+
+[[sec::programming_interface]]
+== Programming Interface
+
+[[sub:cuda:application_interoperability]]
+=== Application Interoperability
+
+[[table.cuda.appinterop.nativeobjects]]
+.Types of native backend objects application interoperability
+[width="100%",options="header",cols="20%,20%,20%,40%"]
+|====
+| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#buffer# |   |   |
+| [code]#context#                                                   |   |   |
+| [code]#device#                                                    |   |   |
+| [code]#device_image<State>#                                       |   |   |
+| [code]#event#                                                     |   |   |
+| [code]#kernel#                                                    |   |   |
+| [code]#kernel_bundle<State>#                                      |   |   |
+| [code]#platform#                                                  |   |   |
+| [code]#queue#                                                     |   |   |
+| [code]#sampled_image<Dims, AllocatorT>#                           |   |   |
+| [code]#unsampled_image<Dims, AllocatorT>#                         |   |   |
+|====
+
+[[table.cuda.appinterop.ownership]]
+.Ownership behavior of native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| SYCL Object                                                       | Destructor behaviour
+| [code]#buffer# |   
+| [code]#context#                                                   |   
+| [code]#device#                                                    |  
+| [code]#device_image<State>#                                       |   
+| [code]#event#                                                     |  
+| [code]#kernel#                                                    |   
+| [code]#kernel_bundle<State>#                                      |   
+| [code]#platform#                                                  |  
+| [code]#queue#                                                     |   
+| [code]#sampled_image<Dims, AllocatorT>#                           |   
+| [code]#unsampled_image<Dims, AllocatorT>#                         | 
+|====
+
+[[table.cuda.appinterop.make_interop_APIs]]
+.[code]#make_*# Interoperability APIs for native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| CUDA interoperability function                                    |  Description
+| [code]#template<backend Backend> +
+platform + 
+make_platform(const backend_input_t<Backend, platform> &backendObject);# 
+        |
+
+| [code]#template<backend Backend> +
+device +
+make_device(const backend_input_t<Backend, device> &backendObject);# 
+        |
+
+| [code]#template<backend Backend> +
+context +
+make_context(const backend_input_t<Backend, context> &backendObject,
+                     const async_handler asyncHandler = {});# 
+        |
+
+| [code]#template<backend Backend> +
+queue +
+make_queue(const backend_input_t<Backend, queue> &backendObject,
+                 const context &targetContext,
+                 const async_handler asyncHandler = {});# 
+        |
+
+| [code]#template<backend Backend> +
+event +
+make_event(const backend_input_t<Backend, event> &backendObject,
+                 const context &targetContext);# 
+        |
+
+| [code]#template <backend Backend, typename T, int dimensions = 1,
+          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+buffer<T, dimensions, AllocatorT> +
+make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
+                &backendObject,
+            const context &targetContext, event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, typename T, int dimensions = 1,
+          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+buffer<T, dimensions, AllocatorT> +
+make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
+                &backendObject,
+            const context &targetContext);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+sampled_image<dimensions, AllocatorT> + 
+make_sampled_image(
+    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, image_sampler imageSampler,
+    event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+sampled_image<dimensions, AllocatorT> +
+make_sampled_image(
+    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, image_sampler imageSampler);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+unsampled_image<dimensions, AllocatorT> +
+make_unsampled_image(
+    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+unsampled_image<dimensions, AllocatorT> + 
+make_unsampled_image(
+    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext);# 
+        |
+
+| [code]#template<backend Backend, bundle_state State> +
+kernel_bundle<State> +
+make_kernel_bundle(
+    const backend_input_t<Backend, kernel_bundle<State>> &backendObject,
+    const context &targetContext);# 
+        |
+
+| [code]#template<backend Backend> +
+kernel +
+make_kernel(const backend_input_t<Backend, kernel> &backendObject,
+                   const context &targetContext);# 
+        |
+|====
+
+[[table.cuda.appinterop.make_interop_APIs]]
+.[code]#get_native# Interoperability APIs for native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| CUDA interoperability function                                    |  Description
+| [code]#template<backend Backend, class T> +
+backend_return_t<Backend, T> +
+get_native(const T &syclObject);# 
+        |
+|====
+
+
+[[sub:cuda:kernel_function_interoperability]]
+=== Kernel Function Interoperability
+
+[[table.cuda.appinterop.nativeobjects]]
+.Types of native backend objects kernel function interoperability
+[width="100%",options="header",cols="20%,20%,20%,40%"]
+|====
+| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#accessor<T, Dims, Mode, target::device>#                    |   |   |
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           |   |   |
+| [code]#accessor<T, Dims, Mode, target::local>#                     |   |   |
+| [code]#local_accessor<T, Dims>#                                    |   |   |
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   |   |   |
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   |   |   |
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   |   |   |
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># |   |   |
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># |   |   |
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># |   |   |
+| [code]#stream#                                                     |   |   |
+| [code]#device_event#                                               |   |   |
+|====
+
+
+[[sec:non_core_features_and_extensions]]
+== Non-core features and extensions
+
+[[sub:cuda:extensions]]
+=== Extensions
+
+[[sub:cuda:error_handling]]
+=== Error Handling
+
+// %%%%%%%%%%%%%%%%%%%%%%%%%%%% end cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/adoc/syclbase.adoc b/adoc/syclbase.adoc
index 56caaa62..6c5c027b 100644
--- a/adoc/syclbase.adoc
+++ b/adoc/syclbase.adoc
@@ -129,6 +129,9 @@ include::chapters/feature_sets.adoc[]
 // \input{opencl_backend}
 include::chapters/opencl_backend.adoc[]
 
+// \input{cuda_backend}
+include::chapters/cuda_backend.adoc[]
+
 // \input{what_changed}
 include::chapters/what_changed.adoc[]
 

From 9500d56ebf7d174f3b1e6cb86c8cc6fd1f459eaf Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Fri, 15 Oct 2021 14:08:34 +0000
Subject: [PATCH 02/45] Add introduction and mapping of context and queue for
 CUDA backend specification

---
 adoc/chapters/cuda_backend.adoc | 36 +++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 9f26cf09..b7f37245 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -4,15 +4,47 @@
 [[chapter:cuda-backend]]
 = CUDA backend specification
 
-[[sec:cuda:introduction]]
-== Introduction
+This chapter describes how the SYCL general programming model is mapped on top
+of CUDA, and how the SYCL generic interoperability interface must be
+implemented by vendors providing SYCL for CUDA implementations to ensure SYCL
+applications written for the CUDA backend are interoperable.
+
+The CUDA backend is enabled using the `sycl::backend::cuda` value of `enum
+class backend`. That means that when the CUDA backend is active, the value of
+`sycl::is_backend_active<sycl::backend::cuda>::value` will be `true`, and the
+preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
+
+The CUDA backend requires an installation of CUDA SDK as well as one or more
+CUDA devices available in the system.
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
 
+This section gives a general overview of how the SYCL programming model maps to
+CUDA. These two programming models are pretty similar in essence however they do
+have a few differences in terminology and architecture.
+
 [[sub:cuda:platform_model]]
 === Platform Model
 
+TODO: Platform
+
+TODO: Device
+
+A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
+a CUDA context is tied to a single device, this is not the case for a SYCL
+<<context>> and the CUDA backend implementation may use multiple CUDA contexts
+to emulate a SYCL <<context>> containing multiple devices. Additionally, while
+SYCL contexts are simple objects passed around either implicitly or explicitly,
+CUDA contexts require to be activated on the current thread to be used by other
+CUDA entry points. Therefore any use of the SYCL APIs with a CUDA backend may
+modify the current active context on the thread, and no guarantee is provided
+that any existing active CUDA context would be restored by SYCL.
+
+A SYCL <<queue>> simply maps to one or multiple CUDA streams. Indeed while a
+CUDA stream is in-order, a SYCL <<queue>> isn't, so a CUDA backend implementation
+may use multiple CUDA streams to implement an out of order SYCL <<queue>>.
+
 [[sub:cuda:memory_model]]
 === Memory model
 

From 4bef59b683b2db4679a64954bf290a286c7c29f7 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Mon, 18 Oct 2021 15:55:05 +0000
Subject: [PATCH 03/45] Add memory model section to CUDA backend specification

---
 adoc/chapters/cuda_backend.adoc | 122 ++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index b7f37245..64f5ce1d 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -48,6 +48,128 @@ may use multiple CUDA streams to implement an out of order SYCL <<queue>>.
 [[sub:cuda:memory_model]]
 === Memory model
 
+==== Memory Allocations
+
+When non-host accessors to buffers are created without [code]#target::host_buffer# they need to allocate memory for their contents on the device. For example using [code]#cudaMalloc3D()#, [code]#cudaMallocPitch()# or [code]#cudaMalloc()#.
+
+When accessors to images are created without [code]#target::host_buffer# they allocate memory, for example using [code]#cudaMalloc3DArray()# or [code]#cudaMallocArray()#. 
+
+When non-host accessors are created with [code]#target::host_buffer# they can, for example use [code]#cudaHostAlloc()# to allocate pinned memory on host.
+
+Table <<table.cuda.memmodel.USM>> specifies which underlying CUDA functions can be used for USM allocations. For shared USM allocations this would mean memory is managed (moved between host and different devices) by CUDA runtime. Alternatively shared USM allocations can be managed by SYCL runtime, using non-managed CUDA allocation on device when needed, such as [code]#cudaMalloc()#.
+
+[[table.cuda.memmodel.USM]]
+.Cuda functions that could be used to allocate SYCL USM allocations
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL USM type | CUDA function
+| device | [code]#cudaMalloc()#
+| host | [code]#cudaHostAlloc()#
+| shared | [code]#cudaMallocManaged()#
+|====
+
+==== Samplers
+
+In both SYCL and CUDA samplers consist of addressing mode, filtering mode and coordinate normalization mode. Mapping between SYCL and CUDA values is defined in tables <<table.cuda.memmodel.sampler_addressing>>, <<table.cuda.memmodel.sampler_filtering>> and <<table.cuda.memmodel.sampler_normalization>>. In CUDA addressing modes for all dimesnions will be the same, as CUDA allows different addressing modes for different dimesnions, while SYCL does not. 
+
+[[table.cuda.memmodel.sampler_addressing]]
+.Mapping of SYCL sampler addressing modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler addressing mode | CUDA sampler addressing mode
+| [code]#sycl::addressing_mode::mirrored_repeat# | [code]#cudaAddressModeMirror#
+| [code]#sycl::addressing_mode::repeat# | [code]#cudaAddressModeWrap#
+| [code]#sycl::addressing_mode::clamp_to_edge# | [code]#cudaAddressModeClamp#
+| [code]#sycl::addressing_mode::clamp# | [code]#cudaAddressModeClamp#
+| [code]#sycl::addressing_mode::none# | [code]#cudaAddressModeBorder#
+|====
+
+SYCL allows [code]#sycl::addressing_mode::mirrored_repeat# and [code]#sycl::addressing_mode::repeat# to be used together with unnormalized coordinates. In this case the resulting coordinates are undefined. CUDA does not allow this, so if [code]#sycl::addressing_mode::mirrored_repeat# or [code]#sycl::addressing_mode::repeat# is specified together with unnormalized coordinates, [code]#cudaAddressModeBorder# is used instead.
+
+[[table.cuda.memmodel.sampler_filtering]]
+.Mapping of SYCL sampler filtering modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler filtering mode | CUDA sampler filtering mode
+| [code]#sycl::filtering_mode::nearest# | [code]#cudaFilterModePoint#
+| [code]#sycl::filtering_mode::linear# | [code]#cudaFilterModeLinear#
+|====
+
+[[table.cuda.memmodel.sampler_normalization]]
+.Mapping of SYCL sampler coordinate normalization modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler coordinate normalization mode | CUDA sampler coordinate normalization mode
+| [code]#sycl::coordinate_normalization_mode::normalized# | [code]#normalizedCoords = true#
+| [code]#sycl::coordinate_normalization_mode::unnormalized# | [code]#normalizedCoords = false#
+|====
+
+==== Address Spaces
+
+Table <<table.cuda.memmodel.address_spaces>> maps SYCL address spaces to CUDA address spaces.
+
+[[table.cuda.memmodel.address_spaces]]
+.Mapping from SYCL address spaces to CUDA address spaces
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL Address Space | CUDA Address Space
+| Global memory | global
+| Local memory | shared
+| Private memory | registers or local
+| Generic memory | generic
+| Constant memory | const
+|====
+
+==== Atomics
+
+Not all CUDA devices support all memory orders. If a particular memory order is unsupported by a CUDA device, it can be unsupported in the SYCL CUDA backend for that device. Sequentially consistent atomics are currently not supported on any device, so the SYCL CUDA backend is not required to implement them. The mappings of other memory orders (when supported by the device) is defined in table <<table.cuda.memmodel.memory_orders>>.
+
+[[table.cuda.memmodel.memory_orders]]
+.Mapping from [code]#sycl::memory_order# to PTX ISA memory orders
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_order# | PTX ISA Memory Order
+| [code]#memory_order::relaxed# | relaxed
+| [code]#memory_order::acquire# | acquire
+| [code]#memory_order::release# | release
+| [code]#memory_order::acq_rel# | acq_rel
+| [code]#memory_order::seq_cst# | undefined
+|====
+
+Mapping of memory scopes (when supported by the device) is defined in table [table.cuda.memmodel.memory_scopes]. [code]#memory_scope::work_item# does not require any consistency between different work items, so it can be mapped to non-atomic operation.
+
+[[table.cuda.memmodel.memory_scopes]]
+.Mapping from [code]#sycl::memory_scope# to PTX ISA memory scopes
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_scope# | PTX ISA Memory Scope
+| [code]#memory_scope::work_item# | 
+| [code]#memory_scope::sub_group# | cta
+| [code]#memory_scope::work_group# | cta
+| [code]#memory_scope::device# | gpu
+| [code]#memory_scope::system# | system
+|====
+
+==== Fences
+
+If a device supports the [code]#fence# PTX instruction the mapping of memory orders is defined in <<table.cuda.memmodel.fence_memory_orders>>. Otherwise all memory orders (except relaxed) are mapped to the [code]#membar# instruction.
+
+[[table.cuda.memmodel.fence_memory_orders]]
+.Mapping from [code]#sycl::memory_order# to PTX ISA memory orders when used in fences
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_order# | PTX ISA Memory Order
+| [code]#memory_order::relaxed# | none
+| [code]#memory_order::acquire# | acq_rel
+| [code]#memory_order::release# | acq_rel
+| [code]#memory_order::acq_rel# | acq_rel
+| [code]#memory_order::seq_cst# | sc
+|====
+
+If future versions of PTX ISA define fence instructions with only acquire or only release memory order, these can be used as well for [code]#memory_order::acquire# and [code]#memory_order::release# on devices that support them.
+
+Mapping of SYCL memory scopes to PTX ISA is the same as for atomics. It is defined in <<table.cuda.memmodel.memory_scopes>>.
+
 [[sub:cuda:execution_model]]
 === Execution Model
 

From 0a6915cd98ce5a734042a46c3c2d2d55c296ed9c Mon Sep 17 00:00:00 2001
From: Aidan Belton <aidan.belton@codeplay.com>
Date: Tue, 19 Oct 2021 23:32:51 +0000
Subject: [PATCH 04/45] Add cuda execution model

---
 adoc/chapters/cuda_backend.adoc | 91 +++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 64f5ce1d..fa27a349 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -16,6 +16,9 @@ preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
 
 The CUDA backend requires an installation of CUDA SDK as well as one or more
 CUDA devices available in the system.
+[[sec:cuda:introduction]]
+== Introduction
+>>>>>>> cuda execution model
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
@@ -173,6 +176,94 @@ Mapping of SYCL memory scopes to PTX ISA is the same as for atomics. It is defin
 [[sub:cuda:execution_model]]
 === Execution Model
 
+CUDA's execution model is similar to SYCL's. CUDA uses kernels to
+offload computation, splitting the host and GPU into asynchronous 
+computing devices. In general, except for CUDA's dynamic 
+parallelism extensions, kernels are called by the host. One 
+difference between CUDA and SYCL execution models is that CUDA 
+uses Single Instruction Multiple Thread (SIMT) while SYCL uses 
+Single Instruction Multiple Data (SIMD) kernels. SIMT kernels use 
+multiple scalar instructions acting on non-contiguous data.  SIMD 
+kernels use vector instructions acting on contiguous data. SIMT 
+can be used in place of SIMD but not the other way around, as SIMD 
+requires memory blocks to have no interruptions within the data, 
+while SIMT does not have this as a requirement. 
+
+CUDA GPUs are constructed out of streaming multiprocessors (SM) 
+which perform the actual computation. Each SM consists of 8 scalar 
+cores, shared memory, registers, a load/store unit, and a scheduler 
+unit. CUDA uses a hierarchy of threads to organize the execution of
+kernels. Kernels are split up into thread blocks. The threadblocks 
+form a grid each thread can identify its location within the grid 
+using a block ID. The grid is a concept used to index threadblocks 
+the grid can be one, two, or three dimensions. Each thread block is 
+tied to a single SM. Similar to a thread block's location within the 
+grid, each thread's position within the block can be identified with 
+a one, two, or three dimensional thread ID. 
+
+Pre-Volta GPU architectures breaks thread blocks into warps which 
+consist of 32 threads. The warp is processed by the SM concurrently. 
+For one warp instruction to be executed requires 4 SM clock cycles. 
+SM's execute multiple warp instructions. The warps instructions are 
+prioritized and scheduled to minimize overhead. 
+
+Volta and more recent GPU architectures use independent thread 
+scheduling. In addition, each thread can access memory within a 
+unified virtual address space. Threads must synchronize with other 
+threads using execution barriers, synchronization primitives and 
+Cooperative Groups to utilize unified memory.
+
+SYCL has a similar execution hierarchy consisting of kernels. 
+The kernel is broken down into work-items. Each work-item concurrently
+executes an instance of the kernel on a piece of memory. Work-items 
+can be combined into work-groups that have designated shared memory.
+Work-groups can synchronize their work-items with work-group barriers.
+
+There are some equivalences between CUDA and SYCL execution models. 
+For example, CUDA's stream multiprocessor is equal to a SYCL compute 
+unit. CUDA's grid is similar to SYCL's nd_range as it is the highest 
+level grouping of threads, not including the whole kernel. Both 
+nd_range and grid can segment the groups of threads into one, two, or 
+three dimensions. SYCL sub-groups roughly map to
+cooperative groups `thread_block_tile` as it allows for the
+work-group/thread block to be further subdivided into concurrent threads.
+Likewise, thread blocks map directly to work-groups, and a
+single thread is a SYCL work-item.
+
+CUDA primarily synchronizes the threads through two functions,
+`cudaStreamSynchronize()` and `__syncthreads()`. 
+`cudaStreamSynchronize()` blocks work from being performed until all 
+threads on the device has been completed. `__syncthreads()` waits for 
+all threads within a thread block to reach the same point. So 
+`cudaStreamSynchronize()` is similar to queue.wait(), buffer 
+destruction, and other host-device synchronization events within SYCL. 
+`__syncthreads()` synchronizes the threads within a thread block which
+is analogous to the work-group barrier.
+
+CUDA's warp concept has no SYCL equivalent. If a user were to write 
+warp aware code it would be non-generic SYCL code and specific to the 
+CUDA backend.
+
+CUDA allows for more detailed thread and memory management through 
+Cooperative Groups. Cooperative Groups allow for synchronizing at the 
+grid level and organizing subgroups in sizes smaller than a warp. 
+Cooperative Groups do not have an equivalent within SYCL 2020 and are 
+not yet supported.
+
+[[table.cuda.CUDA_features_to_SYCL]]
+.CUDA execution features with their corresponding SYCL features
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#SYCL#                                                       | [code]#CUDA#
+| [code]#Compute unit#                                               | [code]#Streaming multiprocessor#
+| [code]#nd_range#                                                   | [code]#grid#
+| [code]#work-group#                                                 | [code]#Thread block#
+| [code]#sub-group#                                                  | [code]#thread_block_tile#
+| [code]#work-item#                                                  | [code]#Thread#
+| [code]#SYCL nd_item synchronization#                               | [code]#cudaStreamSynchronize#
+| [code]#work-group barrier#                                         | [code]#__syncthread#
+|====
+
 [[sec::programming_interface]]
 == Programming Interface
 

From ee341d3f225457cf23a1c1ebfdfbf13c3ab297e2 Mon Sep 17 00:00:00 2001
From: Aidan Belton <aidan.belton@codeplay.com>
Date: Wed, 20 Oct 2021 13:16:31 +0000
Subject: [PATCH 05/45] cuda extension

---
 adoc/chapters/cuda_backend.adoc | 47 ++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index fa27a349..d24edc99 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -18,7 +18,6 @@ The CUDA backend requires an installation of CUDA SDK as well as one or more
 CUDA devices available in the system.
 [[sec:cuda:introduction]]
 == Introduction
->>>>>>> cuda execution model
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
@@ -445,6 +444,52 @@ get_native(const T &syclObject);#
 [[sec:non_core_features_and_extensions]]
 == Non-core features and extensions
 
+Additional CUDA features are available depending upon the devices compute capability.
+SYCL can support these optional CUDA features with extensions.
+Unlike OpenCL, CUDA needs to know if the extension is available at compile time. 
+As a result there are no valid CUDA extensions which can be passed to `has_extension`.
+
+As the extension must be known at runtime CUDA extensions are best implemented 
+using feature test macros. The test macro format is 
+SYCL_EXT_<vendorstring>_<featurename>. For CUDA extensions this format translates 
+to SYCL_EXT_NVIDIA_<featurename>. Similarly, the format for the naming of extension 
+classes and enumerations should be ext_<vendorstring>_<featurename>. Which in this context
+becomes ext_NVIDIA_<featurename>. Given the necessity to know the extension at 
+compile-time, the usage of extension macros should be the primary method of determining 
+if the extension is available in the SYCL implementation not. 
+A list of non-core CUDA features which have SYCL support is below.
+Non-core CUDA features for require a compute capability of greater than 5.
+
+TODO: The table below shows a proposal for SYCL supported CUDA extensions.
+The table should be developed with other members of the SYCL community.
+
+[[table.extensionsupport]]
+.SYCL support for CUDA 11.3 extensions
+[width="100%",options="header",cols="35%,35%,15%, 15"]
+|====
+| SYCL Aspect              | CUDA Extension                                        | Core SYCL API | Required Compute Capability 
+| [code]#aspect::fp16#     | [code]#16-bit floating point#                         | Yes           | 5.3 or greater
+| -                        | [code]#Tensor Cores#                                  | No            | 7 or greater
+| -                        | [code]#Atomic floating-point operations#              | No            | 6 or greater
+|====
+
+=== Aspects
+Aspects are used to query what features and attributes a device has. Some aspects such as `fp16`
+are non-core CUDA features. Therefore, the runtime must be able to determine what aspects CUDA 
+devices have. This can be performed by querying `cudaDeviceProp::major` and `cudaDeviceProp::minor`
+to find out the compute capability. The compute capability indicates what extensions are
+available to the device, and therefore what aspects are available.
+
+[[sec:cuda:extension-fp16]]
+=== Half precision floating-point
+
+The half scalar data type: [code]#half# and the half vector data types:
+[code]#half1#, [code]#half2#, [code]#half3#,
+[code]#half4#, [code]#half8# and [code]#half16# must be
+available at compile-time. However a kernel using these types is only
+supported on devices that have [code]#aspect::fp16#, i.e. compute capability
+5.3 or greater.
+
 [[sub:cuda:extensions]]
 === Extensions
 

From ece44e82342276f2683c62c0e6ff307204b84ba6 Mon Sep 17 00:00:00 2001
From: Jack Kirk <jack.kirk@codeplay.com>
Date: Thu, 21 Oct 2021 09:20:35 +0000
Subject: [PATCH 06/45] Error handling

---
 adoc/chapters/cuda_backend.adoc | 40 +++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d24edc99..9cc8e1b2 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -496,4 +496,44 @@ supported on devices that have [code]#aspect::fp16#, i.e. compute capability
 [[sub:cuda:error_handling]]
 === Error Handling
 
+If there is a CUDA driver API error associated with an exception triggered, then the
+CUDA error code can be obtained by the free function `CUresult sycl::cuda::get_error_code(sycl::exception&)`. In the case where there is
+no CUDA error associated with the exception triggered, the CUDA error
+code will be `CUDA_SUCCESS`.
+
+Most of the SYCL error codes that form sycl::errc are specifically defined as errors thrown during calls to the SYCL API or SYCL runtime. There are also some cases of sycl::errc which cover errors thrown during the compilation or execution of device code.
+It is suitable to map CUDA errors to such cases, such that an exception, "cuda_exception", that was created due to a CUDA error, may, upon execution of `cuda_exception.code()`, return a `std::error_code` relating to the `sycl::errc` case that the CUDA error maps to; whilst `sycl::cuda::get_error_code(cuda_exception)` will return the original CUDA error code.
+
+The relevant `sycl::errc` cases and the CUDA errors that they may be mapped from are listed below.
+
+==== build
+
+`sycl::errc::build` is defined as:
+
+_Error from an online compile or
+link operation when compiling,
+linking, or building a kernel bundle for a device._
+
+which may be mapped from `CUDA_ERROR_NO_BINARY_FOR_GPU`, `CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, `CUDA_ERROR_INVALID_PTX`, `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, `CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, `CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`.
+
+
+==== memory_allocation
+
+`sycl::errc::memory_allocation` is defined as:
+
+_Error on memory allocation on the
+SYCL device for a SYCL kernel._
+
+which may be mapped from `CUDA_ERROR_OUT_OF_MEMORY`.
+
+==== kernel_argument
+
+`sycl::errc::kernel_argument` is defined as:
+
+_The application has passed an invalid argument to a SYCL kernel
+function. This includes captured
+variables if the SYCL kernel function is a lambda function._
+
+which may be mapped from `CUDA_ERROR_NOT_FOUND`.
+
 // %%%%%%%%%%%%%%%%%%%%%%%%%%%% end cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%

From a0c56440459d810ba4ce438a6315e023687c3377 Mon Sep 17 00:00:00 2001
From: Jack Kirk <jack.kirk@codeplay.com>
Date: Thu, 21 Oct 2021 09:22:18 +0000
Subject: [PATCH 07/45] Device and Platform

---
 adoc/chapters/cuda_backend.adoc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 9cc8e1b2..5251b97c 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -29,9 +29,8 @@ have a few differences in terminology and architecture.
 [[sub:cuda:platform_model]]
 === Platform Model
 
-TODO: Platform
-
-TODO: Device
+All CUDA enabled devices which can be executed on are represented by a single `CUdevice`. A SYCL device maps to a single CUDA device.
+As CUDA does not split into separate platforms there is no 'platform' concept in CUDA corresponding to the SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices represented by `std::vector<CUdevice>`.
 
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL

From c7d304fa900db382df62535863be7f7e68efe8d7 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Fri, 22 Oct 2021 11:52:46 +0000
Subject: [PATCH 08/45] Add API interoperability section

---
 adoc/chapters/cuda_backend.adoc | 135 ++++++++------------------------
 1 file changed, 34 insertions(+), 101 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 5251b97c..8a5c3c85 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -268,40 +268,24 @@ not yet supported.
 [[sub:cuda:application_interoperability]]
 === Application Interoperability
 
+This section describes the API level interoperability between SYCL and CUDA.
+
+The CUDA backend supports API interoperability for `platform`, `device`,
+`context`, `queue`, `event` and `buffer`. Interoperability for `kernel`,
+`kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` is not
+supported.
+
 [[table.cuda.appinterop.nativeobjects]]
 .Types of native backend objects application interoperability
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
-| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#buffer# |   |   |
-| [code]#context#                                                   |   |   |
-| [code]#device#                                                    |   |   |
-| [code]#device_image<State>#                                       |   |   |
-| [code]#event#                                                     |   |   |
-| [code]#kernel#                                                    |   |   |
-| [code]#kernel_bundle<State>#                                      |   |   |
-| [code]#platform#                                                  |   |   |
-| [code]#queue#                                                     |   |   |
-| [code]#sampled_image<Dims, AllocatorT>#                           |   |   |
-| [code]#unsampled_image<Dims, AllocatorT>#                         |   |   |
-|====
-
-[[table.cuda.appinterop.ownership]]
-.Ownership behavior of native backend objects.
-[width="100%",options="header",cols="40%,60%"]
-|====
-| SYCL Object                                                       | Destructor behaviour
-| [code]#buffer# |   
-| [code]#context#                                                   |   
-| [code]#device#                                                    |  
-| [code]#device_image<State>#                                       |   
-| [code]#event#                                                     |  
-| [code]#kernel#                                                    |   
-| [code]#kernel_bundle<State>#                                      |   
-| [code]#platform#                                                  |  
-| [code]#queue#                                                     |   
-| [code]#sampled_image<Dims, AllocatorT>#                           |   
-| [code]#unsampled_image<Dims, AllocatorT>#                         | 
+| [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#platform# | `std::vector<CUdevice>`   | `std::vector<CUdevice>`  | A SYCL platform encapsulates a list of CUDA devices.
+| [code]#device#   | `CUdevice`                | `CUdevice`               | A SYCL device encapsulates a CUDA device.
+| [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
+| [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
+| [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
+| [code]#buffer# | `struct { CUdeviceptr ptr; size_t size; }` | `CUdeviceptr` | A SYCL buffer encapsulates a CUDA device pointer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -312,110 +296,59 @@ not yet supported.
 | [code]#template<backend Backend> +
 platform + 
 make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        |
+        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device.
 
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 
-        |
+        | Create a SYCL `device` from a CUDA device.
 
 | [code]#template<backend Backend> +
 context +
 make_context(const backend_input_t<Backend, context> &backendObject,
                      const async_handler asyncHandler = {});# 
-        |
+        | Create a SYCL `context` from a CUDA context.
 
 | [code]#template<backend Backend> +
 queue +
 make_queue(const backend_input_t<Backend, queue> &backendObject,
                  const context &targetContext,
                  const async_handler asyncHandler = {});# 
-        |
+        | Create a SYCL `queue` from a CUDA stream. The provided `targetContext` must encapsulate the same CUDA context as the provided CUDA stream.
 
 | [code]#template<backend Backend> +
 event +
 make_event(const backend_input_t<Backend, event> &backendObject,
                  const context &targetContext);# 
-        |
+        | Create a SYCL `event` from a CUDA event.
 
 | [code]#template <backend Backend, typename T, int dimensions = 1,
           typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
 buffer<T, dimensions, AllocatorT> +
 make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
-                &backendObject,
-            const context &targetContext, event availableEvent);# 
-        |
-
+               &backendObject,
+           const context &targetContext, event availableEvent);#
+        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. The `availableEvent` parameter can be used for synchronization and indicates when the CUDA pointer is ready to be used. Only `dimensions == 1` is supported.
+ 
 | [code]#template <backend Backend, typename T, int dimensions = 1,
-          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+           typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
 buffer<T, dimensions, AllocatorT> +
 make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
                 &backendObject,
-            const context &targetContext);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-sampled_image<dimensions, AllocatorT> + 
-make_sampled_image(
-    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, image_sampler imageSampler,
-    event availableEvent);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-sampled_image<dimensions, AllocatorT> +
-make_sampled_image(
-    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, image_sampler imageSampler);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-unsampled_image<dimensions, AllocatorT> +
-make_unsampled_image(
-    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, event availableEvent);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-unsampled_image<dimensions, AllocatorT> + 
-make_unsampled_image(
-    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext);# 
-        |
-
-| [code]#template<backend Backend, bundle_state State> +
-kernel_bundle<State> +
-make_kernel_bundle(
-    const backend_input_t<Backend, kernel_bundle<State>> &backendObject,
-    const context &targetContext);# 
-        |
+            const context &targetContext);#
+        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. Only `dimensions == 1` is supported.
 
-| [code]#template<backend Backend> +
-kernel +
-make_kernel(const backend_input_t<Backend, kernel> &backendObject,
-                   const context &targetContext);# 
-        |
 |====
 
-[[table.cuda.appinterop.make_interop_APIs]]
-.[code]#get_native# Interoperability APIs for native backend objects.
-[width="100%",options="header",cols="40%,60%"]
-|====
-| CUDA interoperability function                                    |  Description
-| [code]#template<backend Backend, class T> +
-backend_return_t<Backend, T> +
-get_native(const T &syclObject);# 
-        |
-|====
+==== Ownership of native backend objects
+
+The CUDA backend retains ownership of all native CUDA objects obtained through
+the interoperability API, therefore associated SYCL objects must be kept alive
+for the duration of the CUDA work using these native CUDA objects.
 
+When creating a SYCL object from a native CUDA object SYCL does not take
+ownership of the object and it is up to the application to dispose of them when
+appropriate.
 
 [[sub:cuda:kernel_function_interoperability]]
 === Kernel Function Interoperability

From e094cace2f5becc32935d1dee2fcd7eab581e6ca Mon Sep 17 00:00:00 2001
From: Gordon <gordon@codeplay.com>
Date: Fri, 22 Oct 2021 14:31:59 +0000
Subject: [PATCH 09/45] Add CUDA backend specification kernel function interop
 definitions

---
 adoc/chapters/cuda_backend.adoc | 51 ++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 8a5c3c85..053b43df 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -345,6 +345,7 @@ make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
 The CUDA backend retains ownership of all native CUDA objects obtained through
 the interoperability API, therefore associated SYCL objects must be kept alive
 for the duration of the CUDA work using these native CUDA objects.
+dd CUDA backend specification kernel function interop definitions
 
 When creating a SYCL object from a native CUDA object SYCL does not take
 ownership of the object and it is up to the application to dispose of them when
@@ -353,23 +354,47 @@ appropriate.
 [[sub:cuda:kernel_function_interoperability]]
 === Kernel Function Interoperability
 
-[[table.cuda.appinterop.nativeobjects]]
+This section describes the kernel function interoperability for the CUDA
+backend.
+
+The CUDA backend supports kernel function interoperability for the `accessor`,
+`local_accessor`, `sampled_image_accessor`, `unsampled_image_accessor` and
+`stream` classes.
+
+The CUDA backend does not support interoperability for the `device_event` class
+as there's no equivalent in CUDA.
+
+Address spaces in CUDA are associated with variable decorations rather than the
+type, so when pointers are passed as parameters to a function the parameter
+types does not need to be decorated with an address space, instead it's simply a
+raw un-decorated pointer. For this reason the `accessor`,  `local_accessor` and
+`stream` classes map to a raw undecorated pointer which can be implemented using
+the generic address space.
+
+Other kernel function types in CUDA are represented by aliases provided in the
+`sycl::cuda` namespace. These are provided for the `sampled_image_accessor`,
+and `unsampled_image_accessor` classes; `sycl::cuda::texture` and
+`sycl::cuda::surface` respectively.
+
+Below is a table of the `backend_input_t` and `backend_return_t` specializations
+for the SYCL classes which support kernel function interoperability.
+
+[[table.cuda.kernelinterop.nativeobjects]]
 .Types of native backend objects kernel function interoperability
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#accessor<T, Dims, Mode, target::device>#                    |   |   |
-| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           |   |   |
-| [code]#accessor<T, Dims, Mode, target::local>#                     |   |   |
-| [code]#local_accessor<T, Dims>#                                    |   |   |
-| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   |   |   |
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   |   |   |
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   |   |   |
-| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># |   |   |
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># |   |   |
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># |   |   |
-| [code]#stream#                                                     |   |   |
-| [code]#device_event#                                               |   |   |
+| [code]#accessor<T, Dims, Mode, target::device>#                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::local>#                     | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#local_accessor<T, Dims>#                                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#stream#                                                     | signed char * | signed char * | Convert a SYCL `accessor` to an undecorated raw signed char pointer.
 |====
 
 

From 6976a1aa6f7a084a3d9c0d29f537d0fe4aedfd8d Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 11 Jan 2022 11:37:20 +0000
Subject: [PATCH 10/45] Fix editing typo and improve make_device docs

---
 adoc/chapters/cuda_backend.adoc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 053b43df..6a0d88a4 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -301,7 +301,7 @@ make_platform(const backend_input_t<Backend, platform> &backendObject);#
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 
-        | Create a SYCL `device` from a CUDA device.
+        | Construct a SYCL `device` from a CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of devices that are enumerated via `sycl::device::get_devices()`. Calling this function does not create a new device. Rather it merely creates a `sycl::device` object that is a copy of one of the devices from that enumeration.
 
 | [code]#template<backend Backend> +
 context +
@@ -345,7 +345,6 @@ make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
 The CUDA backend retains ownership of all native CUDA objects obtained through
 the interoperability API, therefore associated SYCL objects must be kept alive
 for the duration of the CUDA work using these native CUDA objects.
-dd CUDA backend specification kernel function interop definitions
 
 When creating a SYCL object from a native CUDA object SYCL does not take
 ownership of the object and it is up to the application to dispose of them when

From e585417fba5a32b2e69066c9725ee18fbacb2b61 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 12 Oct 2021 14:06:12 +0000
Subject: [PATCH 11/45] Added initial chapter skeleton for CUDA backend
 specification

---
 adoc/chapters/cuda_backend.adoc | 209 ++++++++++++++++++++++++++++++++
 adoc/syclbase.adoc              |   3 +
 2 files changed, 212 insertions(+)
 create mode 100644 adoc/chapters/cuda_backend.adoc

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
new file mode 100644
index 00000000..9f26cf09
--- /dev/null
+++ b/adoc/chapters/cuda_backend.adoc
@@ -0,0 +1,209 @@
+// %%%%%%%%%%%%%%%%%%%%%%%%%%%% begin cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+[appendix]
+[[chapter:cuda-backend]]
+= CUDA backend specification
+
+[[sec:cuda:introduction]]
+== Introduction
+
+[[sec:cuda:mapping_of_sycl_programming_model]]
+== Mapping of SYCL programming model
+
+[[sub:cuda:platform_model]]
+=== Platform Model
+
+[[sub:cuda:memory_model]]
+=== Memory model
+
+[[sub:cuda:execution_model]]
+=== Execution Model
+
+[[sec::programming_interface]]
+== Programming Interface
+
+[[sub:cuda:application_interoperability]]
+=== Application Interoperability
+
+[[table.cuda.appinterop.nativeobjects]]
+.Types of native backend objects application interoperability
+[width="100%",options="header",cols="20%,20%,20%,40%"]
+|====
+| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#buffer# |   |   |
+| [code]#context#                                                   |   |   |
+| [code]#device#                                                    |   |   |
+| [code]#device_image<State>#                                       |   |   |
+| [code]#event#                                                     |   |   |
+| [code]#kernel#                                                    |   |   |
+| [code]#kernel_bundle<State>#                                      |   |   |
+| [code]#platform#                                                  |   |   |
+| [code]#queue#                                                     |   |   |
+| [code]#sampled_image<Dims, AllocatorT>#                           |   |   |
+| [code]#unsampled_image<Dims, AllocatorT>#                         |   |   |
+|====
+
+[[table.cuda.appinterop.ownership]]
+.Ownership behavior of native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| SYCL Object                                                       | Destructor behaviour
+| [code]#buffer# |   
+| [code]#context#                                                   |   
+| [code]#device#                                                    |  
+| [code]#device_image<State>#                                       |   
+| [code]#event#                                                     |  
+| [code]#kernel#                                                    |   
+| [code]#kernel_bundle<State>#                                      |   
+| [code]#platform#                                                  |  
+| [code]#queue#                                                     |   
+| [code]#sampled_image<Dims, AllocatorT>#                           |   
+| [code]#unsampled_image<Dims, AllocatorT>#                         | 
+|====
+
+[[table.cuda.appinterop.make_interop_APIs]]
+.[code]#make_*# Interoperability APIs for native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| CUDA interoperability function                                    |  Description
+| [code]#template<backend Backend> +
+platform + 
+make_platform(const backend_input_t<Backend, platform> &backendObject);# 
+        |
+
+| [code]#template<backend Backend> +
+device +
+make_device(const backend_input_t<Backend, device> &backendObject);# 
+        |
+
+| [code]#template<backend Backend> +
+context +
+make_context(const backend_input_t<Backend, context> &backendObject,
+                     const async_handler asyncHandler = {});# 
+        |
+
+| [code]#template<backend Backend> +
+queue +
+make_queue(const backend_input_t<Backend, queue> &backendObject,
+                 const context &targetContext,
+                 const async_handler asyncHandler = {});# 
+        |
+
+| [code]#template<backend Backend> +
+event +
+make_event(const backend_input_t<Backend, event> &backendObject,
+                 const context &targetContext);# 
+        |
+
+| [code]#template <backend Backend, typename T, int dimensions = 1,
+          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+buffer<T, dimensions, AllocatorT> +
+make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
+                &backendObject,
+            const context &targetContext, event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, typename T, int dimensions = 1,
+          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+buffer<T, dimensions, AllocatorT> +
+make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
+                &backendObject,
+            const context &targetContext);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+sampled_image<dimensions, AllocatorT> + 
+make_sampled_image(
+    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, image_sampler imageSampler,
+    event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+sampled_image<dimensions, AllocatorT> +
+make_sampled_image(
+    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, image_sampler imageSampler);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+unsampled_image<dimensions, AllocatorT> +
+make_unsampled_image(
+    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext, event availableEvent);# 
+        |
+
+| [code]#template <backend Backend, int dimensions = 1,
+          typename AllocatorT = sycl::image_allocator> +
+unsampled_image<dimensions, AllocatorT> + 
+make_unsampled_image(
+    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
+        &backendObject,
+    const context &targetContext);# 
+        |
+
+| [code]#template<backend Backend, bundle_state State> +
+kernel_bundle<State> +
+make_kernel_bundle(
+    const backend_input_t<Backend, kernel_bundle<State>> &backendObject,
+    const context &targetContext);# 
+        |
+
+| [code]#template<backend Backend> +
+kernel +
+make_kernel(const backend_input_t<Backend, kernel> &backendObject,
+                   const context &targetContext);# 
+        |
+|====
+
+[[table.cuda.appinterop.make_interop_APIs]]
+.[code]#get_native# Interoperability APIs for native backend objects.
+[width="100%",options="header",cols="40%,60%"]
+|====
+| CUDA interoperability function                                    |  Description
+| [code]#template<backend Backend, class T> +
+backend_return_t<Backend, T> +
+get_native(const T &syclObject);# 
+        |
+|====
+
+
+[[sub:cuda:kernel_function_interoperability]]
+=== Kernel Function Interoperability
+
+[[table.cuda.appinterop.nativeobjects]]
+.Types of native backend objects kernel function interoperability
+[width="100%",options="header",cols="20%,20%,20%,40%"]
+|====
+| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#accessor<T, Dims, Mode, target::device>#                    |   |   |
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           |   |   |
+| [code]#accessor<T, Dims, Mode, target::local>#                     |   |   |
+| [code]#local_accessor<T, Dims>#                                    |   |   |
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   |   |   |
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   |   |   |
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   |   |   |
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># |   |   |
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># |   |   |
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># |   |   |
+| [code]#stream#                                                     |   |   |
+| [code]#device_event#                                               |   |   |
+|====
+
+
+[[sec:non_core_features_and_extensions]]
+== Non-core features and extensions
+
+[[sub:cuda:extensions]]
+=== Extensions
+
+[[sub:cuda:error_handling]]
+=== Error Handling
+
+// %%%%%%%%%%%%%%%%%%%%%%%%%%%% end cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/adoc/syclbase.adoc b/adoc/syclbase.adoc
index 56caaa62..6c5c027b 100644
--- a/adoc/syclbase.adoc
+++ b/adoc/syclbase.adoc
@@ -129,6 +129,9 @@ include::chapters/feature_sets.adoc[]
 // \input{opencl_backend}
 include::chapters/opencl_backend.adoc[]
 
+// \input{cuda_backend}
+include::chapters/cuda_backend.adoc[]
+
 // \input{what_changed}
 include::chapters/what_changed.adoc[]
 

From 007b6592368c495f6911ab317199f6d126190658 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Fri, 15 Oct 2021 14:08:34 +0000
Subject: [PATCH 12/45] Add introduction and mapping of context and queue for
 CUDA backend specification

---
 adoc/chapters/cuda_backend.adoc | 36 +++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 9f26cf09..b7f37245 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -4,15 +4,47 @@
 [[chapter:cuda-backend]]
 = CUDA backend specification
 
-[[sec:cuda:introduction]]
-== Introduction
+This chapter describes how the SYCL general programming model is mapped on top
+of CUDA, and how the SYCL generic interoperability interface must be
+implemented by vendors providing SYCL for CUDA implementations to ensure SYCL
+applications written for the CUDA backend are interoperable.
+
+The CUDA backend is enabled using the `sycl::backend::cuda` value of `enum
+class backend`. That means that when the CUDA backend is active, the value of
+`sycl::is_backend_active<sycl::backend::cuda>::value` will be `true`, and the
+preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
+
+The CUDA backend requires an installation of CUDA SDK as well as one or more
+CUDA devices available in the system.
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
 
+This section gives a general overview of how the SYCL programming model maps to
+CUDA. These two programming models are pretty similar in essence however they do
+have a few differences in terminology and architecture.
+
 [[sub:cuda:platform_model]]
 === Platform Model
 
+TODO: Platform
+
+TODO: Device
+
+A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
+a CUDA context is tied to a single device, this is not the case for a SYCL
+<<context>> and the CUDA backend implementation may use multiple CUDA contexts
+to emulate a SYCL <<context>> containing multiple devices. Additionally, while
+SYCL contexts are simple objects passed around either implicitly or explicitly,
+CUDA contexts require to be activated on the current thread to be used by other
+CUDA entry points. Therefore any use of the SYCL APIs with a CUDA backend may
+modify the current active context on the thread, and no guarantee is provided
+that any existing active CUDA context would be restored by SYCL.
+
+A SYCL <<queue>> simply maps to one or multiple CUDA streams. Indeed while a
+CUDA stream is in-order, a SYCL <<queue>> isn't, so a CUDA backend implementation
+may use multiple CUDA streams to implement an out of order SYCL <<queue>>.
+
 [[sub:cuda:memory_model]]
 === Memory model
 

From f09d7a440ac703bab8de635963c18241dfd48497 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Mon, 18 Oct 2021 15:55:05 +0000
Subject: [PATCH 13/45] Add memory model section to CUDA backend specification

---
 adoc/chapters/cuda_backend.adoc | 122 ++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index b7f37245..64f5ce1d 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -48,6 +48,128 @@ may use multiple CUDA streams to implement an out of order SYCL <<queue>>.
 [[sub:cuda:memory_model]]
 === Memory model
 
+==== Memory Allocations
+
+When non-host accessors to buffers are created without [code]#target::host_buffer# they need to allocate memory for their contents on the device. For example using [code]#cudaMalloc3D()#, [code]#cudaMallocPitch()# or [code]#cudaMalloc()#.
+
+When accessors to images are created without [code]#target::host_buffer# they allocate memory, for example using [code]#cudaMalloc3DArray()# or [code]#cudaMallocArray()#. 
+
+When non-host accessors are created with [code]#target::host_buffer# they can, for example use [code]#cudaHostAlloc()# to allocate pinned memory on host.
+
+Table <<table.cuda.memmodel.USM>> specifies which underlying CUDA functions can be used for USM allocations. For shared USM allocations this would mean memory is managed (moved between host and different devices) by CUDA runtime. Alternatively shared USM allocations can be managed by SYCL runtime, using non-managed CUDA allocation on device when needed, such as [code]#cudaMalloc()#.
+
+[[table.cuda.memmodel.USM]]
+.Cuda functions that could be used to allocate SYCL USM allocations
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL USM type | CUDA function
+| device | [code]#cudaMalloc()#
+| host | [code]#cudaHostAlloc()#
+| shared | [code]#cudaMallocManaged()#
+|====
+
+==== Samplers
+
+In both SYCL and CUDA samplers consist of addressing mode, filtering mode and coordinate normalization mode. Mapping between SYCL and CUDA values is defined in tables <<table.cuda.memmodel.sampler_addressing>>, <<table.cuda.memmodel.sampler_filtering>> and <<table.cuda.memmodel.sampler_normalization>>. In CUDA addressing modes for all dimesnions will be the same, as CUDA allows different addressing modes for different dimesnions, while SYCL does not. 
+
+[[table.cuda.memmodel.sampler_addressing]]
+.Mapping of SYCL sampler addressing modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler addressing mode | CUDA sampler addressing mode
+| [code]#sycl::addressing_mode::mirrored_repeat# | [code]#cudaAddressModeMirror#
+| [code]#sycl::addressing_mode::repeat# | [code]#cudaAddressModeWrap#
+| [code]#sycl::addressing_mode::clamp_to_edge# | [code]#cudaAddressModeClamp#
+| [code]#sycl::addressing_mode::clamp# | [code]#cudaAddressModeClamp#
+| [code]#sycl::addressing_mode::none# | [code]#cudaAddressModeBorder#
+|====
+
+SYCL allows [code]#sycl::addressing_mode::mirrored_repeat# and [code]#sycl::addressing_mode::repeat# to be used together with unnormalized coordinates. In this case the resulting coordinates are undefined. CUDA does not allow this, so if [code]#sycl::addressing_mode::mirrored_repeat# or [code]#sycl::addressing_mode::repeat# is specified together with unnormalized coordinates, [code]#cudaAddressModeBorder# is used instead.
+
+[[table.cuda.memmodel.sampler_filtering]]
+.Mapping of SYCL sampler filtering modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler filtering mode | CUDA sampler filtering mode
+| [code]#sycl::filtering_mode::nearest# | [code]#cudaFilterModePoint#
+| [code]#sycl::filtering_mode::linear# | [code]#cudaFilterModeLinear#
+|====
+
+[[table.cuda.memmodel.sampler_normalization]]
+.Mapping of SYCL sampler coordinate normalization modes to CUDA
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL sampler coordinate normalization mode | CUDA sampler coordinate normalization mode
+| [code]#sycl::coordinate_normalization_mode::normalized# | [code]#normalizedCoords = true#
+| [code]#sycl::coordinate_normalization_mode::unnormalized# | [code]#normalizedCoords = false#
+|====
+
+==== Address Spaces
+
+Table <<table.cuda.memmodel.address_spaces>> maps SYCL address spaces to CUDA address spaces.
+
+[[table.cuda.memmodel.address_spaces]]
+.Mapping from SYCL address spaces to CUDA address spaces
+[width="100%",options="header",cols="50%,50%"]
+|====
+| SYCL Address Space | CUDA Address Space
+| Global memory | global
+| Local memory | shared
+| Private memory | registers or local
+| Generic memory | generic
+| Constant memory | const
+|====
+
+==== Atomics
+
+Not all CUDA devices support all memory orders. If a particular memory order is unsupported by a CUDA device, it can be unsupported in the SYCL CUDA backend for that device. Sequentially consistent atomics are currently not supported on any device, so the SYCL CUDA backend is not required to implement them. The mappings of other memory orders (when supported by the device) is defined in table <<table.cuda.memmodel.memory_orders>>.
+
+[[table.cuda.memmodel.memory_orders]]
+.Mapping from [code]#sycl::memory_order# to PTX ISA memory orders
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_order# | PTX ISA Memory Order
+| [code]#memory_order::relaxed# | relaxed
+| [code]#memory_order::acquire# | acquire
+| [code]#memory_order::release# | release
+| [code]#memory_order::acq_rel# | acq_rel
+| [code]#memory_order::seq_cst# | undefined
+|====
+
+Mapping of memory scopes (when supported by the device) is defined in table [table.cuda.memmodel.memory_scopes]. [code]#memory_scope::work_item# does not require any consistency between different work items, so it can be mapped to non-atomic operation.
+
+[[table.cuda.memmodel.memory_scopes]]
+.Mapping from [code]#sycl::memory_scope# to PTX ISA memory scopes
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_scope# | PTX ISA Memory Scope
+| [code]#memory_scope::work_item# | 
+| [code]#memory_scope::sub_group# | cta
+| [code]#memory_scope::work_group# | cta
+| [code]#memory_scope::device# | gpu
+| [code]#memory_scope::system# | system
+|====
+
+==== Fences
+
+If a device supports the [code]#fence# PTX instruction the mapping of memory orders is defined in <<table.cuda.memmodel.fence_memory_orders>>. Otherwise all memory orders (except relaxed) are mapped to the [code]#membar# instruction.
+
+[[table.cuda.memmodel.fence_memory_orders]]
+.Mapping from [code]#sycl::memory_order# to PTX ISA memory orders when used in fences
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#sycl::memory_order# | PTX ISA Memory Order
+| [code]#memory_order::relaxed# | none
+| [code]#memory_order::acquire# | acq_rel
+| [code]#memory_order::release# | acq_rel
+| [code]#memory_order::acq_rel# | acq_rel
+| [code]#memory_order::seq_cst# | sc
+|====
+
+If future versions of PTX ISA define fence instructions with only acquire or only release memory order, these can be used as well for [code]#memory_order::acquire# and [code]#memory_order::release# on devices that support them.
+
+Mapping of SYCL memory scopes to PTX ISA is the same as for atomics. It is defined in <<table.cuda.memmodel.memory_scopes>>.
+
 [[sub:cuda:execution_model]]
 === Execution Model
 

From 6301215581494becf2ccc56a77deb83af8d528ef Mon Sep 17 00:00:00 2001
From: Aidan Belton <aidan.belton@codeplay.com>
Date: Tue, 19 Oct 2021 23:32:51 +0000
Subject: [PATCH 14/45] Add cuda execution model

---
 adoc/chapters/cuda_backend.adoc | 91 +++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 64f5ce1d..fa27a349 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -16,6 +16,9 @@ preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
 
 The CUDA backend requires an installation of CUDA SDK as well as one or more
 CUDA devices available in the system.
+[[sec:cuda:introduction]]
+== Introduction
+>>>>>>> cuda execution model
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
@@ -173,6 +176,94 @@ Mapping of SYCL memory scopes to PTX ISA is the same as for atomics. It is defin
 [[sub:cuda:execution_model]]
 === Execution Model
 
+CUDA's execution model is similar to SYCL's. CUDA uses kernels to
+offload computation, splitting the host and GPU into asynchronous 
+computing devices. In general, except for CUDA's dynamic 
+parallelism extensions, kernels are called by the host. One 
+difference between CUDA and SYCL execution models is that CUDA 
+uses Single Instruction Multiple Thread (SIMT) while SYCL uses 
+Single Instruction Multiple Data (SIMD) kernels. SIMT kernels use 
+multiple scalar instructions acting on non-contiguous data.  SIMD 
+kernels use vector instructions acting on contiguous data. SIMT 
+can be used in place of SIMD but not the other way around, as SIMD 
+requires memory blocks to have no interruptions within the data, 
+while SIMT does not have this as a requirement. 
+
+CUDA GPUs are constructed out of streaming multiprocessors (SM) 
+which perform the actual computation. Each SM consists of 8 scalar 
+cores, shared memory, registers, a load/store unit, and a scheduler 
+unit. CUDA uses a hierarchy of threads to organize the execution of
+kernels. Kernels are split up into thread blocks. The threadblocks 
+form a grid each thread can identify its location within the grid 
+using a block ID. The grid is a concept used to index threadblocks 
+the grid can be one, two, or three dimensions. Each thread block is 
+tied to a single SM. Similar to a thread block's location within the 
+grid, each thread's position within the block can be identified with 
+a one, two, or three dimensional thread ID. 
+
+Pre-Volta GPU architectures breaks thread blocks into warps which 
+consist of 32 threads. The warp is processed by the SM concurrently. 
+For one warp instruction to be executed requires 4 SM clock cycles. 
+SM's execute multiple warp instructions. The warps instructions are 
+prioritized and scheduled to minimize overhead. 
+
+Volta and more recent GPU architectures use independent thread 
+scheduling. In addition, each thread can access memory within a 
+unified virtual address space. Threads must synchronize with other 
+threads using execution barriers, synchronization primitives and 
+Cooperative Groups to utilize unified memory.
+
+SYCL has a similar execution hierarchy consisting of kernels. 
+The kernel is broken down into work-items. Each work-item concurrently
+executes an instance of the kernel on a piece of memory. Work-items 
+can be combined into work-groups that have designated shared memory.
+Work-groups can synchronize their work-items with work-group barriers.
+
+There are some equivalences between CUDA and SYCL execution models. 
+For example, CUDA's stream multiprocessor is equal to a SYCL compute 
+unit. CUDA's grid is similar to SYCL's nd_range as it is the highest 
+level grouping of threads, not including the whole kernel. Both 
+nd_range and grid can segment the groups of threads into one, two, or 
+three dimensions. SYCL sub-groups roughly map to
+cooperative groups `thread_block_tile` as it allows for the
+work-group/thread block to be further subdivided into concurrent threads.
+Likewise, thread blocks map directly to work-groups, and a
+single thread is a SYCL work-item.
+
+CUDA primarily synchronizes the threads through two functions,
+`cudaStreamSynchronize()` and `__syncthreads()`. 
+`cudaStreamSynchronize()` blocks work from being performed until all 
+threads on the device has been completed. `__syncthreads()` waits for 
+all threads within a thread block to reach the same point. So 
+`cudaStreamSynchronize()` is similar to queue.wait(), buffer 
+destruction, and other host-device synchronization events within SYCL. 
+`__syncthreads()` synchronizes the threads within a thread block which
+is analogous to the work-group barrier.
+
+CUDA's warp concept has no SYCL equivalent. If a user were to write 
+warp aware code it would be non-generic SYCL code and specific to the 
+CUDA backend.
+
+CUDA allows for more detailed thread and memory management through 
+Cooperative Groups. Cooperative Groups allow for synchronizing at the 
+grid level and organizing subgroups in sizes smaller than a warp. 
+Cooperative Groups do not have an equivalent within SYCL 2020 and are 
+not yet supported.
+
+[[table.cuda.CUDA_features_to_SYCL]]
+.CUDA execution features with their corresponding SYCL features
+[width="100%",options="header",cols="50%,50%"]
+|====
+| [code]#SYCL#                                                       | [code]#CUDA#
+| [code]#Compute unit#                                               | [code]#Streaming multiprocessor#
+| [code]#nd_range#                                                   | [code]#grid#
+| [code]#work-group#                                                 | [code]#Thread block#
+| [code]#sub-group#                                                  | [code]#thread_block_tile#
+| [code]#work-item#                                                  | [code]#Thread#
+| [code]#SYCL nd_item synchronization#                               | [code]#cudaStreamSynchronize#
+| [code]#work-group barrier#                                         | [code]#__syncthread#
+|====
+
 [[sec::programming_interface]]
 == Programming Interface
 

From f22dac8ffa5c22f4b44ba10f5fb6e1e6d568233d Mon Sep 17 00:00:00 2001
From: Aidan Belton <aidan.belton@codeplay.com>
Date: Wed, 20 Oct 2021 13:16:31 +0000
Subject: [PATCH 15/45] cuda extension

---
 adoc/chapters/cuda_backend.adoc | 47 ++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index fa27a349..d24edc99 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -18,7 +18,6 @@ The CUDA backend requires an installation of CUDA SDK as well as one or more
 CUDA devices available in the system.
 [[sec:cuda:introduction]]
 == Introduction
->>>>>>> cuda execution model
 
 [[sec:cuda:mapping_of_sycl_programming_model]]
 == Mapping of SYCL programming model
@@ -445,6 +444,52 @@ get_native(const T &syclObject);#
 [[sec:non_core_features_and_extensions]]
 == Non-core features and extensions
 
+Additional CUDA features are available depending upon the devices compute capability.
+SYCL can support these optional CUDA features with extensions.
+Unlike OpenCL, CUDA needs to know if the extension is available at compile time. 
+As a result there are no valid CUDA extensions which can be passed to `has_extension`.
+
+As the extension must be known at runtime CUDA extensions are best implemented 
+using feature test macros. The test macro format is 
+SYCL_EXT_<vendorstring>_<featurename>. For CUDA extensions this format translates 
+to SYCL_EXT_NVIDIA_<featurename>. Similarly, the format for the naming of extension 
+classes and enumerations should be ext_<vendorstring>_<featurename>. Which in this context
+becomes ext_NVIDIA_<featurename>. Given the necessity to know the extension at 
+compile-time, the usage of extension macros should be the primary method of determining 
+if the extension is available in the SYCL implementation not. 
+A list of non-core CUDA features which have SYCL support is below.
+Non-core CUDA features for require a compute capability of greater than 5.
+
+TODO: The table below shows a proposal for SYCL supported CUDA extensions.
+The table should be developed with other members of the SYCL community.
+
+[[table.extensionsupport]]
+.SYCL support for CUDA 11.3 extensions
+[width="100%",options="header",cols="35%,35%,15%, 15"]
+|====
+| SYCL Aspect              | CUDA Extension                                        | Core SYCL API | Required Compute Capability 
+| [code]#aspect::fp16#     | [code]#16-bit floating point#                         | Yes           | 5.3 or greater
+| -                        | [code]#Tensor Cores#                                  | No            | 7 or greater
+| -                        | [code]#Atomic floating-point operations#              | No            | 6 or greater
+|====
+
+=== Aspects
+Aspects are used to query what features and attributes a device has. Some aspects such as `fp16`
+are non-core CUDA features. Therefore, the runtime must be able to determine what aspects CUDA 
+devices have. This can be performed by querying `cudaDeviceProp::major` and `cudaDeviceProp::minor`
+to find out the compute capability. The compute capability indicates what extensions are
+available to the device, and therefore what aspects are available.
+
+[[sec:cuda:extension-fp16]]
+=== Half precision floating-point
+
+The half scalar data type: [code]#half# and the half vector data types:
+[code]#half1#, [code]#half2#, [code]#half3#,
+[code]#half4#, [code]#half8# and [code]#half16# must be
+available at compile-time. However a kernel using these types is only
+supported on devices that have [code]#aspect::fp16#, i.e. compute capability
+5.3 or greater.
+
 [[sub:cuda:extensions]]
 === Extensions
 

From caf9edc9122d82cc542b071e33710e00d7dc54e7 Mon Sep 17 00:00:00 2001
From: Jack Kirk <jack.kirk@codeplay.com>
Date: Thu, 21 Oct 2021 09:20:35 +0000
Subject: [PATCH 16/45] Error handling

---
 adoc/chapters/cuda_backend.adoc | 40 +++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d24edc99..9cc8e1b2 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -496,4 +496,44 @@ supported on devices that have [code]#aspect::fp16#, i.e. compute capability
 [[sub:cuda:error_handling]]
 === Error Handling
 
+If there is a CUDA driver API error associated with an exception triggered, then the
+CUDA error code can be obtained by the free function `CUresult sycl::cuda::get_error_code(sycl::exception&)`. In the case where there is
+no CUDA error associated with the exception triggered, the CUDA error
+code will be `CUDA_SUCCESS`.
+
+Most of the SYCL error codes that form sycl::errc are specifically defined as errors thrown during calls to the SYCL API or SYCL runtime. There are also some cases of sycl::errc which cover errors thrown during the compilation or execution of device code.
+It is suitable to map CUDA errors to such cases, such that an exception, "cuda_exception", that was created due to a CUDA error, may, upon execution of `cuda_exception.code()`, return a `std::error_code` relating to the `sycl::errc` case that the CUDA error maps to; whilst `sycl::cuda::get_error_code(cuda_exception)` will return the original CUDA error code.
+
+The relevant `sycl::errc` cases and the CUDA errors that they may be mapped from are listed below.
+
+==== build
+
+`sycl::errc::build` is defined as:
+
+_Error from an online compile or
+link operation when compiling,
+linking, or building a kernel bundle for a device._
+
+which may be mapped from `CUDA_ERROR_NO_BINARY_FOR_GPU`, `CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, `CUDA_ERROR_INVALID_PTX`, `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, `CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, `CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`.
+
+
+==== memory_allocation
+
+`sycl::errc::memory_allocation` is defined as:
+
+_Error on memory allocation on the
+SYCL device for a SYCL kernel._
+
+which may be mapped from `CUDA_ERROR_OUT_OF_MEMORY`.
+
+==== kernel_argument
+
+`sycl::errc::kernel_argument` is defined as:
+
+_The application has passed an invalid argument to a SYCL kernel
+function. This includes captured
+variables if the SYCL kernel function is a lambda function._
+
+which may be mapped from `CUDA_ERROR_NOT_FOUND`.
+
 // %%%%%%%%%%%%%%%%%%%%%%%%%%%% end cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%

From ba334e5a35fb7bb2899546bdba4b2547c1f22f29 Mon Sep 17 00:00:00 2001
From: Jack Kirk <jack.kirk@codeplay.com>
Date: Thu, 21 Oct 2021 09:22:18 +0000
Subject: [PATCH 17/45] Device and Platform

---
 adoc/chapters/cuda_backend.adoc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 9cc8e1b2..5251b97c 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -29,9 +29,8 @@ have a few differences in terminology and architecture.
 [[sub:cuda:platform_model]]
 === Platform Model
 
-TODO: Platform
-
-TODO: Device
+All CUDA enabled devices which can be executed on are represented by a single `CUdevice`. A SYCL device maps to a single CUDA device.
+As CUDA does not split into separate platforms there is no 'platform' concept in CUDA corresponding to the SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices represented by `std::vector<CUdevice>`.
 
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL

From 295ac0ec5b34dbcb2a3936dc8da4efb3c6a7a2e6 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Fri, 22 Oct 2021 11:52:46 +0000
Subject: [PATCH 18/45] Add API interoperability section

---
 adoc/chapters/cuda_backend.adoc | 135 ++++++++------------------------
 1 file changed, 34 insertions(+), 101 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 5251b97c..8a5c3c85 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -268,40 +268,24 @@ not yet supported.
 [[sub:cuda:application_interoperability]]
 === Application Interoperability
 
+This section describes the API level interoperability between SYCL and CUDA.
+
+The CUDA backend supports API interoperability for `platform`, `device`,
+`context`, `queue`, `event` and `buffer`. Interoperability for `kernel`,
+`kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` is not
+supported.
+
 [[table.cuda.appinterop.nativeobjects]]
 .Types of native backend objects application interoperability
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
-| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#buffer# |   |   |
-| [code]#context#                                                   |   |   |
-| [code]#device#                                                    |   |   |
-| [code]#device_image<State>#                                       |   |   |
-| [code]#event#                                                     |   |   |
-| [code]#kernel#                                                    |   |   |
-| [code]#kernel_bundle<State>#                                      |   |   |
-| [code]#platform#                                                  |   |   |
-| [code]#queue#                                                     |   |   |
-| [code]#sampled_image<Dims, AllocatorT>#                           |   |   |
-| [code]#unsampled_image<Dims, AllocatorT>#                         |   |   |
-|====
-
-[[table.cuda.appinterop.ownership]]
-.Ownership behavior of native backend objects.
-[width="100%",options="header",cols="40%,60%"]
-|====
-| SYCL Object                                                       | Destructor behaviour
-| [code]#buffer# |   
-| [code]#context#                                                   |   
-| [code]#device#                                                    |  
-| [code]#device_image<State>#                                       |   
-| [code]#event#                                                     |  
-| [code]#kernel#                                                    |   
-| [code]#kernel_bundle<State>#                                      |   
-| [code]#platform#                                                  |  
-| [code]#queue#                                                     |   
-| [code]#sampled_image<Dims, AllocatorT>#                           |   
-| [code]#unsampled_image<Dims, AllocatorT>#                         | 
+| [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#platform# | `std::vector<CUdevice>`   | `std::vector<CUdevice>`  | A SYCL platform encapsulates a list of CUDA devices.
+| [code]#device#   | `CUdevice`                | `CUdevice`               | A SYCL device encapsulates a CUDA device.
+| [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
+| [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
+| [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
+| [code]#buffer# | `struct { CUdeviceptr ptr; size_t size; }` | `CUdeviceptr` | A SYCL buffer encapsulates a CUDA device pointer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -312,110 +296,59 @@ not yet supported.
 | [code]#template<backend Backend> +
 platform + 
 make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        |
+        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device.
 
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 
-        |
+        | Create a SYCL `device` from a CUDA device.
 
 | [code]#template<backend Backend> +
 context +
 make_context(const backend_input_t<Backend, context> &backendObject,
                      const async_handler asyncHandler = {});# 
-        |
+        | Create a SYCL `context` from a CUDA context.
 
 | [code]#template<backend Backend> +
 queue +
 make_queue(const backend_input_t<Backend, queue> &backendObject,
                  const context &targetContext,
                  const async_handler asyncHandler = {});# 
-        |
+        | Create a SYCL `queue` from a CUDA stream. The provided `targetContext` must encapsulate the same CUDA context as the provided CUDA stream.
 
 | [code]#template<backend Backend> +
 event +
 make_event(const backend_input_t<Backend, event> &backendObject,
                  const context &targetContext);# 
-        |
+        | Create a SYCL `event` from a CUDA event.
 
 | [code]#template <backend Backend, typename T, int dimensions = 1,
           typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
 buffer<T, dimensions, AllocatorT> +
 make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
-                &backendObject,
-            const context &targetContext, event availableEvent);# 
-        |
-
+               &backendObject,
+           const context &targetContext, event availableEvent);#
+        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. The `availableEvent` parameter can be used for synchronization and indicates when the CUDA pointer is ready to be used. Only `dimensions == 1` is supported.
+ 
 | [code]#template <backend Backend, typename T, int dimensions = 1,
-          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
+           typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
 buffer<T, dimensions, AllocatorT> +
 make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
                 &backendObject,
-            const context &targetContext);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-sampled_image<dimensions, AllocatorT> + 
-make_sampled_image(
-    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, image_sampler imageSampler,
-    event availableEvent);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-sampled_image<dimensions, AllocatorT> +
-make_sampled_image(
-    const backend_input_t<Backend, sampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, image_sampler imageSampler);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-unsampled_image<dimensions, AllocatorT> +
-make_unsampled_image(
-    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext, event availableEvent);# 
-        |
-
-| [code]#template <backend Backend, int dimensions = 1,
-          typename AllocatorT = sycl::image_allocator> +
-unsampled_image<dimensions, AllocatorT> + 
-make_unsampled_image(
-    const backend_input_t<Backend, unsampled_image<dimensions, AllocatorT>>
-        &backendObject,
-    const context &targetContext);# 
-        |
-
-| [code]#template<backend Backend, bundle_state State> +
-kernel_bundle<State> +
-make_kernel_bundle(
-    const backend_input_t<Backend, kernel_bundle<State>> &backendObject,
-    const context &targetContext);# 
-        |
+            const context &targetContext);#
+        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. Only `dimensions == 1` is supported.
 
-| [code]#template<backend Backend> +
-kernel +
-make_kernel(const backend_input_t<Backend, kernel> &backendObject,
-                   const context &targetContext);# 
-        |
 |====
 
-[[table.cuda.appinterop.make_interop_APIs]]
-.[code]#get_native# Interoperability APIs for native backend objects.
-[width="100%",options="header",cols="40%,60%"]
-|====
-| CUDA interoperability function                                    |  Description
-| [code]#template<backend Backend, class T> +
-backend_return_t<Backend, T> +
-get_native(const T &syclObject);# 
-        |
-|====
+==== Ownership of native backend objects
+
+The CUDA backend retains ownership of all native CUDA objects obtained through
+the interoperability API, therefore associated SYCL objects must be kept alive
+for the duration of the CUDA work using these native CUDA objects.
 
+When creating a SYCL object from a native CUDA object SYCL does not take
+ownership of the object and it is up to the application to dispose of them when
+appropriate.
 
 [[sub:cuda:kernel_function_interoperability]]
 === Kernel Function Interoperability

From b5bf458cebf8a5c783c4b1c86323f7e33d070437 Mon Sep 17 00:00:00 2001
From: Gordon <gordon@codeplay.com>
Date: Fri, 22 Oct 2021 14:31:59 +0000
Subject: [PATCH 19/45] Add CUDA backend specification kernel function interop
 definitions

---
 adoc/chapters/cuda_backend.adoc | 51 ++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 8a5c3c85..053b43df 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -345,6 +345,7 @@ make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
 The CUDA backend retains ownership of all native CUDA objects obtained through
 the interoperability API, therefore associated SYCL objects must be kept alive
 for the duration of the CUDA work using these native CUDA objects.
+dd CUDA backend specification kernel function interop definitions
 
 When creating a SYCL object from a native CUDA object SYCL does not take
 ownership of the object and it is up to the application to dispose of them when
@@ -353,23 +354,47 @@ appropriate.
 [[sub:cuda:kernel_function_interoperability]]
 === Kernel Function Interoperability
 
-[[table.cuda.appinterop.nativeobjects]]
+This section describes the kernel function interoperability for the CUDA
+backend.
+
+The CUDA backend supports kernel function interoperability for the `accessor`,
+`local_accessor`, `sampled_image_accessor`, `unsampled_image_accessor` and
+`stream` classes.
+
+The CUDA backend does not support interoperability for the `device_event` class
+as there's no equivalent in CUDA.
+
+Address spaces in CUDA are associated with variable decorations rather than the
+type, so when pointers are passed as parameters to a function the parameter
+types does not need to be decorated with an address space, instead it's simply a
+raw un-decorated pointer. For this reason the `accessor`,  `local_accessor` and
+`stream` classes map to a raw undecorated pointer which can be implemented using
+the generic address space.
+
+Other kernel function types in CUDA are represented by aliases provided in the
+`sycl::cuda` namespace. These are provided for the `sampled_image_accessor`,
+and `unsampled_image_accessor` classes; `sycl::cuda::texture` and
+`sycl::cuda::surface` respectively.
+
+Below is a table of the `backend_input_t` and `backend_return_t` specializations
+for the SYCL classes which support kernel function interoperability.
+
+[[table.cuda.kernelinterop.nativeobjects]]
 .Types of native backend objects kernel function interoperability
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#accessor<T, Dims, Mode, target::device>#                    |   |   |
-| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           |   |   |
-| [code]#accessor<T, Dims, Mode, target::local>#                     |   |   |
-| [code]#local_accessor<T, Dims>#                                    |   |   |
-| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   |   |   |
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   |   |   |
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   |   |   |
-| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># |   |   |
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># |   |   |
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># |   |   |
-| [code]#stream#                                                     |   |   |
-| [code]#device_event#                                               |   |   |
+| [code]#accessor<T, Dims, Mode, target::device>#                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::local>#                     | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#local_accessor<T, Dims>#                                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#stream#                                                     | signed char * | signed char * | Convert a SYCL `accessor` to an undecorated raw signed char pointer.
 |====
 
 

From a354cbd968e4679b6a8a83a895709b84a856b31b Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 11 Jan 2022 11:37:20 +0000
Subject: [PATCH 20/45] Fix editing typo and improve make_device docs

---
 adoc/chapters/cuda_backend.adoc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 053b43df..6a0d88a4 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -301,7 +301,7 @@ make_platform(const backend_input_t<Backend, platform> &backendObject);#
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 
-        | Create a SYCL `device` from a CUDA device.
+        | Construct a SYCL `device` from a CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of devices that are enumerated via `sycl::device::get_devices()`. Calling this function does not create a new device. Rather it merely creates a `sycl::device` object that is a copy of one of the devices from that enumeration.
 
 | [code]#template<backend Backend> +
 context +
@@ -345,7 +345,6 @@ make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
 The CUDA backend retains ownership of all native CUDA objects obtained through
 the interoperability API, therefore associated SYCL objects must be kept alive
 for the duration of the CUDA work using these native CUDA objects.
-dd CUDA backend specification kernel function interop definitions
 
 When creating a SYCL object from a native CUDA object SYCL does not take
 ownership of the object and it is up to the application to dispose of them when

From cb3763a88e0f08d0c8537ef792d3ed1efe5e9900 Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Mon, 31 Jan 2022 07:58:09 +0000
Subject: [PATCH 21/45] Update memory model

---
 adoc/chapters/cuda_backend.adoc | 50 +++++++++++++++++++++++++--------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 6a0d88a4..0b992fce 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -49,26 +49,52 @@ may use multiple CUDA streams to implement an out of order SYCL <<queue>>.
 [[sub:cuda:memory_model]]
 === Memory model
 
-==== Memory Allocations
+==== Accessing memory on a different GPU
 
-When non-host accessors to buffers are created without [code]#target::host_buffer# they need to allocate memory for their contents on the device. For example using [code]#cudaMalloc3D()#, [code]#cudaMallocPitch()# or [code]#cudaMalloc()#.
+Devices belonging to the same context must be able to access (directly or indirectly) each other's global memory. This is done in one of the following ways:
 
-When accessors to images are created without [code]#target::host_buffer# they allocate memory, for example using [code]#cudaMalloc3DArray()# or [code]#cudaMallocArray()#. 
+- Device directly accesses memory on another device (peer-to-peer memory access).
+- CUDA-runtime-managed memory is used. CUDA runtime copies the data from one device to another.
+- Peer copy (one of [code]#CuMemcpyPeerAsync# or [code]#CuMemcpy3DPeerAsync#) is used to directly copy the data from one device to another.
+- The data is copied from one device to the host, and then from host to another device.
 
-When non-host accessors are created with [code]#target::host_buffer# they can, for example use [code]#cudaHostAlloc()# to allocate pinned memory on host.
+==== Shared USM memory advices
 
-Table <<table.cuda.memmodel.USM>> specifies which underlying CUDA functions can be used for USM allocations. For shared USM allocations this would mean memory is managed (moved between host and different devices) by CUDA runtime. Alternatively shared USM allocations can be managed by SYCL runtime, using non-managed CUDA allocation on device when needed, such as [code]#cudaMalloc()#.
+Values for the `advice` parameter of `sycl::queue::mem_advise` and `sycl::handler::mem_advice` and their mapping to CUDA equivalent are defined in table <<table.cuda.memmodel.advices>>.
 
-[[table.cuda.memmodel.USM]]
-.Cuda functions that could be used to allocate SYCL USM allocations
-[width="100%",options="header",cols="50%,50%"]
+[[table.cuda.memmodel.advices]]
+.Valid shared USM advices and their equivalents in CUDA
+[width="100%",options="header",cols="40%,30%,30%"]
 |====
-| SYCL USM type | CUDA function
-| device | [code]#cudaMalloc()#
-| host | [code]#cudaHostAlloc()#
-| shared | [code]#cudaMallocManaged()#
+| SYCL shared USM advice | CUDA managed memory advice | processor, the advice is set for
+| CUDA_MEM_ADVISE_SET_READ_MOSTLY | cudaMemAdviseSetReadMostly | device associated with the queue/handler
+| CUDA_MEM_ADVISE_UNSET_READ_MOSTLY | cudaMemAdviceUnsetReadMostly | device associated with the queue/handler
+| CUDA_MEM_ADVISE_SET_PREFERRED_LOCATION | cudaMemAdviseSetPreferredLocation | device associated with the queue/handler
+| CUDA_MEM_ADVISE_UNSET_PREFERRED_LOCATION | cudaMemAdviseUnsetPreferredLocation | device associated with the queue/handler
+| CUDA_MEM_ADVISE_SET_ACCESSED_BY | cudaMemAdviseSetAccessedBy | device associated with the queue/handler
+| CUDA_MEM_ADVISE_UNSET_ACCESSED_BY | cudaMemAdviseUnsetAccessedBy | device associated with the queue/handler
+| CUDA_MEM_ADVISE_SET_PREFERRED_LOCATION_HOST | cudaMemAdviseSetPreferredLocation | host
+| CUDA_MEM_ADVISE_UNSET_PREFERRED_LOCATION_HOST | cudaMemAdviseUnsetPreferredLocation | host
+| CUDA_MEM_ADVISE_SET_ACCESSED_BY_HOST | cudaMemAdviseSetAccessedBy | host
+| CUDA_MEM_ADVISE_UNSET_ACCESSED_BY_HOST | cudaMemAdviseUnsetAccessedBy | host
 |====
 
+==== Supported image formats
+
+Supported image formats are:
+
+* r8g8b8a8_unorm
+* r16g16b16a16_unorm
+* r8g8b8a8_sint
+* r16g16b16a16_sint
+* r32b32g32a32_sint
+* r8g8b8a8_uint
+* r16g16b16a16_uint
+* r32b32g32a32_uint
+* r16b16g16a16_sfloat
+* r32g32b32a32_sfloat
+* b8g8r8a8_unorm
+
 ==== Samplers
 
 In both SYCL and CUDA samplers consist of addressing mode, filtering mode and coordinate normalization mode. Mapping between SYCL and CUDA values is defined in tables <<table.cuda.memmodel.sampler_addressing>>, <<table.cuda.memmodel.sampler_filtering>> and <<table.cuda.memmodel.sampler_normalization>>. In CUDA addressing modes for all dimesnions will be the same, as CUDA allows different addressing modes for different dimesnions, while SYCL does not. 

From 3088646823cbcc4edbc0ff97d49e4bfca0db5252 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 31 Jan 2022 08:06:29 +0000
Subject: [PATCH 22/45] Updating cuda backend with notes on graphics APIs,
 builtin kernel functions,...

---
 adoc/chapters/cuda_backend.adoc | 87 +++++++++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 9 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 6a0d88a4..2d2b91e0 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -177,15 +177,7 @@ Mapping of SYCL memory scopes to PTX ISA is the same as for atomics. It is defin
 CUDA's execution model is similar to SYCL's. CUDA uses kernels to
 offload computation, splitting the host and GPU into asynchronous 
 computing devices. In general, except for CUDA's dynamic 
-parallelism extensions, kernels are called by the host. One 
-difference between CUDA and SYCL execution models is that CUDA 
-uses Single Instruction Multiple Thread (SIMT) while SYCL uses 
-Single Instruction Multiple Data (SIMD) kernels. SIMT kernels use 
-multiple scalar instructions acting on non-contiguous data.  SIMD 
-kernels use vector instructions acting on contiguous data. SIMT 
-can be used in place of SIMD but not the other way around, as SIMD 
-requires memory blocks to have no interruptions within the data, 
-while SIMT does not have this as a requirement. 
+parallelism extensions, kernels are called by the host. 
 
 CUDA GPUs are constructed out of streaming multiprocessors (SM) 
 which perform the actual computation. Each SM consists of 8 scalar 
@@ -211,6 +203,15 @@ unified virtual address space. Threads must synchronize with other
 threads using execution barriers, synchronization primitives and 
 Cooperative Groups to utilize unified memory.
 
+In SYCL, group functions and synchronizations are convergent, meaning 
+all work-items must reach them by the same control flow. Work-items 
+encountering a group function or synchronization point under diverse 
+conditions results in undefined behaviour. Therefore, any device specific 
+capability of independent forward progress among work-items is not exposed 
+in SYCL, and will not be observable to users. Independent forward progress
+of work-items may be achieved through the CUDA interop API, which gives
+the same guarantees as native CUDA.
+
 SYCL has a similar execution hierarchy consisting of kernels. 
 The kernel is broken down into work-items. Each work-item concurrently
 executes an instance of the kernel on a piece of memory. Work-items 
@@ -248,6 +249,44 @@ grid level and organizing subgroups in sizes smaller than a warp.
 Cooperative Groups do not have an equivalent within SYCL 2020 and are 
 not yet supported.
 
+==== Work Item Mapping
+
+A SYCL `nd_range` will tranpose indices as it maps to hardware memory.
+This gives better memory access patterns, in general.
+
+SYCL uses row major memory ordering, meaning in some memory object the 
+rows will be contiguous in memory. SYCL follows C++ convention in 
+this regard. Following the row-major paradigm, it is intuitive to imagine 
+each work-item in a `parallel_for` indexing through a contiguous block of 
+memory as it does its work. However, this gives poorly coalesced memory 
+accesses, as a given contiguous chunk of data being loaded may only pass 
+memory to a single work-item. More efficient memory access patterns are 
+achieved when each load of contiguous data can give data to as many 
+work-items as possible. Meaning the data used by a given work-item
+is non-contiguous.
+
+SYCL makes this intuitive row-major C++ approach give good memory access 
+patterns by flipping the indices of the `nd_range`, as it maps to hardware.
+
+The linear id (whose use is not recommended) of a two dimensional `nd_range` 
+can be calculated using:
+
+[source,c++]
+----
+cgh.parallel_for(range<2>(64, 128), [=](item<2> it) {
+  size_t linearIdx = it.get_id(1) + (it.get_id(0) * it.get_range(0));
+  ...
+});
+----
+
+Notice that rows appear to be accessed in a column-major, rather than 
+row-major, format. This is only the case because the indices are flipped 
+by the SYCL implementation. All memory in SYCL is stored in row-major format.
+
+It is best to avoid calculating the linear index manually; it is better
+to use a multi-dimensional `sycl::id` to index into memory, as it doesn't
+expose index-flipping to the user.
+
 [[table.cuda.CUDA_features_to_SYCL]]
 .CUDA execution features with their corresponding SYCL features
 [width="100%",options="header",cols="50%,50%"]
@@ -449,9 +488,21 @@ supported on devices that have [code]#aspect::fp16#, i.e. compute capability
 [[sub:cuda:extensions]]
 === Extensions
 
+[[sub:cuda:builtin-kernel-functions]]
+=== Built-in Kernel Functions
+The CUDA backend specification currently does not define any built-in kernel 
+functions.
+
+
 [[sub:cuda:error_handling]]
 === Error Handling
 
+SYCL uses `sycl::errc` as an enum class to hold error codes. These error
+codes may originate in the SYCL runtime or be passed from other runtimes to
+the SYCL runtime. When a `sycl::exception` is thrown, the `sycl::errc` can
+be queried using the exception's `.code()` method. Possible values for 
+`sycl::errc` include: `success`, `runtime`, `memory_allocation`, and more.
+
 If there is a CUDA driver API error associated with an exception triggered, then the
 CUDA error code can be obtained by the free function `CUresult sycl::cuda::get_error_code(sycl::exception&)`. In the case where there is
 no CUDA error associated with the exception triggered, the CUDA error
@@ -492,4 +543,22 @@ variables if the SYCL kernel function is a lambda function._
 
 which may be mapped from `CUDA_ERROR_NOT_FOUND`.
 
+[[sub:cuda:non_core_properties]]
+=== Non-Core Properties
+
+The constructors for most SYCL library objects, such as for `sycl::queue` or 
+`sycl::context`, accept the parameter `sycl::property_list`, which can affect 
+the semantics of the compilation or linking operation.
+
+There are currently no CUDA backend specific properties, meaning any properties
+relating to the CUDA backend will be defined by a given implementation.
+
+[[sub:cuda:graphics_apis_interop]]
+=== Interoperability with Graphics APIs
+
+Interoperability between SYCL and OpenGL or DirectX is not directly provided 
+by the SYCL interface. However, since the CUDA API provides interoperability 
+with these APIs, interoperability between SYCL and OpenGL or DirectX is best 
+done indirectly through interoperability with the CUDA API.
+
 // %%%%%%%%%%%%%%%%%%%%%%%%%%%% end cuda_backend %%%%%%%%%%%%%%%%%%%%%%%%%%%%

From c35bec446d8b6bf1fa7102bcff32410452cf7a88 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 3 Feb 2022 10:43:42 +0000
Subject: [PATCH 23/45] Remove extra sentence and fix formatting in platform
 section

---
 adoc/chapters/cuda_backend.adoc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 91641a40..7d49b688 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -29,8 +29,10 @@ have a few differences in terminology and architecture.
 [[sub:cuda:platform_model]]
 === Platform Model
 
-All CUDA enabled devices which can be executed on are represented by a single `CUdevice`. A SYCL device maps to a single CUDA device.
-As CUDA does not split into separate platforms there is no 'platform' concept in CUDA corresponding to the SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices represented by `std::vector<CUdevice>`.
+A SYCL device maps to a single CUDA device.  As CUDA does not split into
+separate platforms there is no 'platform' concept in CUDA corresponding to the
+SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices
+represented by `std::vector<CUdevice>`.
 
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL

From 6527c7d4b1334e101e2d988439d78f2b1a0310b2 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 3 Feb 2022 13:40:47 +0000
Subject: [PATCH 24/45] Update make_platform wording to reflect fixed number of
 platfoms

---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 7d49b688..286ffbc1 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -363,7 +363,7 @@ supported.
 | [code]#template<backend Backend> +
 platform + 
 make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device.
+        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of platforms that are enumerated via `sycl::platform::get_platforms()`. Calling this function does not create a new platform. Rather it merely creates a `sycl::platform` object that is a copy of one of the platforms from that enumeration.
 
 | [code]#template<backend Backend> +
 device +

From 15572b8ff08d8a993d6088d7491204a4bcda91f2 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Mon, 7 Feb 2022 18:02:50 +0000
Subject: [PATCH 25/45] Clarify interop return value for sub-buffers

---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 286ffbc1..7688efce 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -352,7 +352,7 @@ supported.
 | [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
 | [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
 | [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
-| [code]#buffer# | `struct { CUdeviceptr ptr; size_t size; }` | `CUdeviceptr` | A SYCL buffer encapsulates a CUDA device pointer.
+| [code]#buffer# | `struct { CUdeviceptr ptr; size_t size; }` | `CUdeviceptr` | A SYCL buffer encapsulates a CUDA device pointer. If the SYCL buffer is a sub-buffer, the returned `CUdeviceptr` is offset to the beginning of the sub-buffer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]

From 8c6b658cdfd3fc52f896d0527c0eac51f97c228d Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 24 Feb 2022 17:46:45 +0000
Subject: [PATCH 26/45] Remove buffer interop for CUDA backend

After discussions we've decided to go against buffer interop, host tasks
and accessor interop should be used instead.
---
 adoc/chapters/cuda_backend.adoc | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 7688efce..788b6fbe 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -338,7 +338,7 @@ expose index-flipping to the user.
 This section describes the API level interoperability between SYCL and CUDA.
 
 The CUDA backend supports API interoperability for `platform`, `device`,
-`context`, `queue`, `event` and `buffer`. Interoperability for `kernel`,
+`context`, `queue`, and `event`. Interoperability for `buffer`, `kernel`,
 `kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` is not
 supported.
 
@@ -352,7 +352,6 @@ supported.
 | [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
 | [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
 | [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
-| [code]#buffer# | `struct { CUdeviceptr ptr; size_t size; }` | `CUdeviceptr` | A SYCL buffer encapsulates a CUDA device pointer. If the SYCL buffer is a sub-buffer, the returned `CUdeviceptr` is offset to the beginning of the sub-buffer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -389,22 +388,6 @@ make_event(const backend_input_t<Backend, event> &backendObject,
                  const context &targetContext);# 
         | Create a SYCL `event` from a CUDA event.
 
-| [code]#template <backend Backend, typename T, int dimensions = 1,
-          typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
-buffer<T, dimensions, AllocatorT> +
-make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
-               &backendObject,
-           const context &targetContext, event availableEvent);#
-        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. The `availableEvent` parameter can be used for synchronization and indicates when the CUDA pointer is ready to be used. Only `dimensions == 1` is supported.
- 
-| [code]#template <backend Backend, typename T, int dimensions = 1,
-           typename AllocatorT = buffer_allocator<std::remove_const_t<T>>> +
-buffer<T, dimensions, AllocatorT> +
-make_buffer(const backend_input_t<Backend, buffer<T, dimensions, AllocatorT>>
-                &backendObject,
-            const context &targetContext);#
-        | Create a SYCL `buffer` from a CUDA device pointer.` The CUDA pointer must be within the provided `targetContext`. Only `dimensions == 1` is supported.
-
 |====
 
 ==== Ownership of native backend objects

From 3ee367d1391d316ff2cdc5b09af45be3e9702094 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 30 Mar 2022 10:32:01 +0000
Subject: [PATCH 27/45] Responding to comments for non-core features

---
 adoc/chapters/cuda_backend.adoc | 56 ++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 7688efce..b5fb33f6 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -463,25 +463,39 @@ for the SYCL classes which support kernel function interoperability.
 | [code]#stream#                                                     | signed char * | signed char * | Convert a SYCL `accessor` to an undecorated raw signed char pointer.
 |====
 
+[[sec:cuda_support_of_core_features]]
+== CUDA Support of Core SYCL Features
+
+Some core SYCL features require a minimum compute capability for the CUDA
+backend.
+
+[[table.extensionsupport]]
+.CUDA support for Core SYCL API features
+[width="100%",options="header",cols="33%,33%,33%"]
+|====
+| Feature                                   | SYCL Aspect               | Required Compute Capability 
+| [code]#16-bit floating point#             | [code]#aspect::fp16#      | 5.3 or greater
+|====
 
 [[sec:non_core_features_and_extensions]]
 == Non-core features and extensions
 
-Additional CUDA features are available depending upon the devices compute capability.
-SYCL can support these optional CUDA features with extensions.
-Unlike OpenCL, CUDA needs to know if the extension is available at compile time. 
-As a result there are no valid CUDA extensions which can be passed to `has_extension`.
-
-As the extension must be known at runtime CUDA extensions are best implemented 
-using feature test macros. The test macro format is 
-SYCL_EXT_<vendorstring>_<featurename>. For CUDA extensions this format translates 
-to SYCL_EXT_NVIDIA_<featurename>. Similarly, the format for the naming of extension 
-classes and enumerations should be ext_<vendorstring>_<featurename>. Which in this context
-becomes ext_NVIDIA_<featurename>. Given the necessity to know the extension at 
-compile-time, the usage of extension macros should be the primary method of determining 
-if the extension is available in the SYCL implementation not. 
-A list of non-core CUDA features which have SYCL support is below.
-Non-core CUDA features for require a compute capability of greater than 5.
+Additional CUDA features are available depending upon the device's compute 
+capability. SYCL can support these optional CUDA features with extensions.
+
+Use of CUDA extensions requires that the API for a given extension is available
+to the SYCL implementation. This needs to be determined at compile time. 
+Checking for the existence of feature test macros is the preferred method
+for checking whether an API exists. The feature test macro format 
+is `SYCL_EXT_<vendor>_<feature>`. The `<vendor>` string may also contain the
+word `CUDA` for features specific to CUDA. For example, the feature test macro
+for CUDA extensions in oneAPI may be either `SYCL_EXT_ONEAPI_CUDA_<feature>`,
+or just `SYCL_EXT_ONEAPI_<feature>`.
+
+Use of a given CUDA extension also requires that a chosen device has the
+required compute capability to use the CUDA extension. This can be determined
+using `sycl::aspect`s. Non-core SYCL aspects may be defined by an
+implementation which would allow this check to happen at runtime.
 
 TODO: The table below shows a proposal for SYCL supported CUDA extensions.
 The table should be developed with other members of the SYCL community.
@@ -490,19 +504,9 @@ The table should be developed with other members of the SYCL community.
 .SYCL support for CUDA 11.3 extensions
 [width="100%",options="header",cols="35%,35%,15%, 15"]
 |====
-| SYCL Aspect              | CUDA Extension                                        | Core SYCL API | Required Compute Capability 
-| [code]#aspect::fp16#     | [code]#16-bit floating point#                         | Yes           | 5.3 or greater
-| -                        | [code]#Tensor Cores#                                  | No            | 7 or greater
-| -                        | [code]#Atomic floating-point operations#              | No            | 6 or greater
+| CUDA Extension                            | SYCL Aspect   | Feature Test Macro               | Required Compute Capability 
 |====
 
-=== Aspects
-Aspects are used to query what features and attributes a device has. Some aspects such as `fp16`
-are non-core CUDA features. Therefore, the runtime must be able to determine what aspects CUDA 
-devices have. This can be performed by querying `cudaDeviceProp::major` and `cudaDeviceProp::minor`
-to find out the compute capability. The compute capability indicates what extensions are
-available to the device, and therefore what aspects are available.
-
 [[sec:cuda:extension-fp16]]
 === Half precision floating-point
 

From 7c9b4500c43a62e9dc9038bef8f9a2ce734653eb Mon Sep 17 00:00:00 2001
From: Tadej Ciglaric <tadej.ciglaric@codeplay.com>
Date: Thu, 31 Mar 2022 12:55:58 +0000
Subject: [PATCH 28/45] Update CUDA backend specification with the information
 on queries.

---
 adoc/chapters/cuda_backend.adoc | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 417efa70..c3bce58a 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -332,6 +332,29 @@ expose index-flipping to the user.
 [[sec::programming_interface]]
 == Programming Interface
 
+[[sub:cuda:queries]]
+=== Queries
+
+For all event information profiling descriptors, the calls to 
+[code]#sycl::event::get_profiling_info# return the time (in nanoseconds) since 
+the creation of the context that the event is associated with. The "Resolution"
+(timing error) of the returned value is the same as that provided by the CUDA
+driver API call `cuEventElapsedTime`: +/- 0.5 microseconds. All event
+information profiling descriptors, defined by the SYCL specification, are
+supported by the CUDA backend.
+
+Currently no restrictions are defined for parameters of [code]#get_info# member
+function in classes [code]#platform#, [code]#context#, [code]#device#, 
+[code]#queue#, [code]#event# and [code]#kernel#. All parameter values defined 
+in the SYCL specification are supported.
+
+Querying for [code]#info::device::backend_version# by calling 
+[code]#device::get_info# returns the CUDA compute capability of the device.
+
+Currently no parameters are defined for [code]#get_backend_info# member 
+functions of classes [code]#platform#, [code]#context#, [code]#device#, 
+[code]#queue#, [code]#event# and [code]#kernel#.
+
 [[sub:cuda:application_interoperability]]
 === Application Interoperability
 

From ec3aef1b35176a6da6a434b655c8b92eec2e02eb Mon Sep 17 00:00:00 2001
From: Aidan Belton <aidan.belton@codeplay.com>
Date: Fri, 1 Apr 2022 10:51:36 +0000
Subject: [PATCH 29/45] Update kernel interop based on feedback

---
 adoc/chapters/cuda_backend.adoc | 41 ++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index c3bce58a..b8b09acb 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -375,6 +375,8 @@ supported.
 | [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
 | [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
 | [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
+| [code]#buffer# | `struct { void * ptr; size_t size; }` | `void *` | A SYCL buffer encapsulates a CUDA device pointer. If
+the SYCL buffer is a sub-buffer, the returned `void *` is offset to the beginning of the sub-buffer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -430,8 +432,9 @@ This section describes the kernel function interoperability for the CUDA
 backend.
 
 The CUDA backend supports kernel function interoperability for the `accessor`,
-`local_accessor`, `sampled_image_accessor`, `unsampled_image_accessor` and
-`stream` classes.
+`local_accessor`, `sampled_image_accessor`, `unsampled_image_accessor`, `queue`,
+`device` and `context` classes. These are exposed with `get_native` free template
+function.
 
 The CUDA backend does not support interoperability for the `device_event` class
 as there's no equivalent in CUDA.
@@ -439,34 +442,36 @@ as there's no equivalent in CUDA.
 Address spaces in CUDA are associated with variable decorations rather than the
 type, so when pointers are passed as parameters to a function the parameter
 types does not need to be decorated with an address space, instead it's simply a
-raw un-decorated pointer. For this reason the `accessor`,  `local_accessor` and
-`stream` classes map to a raw undecorated pointer which can be implemented using
-the generic address space.
+raw un-decorated pointer. For this reason the `accessor` and  `local_accessor` 
+classes map to a raw undecorated pointer which can be implemented using the 
+generic address space.
 
 Other kernel function types in CUDA are represented by aliases provided in the
 `sycl::cuda` namespace. These are provided for the `sampled_image_accessor`,
 and `unsampled_image_accessor` classes; `sycl::cuda::texture` and
 `sycl::cuda::surface` respectively.
 
-Below is a table of the `backend_input_t` and `backend_return_t` specializations
+Below is a table of the `backend_return_t` specializations
 for the SYCL classes which support kernel function interoperability.
 
 [[table.cuda.kernelinterop.nativeobjects]]
 .Types of native backend objects kernel function interoperability
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
-| [code]#SyclType#                                                   | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#accessor<T, Dims, Mode, target::device>#                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#accessor<T, Dims, Mode, target::local>#                     | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#local_accessor<T, Dims>#                                    | T * | T * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#stream#                                                     | signed char * | signed char * | Convert a SYCL `accessor` to an undecorated raw signed char pointer.
+| [code]#SyclType#                                                   | [code]#backend_return_t<backend::cuda, SyclType># | Description
+| [code]#queue#                                                      | `CUstream` | Convert a SYCL `queue` to native cuda type `CUstream`.
+| [code]#device#                                                     | `CUdevice` | Convert a SYCL `device` to native cuda type `CUdevice`.
+| [code]#context#                                                    | `CUcontext` | Convert a SYCL `context` to a vector of `CUcontext`.
+| [code]#accessor<T, Dims, Mode, target::device>#                    | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::local>#                     | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 |====
 
 [[sec:cuda_support_of_core_features]]

From e19a9bc80a087d9f6cbeac846b1a8f3ce33a52a6 Mon Sep 17 00:00:00 2001
From: Gordon <gordon@codeplay.com>
Date: Thu, 7 Apr 2022 15:43:31 +0000
Subject: [PATCH 30/45] Make edits to backend spec based on feedback.

---
 adoc/chapters/cuda_backend.adoc | 63 ++++++++++++++-------------------
 1 file changed, 27 insertions(+), 36 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index b8b09acb..d457e454 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -362,7 +362,7 @@ This section describes the API level interoperability between SYCL and CUDA.
 
 The CUDA backend supports API interoperability for `platform`, `device`,
 `context`, `queue`, and `event`. Interoperability for `buffer`, `kernel`,
-`kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` is not
+`kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` are not
 supported.
 
 [[table.cuda.appinterop.nativeobjects]]
@@ -372,11 +372,10 @@ supported.
 | [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
 | [code]#platform# | `std::vector<CUdevice>`   | `std::vector<CUdevice>`  | A SYCL platform encapsulates a list of CUDA devices.
 | [code]#device#   | `CUdevice`                | `CUdevice`               | A SYCL device encapsulates a CUDA device.
-| [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts , however it is not possible to create a SYCL context from multiple CUDA contexts.
-| [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue encapsulates a CUDA stream.
-| [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event encapsulates a CUDA event.
-| [code]#buffer# | `struct { void * ptr; size_t size; }` | `void *` | A SYCL buffer encapsulates a CUDA device pointer. If
-the SYCL buffer is a sub-buffer, the returned `void *` is offset to the beginning of the sub-buffer.
+| [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts, however, it is not possible to create a SYCL context from multiple CUDA contexts.
+| [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue can encapsulates multiple CUDA stream, however, a SYCL queue can only be created from or produce one, and any synchronization required should be performed.
+| [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event can encapsulates multiple CUDA events, however, a SYCL event can only be created from or produce one, and a CUevent produced from a SYCL event may or may not be valid, use `sycl::cuda::has_native_event` to query this.
+| [code]#buffer# | NA | `void *` | A SYCL buffer encapsulates a CUDA device pointer. If the SYCL buffer is a sub-buffer, the returned `void *` is offset to the beginning of the sub-buffer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -432,9 +431,8 @@ This section describes the kernel function interoperability for the CUDA
 backend.
 
 The CUDA backend supports kernel function interoperability for the `accessor`,
-`local_accessor`, `sampled_image_accessor`, `unsampled_image_accessor`, `queue`,
-`device` and `context` classes. These are exposed with `get_native` free template
-function.
+`local_accessor`, `sampled_image_accessor` and `unsampled_image_accessor`
+classes. These are exposed with `get_native` free template function.
 
 The CUDA backend does not support interoperability for the `device_event` class
 as there's no equivalent in CUDA.
@@ -459,19 +457,16 @@ for the SYCL classes which support kernel function interoperability.
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType#                                                   | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#queue#                                                      | `CUstream` | Convert a SYCL `queue` to native cuda type `CUstream`.
-| [code]#device#                                                     | `CUdevice` | Convert a SYCL `device` to native cuda type `CUdevice`.
-| [code]#context#                                                    | `CUcontext` | Convert a SYCL `context` to a vector of `CUcontext`.
 | [code]#accessor<T, Dims, Mode, target::device>#                    | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
 | [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
 | [code]#accessor<T, Dims, Mode, target::local>#                     | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `local_accessor` to an undecorated raw pointer.
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 |====
 
 [[sec:cuda_support_of_core_features]]
@@ -491,6 +486,17 @@ backend.
 [[sec:non_core_features_and_extensions]]
 == Non-core features and extensions
 
+Some additional functions are provided for the CUDA backend in the
+`sycl::cuda` namespace.
+
+[[table.noncorefeatures]]
+.CUDA support for non-Core SYCL APIs
+[width="100%",options="header",cols="33%,33%,33%"]
+|====
+| API                                                    | Description
+| [code]#bool sycl::cuda::has_native_event(sycl::event)# | Returns `true` if the SYCL event has a valid `CUevent` that can be queries via application interop.
+|====
+
 Additional CUDA features are available depending upon the device's compute 
 capability. SYCL can support these optional CUDA features with extensions.
 
@@ -508,8 +514,8 @@ required compute capability to use the CUDA extension. This can be determined
 using `sycl::aspect`s. Non-core SYCL aspects may be defined by an
 implementation which would allow this check to happen at runtime.
 
-TODO: The table below shows a proposal for SYCL supported CUDA extensions.
-The table should be developed with other members of the SYCL community.
+The table below shows a proposal for SYCL supported CUDA extensions. This should
+be populated by other members of the SYCL community.
 
 [[table.extensionsupport]]
 .SYCL support for CUDA 11.3 extensions
@@ -518,25 +524,11 @@ The table should be developed with other members of the SYCL community.
 | CUDA Extension                            | SYCL Aspect   | Feature Test Macro               | Required Compute Capability 
 |====
 
-[[sec:cuda:extension-fp16]]
-=== Half precision floating-point
-
-The half scalar data type: [code]#half# and the half vector data types:
-[code]#half1#, [code]#half2#, [code]#half3#,
-[code]#half4#, [code]#half8# and [code]#half16# must be
-available at compile-time. However a kernel using these types is only
-supported on devices that have [code]#aspect::fp16#, i.e. compute capability
-5.3 or greater.
-
-[[sub:cuda:extensions]]
-=== Extensions
-
 [[sub:cuda:builtin-kernel-functions]]
 === Built-in Kernel Functions
 The CUDA backend specification currently does not define any built-in kernel 
 functions.
 
-
 [[sub:cuda:error_handling]]
 === Error Handling
 
@@ -566,7 +558,6 @@ linking, or building a kernel bundle for a device._
 
 which may be mapped from `CUDA_ERROR_NO_BINARY_FOR_GPU`, `CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, `CUDA_ERROR_INVALID_PTX`, `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, `CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, `CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`.
 
-
 ==== memory_allocation
 
 `sycl::errc::memory_allocation` is defined as:

From 2d72c97059d97254b67001117414d8aae949f55e Mon Sep 17 00:00:00 2001
From: Gordon Brown <aerialmantis@gmail.com>
Date: Thu, 28 Apr 2022 11:10:12 +0100
Subject: [PATCH 31/45] Update adoc/chapters/cuda_backend.adoc

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d457e454..269d104f 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -462,7 +462,7 @@ for the SYCL classes which support kernel function interoperability.
 | [code]#accessor<T, Dims, Mode, target::local>#                     | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
 | [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `local_accessor` to an undecorated raw pointer.
 | [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.

From b97476a161fa0115d6d966350279f4e9954fe6e2 Mon Sep 17 00:00:00 2001
From: Gordon Brown <aerialmantis@gmail.com>
Date: Thu, 28 Apr 2022 11:10:21 +0100
Subject: [PATCH 32/45] Update adoc/chapters/cuda_backend.adoc

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 269d104f..6eba4e15 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -463,7 +463,7 @@ for the SYCL classes which support kernel function interoperability.
 | [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `local_accessor` to an undecorated raw pointer.
 | [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.

From e0dfdc73fd917b44b11521eeee6c2ba6d8c8e420 Mon Sep 17 00:00:00 2001
From: Gordon Brown <aerialmantis@gmail.com>
Date: Thu, 28 Apr 2022 11:10:30 +0100
Subject: [PATCH 33/45] Update adoc/chapters/cuda_backend.adoc

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 6eba4e15..298acb6d 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -465,7 +465,7 @@ for the SYCL classes which support kernel function interoperability.
 | [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 |====
 

From 555dfcfc4ea2c9936f6fce9e9a0b0e43cecc8b30 Mon Sep 17 00:00:00 2001
From: Gordon Brown <aerialmantis@gmail.com>
Date: Thu, 28 Apr 2022 11:10:41 +0100
Subject: [PATCH 34/45] Update adoc/chapters/cuda_backend.adoc

Co-authored-by: Jakub Chlanda <j.chlanda@gmail.com>
---
 adoc/chapters/cuda_backend.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 298acb6d..d150128f 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -466,7 +466,7 @@ for the SYCL classes which support kernel function interoperability.
 | [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 | [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
 |====
 
 [[sec:cuda_support_of_core_features]]

From 7ed5e9acd447c5dfa546b1fc0bb59499a94ffdd2 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 5 May 2022 10:43:41 +0100
Subject: [PATCH 35/45] Use device version to report compute capabilities

---
 adoc/chapters/cuda_backend.adoc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d150128f..5e6ad117 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -348,8 +348,9 @@ function in classes [code]#platform#, [code]#context#, [code]#device#,
 [code]#queue#, [code]#event# and [code]#kernel#. All parameter values defined 
 in the SYCL specification are supported.
 
-Querying for [code]#info::device::backend_version# by calling 
-[code]#device::get_info# returns the CUDA compute capability of the device.
+Querying for [code]#info::device::version# by calling [code]#device::get_info#
+returns the CUDA compute capability of the device, in the format
+[code]#<major>.<minor>#.
 
 Currently no parameters are defined for [code]#get_backend_info# member 
 functions of classes [code]#platform#, [code]#context#, [code]#device#, 

From 4d9435a21663397acb2e1902247fc83e654f4897 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 9 May 2022 09:31:02 +0100
Subject: [PATCH 36/45] Error Handling more concise

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 56 +++++++++------------------------
 1 file changed, 14 insertions(+), 42 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d150128f..2725f2ce 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -532,50 +532,22 @@ functions.
 [[sub:cuda:error_handling]]
 === Error Handling
 
-SYCL uses `sycl::errc` as an enum class to hold error codes. These error
-codes may originate in the SYCL runtime or be passed from other runtimes to
-the SYCL runtime. When a `sycl::exception` is thrown, the `sycl::errc` can
-be queried using the exception's `.code()` method. Possible values for 
-`sycl::errc` include: `success`, `runtime`, `memory_allocation`, and more.
+SYCL uses `sycl::errc` as an enum class to hold the Standard SYCL Error Codes.
+These error codes may originate in the SYCL runtime or be created from an error
+originating in a backend. When a `sycl::exception` is thrown, the `sycl::errc` can
+be queried using the exception's `.code()` method.
 
 If there is a CUDA driver API error associated with an exception triggered, then the
-CUDA error code can be obtained by the free function `CUresult sycl::cuda::get_error_code(sycl::exception&)`. In the case where there is
-no CUDA error associated with the exception triggered, the CUDA error
-code will be `CUDA_SUCCESS`.
-
-Most of the SYCL error codes that form sycl::errc are specifically defined as errors thrown during calls to the SYCL API or SYCL runtime. There are also some cases of sycl::errc which cover errors thrown during the compilation or execution of device code.
-It is suitable to map CUDA errors to such cases, such that an exception, "cuda_exception", that was created due to a CUDA error, may, upon execution of `cuda_exception.code()`, return a `std::error_code` relating to the `sycl::errc` case that the CUDA error maps to; whilst `sycl::cuda::get_error_code(cuda_exception)` will return the original CUDA error code.
-
-The relevant `sycl::errc` cases and the CUDA errors that they may be mapped from are listed below.
-
-==== build
-
-`sycl::errc::build` is defined as:
-
-_Error from an online compile or
-link operation when compiling,
-linking, or building a kernel bundle for a device._
-
-which may be mapped from `CUDA_ERROR_NO_BINARY_FOR_GPU`, `CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, `CUDA_ERROR_INVALID_PTX`, `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, `CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, `CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`.
-
-==== memory_allocation
-
-`sycl::errc::memory_allocation` is defined as:
-
-_Error on memory allocation on the
-SYCL device for a SYCL kernel._
-
-which may be mapped from `CUDA_ERROR_OUT_OF_MEMORY`.
-
-==== kernel_argument
-
-`sycl::errc::kernel_argument` is defined as:
-
-_The application has passed an invalid argument to a SYCL kernel
-function. This includes captured
-variables if the SYCL kernel function is a lambda function._
-
-which may be mapped from `CUDA_ERROR_NOT_FOUND`.
+A CUDA error code can be obtained by the free function
+`CUresult sycl::cuda::get_error_code(const sycl::exception&)`.
+In the case where there is no CUDA error associated with the exception triggered,
+the CUDA error code will be `CUDA_SUCCESS`.
+
+The default `sycl::errc` that a CUDA error is mapped to is `sycl::errc::runtime`.
+An exception, `cuda_exception`, that was created due to a CUDA error, will,
+upon execution of `cuda_exception.code()`, return a `std::error_code`
+relating to the `sycl::errc` case that the CUDA error maps to; whilst
+`sycl::cuda::get_error_code(cuda_exception)` will return the original CUDA error code.
 
 [[sub:cuda:non_core_properties]]
 === Non-Core Properties

From 3dde1630e7f9eb282a36059c1d0024549f000f24 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 9 May 2022 09:38:17 +0100
Subject: [PATCH 37/45] Clarified mem consist model and formating

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 187 ++++++++++++++++----------------
 1 file changed, 96 insertions(+), 91 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 2725f2ce..45fed892 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -9,10 +9,10 @@ of CUDA, and how the SYCL generic interoperability interface must be
 implemented by vendors providing SYCL for CUDA implementations to ensure SYCL
 applications written for the CUDA backend are interoperable.
 
-The CUDA backend is enabled using the `sycl::backend::cuda` value of `enum
-class backend`. That means that when the CUDA backend is active, the value of
-`sycl::is_backend_active<sycl::backend::cuda>::value` will be `true`, and the
-preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
+The CUDA backend is enabled using the [code]#sycl::backend::cuda# value of [code]#enum
+class backend#. That means that when the CUDA backend is active, the value of
+[code]#sycl::is_backend_active<sycl::backend::cuda>::value# will be [code]#true#, and the
+preprocessor macro [code]#SYCL_BACKEND_CUDA# will be defined.
 
 The CUDA backend requires an installation of CUDA SDK as well as one or more
 CUDA devices available in the system.
@@ -32,7 +32,7 @@ have a few differences in terminology and architecture.
 A SYCL device maps to a single CUDA device.  As CUDA does not split into
 separate platforms there is no 'platform' concept in CUDA corresponding to the
 SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices
-represented by `std::vector<CUdevice>`.
+represented by [code]#std::vector<CUdevice>#.
 
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL
@@ -62,23 +62,23 @@ Devices belonging to the same context must be able to access (directly or indire
 
 ==== Shared USM memory advices
 
-Values for the `advice` parameter of `sycl::queue::mem_advise` and `sycl::handler::mem_advice` and their mapping to CUDA equivalent are defined in table <<table.cuda.memmodel.advices>>.
+Values for the [code]#advice# parameter of [code]#sycl::queue::mem_advise# and [code]#sycl::handler::mem_advise# and their mapping to CUDA equivalent are defined in table <<table.cuda.memmodel.advices>>.
 
 [[table.cuda.memmodel.advices]]
 .Valid shared USM advices and their equivalents in CUDA
 [width="100%",options="header",cols="40%,30%,30%"]
 |====
 | SYCL shared USM advice | CUDA managed memory advice | processor, the advice is set for
-| CUDA_MEM_ADVISE_SET_READ_MOSTLY | cudaMemAdviseSetReadMostly | device associated with the queue/handler
-| CUDA_MEM_ADVISE_UNSET_READ_MOSTLY | cudaMemAdviceUnsetReadMostly | device associated with the queue/handler
-| CUDA_MEM_ADVISE_SET_PREFERRED_LOCATION | cudaMemAdviseSetPreferredLocation | device associated with the queue/handler
-| CUDA_MEM_ADVISE_UNSET_PREFERRED_LOCATION | cudaMemAdviseUnsetPreferredLocation | device associated with the queue/handler
-| CUDA_MEM_ADVISE_SET_ACCESSED_BY | cudaMemAdviseSetAccessedBy | device associated with the queue/handler
-| CUDA_MEM_ADVISE_UNSET_ACCESSED_BY | cudaMemAdviseUnsetAccessedBy | device associated with the queue/handler
-| CUDA_MEM_ADVISE_SET_PREFERRED_LOCATION_HOST | cudaMemAdviseSetPreferredLocation | host
-| CUDA_MEM_ADVISE_UNSET_PREFERRED_LOCATION_HOST | cudaMemAdviseUnsetPreferredLocation | host
-| CUDA_MEM_ADVISE_SET_ACCESSED_BY_HOST | cudaMemAdviseSetAccessedBy | host
-| CUDA_MEM_ADVISE_UNSET_ACCESSED_BY_HOST | cudaMemAdviseUnsetAccessedBy | host
+| [code]#sycl::cuda::advice::cuda_mem_advise_set_read_mostly# | [code]#cudaMemAdviseSetReadMostly# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_unset_read_mostly# | [code]#cudaMemAdviceUnsetReadMostly# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_set_preferred_location# | [code]#cudaMemAdviseSetPreferredLocation# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_unset_preferred_location# | [code]#cudaMemAdviseUnsetPreferredLocation# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_set_accessed_by# | [code]#cudaMemAdviseSetAccessedBy# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_unset_accessed_by# | [code]#cudaMemAdviseUnsetAccessedBy# | device associated with the queue/handler
+| [code]#sycl::cuda::advice::cuda_mem_advise_set_preferred_location_host# | [code]#cudaMemAdviseSetPreferredLocation# | host
+| [code]#sycl::cuda::advice::cuda_mem_advise_unset_preferred_location_host# | [code]#cudaMemAdviseUnsetPreferredLocation# | host
+| [code]#sycl::cuda::advice::cuda_mem_advise_set_accessed_by_host# | [code]#cudaMemAdviseSetAccessedBy# | host
+| [code]#sycl::cuda::advice::cuda_mem_advise_unset_accessed_by_host# | [code]#cudaMemAdviseUnsetAccessedBy# | host
 |====
 
 ==== Supported image formats
@@ -151,7 +151,12 @@ Table <<table.cuda.memmodel.address_spaces>> maps SYCL address spaces to CUDA ad
 
 ==== Atomics
 
-Not all CUDA devices support all memory orders. If a particular memory order is unsupported by a CUDA device, it can be unsupported in the SYCL CUDA backend for that device. Sequentially consistent atomics are currently not supported on any device, so the SYCL CUDA backend is not required to implement them. The mappings of other memory orders (when supported by the device) is defined in table <<table.cuda.memmodel.memory_orders>>.
+Prior to Volta (Compute Capability 7.0) the CUDA Parallel Thread eXecution model (PTX) used weak memory models that apparently lacked any published
+definitions and corresponding formal proofs. PTX ISA 6.0 introduced a memory consistency model that provides scoped synchronization primitives supported by Volta and later devices.
+A formal analysis of this memory consistency model has been published by Nvidia.
+
+Sequentially consistent atomics are currently not supported in the CUDA backend. The mappings of other memory orders is defined in table <<table.cuda.memmodel.memory_orders>>.
+If a memory order is not specified then [code]#memory_order::relaxed# is assumed. A memory order can only be specified for Volta and later devices.
 
 [[table.cuda.memmodel.memory_orders]]
 .Mapping from [code]#sycl::memory_order# to PTX ISA memory orders
@@ -165,7 +170,7 @@ Not all CUDA devices support all memory orders. If a particular memory order is
 | [code]#memory_order::seq_cst# | undefined
 |====
 
-Mapping of memory scopes (when supported by the device) is defined in table [table.cuda.memmodel.memory_scopes]. [code]#memory_scope::work_item# does not require any consistency between different work items, so it can be mapped to non-atomic operation.
+In the CUDA backend memory scopes are defined for Pascal (Compute Capability 6.0) and later devices. Mapping of memory scopes is defined in table <<table.cuda.memmodel.memory_scopes>>. [code]#memory_scope::work_item# does not require any consistency between different work items, so it can be mapped to non-atomic operations.
 
 [[table.cuda.memmodel.memory_scopes]]
 .Mapping from [code]#sycl::memory_scope# to PTX ISA memory scopes
@@ -211,15 +216,15 @@ CUDA GPUs are constructed out of streaming multiprocessors (SM)
 which perform the actual computation. Each SM consists of 8 scalar 
 cores, shared memory, registers, a load/store unit, and a scheduler 
 unit. CUDA uses a hierarchy of threads to organize the execution of
-kernels. Kernels are split up into thread blocks. The threadblocks 
-form a grid each thread can identify its location within the grid 
-using a block ID. The grid is a concept used to index threadblocks 
-the grid can be one, two, or three dimensions. Each thread block is 
+kernels. Kernels are split up into thread blocks. The thread blocks
+form a grid and each thread can identify its location within the grid
+using a block ID. The grid is a concept used to index thread blocks
+and can be one, two, or three dimensional. Each thread block is
 tied to a single SM. Similar to a thread block's location within the 
 grid, each thread's position within the block can be identified with 
 a one, two, or three dimensional thread ID. 
 
-Pre-Volta GPU architectures breaks thread blocks into warps which 
+Pre-Volta GPU architectures break thread blocks into warps which
 consist of 32 threads. The warp is processed by the SM concurrently. 
 For one warp instruction to be executed requires 4 SM clock cycles. 
 SM's execute multiple warp instructions. The warps instructions are 
@@ -252,19 +257,19 @@ unit. CUDA's grid is similar to SYCL's nd_range as it is the highest
 level grouping of threads, not including the whole kernel. Both 
 nd_range and grid can segment the groups of threads into one, two, or 
 three dimensions. SYCL sub-groups roughly map to
-cooperative groups `thread_block_tile` as it allows for the
+cooperative groups [code]#thread_block_tile# as it allows for the
 work-group/thread block to be further subdivided into concurrent threads.
 Likewise, thread blocks map directly to work-groups, and a
 single thread is a SYCL work-item.
 
 CUDA primarily synchronizes the threads through two functions,
-`cudaStreamSynchronize()` and `__syncthreads()`. 
-`cudaStreamSynchronize()` blocks work from being performed until all 
-threads on the device has been completed. `__syncthreads()` waits for 
+[code]#cudaStreamSynchronize()# and [code]#&lowbar&lowbarsyncthreads()#.
+[code]#cudaStreamSynchronize()# blocks work from being performed until all
+threads on the device has been completed. [code]#&lowbar&lowbarsyncthreads()# waits for
 all threads within a thread block to reach the same point. So 
-`cudaStreamSynchronize()` is similar to queue.wait(), buffer 
+[code]#cudaStreamSynchronize()# is similar to queue.wait(), buffer
 destruction, and other host-device synchronization events within SYCL. 
-`__syncthreads()` synchronizes the threads within a thread block which
+[code]#&lowbar&lowbarsyncthreads()# synchronizes the threads within a thread block which
 is analogous to the work-group barrier.
 
 CUDA's warp concept has no SYCL equivalent. If a user were to write 
@@ -279,13 +284,13 @@ not yet supported.
 
 ==== Work Item Mapping
 
-A SYCL `nd_range` will tranpose indices as it maps to hardware memory.
+A SYCL [code]#nd_range# will tranpose indices as it maps to hardware memory.
 This gives better memory access patterns, in general.
 
 SYCL uses row major memory ordering, meaning in some memory object the 
 rows will be contiguous in memory. SYCL follows C++ convention in 
 this regard. Following the row-major paradigm, it is intuitive to imagine 
-each work-item in a `parallel_for` indexing through a contiguous block of 
+each work-item in a [code]#parallel_for# indexing through a contiguous block of
 memory as it does its work. However, this gives poorly coalesced memory 
 accesses, as a given contiguous chunk of data being loaded may only pass 
 memory to a single work-item. More efficient memory access patterns are 
@@ -294,9 +299,9 @@ work-items as possible. Meaning the data used by a given work-item
 is non-contiguous.
 
 SYCL makes this intuitive row-major C++ approach give good memory access 
-patterns by flipping the indices of the `nd_range`, as it maps to hardware.
+patterns by flipping the indices of the [code]#nd_range#, as it maps to hardware.
 
-The linear id (whose use is not recommended) of a two dimensional `nd_range` 
+The linear id (whose use is not recommended) of a two dimensional [code]#nd_range#
 can be calculated using:
 
 [source,c++]
@@ -312,7 +317,7 @@ row-major, format. This is only the case because the indices are flipped
 by the SYCL implementation. All memory in SYCL is stored in row-major format.
 
 It is best to avoid calculating the linear index manually; it is better
-to use a multi-dimensional `sycl::id` to index into memory, as it doesn't
+to use a multi-dimensional [code]#sycl::id# to index into memory, as it doesn't
 expose index-flipping to the user.
 
 [[table.cuda.CUDA_features_to_SYCL]]
@@ -326,7 +331,7 @@ expose index-flipping to the user.
 | [code]#sub-group#                                                  | [code]#thread_block_tile#
 | [code]#work-item#                                                  | [code]#Thread#
 | [code]#SYCL nd_item synchronization#                               | [code]#cudaStreamSynchronize#
-| [code]#work-group barrier#                                         | [code]#__syncthread#
+| [code]#work-group barrier#                                         | [code]#__syncthreads#
 |====
 
 [[sec::programming_interface]]
@@ -339,7 +344,7 @@ For all event information profiling descriptors, the calls to
 [code]#sycl::event::get_profiling_info# return the time (in nanoseconds) since 
 the creation of the context that the event is associated with. The "Resolution"
 (timing error) of the returned value is the same as that provided by the CUDA
-driver API call `cuEventElapsedTime`: +/- 0.5 microseconds. All event
+driver API call [code]#cuEventElapsedTime#: +/- 0.5 microseconds. All event
 information profiling descriptors, defined by the SYCL specification, are
 supported by the CUDA backend.
 
@@ -360,9 +365,9 @@ functions of classes [code]#platform#, [code]#context#, [code]#device#,
 
 This section describes the API level interoperability between SYCL and CUDA.
 
-The CUDA backend supports API interoperability for `platform`, `device`,
-`context`, `queue`, and `event`. Interoperability for `buffer`, `kernel`,
-`kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` are not
+The CUDA backend supports API interoperability for [code]#platform#, [code]#device#,
+[code]#context#, [code]#queue#, and [code]#event#. Interoperability for [code]#buffer#, [code]#kernel#,
+[code]#kernel_bundle#, [code]#device_image#, [code]#sampled_image# and [code]#unsampled_image# are not
 supported.
 
 [[table.cuda.appinterop.nativeobjects]]
@@ -370,12 +375,12 @@ supported.
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#platform# | `std::vector<CUdevice>`   | `std::vector<CUdevice>`  | A SYCL platform encapsulates a list of CUDA devices.
-| [code]#device#   | `CUdevice`                | `CUdevice`               | A SYCL device encapsulates a CUDA device.
-| [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts, however, it is not possible to create a SYCL context from multiple CUDA contexts.
-| [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue can encapsulates multiple CUDA stream, however, a SYCL queue can only be created from or produce one, and any synchronization required should be performed.
-| [code]#event#    | `CUevent`    | `CUevent`  | A SYCL event can encapsulates multiple CUDA events, however, a SYCL event can only be created from or produce one, and a CUevent produced from a SYCL event may or may not be valid, use `sycl::cuda::has_native_event` to query this.
-| [code]#buffer# | NA | `void *` | A SYCL buffer encapsulates a CUDA device pointer. If the SYCL buffer is a sub-buffer, the returned `void *` is offset to the beginning of the sub-buffer.
+| [code]#platform# | [code]#std::vector<CUdevice>#   | [code]#std::vector<CUdevice>#  | A SYCL platform encapsulates a list of CUDA devices.
+| [code]#device#   | [code]#CUdevice#                | [code]#CUdevice#               | A SYCL device encapsulates a CUDA device.
+| [code]#context#  | [code]#CUcontext#               | [code]#std::vector<CUcontext># | A SYCL context can encapsulate multiple CUDA contexts, however, it is not possible to create a SYCL context from multiple CUDA contexts.
+| [code]#queue#    | [code]#CUstream#   | [code]#CUstream# | A SYCL queue can encapsulates multiple CUDA stream, however, a SYCL queue can only be created from or produce one, and any synchronization required should be performed.
+| [code]#event#    | [code]#CUevent#    | [code]#CUevent#  | A SYCL event can encapsulates multiple CUDA events, however, a SYCL event can only be created from or produce one, and a CUevent produced from a SYCL event may or may not be valid, use [code]#sycl::cuda::has_native_event# to query this.
+| [code]#buffer# | NA | [code]#void *# | A SYCL buffer encapsulates a CUDA device pointer. If the SYCL buffer is a sub-buffer, the returned [code]#void *# is offset to the beginning of the sub-buffer.
 |====
 
 [[table.cuda.appinterop.make_interop_APIs]]
@@ -386,31 +391,31 @@ supported.
 | [code]#template<backend Backend> +
 platform + 
 make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of platforms that are enumerated via `sycl::platform::get_platforms()`. Calling this function does not create a new platform. Rather it merely creates a `sycl::platform` object that is a copy of one of the platforms from that enumeration.
+        | Create a SYCL [code]#platform# from a list of CUDA device, the list must contain at least one CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of platforms that are enumerated via [code]#sycl::platform::get_platforms()#. Calling this function does not create a new platform. Rather it merely creates a [code]#sycl::platform# object that is a copy of one of the platforms from that enumeration.
 
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 
-        | Construct a SYCL `device` from a CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of devices that are enumerated via `sycl::device::get_devices()`. Calling this function does not create a new device. Rather it merely creates a `sycl::device` object that is a copy of one of the devices from that enumeration.
+        | Construct a SYCL [code]#device# from a CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of devices that are enumerated via [code]#sycl::device::get_devices()#. Calling this function does not create a new device. Rather it merely creates a [code]#sycl::device# object that is a copy of one of the devices from that enumeration.
 
 | [code]#template<backend Backend> +
 context +
 make_context(const backend_input_t<Backend, context> &backendObject,
                      const async_handler asyncHandler = {});# 
-        | Create a SYCL `context` from a CUDA context.
+        | Create a SYCL [code]#context# from a CUDA context.
 
 | [code]#template<backend Backend> +
 queue +
 make_queue(const backend_input_t<Backend, queue> &backendObject,
                  const context &targetContext,
                  const async_handler asyncHandler = {});# 
-        | Create a SYCL `queue` from a CUDA stream. The provided `targetContext` must encapsulate the same CUDA context as the provided CUDA stream.
+        | Create a SYCL [code]#queue# from a CUDA stream. The provided [code]#targetContext# must encapsulate the same CUDA context as the provided CUDA stream.
 
 | [code]#template<backend Backend> +
 event +
 make_event(const backend_input_t<Backend, event> &backendObject,
                  const context &targetContext);# 
-        | Create a SYCL `event` from a CUDA event.
+        | Create a SYCL [code]#event# from a CUDA event.
 
 |====
 
@@ -430,43 +435,43 @@ appropriate.
 This section describes the kernel function interoperability for the CUDA
 backend.
 
-The CUDA backend supports kernel function interoperability for the `accessor`,
-`local_accessor`, `sampled_image_accessor` and `unsampled_image_accessor`
-classes. These are exposed with `get_native` free template function.
+The CUDA backend supports kernel function interoperability for the [code]#accessor#,
+[code]#local_accessor#, [code]#sampled_image_accessor# and [code]#unsampled_image_accessor#
+classes. These are exposed with [code]#get_native# free template function.
 
-The CUDA backend does not support interoperability for the `device_event` class
+The CUDA backend does not support interoperability for the [code]#device_event# class
 as there's no equivalent in CUDA.
 
 Address spaces in CUDA are associated with variable decorations rather than the
 type, so when pointers are passed as parameters to a function the parameter
 types does not need to be decorated with an address space, instead it's simply a
-raw un-decorated pointer. For this reason the `accessor` and  `local_accessor` 
+raw un-decorated pointer. For this reason the [code]#accessor# and  [code]#local_accessor# 
 classes map to a raw undecorated pointer which can be implemented using the 
 generic address space.
 
 Other kernel function types in CUDA are represented by aliases provided in the
-`sycl::cuda` namespace. These are provided for the `sampled_image_accessor`,
-and `unsampled_image_accessor` classes; `sycl::cuda::texture` and
-`sycl::cuda::surface` respectively.
+[code]#sycl::cuda# namespace. These are provided for the [code]#sampled_image_accessor#,
+and [code]#unsampled_image_accessor# classes; [code]#sycl::cuda::texture# and
+[code]#sycl::cuda::surface# respectively.
 
-Below is a table of the `backend_return_t` specializations
+Below is a table of the [code]#backend_return_t# specializations
 for the SYCL classes which support kernel function interoperability.
 
 [[table.cuda.kernelinterop.nativeobjects]]
 .Types of native backend objects kernel function interoperability
-[width="100%",options="header",cols="20%,20%,20%,40%"]
+[width="100%",options="header",cols="30%,20%,50%"]
 |====
 | [code]#SyclType#                                                   | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#accessor<T, Dims, Mode, target::device>#                    | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#accessor<T, Dims, Mode, target::local>#                     | void * | Convert a SYCL `accessor` to an undecorated raw pointer.
-| [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL `local_accessor` to an undecorated raw pointer.
-| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | Convert a SYCL `sampled_image_accessor` to the `sycl::cuda::texture` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
-| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | Convert a SYCL `unsampled_image_accessor` to the `sycl::cuda::surface` interoperability type with the same type and dimensions.
+| [code]#accessor<T, Dims, Mode, target::device>#                    | void * | Convert a SYCL [code]#accessor# to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::constant_buffer>#           | void * | Convert a SYCL [code]#accessor# to an undecorated raw pointer.
+| [code]#accessor<T, Dims, Mode, target::local>#                     | void * | Convert a SYCL [code]#accessor# to an undecorated raw pointer.
+| [code]#local_accessor<T, Dims>#                                    | void * | Convert a SYCL [code]#local_accessor# to an undecorated raw pointer.
+| [code]#sampled_image_accessor<T, 1, Mode, image_target::device>#   | sycl::cuda::texture<T, 1> | Convert a SYCL [code]#sampled_image_accessor# to the [code]#sycl::cuda::texture# interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 2, Mode, image_target::device>#   | sycl::cuda::texture<T, 2> | Convert a SYCL [code]#sampled_image_accessor# to the [code]#sycl::cuda::texture# interoperability type with the same type and dimensions.
+| [code]#sampled_image_accessor<T, 3, Mode, image_target::device>#   | sycl::cuda::texture<T, 3> | Convert a SYCL [code]#sampled_image_accessor# to the [code]#sycl::cuda::texture# interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 1, Mode, image_target::device># | sycl::cuda::surface<T, 1> | Convert a SYCL [code]#unsampled_image_accessor# to the [code]#sycl::cuda::surface# interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 2, Mode, image_target::device># | sycl::cuda::surface<T, 2> | Convert a SYCL [code]#unsampled_image_accessor# to the [code]#sycl::cuda::surface# interoperability type with the same type and dimensions.
+| [code]#unsampled_image_accessor<T, 3, Mode, image_target::device># | sycl::cuda::surface<T, 3> | Convert a SYCL [code]#unsampled_image_accessor# to the [code]#sycl::cuda::surface# interoperability type with the same type and dimensions.
 |====
 
 [[sec:cuda_support_of_core_features]]
@@ -475,7 +480,7 @@ for the SYCL classes which support kernel function interoperability.
 Some core SYCL features require a minimum compute capability for the CUDA
 backend.
 
-[[table.extensionsupport]]
+[[table.coresupport]]
 .CUDA support for Core SYCL API features
 [width="100%",options="header",cols="33%,33%,33%"]
 |====
@@ -487,14 +492,14 @@ backend.
 == Non-core features and extensions
 
 Some additional functions are provided for the CUDA backend in the
-`sycl::cuda` namespace.
+[code]#sycl::cuda# namespace.
 
 [[table.noncorefeatures]]
 .CUDA support for non-Core SYCL APIs
-[width="100%",options="header",cols="33%,33%,33%"]
+[width="100%",options="header",cols="50%,50%"]
 |====
 | API                                                    | Description
-| [code]#bool sycl::cuda::has_native_event(sycl::event)# | Returns `true` if the SYCL event has a valid `CUevent` that can be queries via application interop.
+| [code]#bool sycl::cuda::has_native_event(sycl::event)# | Returns [code]#true# if the SYCL event has a valid [code]#CUevent# that can be queries via application interop.
 |====
 
 Additional CUDA features are available depending upon the device's compute 
@@ -504,14 +509,14 @@ Use of CUDA extensions requires that the API for a given extension is available
 to the SYCL implementation. This needs to be determined at compile time. 
 Checking for the existence of feature test macros is the preferred method
 for checking whether an API exists. The feature test macro format 
-is `SYCL_EXT_<vendor>_<feature>`. The `<vendor>` string may also contain the
-word `CUDA` for features specific to CUDA. For example, the feature test macro
-for CUDA extensions in oneAPI may be either `SYCL_EXT_ONEAPI_CUDA_<feature>`,
-or just `SYCL_EXT_ONEAPI_<feature>`.
+is [code]#SYCL_EXT_<vendor>_<feature>#. The [code]#<vendor># string may also contain the
+word [code]#CUDA# for features specific to CUDA. For example, the feature test macro
+for CUDA extensions in oneAPI may be either [code]#SYCL_EXT_ONEAPI_CUDA_<feature>#,
+or just [code]#SYCL_EXT_ONEAPI_<feature>#.
 
 Use of a given CUDA extension also requires that a chosen device has the
 required compute capability to use the CUDA extension. This can be determined
-using `sycl::aspect`s. Non-core SYCL aspects may be defined by an
+using [code]#sycl::aspect#s. Non-core SYCL aspects may be defined by an
 implementation which would allow this check to happen at runtime.
 
 The table below shows a proposal for SYCL supported CUDA extensions. This should
@@ -532,28 +537,28 @@ functions.
 [[sub:cuda:error_handling]]
 === Error Handling
 
-SYCL uses `sycl::errc` as an enum class to hold the Standard SYCL Error Codes.
+SYCL uses [code]#sycl::errc# as an enum class to hold the Standard SYCL Error Codes.
 These error codes may originate in the SYCL runtime or be created from an error
-originating in a backend. When a `sycl::exception` is thrown, the `sycl::errc` can
-be queried using the exception's `.code()` method.
+originating in a backend. When a [code]#sycl::exception# is thrown, the [code]#sycl::errc# can
+be queried using the exception's [code]#.code()# method.
 
 If there is a CUDA driver API error associated with an exception triggered, then the
 A CUDA error code can be obtained by the free function
-`CUresult sycl::cuda::get_error_code(const sycl::exception&)`.
+[code]#CUresult sycl::cuda::get_error_code(const sycl::exception&)#.
 In the case where there is no CUDA error associated with the exception triggered,
-the CUDA error code will be `CUDA_SUCCESS`.
+the CUDA error code will be [code]#CUDA_SUCCESS#.
 
-The default `sycl::errc` that a CUDA error is mapped to is `sycl::errc::runtime`.
-An exception, `cuda_exception`, that was created due to a CUDA error, will,
-upon execution of `cuda_exception.code()`, return a `std::error_code`
-relating to the `sycl::errc` case that the CUDA error maps to; whilst
-`sycl::cuda::get_error_code(cuda_exception)` will return the original CUDA error code.
+The default [code]#sycl::errc# that a CUDA error is mapped to is [code]#sycl::errc::runtime#.
+An exception, [code]#cuda_exception#, that was created due to a CUDA error, will,
+upon execution of [code]#cuda_exception.code()#, return a [code]#std::error_code#
+relating to the [code]#sycl::errc# case that the CUDA error maps to; whilst
+[code]#sycl::cuda::get_error_code(cuda_exception)# will return the original CUDA error code.
 
 [[sub:cuda:non_core_properties]]
 === Non-Core Properties
 
-The constructors for most SYCL library objects, such as for `sycl::queue` or 
-`sycl::context`, accept the parameter `sycl::property_list`, which can affect 
+The constructors for most SYCL library objects, such as for [code]#sycl::queue# or
+[code]#sycl::context#, accept the parameter [code]#sycl::property_list#, which can affect
 the semantics of the compilation or linking operation.
 
 There are currently no CUDA backend specific properties, meaning any properties

From 5c32ba469e57d0041d7fbd8bb97a8753116b2c9f Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 10 May 2022 15:16:04 +0100
Subject: [PATCH 38/45] Remove platform interop

Platform is not a concept that makes sense for CUDA and are simply a
list of devices so specific interop for it doesn't really make sense.

The user can already access the list of SYCL device then use interop on
these if necessary.
---
 adoc/chapters/cuda_backend.adoc | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index d150128f..93707112 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -31,8 +31,7 @@ have a few differences in terminology and architecture.
 
 A SYCL device maps to a single CUDA device.  As CUDA does not split into
 separate platforms there is no 'platform' concept in CUDA corresponding to the
-SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices
-represented by `std::vector<CUdevice>`.
+SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices.
 
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL
@@ -360,8 +359,8 @@ functions of classes [code]#platform#, [code]#context#, [code]#device#,
 
 This section describes the API level interoperability between SYCL and CUDA.
 
-The CUDA backend supports API interoperability for `platform`, `device`,
-`context`, `queue`, and `event`. Interoperability for `buffer`, `kernel`,
+The CUDA backend supports API interoperability for `device`, `context`,
+`queue`, and `event`. Interoperability for `platform`, `buffer`, `kernel`,
 `kernel_bundle`, `device_image`, `sampled_image` and `unsampled_image` are not
 supported.
 
@@ -370,7 +369,6 @@ supported.
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#platform# | `std::vector<CUdevice>`   | `std::vector<CUdevice>`  | A SYCL platform encapsulates a list of CUDA devices.
 | [code]#device#   | `CUdevice`                | `CUdevice`               | A SYCL device encapsulates a CUDA device.
 | [code]#context#  | `CUcontext`               | `std::vector<CUcontext>` | A SYCL context can encapsulate multiple CUDA contexts, however, it is not possible to create a SYCL context from multiple CUDA contexts.
 | [code]#queue#    | `CUstream`   | `CUstream` | A SYCL queue can encapsulates multiple CUDA stream, however, a SYCL queue can only be created from or produce one, and any synchronization required should be performed.
@@ -383,11 +381,6 @@ supported.
 [width="100%",options="header",cols="40%,60%"]
 |====
 | CUDA interoperability function                                    |  Description
-| [code]#template<backend Backend> +
-platform + 
-make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        | Create a SYCL `platform` from a list of CUDA device, the list must contain at least one CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of platforms that are enumerated via `sycl::platform::get_platforms()`. Calling this function does not create a new platform. Rather it merely creates a `sycl::platform` object that is a copy of one of the platforms from that enumeration.
-
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 

From 7dc8a21a4f62e3251290822d223edc12b15280b6 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 10 May 2022 16:15:36 +0100
Subject: [PATCH 39/45] fixed __ formatting in __syncthreads

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 45fed892..c99fd0ef 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -263,13 +263,14 @@ Likewise, thread blocks map directly to work-groups, and a
 single thread is a SYCL work-item.
 
 CUDA primarily synchronizes the threads through two functions,
-[code]#cudaStreamSynchronize()# and [code]#&lowbar&lowbarsyncthreads()#.
+[code]#cudaStreamSynchronize()# and [code]#\__syncthreads()#.
 [code]#cudaStreamSynchronize()# blocks work from being performed until all
-threads on the device has been completed. [code]#&lowbar&lowbarsyncthreads()# waits for
+threads on the device has been completed.
+[code]#__syncthreads()# waits for
 all threads within a thread block to reach the same point. So 
 [code]#cudaStreamSynchronize()# is similar to queue.wait(), buffer
-destruction, and other host-device synchronization events within SYCL. 
-[code]#&lowbar&lowbarsyncthreads()# synchronizes the threads within a thread block which
+destruction, and other host-device synchronization events within SYCL.
+[code]#__syncthreads()# synchronizes the threads within a thread block which
 is analogous to the work-group barrier.
 
 CUDA's warp concept has no SYCL equivalent. If a user were to write 

From 8cb3ed4fac320cc459ea4e36fa0e0ad82024268a Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 10 May 2022 16:35:25 +0100
Subject: [PATCH 40/45] clarify backend spec is for `backend::cuda`

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index c99fd0ef..5bd32715 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -4,10 +4,12 @@
 [[chapter:cuda-backend]]
 = CUDA backend specification
 
-This chapter describes how the SYCL general programming model is mapped on top
-of CUDA, and how the SYCL generic interoperability interface must be
-implemented by vendors providing SYCL for CUDA implementations to ensure SYCL
-applications written for the CUDA backend are interoperable.
+This chapter describes the behavior of the [code]#sycl::backend::cuda# backend
+and how it relates to the SYCL general programming model.
+This backend is implemented on top of the CUDA SDK and exposes NVIDIA
+devices to the SYCL general programming model.
+This chapter also describes how the SYCL generic interoperability interface is
+implemented for the [code]#sycl::backend::cuda# backend.
 
 The CUDA backend is enabled using the [code]#sycl::backend::cuda# value of [code]#enum
 class backend#. That means that when the CUDA backend is active, the value of

From 977867f46506f03b3eddea787c33fa9ee0c9d4da Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Fri, 20 May 2022 10:49:59 +0100
Subject: [PATCH 41/45] Updating section on index flipping

---
 adoc/chapters/cuda_backend.adoc | 45 ++++++++-------------------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 93707112..caeb022b 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -278,41 +278,16 @@ not yet supported.
 
 ==== Work Item Mapping
 
-A SYCL `nd_range` will tranpose indices as it maps to hardware memory.
-This gives better memory access patterns, in general.
-
-SYCL uses row major memory ordering, meaning in some memory object the 
-rows will be contiguous in memory. SYCL follows C++ convention in 
-this regard. Following the row-major paradigm, it is intuitive to imagine 
-each work-item in a `parallel_for` indexing through a contiguous block of 
-memory as it does its work. However, this gives poorly coalesced memory 
-accesses, as a given contiguous chunk of data being loaded may only pass 
-memory to a single work-item. More efficient memory access patterns are 
-achieved when each load of contiguous data can give data to as many 
-work-items as possible. Meaning the data used by a given work-item
-is non-contiguous.
-
-SYCL makes this intuitive row-major C++ approach give good memory access 
-patterns by flipping the indices of the `nd_range`, as it maps to hardware.
-
-The linear id (whose use is not recommended) of a two dimensional `nd_range` 
-can be calculated using:
-
-[source,c++]
-----
-cgh.parallel_for(range<2>(64, 128), [=](item<2> it) {
-  size_t linearIdx = it.get_id(1) + (it.get_id(0) * it.get_range(0));
-  ...
-});
-----
-
-Notice that rows appear to be accessed in a column-major, rather than 
-row-major, format. This is only the case because the indices are flipped 
-by the SYCL implementation. All memory in SYCL is stored in row-major format.
-
-It is best to avoid calculating the linear index manually; it is better
-to use a multi-dimensional `sycl::id` to index into memory, as it doesn't
-expose index-flipping to the user.
+The SYCL specification specifies that work-items must be arranged in a row major
+fashion, making work-items with ids `(a, b, c)` and `(a, b, c+1)` adjacent.
+
+In native CUDA, work-items are arranged in a column major fashion, making 
+work-items with ids `(a, b, c)` and `(a+1, b, c)` adjacent.
+
+In order for a given SYCL implementation's CUDA backend to conform to the SYCL
+specification, the implementation must map the row major ordering of SYCL to the
+column major ordering specific to the CUDA backend. The underlying column major
+ordering of work-items in CUDA is therefore not perceptible to the user.
 
 [[table.cuda.CUDA_features_to_SYCL]]
 .CUDA execution features with their corresponding SYCL features

From aaffd99628c07a7d7f476495b55786e025a8fc66 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 23 May 2022 14:06:54 +0100
Subject: [PATCH 42/45] Removed platform interop.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 5bd32715..7ff10b8f 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -368,7 +368,7 @@ functions of classes [code]#platform#, [code]#context#, [code]#device#,
 
 This section describes the API level interoperability between SYCL and CUDA.
 
-The CUDA backend supports API interoperability for [code]#platform#, [code]#device#,
+The CUDA backend supports API interoperability for [code]#device#,
 [code]#context#, [code]#queue#, and [code]#event#. Interoperability for [code]#buffer#, [code]#kernel#,
 [code]#kernel_bundle#, [code]#device_image#, [code]#sampled_image# and [code]#unsampled_image# are not
 supported.
@@ -378,7 +378,6 @@ supported.
 [width="100%",options="header",cols="20%,20%,20%,40%"]
 |====
 | [code]#SyclType# | [code]#backend_input_t<backend::cuda, SyclType># | [code]#backend_return_t<backend::cuda, SyclType># | Description
-| [code]#platform# | [code]#std::vector<CUdevice>#   | [code]#std::vector<CUdevice>#  | A SYCL platform encapsulates a list of CUDA devices.
 | [code]#device#   | [code]#CUdevice#                | [code]#CUdevice#               | A SYCL device encapsulates a CUDA device.
 | [code]#context#  | [code]#CUcontext#               | [code]#std::vector<CUcontext># | A SYCL context can encapsulate multiple CUDA contexts, however, it is not possible to create a SYCL context from multiple CUDA contexts.
 | [code]#queue#    | [code]#CUstream#   | [code]#CUstream# | A SYCL queue can encapsulates multiple CUDA stream, however, a SYCL queue can only be created from or produce one, and any synchronization required should be performed.
@@ -391,11 +390,6 @@ supported.
 [width="100%",options="header",cols="40%,60%"]
 |====
 | CUDA interoperability function                                    |  Description
-| [code]#template<backend Backend> +
-platform + 
-make_platform(const backend_input_t<Backend, platform> &backendObject);# 
-        | Create a SYCL [code]#platform# from a list of CUDA device, the list must contain at least one CUDA device. As the SYCL execution environment for the CUDA backend contains a fixed number of platforms that are enumerated via [code]#sycl::platform::get_platforms()#. Calling this function does not create a new platform. Rather it merely creates a [code]#sycl::platform# object that is a copy of one of the platforms from that enumeration.
-
 | [code]#template<backend Backend> +
 device +
 make_device(const backend_input_t<Backend, device> &backendObject);# 

From 3915d34609edf7bdc66a2b7b31e7456998e4c116 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 23 May 2022 15:51:01 +0100
Subject: [PATCH 43/45] Updated timing descriptor ref to platform creation
 time.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 adoc/chapters/cuda_backend.adoc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 7ff10b8f..c34ca939 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -344,12 +344,12 @@ expose index-flipping to the user.
 === Queries
 
 For all event information profiling descriptors, the calls to 
-[code]#sycl::event::get_profiling_info# return the time (in nanoseconds) since 
-the creation of the context that the event is associated with. The "Resolution"
-(timing error) of the returned value is the same as that provided by the CUDA
-driver API call [code]#cuEventElapsedTime#: +/- 0.5 microseconds. All event
-information profiling descriptors, defined by the SYCL specification, are
-supported by the CUDA backend.
+[code]#sycl::event::get_profiling_info# return the time difference (in nanoseconds)
+between the creation of the platform (which happens when the application is started)
+and the descriptor time for the associated event. The "Resolution" (timing error)
+of the returned value is the same as that provided by the CUDA driver API call,
+[code]#cuEventElapsedTime#: +/- 0.5 microseconds. All event information profiling
+descriptors, defined by the SYCL specification, are supported by the CUDA backend.
 
 Currently no restrictions are defined for parameters of [code]#get_info# member
 function in classes [code]#platform#, [code]#context#, [code]#device#, 

From d251dc3359f6c30e5533074424c3c6519035f878 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 24 May 2022 16:05:07 +0100
Subject: [PATCH 44/45] Clarify active contexts and devices

---
 adoc/chapters/cuda_backend.adoc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 93707112..5ccd21e3 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -36,12 +36,14 @@ SYCL platform. Instead, a SYCL platform maps to a collection of CUDA devices.
 A SYCL <<context>> simply maps to one, or multiple CUDA contexts. Indeed while
 a CUDA context is tied to a single device, this is not the case for a SYCL
 <<context>> and the CUDA backend implementation may use multiple CUDA contexts
-to emulate a SYCL <<context>> containing multiple devices. Additionally, while
-SYCL contexts are simple objects passed around either implicitly or explicitly,
-CUDA contexts require to be activated on the current thread to be used by other
-CUDA entry points. Therefore any use of the SYCL APIs with a CUDA backend may
-modify the current active context on the thread, and no guarantee is provided
-that any existing active CUDA context would be restored by SYCL.
+to emulate a SYCL <<context>> containing multiple devices.
+
+In CUDA, contexts and devices may need to be set as active on a thread for the
+CUDA Driver or CUDA Runtime APIs to use. Therefore the SYCL API entry points
+may change the active context or device whenever required for proper execution
+of SYCL operations, and no guarantee is provided that any prior active context
+or active device will be restored by the SYCL API entry points before returning
+to the user application.
 
 A SYCL <<queue>> simply maps to one or multiple CUDA streams. Indeed while a
 CUDA stream is in-order, a SYCL <<queue>> isn't, so a CUDA backend implementation

From f9f309d7909e4c5762cc9c3628edbd8c13e3e8d9 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Thu, 9 Jun 2022 09:51:04 +0100
Subject: [PATCH 45/45] Remove is_backend_active

---
 adoc/chapters/cuda_backend.adoc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/adoc/chapters/cuda_backend.adoc b/adoc/chapters/cuda_backend.adoc
index 93707112..63dcefd3 100644
--- a/adoc/chapters/cuda_backend.adoc
+++ b/adoc/chapters/cuda_backend.adoc
@@ -10,8 +10,7 @@ implemented by vendors providing SYCL for CUDA implementations to ensure SYCL
 applications written for the CUDA backend are interoperable.
 
 The CUDA backend is enabled using the `sycl::backend::cuda` value of `enum
-class backend`. That means that when the CUDA backend is active, the value of
-`sycl::is_backend_active<sycl::backend::cuda>::value` will be `true`, and the
+class backend`. That means that when the CUDA backend is active the
 preprocessor macro `SYCL_BACKEND_CUDA` will be defined.
 
 The CUDA backend requires an installation of CUDA SDK as well as one or more