diff --git a/pyclesperanto_prototype/_tier0/_cuda_backend.py b/pyclesperanto_prototype/_tier0/_cuda_backend.py index f44142e9..5ed6dbce 100644 --- a/pyclesperanto_prototype/_tier0/_cuda_backend.py +++ b/pyclesperanto_prototype/_tier0/_cuda_backend.py @@ -83,6 +83,14 @@ def __array__(self, dtype=None): def __repr__(self): return "experimental clesperanto CUDAArray(" + str(self.array.get()) + ", dtype=" + str(self.array.dtype) + ")" + def astype(self, dtype, copy=None): + from ._create import create + if dtype == float or dtype == np.float64: + dtype = np.float32 + copied = create(self.shape, dtype=dtype) + from .._tier1 import copy + return copy(self, copied) + def min(self, axis=None, out=None): from .._tier2 import minimum_of_all_pixels from .._tier1 import minimum_x_projection diff --git a/pyclesperanto_prototype/_tier0/_cuda_execute.py b/pyclesperanto_prototype/_tier0/_cuda_execute.py index cf4c4b3f..82b603da 100644 --- a/pyclesperanto_prototype/_tier0/_cuda_execute.py +++ b/pyclesperanto_prototype/_tier0/_cuda_execute.py @@ -6,6 +6,7 @@ #define sampler_t int #define FLT_MIN 1.19209e-07 +#define FLT_MAX 1e+37 #define MAX_ARRAY_SIZE 1000 @@ -166,6 +167,10 @@ } } +__device__ inline unsigned int atomic_add(unsigned int* address, unsigned int value) { + return atomicAdd(address, value); +} + #define get_global_size(dim) global_size_ ## dim ## _size #define READ_IMAGE(a,b,c) READ_ ## a ## _IMAGE(a,b,c) @@ -386,10 +391,24 @@ def execute(anchor, opencl_kernel_filename, kernel_name, global_size, parameters ) size_params = "" arguments.append(value) - elif isinstance(value, int): + elif isinstance(value, np.int8): + arguments.append(cp.int8(value)) + elif isinstance(value, np.uint8): + arguments.append(cp.uint8(value)) + elif isinstance(value, np.int16): + arguments.append(cp.int16(value)) + elif isinstance(value, np.uint16): + arguments.append(cp.uint16(value)) + elif isinstance(value, int) or isinstance(value, np.int32): arguments.append(cp.int32(value)) - elif isinstance(value, float): + elif isinstance(value, float) or isinstance(value, np.float32): arguments.append(cp.float32(value)) + elif isinstance(value, np.int64): + arguments.append(cp.int64(value)) + elif isinstance(value, np.uint64): + arguments.append(cp.uint64(value)) + elif isinstance(value, np.float64): + arguments.append(cp.float64(value)) else: var_type = str(type(value)) raise TypeError( @@ -435,12 +454,19 @@ def execute(anchor, opencl_kernel_filename, kernel_name, global_size, parameters #print("Grid", grid) #print("Block", block) - # load and compile - a_kernel = cp.RawKernel(cuda_kernel, kernel_name) - - # run - a_kernel(grid, block, tuple(arguments)) - + try: + # load and compile + a_kernel = cp.RawKernel(cuda_kernel, kernel_name) + + # run + a_kernel(grid, block, tuple(arguments)) + except cp.cuda.compiler.CompileException as ce: + error = [] + for i, k in enumerate(cuda_kernel.split("\n")): + error.append(str(i) + ":" + k) + error.append(ce.get_message()) + error.append("CUDA compilation failed") + raise RuntimeError("\n".join(error)) #for i, a in enumerate(arguments): # print(i, type(a), a)