fixstars · vorj · Feb 5, 2020 · Feb 5, 2020 · Apr 16, 2019 · Feb 5, 2020
diff --git a/clpy/backend/compiler.pyx b/clpy/backend/compiler.pyx
@@ -9,6 +9,8 @@ cpdef function.Module compile_with_cache(
         str source, tuple options=(), arch=None, cache_dir=None,
         extra_source=None):
     options += (' -cl-fp32-correctly-rounded-divide-sqrt', )
+    if clpy.backend.opencl.env.supports_cl_khr_fp16():
+        options += (' -D__CLPY_ENABLE_CL_KHR_FP16', )
     optionStr = functools.reduce(operator.add, options)
 
     device = clpy.backend.opencl.env.get_device()

diff --git a/clpy/backend/opencl/env.pxd b/clpy/backend/opencl/env.pxd
@@ -1,8 +1,10 @@
 include "common_decl.pxi"
+from libcpp cimport bool
 
 cdef cl_context get_context()
 cdef cl_command_queue get_command_queue()
 cpdef int get_device_id()
 cpdef set_device_id(int id)
 cdef cl_device_id* get_devices()
 cdef cl_device_id get_device()
+cpdef bool supports_cl_khr_fp16() except *
diff --git a/clpy/backend/opencl/env.pyx b/clpy/backend/opencl/env.pyx
@@ -8,6 +8,7 @@ from clpy.backend.opencl cimport api
 from cython.view cimport array as cython_array
 
 from libc.stdlib cimport malloc
+from libcpp cimport bool
 
 cdef interpret_versionstr(versionstr):
     version_detector = re.compile('''OpenCL (\d+)\.(\d+)''')
@@ -72,6 +73,33 @@ cdef void check_device_version(cl_device_id device, required_version) except *:
     if not interpret_versionstr(versionstr) >= required_version:
         raise RuntimeError("Device's OpenCL version must be >= 1.2")
 
+cdef cl_bool check_device_extension(cl_device_id device,
+                                    str extension) except *:
+    cdef size_t param_value_size
+    api.GetDeviceInfo(
+        device,
+        CL_DEVICE_EXTENSIONS,
+        0,
+        NULL,
+        &param_value_size)
+
+    cdef cython_array extensions_buffer =\
+        cython_array(shape=(param_value_size,),
+                     itemsize=sizeof(char),
+                     format='b')
+    api.GetDeviceInfo(
+        device,
+        CL_DEVICE_EXTENSIONS,
+        param_value_size,
+        <void*>extensions_buffer.data,
+        &param_value_size)
+
+    extensions =\
+        extensions_buffer.data[:param_value_size]\
+        .decode(locale.getpreferredencoding())
+
+    return CL_TRUE if extension in extensions else CL_FALSE
+
 
 ##########################################
 # Initialization
@@ -164,6 +192,10 @@ cdef cl_device_id get_device():
     global __current_device_id
     return __devices[__current_device_id]
 
+cpdef bool supports_cl_khr_fp16() except *:
+    global __current_device_id
+    return check_device_extension(get_device(), 'cl_khr_fp16') == CL_TRUE
+
 
 def release():
     """Release command_queue and context automatically."""

diff --git a/clpy/backend/ultima/compiler.pxd b/clpy/backend/ultima/compiler.pxd
@@ -1 +1,3 @@
-cpdef str exec_ultima(str source, str _clpy_header_include=*)
+cpdef str exec_ultima(str source,
+                      str _clpy_header_include=*,
+                      tuple _options=*)
diff --git a/clpy/backend/ultima/compiler.pyx b/clpy/backend/ultima/compiler.pyx
@@ -1,3 +1,5 @@
+import functools
+import operator
 import os
 import subprocess
 import tempfile
@@ -21,7 +23,9 @@ class TempFile(object):
             os.remove(self.fn)
 
 
-cpdef str exec_ultima(str source, str _clpy_header_include=''):
+cpdef str exec_ultima(str source,
+                      str _clpy_header_include='',
+                      tuple _options=('',)):
     kernel_arg_size_t_code = 'typedef ' \
         + clpy.backend.opencl.utility.typeof_size() + ' __kernel_arg_size_t;\n'
     source = kernel_arg_size_t_code + _clpy_header_include + '\n' \
@@ -35,15 +39,17 @@ cpdef str exec_ultima(str source, str _clpy_header_include=''):
 
     with TempFile(filename, source) as tf:
         root_dir = os.path.join(clpy.__path__[0], "..")
-        proc = subprocess.Popen('{} {} -- -I {}'
+        proc = subprocess.Popen('{} {} -- -I {} {}'
                                 .format(os.path.join(root_dir,
                                                      "ultima",
                                                      "ultima"),
                                         filename,
                                         os.path.join(root_dir,
                                                      "clpy",
                                                      "core",
-                                                     "include"))
+                                                     "include"),
+                                        functools.reduce(operator.add,
+                                                         _options))
                                 .strip().split(" "),
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,

diff --git a/clpy/core/carray.pxi b/clpy/core/carray.pxi
@@ -130,7 +130,13 @@ cpdef function.Module compile_with_cache(
         str source, tuple options=(), arch=None, cache_dir=None):
     kernel_arg_size_t_code = 'typedef ' \
         + clpy.backend.opencl.utility.typeof_size() + ' __kernel_arg_size_t;\n'
-    source = clpy.backend.ultima.exec_ultima(source, _clpy_header)
+    if clpy.backend.opencl.env.supports_cl_khr_fp16():
+        ultima_options = (' -D__CLPY_ENABLE_CL_KHR_FP16', )
+    else:
+        ultima_options = ('', )
+    source = clpy.backend.ultima.exec_ultima(source,
+                                             _clpy_header,
+                                             ultima_options)
 
     extra_source = _get_header_source()
     options += ('-I%s' % _get_header_dir_path(),)

diff --git a/clpy/core/include/cl_stub.hpp b/clpy/core/include/cl_stub.hpp
@@ -11,8 +11,10 @@ typedef unsigned long ulong;
 typedef unsigned long size_t;
 typedef long ptrdiff_t;
 
+#ifdef __CLPY_ENABLE_CL_KHR_FP16
 typedef float half;
 typedef half __clpy__half;
+#endif
 #define half __clpy__half
 
 __attribute__((annotate("clpy_no_mangle"))) static unsigned int atomic_cmpxchg(volatile __global unsigned int*, unsigned int, unsigned int);

diff --git a/clpy/core/include/clpy/carray.clh b/clpy/core/include/clpy/carray.clh
@@ -1,5 +1,7 @@
 #pragma once
+#ifdef __CLPY_ENABLE_CL_KHR_FP16
 #pragma OPENCL EXTENSION cl_khr_fp16: enable
+#endif
 
 // TODO: Implement common functions in OpenCL C
 #if 0
@@ -530,54 +532,83 @@ static void __clpy_end_print_out() __attribute__((annotate("clpy_end_print_out")
 
 
 #ifdef __ULTIMA
-__attribute__((annotate("clpy_no_mangle"))) static half convert_float_to_half(float x);
+__attribute__((annotate("clpy_no_mangle"))) static ushort convert_float_to_half_ushort(float x);
+__attribute__((annotate("clpy_no_mangle"))) static float convert_half_ushort_to_float(ushort x);
 #else
 #include "fp16.clh"
+#endif
+
+#ifdef __CLPY_ENABLE_CL_KHR_FP16
+
+#ifndef __ULTIMA
+
 typedef half __clpy__half;
+
 #endif
 
-#ifdef __ULTIMA
-__attribute__((annotate("clpy_no_mangle"))) static half clpy_nextafter_fp16(half x1, half x2);
+__attribute__((annotate("clpy_no_mangle"))) static __clpy__half clpy_nextafter_fp16(__clpy__half x1, __clpy__half x2){
+  return nextafter(x1, x2);
+}
+
+__attribute__((annotate("clpy_no_mangle"))) static __clpy__half convert_float_to_half(float x){
+  const ushort h = convert_float_to_half_ushort(x);
+  return *(const half*)&h;
+}
+
 #else
-static int isnan_fp16(half x){
-  unsigned short const* x_raw = (unsigned short const*)&x;
-  return (*x_raw & 0x7c00u) == 0x7c00u && (*x_raw & 0x03ffu) != 0x0000u;
-}
-static int isfinite_fp16(half x){
-  unsigned short const* x_raw = (unsigned short const*)&x;
-  return (*x_raw & 0x7c00u) != 0x7c00u;
-}
-static int iszero_fp16(half x){
-  unsigned short const* x_raw = (unsigned short const*)&x;
-  return (*x_raw & 0x7fffu) == 0;
-}
-static int eq_nonan_fp16(half x1, half x2){
-  unsigned short const* x1_raw = (unsigned short const*)&x1;
-  unsigned short const* x2_raw = (unsigned short const*)&x2;
-  return (*x1_raw == *x2_raw || ((*x1_raw | *x2_raw) & 0x7fff) == 0);
-}
-static half clpy_nextafter_fp16(half x1, half x2){
-  unsigned short const* x1_raw = (unsigned short const*)&x1;
-  unsigned short const* x2_raw = (unsigned short const*)&x2;
+
+#ifdef __ULTIMA
+static void __clpy_begin_print_out() __attribute__((annotate("clpy_begin_print_out")));
+
+typedef struct __attribute__((packed)) __attribute__((aligned(2))) __clpy__half_{
+  ushort v;
+  __clpy__half_() = default;
+  __clpy__half_(float f):v{convert_float_to_half_ushort(f)}{}
+  operator float()const{return convert_half_ushort_to_float(v);}
+}__clpy__half;
+
+static int isnan_fp16(__clpy__half x){
+  return (x.v & 0x7c00u) == 0x7c00u && (x.v & 0x03ffu) != 0x0000u;
+}
+static int isfinite_fp16(__clpy__half x){
+  return (x.v & 0x7c00u) != 0x7c00u;
+}
+static int iszero_fp16(__clpy__half x){
+  return (x.v & 0x7fffu) == 0;
+}
+static int eq_nonan_fp16(__clpy__half x1, __clpy__half x2){
+  return (x1.v == x2.v || ((x1.v | x2.v) & 0x7fff) == 0);
+}
+__attribute__((annotate("clpy_no_mangle"))) static __clpy__half clpy_nextafter_fp16(__clpy__half x1, __clpy__half x2){
   unsigned short ret_raw_;
 
   if (!isfinite_fp16(x1) || isnan_fp16(x2)){
     ret_raw_ = 0x7e00u; // NaN in fp16
   }else if(eq_nonan_fp16(x1, x2)){
-    ret_raw_ = *x1_raw;
+    ret_raw_ = x1.v;
   }else if(iszero_fp16(x1)){
-    ret_raw_ = (*x2_raw & 0x8000u) + 1;
-  }else if(!(*x1_raw & 0x8000u)){
-    if (*(short const*)x1_raw > *(short const*)x2_raw){
-      ret_raw_ = *x1_raw - 1;
+    ret_raw_ = (x2.v & 0x8000u) + 1;
+  }else if(!(x1.v & 0x8000u)){
+    if (*(short const*)&x1.v > *(short const*)&x2.v){
+      ret_raw_ = x1.v - 1;
     }else{
-      ret_raw_ = *x1_raw + 1;
+      ret_raw_ = x1.v + 1;
     }
-  }else if(!(*x2_raw & 0x8000u) || (*x1_raw & 0x7fffu) > (*x2_raw & 0x7fffu)) {
-    ret_raw_ = *x1_raw - 1;
+  }else if(!(x2.v & 0x8000u) || (x1.v & 0x7fffu) > (x2.v & 0x7fffu)) {
+    ret_raw_ = x1.v - 1;
   } else {
-    ret_raw_ = *x1_raw + 1;
+    ret_raw_ = x1.v + 1;
   }
-  return *(half*)&ret_raw_;
+  __clpy__half ret;
+  ret.v = ret_raw_;
+  return ret;
+}
+
+__attribute__((annotate("clpy_no_mangle"))) static __clpy__half convert_float_to_half(float x){
+  __clpy__half ret(x);
+  return ret;
 }
+
+static void __clpy_end_print_out() __attribute__((annotate("clpy_end_print_out")));
+#endif
 #endif
diff --git a/clpy/core/include/clpy/fp16.clh b/clpy/core/include/clpy/fp16.clh
@@ -12,7 +12,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-static half convert_float_to_half(float f) {
+static ushort convert_float_to_half_ushort(float f) {
   const float scale_to_inf = 0x1.0p+112f;
   const float scale_to_zero = 0x1.0p-110f;
   float base = (fabs(f) * scale_to_inf) * scale_to_zero;
@@ -32,5 +32,26 @@ static half convert_float_to_half(float f) {
   const uint mantissa_bits = bits & (uint)(0x00000FFF);
   const uint nonsign = exp_bits + mantissa_bits;
   const ushort ret = ((sign >> 16) | (shl1_w > (uint)(0xFF000000) ? (ushort)(0x7E00) : nonsign));
-  return *(const half*)&ret;
+  return ret;
+}
+
+static inline float convert_half_ushort_to_float(ushort h) {
+  const uint w = (uint) h << 16;
+  const uint sign = w & (uint)0x80000000;
+  const uint two_w = w + w;
+
+  const uint exp_offset = (uint)0xE0 << 23;
+  const float exp_scale = 0x1.0p-112f;
+  const uint normalized_value_ = (two_w >> 4) + exp_offset;
+  const float normalized_value = *(const float*)&normalized_value_ * exp_scale;
+
+  const uint magic_mask = (uint)(126u) << 23;
+  const float magic_bias = .5f;
+  const uint denormalized_value_ = (two_w >> 17) | magic_mask;
+  const float denormalized_value = *(const float*)&denormalized_value_ - magic_bias;
+
+  const uint denormalized_cutoff = (uint)1u << 27;
+  const uint result = sign |
+    *(const uint*)(two_w < denormalized_cutoff ? &denormalized_value : &normalized_value);
+  return *(const float*)&result;
 }
diff --git a/tests/clpy_tests/opencl_tests/ultima_tests/test_half.py b/tests/clpy_tests/opencl_tests/ultima_tests/test_half.py
@@ -10,21 +10,35 @@
 class TestUltimaHalfTrick(unittest.TestCase):
 
     def test_type_half(self):
-        x = '''
+        supports_cl_khr_fp16 = clpy.backend.opencl.env.supports_cl_khr_fp16()
+        options = ('-D__CLPY_ENABLE_CL_KHR_FP16'
+                   if supports_cl_khr_fp16
+                   else '', )
+        x = clpy.backend.ultima.exec_ultima('',
+                                            '#include <cupy/carray.hpp>',
+                                            _options=options) + ('''
 __clpy__half f() 
 {
-    __clpy__half a;
-    return (__clpy__half)(42.F);
+    __clpy__half a = 42.F;
+    return a;
 }
-'''
+''' if supports_cl_khr_fp16 else '''
+__clpy__half f() 
+{
+    __clpy__half a;constructor___clpy__half___left_paren____clpy__half_float__right_paren__(&a, 42.F);
+    return a;
+}
+''')[1:]
         y = clpy.backend.ultima.exec_ultima(
             '''
             half f(){
-              half a;
-              return static_cast<half>(42.f);
+              half a = 42.f;
+              return a;
             }
-            ''')
-        self.assertEqual(x[1:], y)
+            ''',
+            '#include <cupy/carray.hpp>',
+            _options=options)
+        self.assertEqual(x, y)
 
     def test_variable_named_half(self):
         x = '''
@@ -54,21 +68,35 @@ def test_argument_named_half(self):
         self.assertEqual(x[1:], y)
 
     def test_clpy_half(self):
-        x = '''
+        supports_cl_khr_fp16 = clpy.backend.opencl.env.supports_cl_khr_fp16()
+        options = ('-D__CLPY_ENABLE_CL_KHR_FP16'
+                   if supports_cl_khr_fp16
+                   else '', )
+        x = clpy.backend.ultima.exec_ultima('',
+                                            '#include <cupy/carray.hpp>',
+                                            _options=options) + ('''
 void f() 
 {
     __clpy__half half_ = 42.F;
     int __clpy__half = half_;
 }
-'''
+''' if supports_cl_khr_fp16 else '''
+void f() 
+{
+    __clpy__half half_;constructor___clpy__half___left_paren____clpy__half_float__right_paren__(&half_, 42.F);
+    int __clpy__half = operatorfloat___clpy__half_(&half_);
+}
+''')[1:]
         y = clpy.backend.ultima.exec_ultima(
             '''
             void f(){
               __clpy__half half_ = 42.f;
               int __clpy__half = half_;
             }
-            ''')
-        self.assertEqual(x[1:], y)
+            ''',
+            '#include <cupy/carray.hpp>',
+            _options=options)
+        self.assertEqual(x, y)
 
 
 if __name__ == "__main__":