Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for sin #1

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: RunUnitTest

on: [push, pull_request]

jobs:
run-tests:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Build and Run Tests
working-directory: test
run: |
make all
make runAll
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@
*.exe
*.out
*.app

# Tests
test/*/bin
8 changes: 6 additions & 2 deletions include/internal/defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
#pragma once

// XTD_DEVICE_FUNCTION
#if defined(__CUDACC__) || defined(__HIPCC__)
#if defined(__CUDACC__) || defined(__HIPCC__)
// CUDA or HIP/ROCm compiler
#define XTD_DEVICE_FUNCTION __host__ __define__
#define XTD_DEVICE_FUNCTION __host__ __device__
#else
// SYCL or standard C++ code
#define XTD_DEVICE_FUNCTION
Expand All @@ -29,3 +29,7 @@
// standard C++ code
#define XTD_TARGET_CPU
#endif

#if defined(__SYCL_DEVICE_ONLY__)
#include <sycl/sycl.hpp>
#endif
18 changes: 8 additions & 10 deletions include/math/sin.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#pragma once

#include "internal/defines.h"
#include <cmath>
#include <type_traits>

namespace xtd {

Expand Down Expand Up @@ -53,27 +55,23 @@ namespace xtd {
/* Computes the sine of arg (measured in radians),
* in double precision.
*/
template <typename T, typename = std::is_integer_v<T>>
XTD_DEVICE_FUNCTION
inline constexpr double sin(T arg) {
template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
XTD_DEVICE_FUNCTION inline constexpr double sin(T arg) {
return sin(static_cast<double>(arg));
}

/* Computes the sine of arg (measured in radians),
* in single precision.
*/
XTD_DEVICE_FUNCTION
inline constexpr float sinf(float arg) {
return sin(arg);
}
inline constexpr float sinf(float arg) { return sin(arg); }

/* Computes the sine of arg (measured in radians),
* in single precision.
*/
template <typename T, typename = std::is_integer_v<T>>
XTD_DEVICE_FUNCTION
inline constexpr double sinf(T arg) {
template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
XTD_DEVICE_FUNCTION inline constexpr double sinf(T arg) {
return sin(static_cast<float>(arg));
}

} // namespace xtd
} // namespace xtd
178 changes: 178 additions & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# gcc
CXX := g++
GCC_TOOLCHAIN := $(abspath $(dir $(shell which $(CXX)))/..)
GCC_TARGET := $(shell $(CXX) -dumpmachine)
HOST_CXXFLAGS := -O2 -fPIC -fdiagnostics-show-option -felide-constructors -fmessage-length=0 -fno-math-errno -ftree-vectorize -fvisibility-inlines-hidden --param vect-max-version-for-alias-checks=50 -msse3 -pipe -pthread -Werror=address -Wall -Werror=array-bounds -Wno-attributes -Werror=conversion-null -Werror=delete-non-virtual-dtor -Wno-deprecated -Werror=format-contains-nul -Werror=format -Wno-long-long -Werror=main -Werror=missing-braces -Werror=narrowing -Wno-non-template-friend -Wnon-virtual-dtor -Werror=overflow -Werror=overlength-strings -Wparentheses -Werror=pointer-arith -Wno-psabi -Werror=reorder -Werror=return-local-addr -Wreturn-type -Werror=return-type -Werror=sign-compare -Werror=strict-aliasing -Wstrict-overflow -Werror=switch -Werror=type-limits -Wunused -Werror=unused-but-set-variable -Wno-unused-local-typedefs -Werror=unused-value -Wno-error=unused-variable -Wno-vla -Werror=write-strings -Wfatal-errors

# Compiler flags supported by GCC but not by the LLVM-based compilers (clang, hipcc, icpx, etc.)
LLVM_UNSUPPORTED_CXXFLAGS := --param vect-max-version-for-alias-checks=50 -Werror=format-contains-nul -Wno-non-template-friend -Werror=return-local-addr -Werror=unused-but-set-variable

CXXFLAGS := -std=c++17 $(HOST_CXXFLAGS) -g
NVCXX_CXXFLAGS := -std=c++20 -O0 -cuda -gpu=managed -stdpar -fpic -gopt
LDFLAGS := -O2 -fPIC -pthread -Wl,-E -lstdc++fs -ldl
LDFLAGS_NVCC := -ccbin $(CXX) --linker-options '-E' --linker-options '-lstdc++fs'
LDFLAGS_NVCXX := -cuda -Wl,-E -ldl -gpu=managed -stdpar

# CUDA
CUDA_BASE := /usr/local/cuda
ifeq ($(wildcard $(CUDA_BASE)),)
# CUDA platform not found
CUDA_BASE :=
else
# CUDA platform at $(CUDA_BASE)
CUDA_LIBDIR := $(CUDA_BASE)/lib64
CUDA_DEPS := $(CUDA_LIBDIR)/libcudart.so
CUDA_ARCH := 60
CUDA_CXXFLAGS := -I$(CUDA_BASE)/include
CUDA_LDFLAGS := -L$(CUDA_LIBDIR) -lcudart -lcudadevrt
CUDA_NVCC := $(CUDA_BASE)/bin/nvcc
define CUFLAGS_template
$(2)NVCC_FLAGS := $$(foreach ARCH,$(1),-gencode arch=compute_$$(ARCH),code=[sm_$$(ARCH),compute_$$(ARCH)]) -Wno-deprecated-gpu-targets -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --expt-relaxed-constexpr --expt-extended-lambda --generate-line-info --source-in-ptx --display-error-number --threads $$(words $(1)) --cudart=shared
$(2)NVCC_COMMON := -std=c++17 -O3 -g $$($(2)NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
$(2)CUDA_CUFLAGS := $$($(2)NVCC_COMMON)
endef
$(eval $(call CUFLAGS_template,$(CUDA_ARCH),))
NVCC_COMMON := -std=c++17 -O3 -g $(NVCC_FLAGS) -ccbin $(CXX) --compiler-options '$(HOST_CXXFLAGS)'
CUDA_CUFLAGS := $(NVCC_COMMON)
endif

# ROCm
ROCM_BASE := /opt/rocm
ifeq ($(wildcard $(ROCM_BASE)),)
# ROCm platform not found
ROCM_BASE :=
else
# ROCm platform at $(ROCM_BASE)
ROCM_LIBDIR := $(ROCM_BASE)/lib
ROCM_DEPS := $(ROCM_LIBDIR)/libamdhip64.so
ROCM_ARCH := gfx900 gfx90a gfx1030
ROCM_HIPCC := $(ROCM_BASE)/bin/hipcc
HIPCC_CXXFLAGS := -fno-gpu-rdc $(foreach ARCH,$(ROCM_ARCH),--offload-arch=$(ARCH)) $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) --target=$(GCC_TARGET) --gcc-toolchain=$(GCC_TOOLCHAIN) -I$(ROCM_BASE)/include/hip -Wno-unused-result
HIPCC_LDFLAGS := $(LDFLAGS) --target=$(GCC_TARGET) --gcc-toolchain=$(GCC_TOOLCHAIN)
endif

# oneAPI
ONEAPI_BASE := /opt/intel/oneapi
ifeq ($(wildcard $(ONEAPI_BASE)),)
# Intel oneAPI not available
$(warning Cannot find an Intel oneAPI installation at $(ONEAPI_BASE))
ONEAPI_BASE :=
else
SYCL_BASE := $(ONEAPI_BASE)/compiler/latest
SYCL_LIBDIR := $(SYCL_BASE)/lib
SYCL_CXX := $(SYCL_BASE)/bin/icpx
AOT_CPU_TARGETS := -fsycl-targets=x86_64
SYCL_FLAGS := -fsycl -fp-model=precise
SYCL_CXXFLAGS := $(filter-out $(LLVM_UNSUPPORTED_CXXFLAGS),$(CXXFLAGS)) $(SYCL_FLAGS) -Wno-unused-variable
SYCL_LDFLAGS :=
# Check for Intel GPU existence
INTEL_GPU_EXISTS := $(shell sycl-ls | grep -c ext_oneapi_level_zero)
INTEL_GPU_FLEX := $(shell sycl-ls | grep -c 'GPU Flex')
# Set AOT_INTEL_TARGETS based on the conditions
ifneq ($(INTEL_GPU_EXISTS),0)
ifneq ($(shell sycl-ls | grep -c 'GPU Flex'),0)
AOT_INTEL_TARGETS := -fsycl-targets=intel_gpu_acm_g10
else ifneq ($(shell sycl-ls | grep -c 'GPU Max'),0)
AOT_INTEL_TARGETS := -fsycl-targets=intel_gpu_pvc
else
# rely on JIT
AOT_INTEL_TARGETS :=
endif
else
AOT_INTEL_TARGETS :=
endif
endif

XTD_BASE := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))

CATCH2_BASE := /cvmfs/cms.cern.ch/el8_amd64_gcc12/external/catch2/2.13.6-84f81620c3580a9689570b04155f541c/include
ifeq ($(wildcard $(CATCH2_BASE)/catch.hpp),)
CATCH2_BASE := $(XTD_BASE)
endif

CATCH2_INCLUDE := $(CATCH2_BASE)/catch.hpp

external_catch2: $(CATCH2_INCLUDE)

$(CATCH2_INCLUDE):
wget https://github.com/catchorg/Catch2/releases/download/v2.13.6/catch.hpp -O $(XTD_BASE)/catch.hpp

LIB_INCLUDE := -I$(XTD_BASE)/../include -I$(CATCH2_BASE)

SUBDIRS := $(wildcard $(XTD_BASE)/*/)
TARGETS_ALL := $(notdir $(patsubst %/,%,$(SUBDIRS)))

define TEST_template
$(1)/bin:
mkdir -p $(1)/bin

$(1)Test: external_catch2 $(1)/bin/$(1)_t_cc $(1)/bin/$(1)_t_cuda $(1)/bin/$(1)_t_hip $(1)/bin/$(1)_t_cpusycl $(1)/bin/$(1)_t_gpusycl

$(1)/bin/$(1)_t_cc: $(1)/$(1)_t.cc | $(1)/bin
$(CXX) $(CXXFLAGS) $(LIB_INCLUDE) $$< -o $$@

$(1)/bin/$(1)_t_cuda: $(1)/$(1)_t.cu | $(1)/bin
@if [ -z "$(CUDA_BASE)" ]; then \
echo "Error: CUDA_BASE not set. Skipping $@"; \
else \
echo "$(CUDA_NVCC) $(CUDA_CXXFLAGS) $(CUDA_LDFLAGS) $(CUDA_CUFLAGS) $(LIB_INCLUDE) $$< -o $$@"; \
$(CUDA_NVCC) $(CUDA_CXXFLAGS) $(CUDA_LDFLAGS) $(CUDA_CUFLAGS) $(LIB_INCLUDE) $$< -o $$@; \
fi

$(1)/bin/$(1)_t_hip: $(1)/$(1)_t.hip.cc | $(1)/bin
@if [ -z "$(ROCM_BASE)" ]; then \
echo "Error: ROCM_BASE not set. Skipping $@"; \
else \
echo "$(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(HIPCC_LDFLAGS) $(LIB_INCLUDE) $$< -o $$@"; \
$(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(HIPCC_LDFLAGS) $(LIB_INCLUDE) $$< -o $$@; \
fi

$(1)/bin/$(1)_t_cpusycl: $(1)/$(1)_t.sycl.cc | $(1)/bin
@if [ -z "$(ONEAPI_BASE)" ]; then \
echo "Error: ONEAPI_BASE not set. Skipping $@"; \
else \
echo "$(SYCL_CXX) $(SYCL_CXXFLAGS) $(SYCL_LDFLAGS) $(AOT_CPU_TARGETS) -DONEAPI_CPU $(LIB_INCLUDE) $$< -o $$@"; \
$(SYCL_CXX) $(SYCL_CXXFLAGS) $(SYCL_LDFLAGS) $(AOT_CPU_TARGETS) -DONEAPI_CPU $(LIB_INCLUDE) $$< -o $$@; \
fi

$(1)/bin/$(1)_t_gpusycl: $(1)/$(1)_t.sycl.cc | $(1)/bin
@if [ -z "$(ONEAPI_BASE)" ]; then \
echo "Error: ONEAPI_BASE not set. Skipping $@"; \
else \
if [ "$(INTEL_GPU_EXISTS)" -ne 0 ]; then \
echo "$(SYCL_CXX) $(SYCL_CXXFLAGS) $(SYCL_LDFLAGS) $(AOT_INTEL_TARGETS) $(LIB_INCLUDE) $$< -o $$@"; \
$(SYCL_CXX) $(SYCL_CXXFLAGS) $(SYCL_LDFLAGS) $(AOT_INTEL_TARGETS) $(LIB_INCLUDE) $$< -o $$@; \
else \
echo "Error: Intel GPU not found. Skipping $@"; \
fi \
fi

# List of test executables
$(1)_BIN := $(XTD_BASE)/$(1)/bin

# Add targets
TEST_EXECUTABLES := $(1)/bin/$(1)_t_cc
ifdef $(CUDA_BASE)
TEST_EXECUTABLES += $(1)/bin/$(1)_t_cuda
endif
ifdef $(ROCM_BASE)
TEST_EXECUTABLES += $(1)/bin/$(1)_t_hip
endif
ifdef $(ONEAPI_BASE)
TEST_EXECUTABLES += $(1)/bin/$(1)_t_cpusycl
ifdef $(INTEL_GPU_EXISTS)
TEST_EXECUTABLES += $(1)/bin/$(1)_t_gpusycl
endif
endif

run$(1)Test: $(TEST_EXECUTABLES)
@find $$($(1)_BIN) -maxdepth 1 -type f -exec echo "Running {}" \; -exec {} \; -exec echo \;

clean_$(1)Test:
rm -rf $(1)/bin
endef
$(foreach target,$(TARGETS_ALL),$(eval $(call TEST_template,$(target))))

clean: $(foreach target,$(TARGETS_ALL), clean_$(target)Test)
all: $(foreach target,$(TARGETS_ALL), $(target)Test)
runAll: $(foreach target,$(TARGETS_ALL), run$(target)Test)
30 changes: 30 additions & 0 deletions test/sin/sin_t.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include "math.h"
#include <cmath>
#include <limits>

TEST_CASE("sinSerial", "[sin]") {
auto const epsilon = std::numeric_limits<double>::epsilon();
auto const epsilon_f = std::numeric_limits<float>::epsilon();

std::vector<double> values{-1., 0., M_PI / 2, M_PI, 42.};

for (auto &v : values) {
REQUIRE_THAT(
xtd::sin(static_cast<int>(v)),
Catch::Matchers::WithinAbs(std::sin(static_cast<int>(v)), epsilon));
REQUIRE_THAT(xtd::sin(static_cast<float>(v)),
Catch::Matchers::WithinAbs(std::sin(v), epsilon_f));
REQUIRE_THAT(xtd::sin(static_cast<double>(v)),
Catch::Matchers::WithinAbs(std::sin(v), epsilon));
REQUIRE_THAT(
xtd::sinf(static_cast<int>(v)),
Catch::Matchers::WithinAbs(sinf(static_cast<int>(v)), epsilon_f));
REQUIRE_THAT(xtd::sinf(static_cast<float>(v)),
Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
REQUIRE_THAT(xtd::sinf(static_cast<double>(v)),
Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
}
}
67 changes: 67 additions & 0 deletions test/sin/sin_t.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include "math.h"
#include <cuda_runtime.h>
#include <limits>
#include <vector>

template <typename T> __global__ void sinKernel(double *result, T input) {
result[0] = static_cast<double>(xtd::sin(input));
}

template <typename T> __global__ void sinfKernel(double *result, T input) {
result[0] = static_cast<double>(xtd::sinf(input));
}

TEST_CASE("sinCuda", "[sin]") {
int deviceCount;
cudaError_t cudaStatus = cudaGetDeviceCount(&deviceCount);

if (cudaStatus != cudaSuccess || deviceCount == 0) {
exit(EXIT_SUCCESS);
}

cudaSetDevice(0);
cudaStream_t q;
cudaStreamCreate(&q);

std::vector<double> values{-1., 0., M_PI / 2, M_PI, 42.};

double *result;
int constexpr N = 6;
cudaMallocAsync(&result, N * sizeof(double), q);

for (auto v : values) {

cudaMemsetAsync(&result, 0x00, N * sizeof(double), q);

sinKernel<<<1, 1, 0, q>>>(&result[0], static_cast<int>(v));
sinKernel<<<1, 1, 0, q>>>(&result[1], static_cast<float>(v));
sinKernel<<<1, 1, 0, q>>>(&result[2], static_cast<double>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[3], static_cast<int>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[4], static_cast<float>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[5], static_cast<double>(v));

double resultHost[N];
cudaMemcpyAsync(resultHost, result, N * sizeof(double),
cudaMemcpyDeviceToHost, q);

cudaStreamSynchronize(q);

auto const epsilon = std::numeric_limits<double>::epsilon();
auto const epsilon_f = std::numeric_limits<float>::epsilon();
REQUIRE_THAT(resultHost[0], Catch::Matchers::WithinAbs(
std::sin(static_cast<int>(v)), epsilon));
REQUIRE_THAT(resultHost[1],
Catch::Matchers::WithinAbs(std::sin(v), epsilon_f));
REQUIRE_THAT(resultHost[2],
Catch::Matchers::WithinAbs(std::sin(v), epsilon));
REQUIRE_THAT(resultHost[3], Catch::Matchers::WithinAbs(
sinf(static_cast<int>(v)), epsilon_f));
REQUIRE_THAT(resultHost[4], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
REQUIRE_THAT(resultHost[5], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
}

cudaFreeAsync(result, q);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#define CATCH_CONFIG_MAIN
#include <catch.hpp>
#include "math.h"
#include <cuda_runtime.h>
#include <limits>
#include <vector>
template <typename T> __global__ void sinKernel(double *result, T input) {
result[0] = static_cast<double>(xtd::sin(input));
}
template <typename T> __global__ void sinfKernel(double *result, T input) {
result[0] = static_cast<double>(xtd::sinf(input));
}
TEST_CASE("sinCuda", "[sin]") {
int deviceCount;
cudaError_t cudaStatus = cudaGetDeviceCount(&deviceCount);
if (cudaStatus != cudaSuccess || deviceCount == 0) {
exit(EXIT_SUCCESS);
}
cudaSetDevice(0);
cudaStream_t q;
cudaStreamCreate(&q);
std::vector<double> values{-1., 0., M_PI / 2, M_PI, 42.};
double *result;
int constexpr N = 6;
cudaMallocAsync(&result, N * sizeof(double), q);
for (auto v : values) {
cudaMemsetAsync(&result, 0x00, N * sizeof(double), q);
sinKernel<<<1, 1, 0, q>>>(&result[0], static_cast<int>(v));
sinKernel<<<1, 1, 0, q>>>(&result[1], static_cast<float>(v));
sinKernel<<<1, 1, 0, q>>>(&result[2], static_cast<double>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[3], static_cast<int>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[4], static_cast<float>(v));
sinfKernel<<<1, 1, 0, q>>>(&result[5], static_cast<double>(v));
double resultHost[N];
cudaMemcpyAsync(resultHost, result, N * sizeof(double),
cudaMemcpyDeviceToHost, q);
cudaStreamSynchronize(q);
auto const epsilon = std::numeric_limits<double>::epsilon();
auto const epsilon_f = std::numeric_limits<float>::epsilon();
REQUIRE_THAT(resultHost[0], Catch::Matchers::WithinAbs(
std::sin(static_cast<int>(v)), epsilon));
REQUIRE_THAT(resultHost[1],
Catch::Matchers::WithinAbs(std::sin(v), epsilon_f));
REQUIRE_THAT(resultHost[2],
Catch::Matchers::WithinAbs(std::sin(v), epsilon));
REQUIRE_THAT(resultHost[3], Catch::Matchers::WithinAbs(
sinf(static_cast<int>(v)), epsilon_f));
REQUIRE_THAT(resultHost[4], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
REQUIRE_THAT(resultHost[5], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
}
cudaFreeAsync(result, q);
}
#include <limits>
#include <vector>
#include <cuda_runtime.h>
#define CATCH_CONFIG_MAIN
#include <catch.hpp>
#include "math.h"
#include "cuda_check.h"
template <typename T> __global__ void sinKernel(double *result, T input) {
*result = static_cast<double>(xtd::sin(input));
}
template <typename T> __global__ void sinfKernel(double *result, T input) {
*result = static_cast<double>(xtd::sinf(input));
}
TEST_CASE("sinCuda", "[sin]") {
int deviceCount;
cudaError_t cudaStatus = cudaGetDeviceCount(&deviceCount);
if (cudaStatus != cudaSuccess || deviceCount == 0) {
exit(EXIT_SUCCESS);
}
CUDA_CHECK(cudaSetDevice(0));
cudaStream_t q;
CUDA_CHECK(cudaStreamCreate(&q));
std::vector<double> values{-1., 0., M_PI / 2, M_PI, 42.};
double *result;
int constexpr N = 6;
CUDA_CHECK(cudaMallocAsync(&result, N * sizeof(double), q));
for (auto v : values) {
CUDA_CHECK(cudaMemsetAsync(result, 0x00, N * sizeof(double), q));
sinKernel<<<1, 1, 0, q>>>(result + 0, static_cast<int>(v));
CUDA_CHECK(cudaGetLastError());
sinKernel<<<1, 1, 0, q>>>(result + 1, static_cast<float>(v));
CUDA_CHECK(cudaGetLastError());
sinKernel<<<1, 1, 0, q>>>(result + 2, static_cast<double>(v));
CUDA_CHECK(cudaGetLastError());
sinfKernel<<<1, 1, 0, q>>>(result + 3, static_cast<int>(v));
CUDA_CHECK(cudaGetLastError());
sinfKernel<<<1, 1, 0, q>>>(result + 4, static_cast<float>(v));
CUDA_CHECK(cudaGetLastError());
sinfKernel<<<1, 1, 0, q>>>(result + 5, static_cast<double>(v));
CUDA_CHECK(cudaGetLastError());
double resultHost[N];
CUDA_CHECK(cudaMemcpyAsync(resultHost, result, N * sizeof(double), cudaMemcpyDeviceToHost, q));
CUDA_CHECK(cudaStreamSynchronize(q));
auto const epsilon = std::numeric_limits<double>::epsilon();
auto const epsilon_f = std::numeric_limits<float>::epsilon();
REQUIRE_THAT(resultHost[0], Catch::Matchers::WithinAbs(std::sin(static_cast<int>(v)), epsilon));
REQUIRE_THAT(resultHost[1], Catch::Matchers::WithinAbs(std::sin(v), epsilon_f));
REQUIRE_THAT(resultHost[2], Catch::Matchers::WithinAbs(std::sin(v), epsilon));
REQUIRE_THAT(resultHost[3], Catch::Matchers::WithinAbs(sinf(static_cast<int>(v)), epsilon_f));
REQUIRE_THAT(resultHost[4], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
REQUIRE_THAT(resultHost[5], Catch::Matchers::WithinAbs(sinf(v), epsilon_f));
}
CUDA_CHECK(cudaFreeAsync(result, q));
CUDA_CHECK(cudaStreamDestroy(q));
}

Loading