diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 80598d24..1bf88004 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -253,7 +253,7 @@ if(FILE_REORG_BACKWARD_COMPATIBILITY) set(ROCM_HEADER_WRAPPER_WERROR "$ENV{ROCM_HEADER_WRAPPER_WERROR}" CACHE STRING "Header wrapper warnings as errors.") else() - set(ROCM_HEADER_WRAPPER_WERROR "ON" CACHE STRING "Header wrapper warnings as errors.") + set(ROCM_HEADER_WRAPPER_WERROR "OFF" CACHE STRING "Header wrapper warnings as errors.") endif() endif() if(ROCM_HEADER_WRAPPER_WERROR) diff --git a/src/roctracer/hsa_support.cpp b/src/roctracer/hsa_support.cpp index 31153987..c5ca4640 100644 --- a/src/roctracer/hsa_support.cpp +++ b/src/roctracer/hsa_support.cpp @@ -27,6 +27,7 @@ #include "roctracer.h" #include "roctracer_hsa.h" +#include #include #include #include @@ -415,11 +416,13 @@ hsa_status_t ExecutableDestroyIntercept(hsa_executable_t executable) { return saved_core_api.hsa_executable_destroy_fn(executable); } -bool profiling_async_copy_enable = false; +std::atomic profiling_async_copy_enable{false}; hsa_status_t ProfilingAsyncCopyEnableIntercept(bool enable) { hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(enable); - if (status == HSA_STATUS_SUCCESS) profiling_async_copy_enable = enable; + if (status == HSA_STATUS_SUCCESS) { + profiling_async_copy_enable.exchange(enable, std::memory_order_release); + } return status; } @@ -434,6 +437,36 @@ void MemoryASyncCopyHandler(const Tracker::entry_t* entry) { ReportActivity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY, &record); } +hsa_status_t MemoryASyncCopyOnEngineIntercept( + void* dst, hsa_agent_t dst_agent, const void* src, hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, hsa_signal_t completion_signal, + hsa_amd_sdma_engine_id_t engine_id, bool force_copy_on_sdma) { + bool is_enabled = IsEnabled(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY); + + // FIXME: what happens if the state changes before returning? + [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); + assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); + + if (!is_enabled) { + return saved_amd_ext_api.hsa_amd_memory_async_copy_on_engine_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, completion_signal, + engine_id, force_copy_on_sdma); + } + + Tracker::entry_t* entry = new Tracker::entry_t(); + entry->handler = MemoryASyncCopyHandler; + entry->correlation_id = CorrelationId(); + Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); + + status = saved_amd_ext_api.hsa_amd_memory_async_copy_on_engine_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, entry->signal, engine_id, + force_copy_on_sdma); + if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); + + return status; +} + hsa_status_t MemoryASyncCopyIntercept(void* dst, hsa_agent_t dst_agent, const void* src, hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, @@ -442,7 +475,7 @@ hsa_status_t MemoryASyncCopyIntercept(void* dst, hsa_agent_t dst_agent, const vo // FIXME: what happens if the state changes before returning? [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( - profiling_async_copy_enable | is_enabled); + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); if (!is_enabled) { @@ -473,7 +506,7 @@ hsa_status_t MemoryASyncCopyRectIntercept(const hsa_pitched_ptr_t* dst, // FIXME: what happens if the state changes before returning? [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( - profiling_async_copy_enable | is_enabled); + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); if (!is_enabled) { @@ -570,6 +603,7 @@ void Initialize(HsaApiTable* table) { // Install the HSA_OPS intercept table->amd_ext_->hsa_amd_memory_async_copy_fn = MemoryASyncCopyIntercept; table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = MemoryASyncCopyRectIntercept; + table->amd_ext_->hsa_amd_memory_async_copy_on_engine_fn = MemoryASyncCopyOnEngineIntercept; table->amd_ext_->hsa_amd_profiling_async_copy_enable_fn = ProfilingAsyncCopyEnableIntercept; // Install the HSA_EVT intercept @@ -590,7 +624,7 @@ void Initialize(HsaApiTable* table) { void Finalize() { if (hsa_status_t status = - saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(profiling_async_copy_enable); + saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(profiling_async_copy_enable.load(std::memory_order_relaxed)); status != HSA_STATUS_SUCCESS) assert(!"hsa_amd_profiling_async_copy_enable failed"); diff --git a/src/roctracer/roctracer.cpp b/src/roctracer/roctracer.cpp index 66554f82..945ca1bb 100644 --- a/src/roctracer/roctracer.cpp +++ b/src/roctracer/roctracer.cpp @@ -891,4 +891,4 @@ ROCTRACER_EXPORT bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_EXPORT void OnUnload() { hsa_support::Finalize(); } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c7c5903b..3b5d14ce 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) -find_package(HIP REQUIRED MODULE) +find_package(HIP REQUIRED) find_package(Clang REQUIRED CONFIG PATHS "${ROCM_PATH}"