From 6fffa50442544cbf13367a7877698ded511aad80 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 11 Aug 2020 04:44:08 -0500 Subject: [PATCH 01/47] optimized tool stats; Change-Id: I1baab986d36207b87f6f9ad5e0a45a9cffbea0c8 --- src/core/roctracer.cpp | 12 +-- src/core/trace_buffer.h | 21 ++++- src/proxy/intercept_queue.h | 4 +- test/CMakeLists.txt | 2 +- test/tool/tracer_tool.cpp | 163 ++++++++++++++++++++++++++++++------ 5 files changed, 166 insertions(+), 36 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 2d15bbba..dd47bb0f 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -218,11 +218,12 @@ template<> bool act_en_functor_t::fun(const act_en_functor_t::record_t& record) void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry); void hsa_kernel_handler(::proxy::Tracker::entry_t* entry); -TraceBuffer::flush_prm_t trace_buffer_prm[] = { +constexpr TraceBuffer::flush_prm_t trace_buffer_prm[] = { {COPY_ENTRY_TYPE, hsa_async_copy_handler}, {KERNEL_ENTRY_TYPE, hsa_kernel_handler} }; -TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); +TraceBuffer* trace_buffer = NULL; +//TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); namespace hsa_support { // callbacks table @@ -567,7 +568,7 @@ hsa_status_t hsa_amd_memory_async_copy_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, src_agent, size, num_dep_signals, @@ -591,7 +592,7 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, copy_agent, @@ -1289,13 +1290,14 @@ PUBLIC_API void roctracer_unload() { PUBLIC_API void roctracer_flush_buf() { ONLOAD_TRACE_BEG(); - roctracer::trace_buffer.Flush(); + roctracer::trace_buffer->Flush(); ONLOAD_TRACE_END(); } CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); roctracer::util::Logger::Create(); + roctracer::trace_buffer = new roctracer::TraceBuffer("HSA GPU", 0x200000, roctracer::trace_buffer_prm, 2); roctracer_load(); ONLOAD_TRACE_END(); } diff --git a/src/core/trace_buffer.h b/src/core/trace_buffer.h index cd62dda7..8d994046 100644 --- a/src/core/trace_buffer.h +++ b/src/core/trace_buffer.h @@ -124,7 +124,7 @@ class TraceBuffer : protected TraceBufferBase { callback_t fun; }; - TraceBuffer(const char* name, uint32_t size, flush_prm_t* flush_prm_arr, uint32_t flush_prm_count) : + TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) : is_flushed_(false), work_thread_started_(false) { @@ -139,12 +139,14 @@ class TraceBuffer : protected TraceBufferBase { flush_prm_arr_ = flush_prm_arr; flush_prm_count_ = flush_prm_count; + priority_ = prior; + TraceBufferBase::Push(this); } ~TraceBuffer() { StopWorkerThread(); - Flush(); + FlushAll(); } void StartWorkerThread() { @@ -176,14 +178,24 @@ class TraceBuffer : protected TraceBufferBase { } void Flush() { flush_buf(); } + void Flush(const bool& b) { + DisableFlushing(!b); + flush_buf(); + } + void DisableFlushing(const bool& b) { is_flushed_.exchange(b, std::memory_order_acquire); } private: void flush_buf() { std::lock_guard lck(mutex_); const bool is_flushed = is_flushed_.exchange(true, std::memory_order_acquire); + if (priority_ != 0) { + priority_ -= 1; + return; + } + if (is_flushed == false) { - for (flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { + for (const flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { // Flushed entries type uint32_t type = prm->type; // Flushing function @@ -253,8 +265,9 @@ class TraceBuffer : protected TraceBufferBase { volatile std::atomic end_pointer_; std::list buf_list_; - flush_prm_t* flush_prm_arr_; + const flush_prm_t* flush_prm_arr_; uint32_t flush_prm_count_; + uint32_t priority_; volatile std::atomic is_flushed_; pthread_t work_thread_; diff --git a/src/proxy/intercept_queue.h b/src/proxy/intercept_queue.h index f92f1ce6..000c7e88 100644 --- a/src/proxy/intercept_queue.h +++ b/src/proxy/intercept_queue.h @@ -39,7 +39,7 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" #include "util/exception.h" -namespace roctracer { extern TraceBuffer trace_buffer; } +namespace roctracer { extern TraceBuffer* trace_buffer; } namespace rocprofiler { extern decltype(hsa_queue_create)* hsa_queue_create_fn; @@ -160,7 +160,7 @@ class InterceptQueue { const char* kernel_name = GetKernelName(kernel_symbol); // Adding kernel timing tracker - ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer.GetEntry(); + ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer->GetEntry(); entry->kernel.tid = syscall(__NR_gettid); entry->kernel.name = kernel_name; ::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id, completion_signal, entry); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3e3b9654..6a6d7d17 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "5b47aae" ) +set ( HSA_REV "a657002" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index df2530a5..db87d04d 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -42,6 +42,7 @@ THE SOFTWARE. #include "src/core/loader.h" #include "src/core/trace_buffer.h" +#include "util/evt_stats.h" #include "util/hsa_rsrc_factory.h" #include "util/xml.h" @@ -96,6 +97,12 @@ std::vector kfd_api_vec; LOADER_INSTANTIATE(); TRACE_BUFFER_INSTANTIATE(); +typedef EvtStatsT EvtStatsA; +// HIP stats +EvtStats* hip_api_stats = NULL; +EvtStatsA* hip_kernel_stats = NULL; +EvtStatsA* hip_memcpy_stats = NULL; + // Global output file handle FILE* begin_ts_file_handle = NULL; FILE* roctx_file_handle = NULL; @@ -137,7 +144,7 @@ static inline const char* cxx_demangle(const char* symbol) { size_t funcnamesize; int status; const char* ret = (symbol != NULL) ? abi::__cxa_demangle(symbol, NULL, &funcnamesize, &status) : symbol; - return (ret != NULL) ? ret : symbol; + return (ret != NULL) ? ret : strdup(symbol); } // Tracing control thread @@ -208,8 +215,9 @@ struct roctx_trace_entry_t { }; void roctx_flush_cb(roctx_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; -roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; +roctracer::TraceBuffer* roctx_trace_buffer = NULL; +//roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); // rocTX callback function static inline void roctx_callback_fun( @@ -224,7 +232,7 @@ static inline void roctx_callback_fun( #else const timestamp_t time = timer->timestamp_fn_ns(); #endif - roctx_trace_entry_t* entry = roctx_trace_buffer.GetEntry(); + roctx_trace_entry_t* entry = roctx_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -286,8 +294,9 @@ struct hsa_api_trace_entry_t { }; void hsa_api_flush_cb(hsa_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; -roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; +roctracer::TraceBuffer* hsa_api_trace_buffer = NULL; +//roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); // HSA API callback function void hsa_api_callback( @@ -302,7 +311,7 @@ void hsa_api_callback( hsa_begin_timestamp = timer->timestamp_fn_ns(); } else { const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns(); - hsa_api_trace_entry_t* entry = hsa_api_trace_buffer.GetEntry(); + hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -348,8 +357,9 @@ struct hip_api_trace_entry_t { }; void hip_api_flush_cb(hip_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hip_flush_prm[1] = {{0, hip_api_flush_cb}}; -roctracer::TraceBuffer hip_api_trace_buffer("HIP", 0x200000, hip_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm[1] = {{0, hip_api_flush_cb}}; +roctracer::TraceBuffer* hip_api_trace_buffer = NULL; +//roctracer::TraceBuffer hip_api_trace_buffer("HIP API", 0x200000, hip_api_flush_prm, 1); static inline bool is_hip_kernel_launch_api(const uint32_t& cid) { bool ret = @@ -379,7 +389,7 @@ void hip_api_callback( hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); const timestamp_t end_timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); + hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = cid; @@ -440,7 +450,7 @@ void mark_api_callback( const char* name = reinterpret_cast(callback_data); const timestamp_t timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); + hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); entry->valid = roctracer::TRACE_ENTRY_COMPL; entry->type = 0; entry->cid = 0; @@ -454,7 +464,14 @@ void mark_api_callback( entry->ptr = NULL; } +typedef std::map hip_kernel_map_t; +hip_kernel_map_t* hip_kernel_map = NULL; +std::mutex hip_kernel_mutex; + void hip_api_flush_cb(hip_api_trace_entry_t* entry) { + static uint64_t correlation_id = 0; + correlation_id += 1; + const uint32_t domain = entry->domain; const uint32_t cid = entry->cid; const hip_api_data_t* data = &(entry->data); @@ -469,12 +486,22 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING - const char* str = hipApiString((hip_api_id_t)cid, data); - rec_ss << " " << str; - if (is_hip_kernel_launch_api(cid)) { - if (entry->name) rec_ss << " kernel=" << cxx_demangle(entry->name); + if (hip_api_stats != NULL) { + hip_api_stats->add_event(cid, end_timestamp - begin_timestamp); + if (is_hip_kernel_launch_api(cid)) { + hip_kernel_mutex.lock(); + (*hip_kernel_map)[correlation_id] = entry->name; + hip_kernel_mutex.unlock(); + } + } else { + const char* str = hipApiString((hip_api_id_t)cid, data); + rec_ss << " " << str; + if (is_hip_kernel_launch_api(cid) && entry->name) { + const char* kernel_name = cxx_demangle(entry->name); + rec_ss << " kernel=" << kernel_name; + } + fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); } - fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); #else // !HIP_PROF_HIP_API_STRING switch (cid) { case HIP_API_ID_hipMemcpy: @@ -536,6 +563,47 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { fflush(hip_api_file_handle); } +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// HSA API tracing + +struct hip_act_trace_entry_t { + uint32_t valid; + uint32_t type; + uint32_t kind; + timestamp_t dur; + uint64_t correlation_id; +}; + +void hip_act_flush_cb(hip_act_trace_entry_t* entry); +constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm[1] = {{0, hip_act_flush_cb}}; +roctracer::TraceBuffer* hip_act_trace_buffer = NULL; +//roctracer::TraceBuffer hip_act_trace_buffer("HIP ACT", 0x200000, hip_act_flush_prm, 1); + +// HIP ACT trace buffer flush callback +void hip_act_flush_cb(hip_act_trace_entry_t* entry) { + const uint32_t domain = ACTIVITY_DOMAIN_HCC_OPS; + const uint32_t op = 0; + const char * name = roctracer_op_string(domain, op, entry->kind); + if (name == NULL) { + printf("hip_act_flush_cb name is NULL\n"); fflush(stdout); + abort(); + } + + if (strncmp("Kernel", name, 6) == 0) { + hip_kernel_mutex.lock(); + if (hip_kernel_stats == NULL) { + printf("hip_act_flush_cb hip_kernel_stats is NULL\n"); fflush(stdout); + abort(); + } + name = (*hip_kernel_map)[entry->correlation_id]; + hip_kernel_mutex.unlock(); + const char* kernel_name = cxx_demangle(name); + hip_kernel_stats->add_event(kernel_name, entry->dur); + } else { + hip_memcpy_stats->add_event(name, entry->dur); + } +} + // Activity tracing callback // hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) void pool_activity_callback(const char* begin, const char* end, void* arg) { @@ -546,11 +614,20 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: - fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", - record->begin_ns, record->end_ns, - record->device_id, record->queue_id, - name, record->correlation_id, my_pid); - fflush(hcc_activity_file_handle); + if (hip_memcpy_stats != NULL) { + hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry(); + entry->valid = roctracer::TRACE_ENTRY_COMPL; + entry->type = 0; + entry->kind = record->kind; + entry->dur = record->end_ns - record->begin_ns; + entry->correlation_id = record->correlation_id; + } else { + fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", + record->begin_ns, record->end_ns, + record->device_id, record->queue_id, + name, record->correlation_id, my_pid); + fflush(hcc_activity_file_handle); + } break; case ACTIVITY_DOMAIN_HSA_OPS: if (record->op == HSA_OP_ID_RESERVED1) { @@ -639,8 +716,10 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const } // Open output file -FILE* open_output_file(const char* prefix, const char* name) { +FILE* open_output_file(const char* prefix, const char* name, const char** path = NULL) { FILE* file_handle = NULL; + if (path != NULL) *path = NULL; + if (prefix != NULL) { std::ostringstream oss; oss << prefix << "/" << GetPid() << "_" << name; @@ -651,6 +730,8 @@ FILE* open_output_file(const char* prefix, const char* name) { perror(errmsg.str().c_str()); abort(); } + + if (path != NULL) *path = strdup(oss.str().c_str()); } else file_handle = stdout; return file_handle; } @@ -720,6 +801,7 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); + hip_act_trace_buffer->Flush(true); close_file_handles(); ONLOAD_TRACE_END(); @@ -979,15 +1061,40 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, (void*)mark_api_callback); // Allocating tracing pool open_tracing_pool(); + + // Check for optimized stats + const bool is_stats_opt = (getenv("ROCP_STATS_OPT") != NULL); + + // HIP kernel ma pinstantiation + if (is_stats_opt) hip_kernel_map = new hip_kernel_map_t; + // Enable tracing if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); - ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + + if (is_stats_opt) { + const char* path = NULL; + FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); + hip_api_stats = new EvtStats(f, path); + for (uint32_t id = 0; id < HIP_API_ID_NUMBER; id += 1) { + const char* label = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, id, 0); + hip_api_stats->set_label(id, label); + } + } } if (trace_hip_activity) { hcc_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); + + if (is_stats_opt) { + FILE* f = NULL; + const char* path = NULL; + f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); + hip_kernel_stats = new EvtStatsA(f, path); + f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); + hip_memcpy_stats = new EvtStatsA(f, path); + } } } @@ -1010,6 +1117,10 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); + roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, roctx_flush_prm, 1); + hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, hip_api_flush_prm, 1); + hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, hip_act_flush_prm, 1, 1); + hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, hsa_flush_prm, 1); roctracer_load(); tool_load(); ONLOAD_TRACE_END(); @@ -1018,7 +1129,11 @@ extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); roctracer_flush_buf(); tool_unload(); + + if (hip_api_stats) hip_api_stats->dump(); + if (hip_kernel_stats) hip_kernel_stats->dump(); + if (hip_memcpy_stats) hip_memcpy_stats->dump(); + roctracer_unload(); ONLOAD_TRACE_END(); } - From db1ccb0619d20489ed7455539ae3c04e5608950e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 15 Aug 2020 02:23:43 -0500 Subject: [PATCH 02/47] flush-rate option fixed; Change-Id: I50473f8008672772dd4aaf37cbc64472cb50b4a3 --- src/core/roctracer.cpp | 3 +- src/core/trace_buffer.h | 142 ++++++++++++++++++++++---------------- src/proxy/tracker.h | 13 ++-- test/run.sh | 5 +- test/tool/tracer_tool.cpp | 55 +++++++-------- 5 files changed, 119 insertions(+), 99 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index dd47bb0f..52f1e28b 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -223,7 +223,6 @@ constexpr TraceBuffer::flush_prm_t trace_buffer_prm[] = { {KERNEL_ENTRY_TYPE, hsa_kernel_handler} }; TraceBuffer* trace_buffer = NULL; -//TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); namespace hsa_support { // callbacks table @@ -1127,7 +1126,7 @@ PUBLIC_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* po API_METHOD_PREFIX if (pool == NULL) pool = roctracer_default_pool(); roctracer::MemoryPool* memory_pool = reinterpret_cast(pool); - memory_pool->Flush(); + if (memory_pool != NULL) memory_pool->Flush(); roctracer::TraceBufferBase::FlushAll(); API_METHOD_SUFFIX } diff --git a/src/core/trace_buffer.h b/src/core/trace_buffer.h index 8d994046..cb6767f2 100644 --- a/src/core/trace_buffer.h +++ b/src/core/trace_buffer.h @@ -36,15 +36,17 @@ enum { TRACE_ENTRY_COMPL = 2 }; -enum { - API_ENTRY_TYPE, - COPY_ENTRY_TYPE, - KERNEL_ENTRY_TYPE +enum entry_type_t { + DFLT_ENTRY_TYPE = 0, + API_ENTRY_TYPE = 1, + COPY_ENTRY_TYPE = 2, + KERNEL_ENTRY_TYPE = 3, + NUM_ENTRY_TYPE = 4 }; struct trace_entry_t { std::atomic valid; - uint32_t type; + entry_type_t type; uint64_t dispatch; uint64_t begin; // kernel begin timestamp, ns uint64_t end; // kernel end timestamp, ns @@ -67,14 +69,26 @@ struct trace_entry_t { template struct push_element_fun { T* const elem_; - void fun(T* node) { if (node->next_elem_ == NULL) node->next_elem_ = elem_; } - push_element_fun(T* elem) : elem_(elem) {} + T** prev_; + bool fun(T* node) { + if (node->priority_ > elem_->priority_) { + *prev_ = elem_; + elem_->next_elem_ = node; + } else if (node->next_elem_ == NULL) { + node->next_elem_ = elem_; + } else { + prev_ = &(node->next_elem_); + return false; + } + return true; + } + push_element_fun(T* elem, T** prev) : elem_(elem), prev_(prev) {} }; template struct call_element_fun { void (T::*fptr_)(); - void fun(T* node) { (node->*fptr_)(); } + bool fun(T* node) const { (node->*fptr_)(); return false; } call_element_fun(void (T::*f)()) : fptr_(f) {} }; @@ -89,10 +103,10 @@ struct TraceBufferBase { static void Push(TraceBufferBase* elem) { if (head_elem_ == NULL) head_elem_ = elem; - else foreach(push_element_fun(elem)); + else foreach(push_element_fun(elem, &head_elem_)); } - TraceBufferBase() : next_elem_(NULL) {} + TraceBufferBase(const uint32_t& prior) : priority_(prior), next_elem_(NULL) {} template static void foreach(const F& f_in) { @@ -101,11 +115,12 @@ struct TraceBufferBase { TraceBufferBase* p = head_elem_; while (p != NULL) { TraceBufferBase* next = p->next_elem_; - f.fun(p); + if (f.fun(p) == true) break; p = next; } } + const uint32_t priority_; TraceBufferBase* next_elem_; static TraceBufferBase* head_elem_; static mutex_t mutex_; @@ -118,35 +133,41 @@ class TraceBuffer : protected TraceBufferBase { typedef TraceBuffer Obj; typedef uint64_t pointer_t; typedef std::recursive_mutex mutex_t; + typedef typename std::list buf_list_t; + typedef typename buf_list_t::iterator buf_list_it_t; struct flush_prm_t { - uint32_t type; + entry_type_t type; callback_t fun; }; TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) : - is_flushed_(false), + TraceBufferBase(prior), + size_(size), work_thread_started_(false) { name_ = strdup(name); - size_ = size; data_ = allocate_fun(); next_ = allocate_fun(); read_pointer_ = 0; + write_pointer_ = 0; end_pointer_ = size; buf_list_.push_back(data_); - flush_prm_arr_ = flush_prm_arr; - flush_prm_count_ = flush_prm_count; - - priority_ = prior; + memset(f_array_, 0, sizeof(f_array_)); + for (const flush_prm_t* prm = flush_prm_arr; prm < flush_prm_arr + flush_prm_count; prm++) { + const entry_type_t type = prm->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + if (f_array_[type] != NULL) FATAL("handler function ptr redefinition (" << type << ")"); + f_array_[type] = prm->fun; + } TraceBufferBase::Push(this); } ~TraceBuffer() { StopWorkerThread(); - FlushAll(); + Flush(); } void StartWorkerThread() { @@ -171,52 +192,52 @@ class TraceBuffer : protected TraceBufferBase { } Entry* GetEntry() { - const pointer_t pointer = read_pointer_.fetch_add(1); + const pointer_t pointer = write_pointer_.fetch_add(1); if (pointer >= end_pointer_) wrap_buffer(pointer); if (pointer >= end_pointer_) FATAL("pointer >= end_pointer_ after buffer wrap"); - return data_ + (pointer + size_ - end_pointer_); + Entry* entry = data_ + (size_ + pointer - end_pointer_); + entry->valid = TRACE_ENTRY_INV; + entry->type = DFLT_ENTRY_TYPE; + return entry; } void Flush() { flush_buf(); } - void Flush(const bool& b) { - DisableFlushing(!b); - flush_buf(); - } - void DisableFlushing(const bool& b) { is_flushed_.exchange(b, std::memory_order_acquire); } private: void flush_buf() { std::lock_guard lck(mutex_); - const bool is_flushed = is_flushed_.exchange(true, std::memory_order_acquire); - if (priority_ != 0) { - priority_ -= 1; - return; - } + pointer_t pointer = read_pointer_; + pointer_t curr_pointer = write_pointer_.load(std::memory_order_relaxed); + buf_list_it_t it = buf_list_.begin(); + buf_list_it_t end_it = buf_list_.end(); + while(it != end_it) { + Entry* buf = *it; + Entry* ptr = buf + (pointer % size_); + Entry* end_ptr = buf + size_; + while ((ptr < end_ptr) && (pointer < curr_pointer)) { + if (ptr->valid != TRACE_ENTRY_COMPL) break; + + entry_type_t type = ptr->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + callback_t f_ptr = f_array_[type]; + if (f_ptr == NULL) FATAL("f_ptr == NULL"); + (*f_ptr)(ptr); + + ptr++; + pointer++; + } - if (is_flushed == false) { - for (const flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { - // Flushed entries type - uint32_t type = prm->type; - // Flushing function - callback_t fun = prm->fun; - if (fun == NULL) FATAL("flush function is not set"); - - pointer_t pointer = 0; - for (Entry* ptr : buf_list_) { - Entry* end = ptr + size_; - while ((ptr < end) && (pointer < read_pointer_)) { - if (ptr->type == type) { - if (ptr->valid == TRACE_ENTRY_COMPL) { - fun(ptr); - } - } - ptr++; - pointer++; - } - } + buf_list_it_t prev = it; + it++; + if (ptr == end_ptr) { + free_fun(*prev); + buf_list_.erase(prev); } + if (pointer == curr_pointer) break; } + + read_pointer_ = pointer; } inline Entry* allocate_fun() { @@ -226,6 +247,10 @@ class TraceBuffer : protected TraceBufferBase { return ptr; } + inline void free_fun(void* ptr) { + free(ptr); + } + static void* allocate_worker(void* arg) { Obj* obj = (Obj*)arg; @@ -258,17 +283,14 @@ class TraceBuffer : protected TraceBufferBase { } const char* name_; - uint32_t size_; + const uint32_t size_; Entry* data_; Entry* next_; - volatile std::atomic read_pointer_; + pointer_t read_pointer_; + volatile std::atomic write_pointer_; volatile std::atomic end_pointer_; - std::list buf_list_; - - const flush_prm_t* flush_prm_arr_; - uint32_t flush_prm_count_; - uint32_t priority_; - volatile std::atomic is_flushed_; + buf_list_t buf_list_; + callback_t f_array_[NUM_ENTRY_TYPE]; pthread_t work_thread_; pthread_mutex_t work_mutex_; diff --git a/src/proxy/tracker.h b/src/proxy/tracker.h index edb223b0..dc0322bd 100644 --- a/src/proxy/tracker.h +++ b/src/proxy/tracker.h @@ -40,9 +40,10 @@ class Tracker { public: typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef roctracer::trace_entry_t entry_t; + typedef roctracer::entry_type_t entry_type_t; // Add tracker entry - inline static void Enable(uint32_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { + inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance()); @@ -88,13 +89,16 @@ class Tracker { } entry->complete = hsa_rsrc->TimestampNs(); + hsa_signal_t orig = entry->orig; + hsa_signal_t signal = entry->signal; + + // Releasing completed entry entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); // Original intercepted signal completion - hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); - amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(signal.handle); orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; @@ -102,7 +106,7 @@ class Tracker { if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); hsa_signal_store_screlease(orig, signal_value); } - hsa_signal_destroy(entry->signal); + hsa_signal_destroy(signal); } // Handler for packet completion @@ -113,7 +117,6 @@ class Tracker { // Complete entry Tracker::Complete(signal_value, entry); - return false; } }; diff --git a/test/run.sh b/test/run.sh index 962033f6..c5c8aa45 100755 --- a/test/run.sh +++ b/test/run.sh @@ -79,7 +79,9 @@ eval_test() { test_runnum=$((test_runnum + 1)) eval "$cmdline" >$test_trace 2>&1 is_failed=$? - cat $test_trace + if [ $is_failed != 0 ] ; then + cat $test_trace + fi if [ $IS_CI = 1 ] ; then is_failed=0; else @@ -87,6 +89,7 @@ eval_test() { python ./test/check_trace.py -in $test_name -ck $check_trace_flag is_failed=$? if [ $is_failed != 0 ] ; then + echo "Trace checker error:" python ./test/check_trace.py -v -in $test_name -ck $check_trace_flag fi fi diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index db87d04d..a075bc29 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -176,6 +176,8 @@ void* control_thr_fun(void*) { usleep(dist_us); } } + + return NULL; } // Flushing control thread @@ -204,8 +206,8 @@ void* flush_thr_fun(void*) { // rocTX annotation tracing struct roctx_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t time; uint32_t pid; @@ -215,9 +217,8 @@ struct roctx_trace_entry_t { }; void roctx_flush_cb(roctx_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm = {roctracer::DFLT_ENTRY_TYPE, roctx_flush_cb}; roctracer::TraceBuffer* roctx_trace_buffer = NULL; -//roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); // rocTX callback function static inline void roctx_callback_fun( @@ -233,14 +234,13 @@ static inline void roctx_callback_fun( const timestamp_t time = timer->timestamp_fn_ns(); #endif roctx_trace_entry_t* entry = roctx_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->time = time; entry->pid = GetPid(); entry->tid = tid; entry->rid = rid; entry->message = (message != NULL) ? strdup(message) : NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } void roctx_api_callback( @@ -283,8 +283,8 @@ void roctx_flush_cb(roctx_trace_entry_t* entry) { // HSA API tracing struct hsa_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t begin; timestamp_t end; @@ -294,9 +294,8 @@ struct hsa_api_trace_entry_t { }; void hsa_api_flush_cb(hsa_api_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hsa_api_flush_cb}; roctracer::TraceBuffer* hsa_api_trace_buffer = NULL; -//roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); // HSA API callback function void hsa_api_callback( @@ -312,14 +311,13 @@ void hsa_api_callback( } else { const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns(); hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->begin = hsa_begin_timestamp; entry->end = end_timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } } @@ -343,8 +341,8 @@ void hsa_activity_callback( // HIP API tracing struct hip_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t domain; uint32_t cid; timestamp_t begin; @@ -357,9 +355,8 @@ struct hip_api_trace_entry_t { }; void hip_api_flush_cb(hip_api_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm[1] = {{0, hip_api_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_api_flush_cb}; roctracer::TraceBuffer* hip_api_trace_buffer = NULL; -//roctracer::TraceBuffer hip_api_trace_buffer("HIP API", 0x200000, hip_api_flush_prm, 1); static inline bool is_hip_kernel_launch_api(const uint32_t& cid) { bool ret = @@ -390,8 +387,6 @@ void hip_api_callback( const timestamp_t end_timestamp = timer->timestamp_fn_ns(); hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = cid; entry->domain = domain; entry->begin = hip_begin_timestamp; @@ -437,6 +432,8 @@ void hip_api_callback( } } } + + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } } @@ -451,8 +448,6 @@ void mark_api_callback( const timestamp_t timestamp = timer->timestamp_fn_ns(); hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->cid = 0; entry->domain = domain; entry->begin = timestamp; @@ -462,6 +457,7 @@ void mark_api_callback( entry->data = {}; entry->name = strdup(name); entry->ptr = NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } typedef std::map hip_kernel_map_t; @@ -567,17 +563,16 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { // HSA API tracing struct hip_act_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t kind; timestamp_t dur; uint64_t correlation_id; }; void hip_act_flush_cb(hip_act_trace_entry_t* entry); -constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm[1] = {{0, hip_act_flush_cb}}; +constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_act_flush_cb}; roctracer::TraceBuffer* hip_act_trace_buffer = NULL; -//roctracer::TraceBuffer hip_act_trace_buffer("HIP ACT", 0x200000, hip_act_flush_prm, 1); // HIP ACT trace buffer flush callback void hip_act_flush_cb(hip_act_trace_entry_t* entry) { @@ -616,11 +611,10 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { case ACTIVITY_DOMAIN_HCC_OPS: if (hip_memcpy_stats != NULL) { hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; entry->kind = record->kind; entry->dur = record->end_ns - record->begin_ns; entry->correlation_id = record->correlation_id; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } else { fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", record->begin_ns, record->end_ns, @@ -801,7 +795,6 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - hip_act_trace_buffer->Flush(true); close_file_handles(); ONLOAD_TRACE_END(); @@ -1117,10 +1110,10 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); - roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, roctx_flush_prm, 1); - hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, hip_api_flush_prm, 1); - hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, hip_act_flush_prm, 1, 1); - hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, hsa_flush_prm, 1); + roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, &roctx_flush_prm, 1); + hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, &hip_api_flush_prm, 1); + hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1); + hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, &hsa_flush_prm, 1); roctracer_load(); tool_load(); ONLOAD_TRACE_END(); From dae98a346cebf58a935d0bfa79c3c51f6db7f58b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 28 Aug 2020 06:31:24 -0500 Subject: [PATCH 03/47] Format ostream ops; Change-Id: I33d01cde41d9a762a8a955a1faccfdef02d8c0ac --- inc/roctracer_hip.h | 14 ++++++++ script/gen_ostream_ops.py | 72 +++++++++++++++++++++++++++++++-------- test/tool/tracer_tool.cpp | 1 + 3 files changed, 72 insertions(+), 15 deletions(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 86ffc1ae..091f3279 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -23,6 +23,20 @@ THE SOFTWARE. #ifndef INC_ROCTRACER_HIP_H_ #define INC_ROCTRACER_HIP_H_ +#ifdef __cplusplus +#include + +inline static std::ostream& operator<<(std::ostream& out, const unsigned char& v) { + out << (unsigned int)v; + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const char& v) { + out << (unsigned char)v; + return out; +} +#endif // __cplusplus + #include #include #include diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 142ec98e..900c3677 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -3,6 +3,7 @@ import os, sys, re import CppHeaderParser import argparse +import string LICENSE = \ '/*\n' + \ @@ -33,7 +34,7 @@ ' inline static std::ostream& put(std::ostream& out, const T& v) { return out; }\n' + \ '};\n\n' -header_hip = \ +header_basic = \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ @@ -43,6 +44,7 @@ structs_analyzed = {} global_ops_hip = '' +global_str = '' # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): @@ -51,6 +53,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a # cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...) # parent_hier_name: parent hierarchical name used for nested structs/enums # apiname: for example hip, kfd. + global global_str if cppHeader_struct == 'max_align_t': #function pointers not working in cppheaderparser return @@ -59,7 +62,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a if cppHeader_struct in structs_analyzed: return - structs_analyzed[cppHeader_struct] = 1; + structs_analyzed[cppHeader_struct] = 1 for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): key = 'name' name = "" @@ -85,16 +88,21 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: prop = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key4] + str = '' if "union" not in mtype: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - str = " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + " = \");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" else: + str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \"" + name + " = \");\n" if array_size == "": - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+">::put(out,v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + ">::put(out, v." + name + ");\n" else: - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+"["+array_size+"]>::put(out,v."+name+");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + "[" + array_size + "]>::put(out, v." + name + ");\n" + str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \", \");\n" if "void" not in mtype: - file_handle.write(str) + global_str += str else: if prop != '': next_cppHeader_struct = prop + "::" @@ -105,11 +113,12 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) # Parses API header file and generates ostream ops files ostream_ops.h -def gen_cppheader(infilepath, outfilepath): +def gen_cppheader(infilepath, outfilepath, structs_depth): # infilepath: API Header file to be parsed # outfilepath: Output file where ostream operators are written global_ops_hip = '' global_ops_hsa = '' + global global_str try: cppHeader = CppHeaderParser.CppHeader(infilepath) except CppHeaderParser.CppParseError as e: @@ -140,10 +149,12 @@ def gen_cppheader(infilepath, outfilepath): f.write('\n') f.write('namespace roctracer {\n') f.write('namespace ' + apiname.lower() + '_support {\n') + if structs_depth != -1: + f.write('static int ' + apiname.upper() + '_depth_max = ' + str(structs_depth) + ';\n') f.write('// begin ostream ops for '+ apiname + ' \n') if apiname.lower() == "hip" or apiname.lower() == "hsa": f.write("// basic ostream ops\n") - f.write(header_hip) + f.write(header_basic) f.write("// End of basic ostream ops\n\n") else: f.write(header) @@ -154,28 +165,56 @@ def gen_cppheader(infilepath, outfilepath): if apiname.lower() == 'hsa': if c == 'max_align_t' or c == '__fsid_t': #already defined for hip continue - #if apiname.lower() == 'hip' and c == 'hipIpcEventHandle_t': #feature is TBD - # continue if len(cppHeader.classes[c]["properties"]["public"])!=0: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': f.write("std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") f.write("{\n") + f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + if structs_depth != -1: + f.write(" " + apiname.upper() + "_depth_max++;\n") + f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") + global_str = "\n".join(global_str.split("\n")[0:-2]) + if structs_depth != -1: #reindent + global_str = string.split(global_str, '\n') + global_str = [' ' + string.lstrip(line) for line in global_str] + global_str = string.join(global_str, '\n') + f.write(global_str+"\n") + if structs_depth != -1: + f.write(" };\n") + f.write(" " + apiname.upper() + "_depth_max--;\n") + f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + f.write(" return out;\n") f.write("}\n") + global_str = '' else: f.write("\ntemplate<>\n") f.write("struct output_streamer<" + c + "&> {\n") f.write(" inline static std::ostream& put(std::ostream& out, "+c+"& v)\n") f.write("{\n") + f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '{');\n") + if structs_depth != -1: + f.write(apiname.upper() + "_depth_max++;\n") + f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") + global_str = "\n".join(global_str.split("\n")[0:-2]) + if structs_depth != -1: #reindent + global_str = string.split(global_str, '\n') + global_str = [' ' + string.lstrip(line) for line in global_str] + global_str = string.join(global_str, '\n') + f.write(global_str+"\n") + if structs_depth != -1: + f.write(" };\n") + f.write(" " + apiname.upper() + "_depth_max--;\n") + f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '}');\n") + f.write(" return out;\n") f.write("}\n") f.write("};\n") + global_str = '' if apiname.lower() == 'hip': - global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" if apiname.lower() == 'hsa': - global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" footer = \ '// end ostream ops for '+ apiname + ' \n' @@ -196,8 +235,11 @@ def gen_cppheader(infilepath, outfilepath): requiredNamed = parser.add_argument_group('Required arguments') requiredNamed.add_argument('-in', metavar='file', help='Header file to be parsed', required=True) requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) +requiredNamed.add_argument('-depth', metavar='N', type=int, help='Depth for nested structs', required=False) +structs_depth = 0 args = vars(parser.parse_args()) if __name__ == '__main__': - gen_cppheader(args['in'], args['out']) + if args['depth'] != None: structs_depth = args['depth'] + gen_cppheader(args['in'], args['out'], structs_depth) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index a075bc29..ad866012 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1110,6 +1110,7 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); + roctracer::hip_support::HIP_depth_max = 0; roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, &roctx_flush_prm, 1); hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, &hip_api_flush_prm, 1); hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1); From b730da09041d52419bfa7a27fbed9c4b6afaa8d0 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Fri, 11 Sep 2020 14:25:21 -0400 Subject: [PATCH 04/47] Fix for trace checker Change-Id: Ib8a0df7b7bb0da2e68b5b4d99ce8025de169f317 (cherry picked from commit 29da9a744d7da29fdb691d28ada3212647bb8379) --- script/check_trace.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/script/check_trace.py b/script/check_trace.py index 29baff8a..a4506a12 100644 --- a/script/check_trace.py +++ b/script/check_trace.py @@ -120,14 +120,14 @@ def diff_strings(cnt_r, cnt, metric): for evt2 in events_order[tid]: if diff_cnt == diff_cnt_r: if evt != evt2: - print (">I< Difference starts at index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") found_diff_evt = 1 break diff_cnt += 1 diff_cnt_r += 1 if found_diff_evt: break if len(events_order_r[tid_r]) != len(events_order[tid]) and found_diff_evt == 0: - print (">I< Difference starts at index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") break cnt_tid += 1 cnt_tid_r += 1 @@ -292,6 +292,10 @@ def gen_events_info(tracefile, trace_level, no_events_cnt, events2ignore, events if metric == 'or': for tid in sorted (events_order.keys()) : res = res + str(events_order[tid]) + if metric == 'cnt': + newres = res.split('\n') + newres = sorted(newres) + res = str(newres) return res parser = argparse.ArgumentParser(description='check_trace.py: check a trace aainst golden ref. Returns 0 for success, 1 for failure') From 4c2b6cbba493f34e33623c13b0418ec174c8e5d6 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 19 Sep 2020 16:49:46 -0500 Subject: [PATCH 05/47] codeobj tracing prof protocol Change-Id: Ib49c8ee034fb7481b21f950490e10b350f2a1b79 (cherry picked from commit 6567c48e98b0cf3a10ff46bf411057642c307990) --- inc/ext/prof_protocol.h | 1 + 1 file changed, 1 insertion(+) diff --git a/inc/ext/prof_protocol.h b/inc/ext/prof_protocol.h index c29ff0e6..1c00e972 100644 --- a/inc/ext/prof_protocol.h +++ b/inc/ext/prof_protocol.h @@ -36,6 +36,7 @@ typedef enum { ACTIVITY_DOMAIN_KFD_API = 4, // KFD API domain ACTIVITY_DOMAIN_EXT_API = 5, // External ID domain ACTIVITY_DOMAIN_ROCTX = 6, // ROCTX domain + ACTIVITY_DOMAIN_HSA_EVT = 7, // HSA events ACTIVITY_DOMAIN_NUMBER } activity_domain_t; From 35bc1e93f84b9ed74de1f1fd39a2665d85d2cebb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Sep 2020 01:44:51 -0500 Subject: [PATCH 06/47] gen_ostream_ops.py fix - ostream operators as inline static Change-Id: I9688236b06dd167960662b8eecf1a07c93b43fff (cherry picked from commit c9ed0f067d779e89f94bb2d0a2d25618a69f4623) --- script/gen_ostream_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 900c3677..73585ce8 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -167,7 +167,7 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): continue if len(cppHeader.classes[c]["properties"]["public"])!=0: if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - f.write("std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") + f.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") f.write("{\n") f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") if structs_depth != -1: From 5bf3efa8aa72817200410483bdfd554ac5f46f58 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 7 Sep 2020 13:50:10 -0500 Subject: [PATCH 07/47] build normalizing - generating under build directory Change-Id: Id9203aec7800024bd749059a415fb29b8051005a --- CMakeLists.txt | 11 +++++- build.sh | 2 +- script/hsaap.py | 6 +-- script/kfdap.py | 60 +++++++++++++++--------------- src/CMakeLists.txt | 41 ++++++++++---------- src/kfd/.gitignore | 1 - test/CMakeLists.txt | 3 +- test/MatrixTranspose/Makefile | 2 +- test/MatrixTranspose_test/Makefile | 2 +- test/run.sh | 2 +- 10 files changed, 69 insertions(+), 61 deletions(-) delete mode 100644 src/kfd/.gitignore diff --git a/CMakeLists.txt b/CMakeLists.txt index f1ad1982..97c06cf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,12 +100,14 @@ set ( PUBLIC_HEADERS roctracer_kfd.h roctracer_roctx.h roctracer_cb_table.h + ext/prof_protocol.h + ext/hsa_rt_utils.hpp +) +set ( GEN_HEADERS hip_ostream_ops.h hsa_prof_str.h kfd_ostream_ops.h kfd_prof_str.h - ext/prof_protocol.h - ext/hsa_rt_utils.hpp ) if ( ${LIBRARY_TYPE} STREQUAL SHARED ) @@ -137,6 +139,11 @@ foreach ( header ${PUBLIC_HEADERS} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) endforeach () +foreach ( header ${GEN_HEADERS} ) + get_filename_component ( header_subdir ${header} DIRECTORY ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) +endforeach () #install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION include RENAME ${DEST_NAME} ) install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so ) install ( FILES ${PROJECT_BINARY_DIR}/so-major-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so.${LIB_VERSION_MAJOR} ) diff --git a/build.sh b/build.sh index a8515b69..cae5a59a 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash -e SRC_DIR=`dirname $0` COMPONENT="roctracer" ROCM_PATH="${ROCM_PATH:=/opt/rocm}" diff --git a/script/hsaap.py b/script/hsaap.py index a50b3d4d..84ee9bbf 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -490,15 +490,15 @@ def gen_out_stream(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file=sys.stderr) + print ("Usage:", sys.argv[0], " ", file=sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' HSA_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT, HSA_DIR, API_TABLES_H, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT +out_file = PREFIX + OUT print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content[:-1]) diff --git a/script/kfdap.py b/script/kfdap.py index e920bbf6..9f560a35 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -2,10 +2,10 @@ from __future__ import print_function import os, sys, re -OUT_H = 'inc/kfd_prof_str.h' -OUT_CPP = 'src/kfd/kfd_wrapper.cpp' -API_HEADERS_H = ( - ('HSAKMTAPI', 'hsakmt.h'), +OUT_H = 'inc/kfd_prof_str.h' +OUT_CPP = 'src/kfd_wrapper.cpp' +API_HEADERS_H = ( + ('HSAKMTAPI', 'hsakmt.h'), ) LICENSE = \ @@ -38,7 +38,7 @@ def fatal(module, msg): sys.exit(1) # Get next text block -def NextBlock(pos, record): +def NextBlock(pos, record): if len(record) == 0: return pos space_pattern = re.compile(r'(\s+)') @@ -82,8 +82,8 @@ def __init__(self, header, name, full_fct): self.inp = open(header, 'r') - self.beg_pattern = re.compile(name) - self.end_pattern = re.compile('.*\)\s*;\s*$'); + self.beg_pattern = re.compile(name) + self.end_pattern = re.compile('.*\)\s*;\s*$'); self.array = [] self.parse() @@ -92,10 +92,10 @@ def norm_line(self, line): return re.sub(r'^\s+', r' ', line) def fix_comment_line(self, line): - return re.sub(r'\/\/.*', r'', line) + return re.sub(r'\/\/.*', r'', line) def remove_ret_line(self, line): - return re.sub(r'\n', r'', line) + return re.sub(r'\n', r'', line) # check for start record def is_start(self, record): @@ -107,7 +107,7 @@ def is_end(self, record): # check for declaration entry record def is_entry(self, record): - return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) + return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) # parse method def parse(self): @@ -121,7 +121,7 @@ def parse(self): line = self.norm_line(line) line = self.fix_comment_line(line) - if cumulate == 1: record += " " + line; + if cumulate == 1: record += " " + line; else: record = line; if self.is_start(line): rettype = prev_line.strip(); cumulate = 1; prev_line = line; continue; if self.is_end(line): record = self.remove_ret_line(record); cumulate = 0; active = 1; @@ -132,7 +132,7 @@ def parse(self): mycall_full = rettype + " " + m.group(1) + ' (' + m.group(2) + ')' mycall = m.group(1) self.full_fct[mycall] = mycall_full - self.array.append(mycall) + self.array.append(mycall) rettype = ""; prev_line = line @@ -173,7 +173,7 @@ def get_args(self, record): struct = {'ret': '', 'args': '', 'astr': {}, 'alst': [], 'tlst': []} record = re.sub(r'^\s+', r'', record) record = re.sub(r'\s*(\*+)\s*', r'\1 ', record) - rind = NextBlock(0, record) + rind = NextBlock(0, record) struct['ret'] = record[0:rind] pos = record.find('(') end = NextBlock(pos, record); @@ -184,7 +184,7 @@ def get_args(self, record): struct['args'] = re.sub(r',', r', ', args) if args == "void": return struct - + if len(args) == 0: return struct pos = 0 @@ -217,7 +217,7 @@ def get_args(self, record): # parse given api def parse(self, call, full_fct): - if call in full_fct: + if call in full_fct: self.data[call] = self.get_args(full_fct[call]) else: self.data[call] = self.get_args(call) @@ -238,7 +238,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.api_calls = {} self.api_rettypes = set() self.api_id = {} - + api_data = {} full_fct = {} api_list = [] @@ -271,7 +271,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.ns_calls = ns_calls self.content_h += "// automatically generated\n\n" + license + '\n' - + self.content_h += "/////////////////////////////////////////////////////////////////////////////\n" for call in self.ns_calls: self.content_h += '// ' + call + ' was not parsed\n' @@ -298,7 +298,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.content_h += 'namespace kfd_support {\n' self.add_section('API get_name function', ' ', self.gen_get_name) - self.add_section('API get_code function', ' ', self.gen_get_code) + self.add_section('API get_code function', ' ', self.gen_get_code) self.add_section('API intercepting code', '', self.gen_intercept_decl) self.add_section('API intercepting code', '', self.gen_intercept) @@ -369,7 +369,7 @@ def gen_id_enum(self, n, name, call, data): self.content_h += ' KFD_API_ID_NUMBER = ' + str(n) + ',\n' self.content_h += ' KFD_API_ID_ANY = ' + str(n + 1) + ',\n' self.content_h += '};\n' - + # generate API args structure def gen_arg_struct(self, n, name, call, struct): if n == -1: @@ -396,7 +396,7 @@ def gen_arg_struct(self, n, name, call, struct): else: self.content_h += ' } args;\n' self.content_h += '} kfd_api_data_t;\n' - + # generate API callbacks def gen_callbacks(self, n, name, call, struct): if n == -1: @@ -406,7 +406,7 @@ def gen_callbacks(self, n, name, call, struct): if call != '-': call_id = self.api_id[call]; ret_type = struct['ret'] - self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + + self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + self.content_h += ' if (' + name + '_table == NULL) intercept_KFDApiTable();\n' self.content_h += ' kfd_api_data_t api_data{};\n' for var in struct['alst']: @@ -448,7 +448,7 @@ def gen_intercept(self, n, name, call, struct): if call != '-': self.content_h += ' typedef decltype(' + name + '_table_t::' + call + '_fn) ' + call + '_t;\n' - self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' + self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' # generate API name function def gen_get_name(self, n, name, call, struct): @@ -493,7 +493,7 @@ def gen_out_stream(self, n, name, call, struct): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var if re.search(r'MemFlags',arg_var): - continue + continue self.content_h += ' typedef decltype(' + arg_val.replace("[]","") + ') arg_val_type_t' + str(ind) + ';\n' self.content_h += ' roctracer::kfd_support::output_streamer::put(out, ' + arg_val.replace("[]","") + ')' if ind < len(arg_list)-1: self.content_h += ' << ", ";\n' @@ -510,11 +510,11 @@ def gen_out_stream(self, n, name, call, struct): self.content_h += ' abort();\n' self.content_h += ' }\n' self.content_h += ' return out;\n' - self.content_h += '}\n' + self.content_h += '}\n' self.content_h += '#endif\n' - self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' + self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' - # generate PUBLIC_API for all API fcts + # generate PUBLIC_API for all API fcts def gen_public_api(self, n, name, call, struct): if n == -1: self.content_cpp += 'extern "C" {\n' @@ -540,21 +540,21 @@ def gen_public_api(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file = sys.stderr) + print ("Usage:", sys.argv[0], " ", file = sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' KFD_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT_H, KFD_DIR, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT_H +out_file = PREFIX + OUT_H print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_h[:-1]) f.close() -out_file = ROOT + OUT_CPP +out_file = PREFIX + OUT_CPP print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_cpp[:-1]) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ceb33c74..c794c491 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,19 @@ -# +# Generating tracing primitives +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( GEN_SRC_DIR ${PROJECT_BINARY_DIR}/src ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) + # Build dynamic Library object -# -set ( TARGET_LIB "${TARGET_NAME}" ) +set ( TARGET_LIB ${TARGET_NAME} ) set ( LIB_SRC ${LIB_DIR}/core/roctracer.cpp ${LIB_DIR}/proxy/proxy_queue.cpp @@ -10,33 +22,22 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) -# Generating HSA tracing primitives -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH}" ) - -# Generating KFD/Thunk tracing primitives +# Build KFD/Thunk tracing library set ( KFD_LIB "kfdwrapper64" ) -set ( KFD_LIB_SRC - ${LIB_DIR}/kfd/kfd_wrapper.cpp -) -execute_process ( COMMAND sh -xc "${CMAKE_CXX_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h -out ${ROOT_DIR}/inc/kfd_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include -E ${HIP_PATH}/include/hip/hip_runtime_api.h > ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h -out ${ROOT_DIR}/inc/hip_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${PROJECT_BINARY_DIR}/hsa_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsa_pp.h -out ${ROOT_DIR}/inc/hsa_ostream_ops.h" ) +set ( KFD_LIB_SRC ${GEN_SRC_DIR}/kfd_wrapper.cpp) add_library ( ${KFD_LIB} SHARED ${KFD_LIB_SRC} ) -target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${KFD_LIB} PRIVATE c stdc++ ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${ROOT_DIR} ${HSA_KMT_INC_PATH}" ) +# Build ROCTX tracing library set ( ROCTX_LIB "roctx64" ) set ( ROCTX_LIB_SRC ${LIB_DIR}/roctx/roctx.cpp ${LIB_DIR}/roctx/roctx_intercept.cpp ) add_library ( ${ROCTX_LIB} SHARED ${ROCTX_LIB_SRC} ) -target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ) +target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${ROCTX_LIB} PRIVATE c stdc++ ) diff --git a/src/kfd/.gitignore b/src/kfd/.gitignore deleted file mode 100644 index 0c2acea7..00000000 --- a/src/kfd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -kfd_wrapper.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6a6d7d17..c1b56c0c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,6 +50,7 @@ set ( HSA_REV "a657002" ) set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) ## build HIP tests +set ( INC_PATH "${INC_PATH} ${PROJECT_BINARY_DIR}/inc" ) set ( TEST_ENV HIP_VDI=${HIP_VDI} ROCM_PATH=${ROCM_ROOT_DIR} HSA_PATH=${ROCM_ROOT_DIR}/hsa INC_PATH=${INC_PATH} LIB_PATH=${LIB_PATH} HIPCC_VERBOSE=3 ) add_custom_target( mytest COMMAND ${TEST_ENV} make -C "${TEST_DIR}/MatrixTranspose" @@ -75,7 +76,7 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${PROJECT_BINARY_DIR}/inc ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) endif () diff --git a/test/MatrixTranspose/Makefile b/test/MatrixTranspose/Makefile index 647067dd..9a805fb1 100644 --- a/test/MatrixTranspose/Makefile +++ b/test/MatrixTranspose/Makefile @@ -23,7 +23,7 @@ EXECUTABLE=./MatrixTranspose all: clean $(EXECUTABLE) -CXXFLAGS =-g -I$(INC_PATH) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) +CXXFLAGS =-g $(INC_PATH:%=-I%) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) diff --git a/test/MatrixTranspose_test/Makefile b/test/MatrixTranspose_test/Makefile index 3e879ee8..758f8d94 100644 --- a/test/MatrixTranspose_test/Makefile +++ b/test/MatrixTranspose_test/Makefile @@ -17,7 +17,7 @@ TARGET=hcc EXECUTABLE=./MatrixTranspose OBJECTS = MatrixTranspose.o -FLAGS =-g -I$(INC_PATH) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 +FLAGS =-g $(INC_PATH:%=-I%) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 ifeq ($(C_TEST), 1) COMP=${CC} diff --git a/test/run.sh b/test/run.sh index c5c8aa45..c5931061 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,4 +1,4 @@ -#!/bin/sh -x +#!/bin/sh ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. From 401c48b8b79f93b882e0bc863e6847544962a45f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 16 Sep 2020 20:32:18 -0400 Subject: [PATCH 08/47] testing using v3 object Change-Id: Ifca31d632726ab83f4c672b46cd9b97f817e757d --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c1b56c0c..ce003b7c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "a657002" ) +set ( HSA_REV "19b1191" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) From 96ea2d613b07a7e0824741e6a104986130bc77e7 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 11 Sep 2020 10:29:06 -0500 Subject: [PATCH 09/47] HCC_HOME env cleanup Change-Id: I2b00e5d310e6349fc52d5df60aae85f4c06adebe --- README.md | 3 +-- build.sh | 1 - build_static.sh | 1 - cmake_modules/env.cmake | 8 +------- run_test.sh | 1 - 5 files changed, 2 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 8f3e8481..e700ee40 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,8 @@ rocTX API: - Set environment: export CMAKE_PREFIX_PATH=/opt/rocm - - To use custom HIP/HCC versions: + - To use custom HIP version: export HIP_PATH=/opt/rocm/hip - export HCC_HOME=/opt/rocm/hcc - To build roctracer library: export CMAKE_BUILD_TYPE= # release by default diff --git a/build.sh b/build.sh index cae5a59a..a5201275 100755 --- a/build.sh +++ b/build.sh @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/build_static.sh b/build_static.sh index 938df3ce..bb6ecf29 100755 --- a/build_static.sh +++ b/build_static.sh @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 8dbf2c9c..41f6253a 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -89,7 +89,7 @@ else() set ( HIP_DEFINES "-D__HIP_PLATFORM_HCC__=1") endif() -## Enable HIP/HCC local build +## Enable HIP local build if ( DEFINED LOCAL_BUILD ) add_definitions ( -DLOCAL_BUILD=${LOCAL_BUILD} ) else() @@ -114,15 +114,10 @@ if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) endif() set ( ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} ) -set ( HCC_HOME "/opt/rocm/hcc" ) set ( HIP_PATH "/opt/rocm/hip" ) -if ( DEFINED ENV{HCC_HOME} ) - set ( HCC_HOME $ENV{HCC_HOME} ) -endif() if ( DEFINED ENV{HIP_PATH} ) set ( HIP_PATH $ENV{HIP_PATH} ) endif() -set ( HCC_INC_DIR "${HCC_HOME}/include" ) set ( HIP_INC_DIR "${HIP_PATH}/include" ) ## Extend Compiler flags based on build type @@ -170,7 +165,6 @@ message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) message ( "-------------KFD-Inc: ${HSA_KMT_INC_PATH}" ) -message ( "-------------HCC-Inc: ${HCC_INC_DIR}" ) message ( "-------------HIP-Inc: ${HIP_INC_DIR}" ) message ( "-------------HIP-VDI: ${HIP_VDI}" ) message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) diff --git a/run_test.sh b/run_test.sh index c2ea74a6..61f1b301 100755 --- a/run_test.sh +++ b/run_test.sh @@ -7,7 +7,6 @@ fatal() { } if [ -z "$BUILD_DIR" ] ; then export BUILD_DIR=$PWD; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi cd $BUILD_DIR ./run.sh From 28e4b8e014517dde5c12a98c4a08501dda964e87 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 25 Sep 2020 09:00:53 -0500 Subject: [PATCH 10/47] SWDEV-253997 : packaging fix: installing hsa_ostream_ops.h Change-Id: Ib739cbb7538473afc9744e12d2bd568635e78616 (cherry picked from commit 1d975e5ba587fdcb24e3ee0ae4b3ae9202a756de) --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97c06cf9..fe3cecde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,8 +106,9 @@ set ( PUBLIC_HEADERS set ( GEN_HEADERS hip_ostream_ops.h hsa_prof_str.h - kfd_ostream_ops.h + hsa_ostream_ops.h kfd_prof_str.h + kfd_ostream_ops.h ) if ( ${LIBRARY_TYPE} STREQUAL SHARED ) From 367e2c496dc0fedd156b73af8d64a5e64d5af778 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 1 Oct 2020 01:28:43 -0400 Subject: [PATCH 11/47] SWDEV-251491 : gen_ostream_ops.py porting to python3 Change-Id: I7081b6ad21b038040267067bd73d8a44df46e4ff --- script/gen_ostream_ops.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 73585ce8..c8f23629 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import os, sys, re import CppHeaderParser @@ -176,9 +176,9 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): process_struct(f, c, cppHeader, "", apiname) global_str = "\n".join(global_str.split("\n")[0:-2]) if structs_depth != -1: #reindent - global_str = string.split(global_str, '\n') - global_str = [' ' + string.lstrip(line) for line in global_str] - global_str = string.join(global_str, '\n') + global_str = global_str.split('\n') + global_str = [' ' + line.lstrip() for line in global_str] + global_str = "\n".join(global_str) f.write(global_str+"\n") if structs_depth != -1: f.write(" };\n") @@ -199,9 +199,9 @@ def gen_cppheader(infilepath, outfilepath, structs_depth): process_struct(f, c, cppHeader, "", apiname) global_str = "\n".join(global_str.split("\n")[0:-2]) if structs_depth != -1: #reindent - global_str = string.split(global_str, '\n') - global_str = [' ' + string.lstrip(line) for line in global_str] - global_str = string.join(global_str, '\n') + global_str = global_str.split('\n') + global_str = [' ' + line.lstrip() for line in global_str] + global_str = "\n".join(global_str) f.write(global_str+"\n") if structs_depth != -1: f.write(" };\n") From fb0c230b0bf4fd822d01aa9c2633d12f0e15029c Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 22 Sep 2020 06:20:08 -0400 Subject: [PATCH 12/47] SWDEV-249924 : hip correlation id explicite propogation; tracer debug instrumentation; Change-Id: Ibbc411541f5610ce739f3fc1efa1ab7f605220f5 initial commmit Change-Id: I34b360be62c2083819dc5c3acc8268bd69f2f58a --- cmake_modules/env.cmake | 5 ++++- src/core/roctracer.cpp | 37 ++++++++++++++++++++++++------ src/util/logger.h | 18 ++++++++++++++- test/tool/tracer_tool.cpp | 47 ++++++++++++++++++++++++++++++--------- 4 files changed, 88 insertions(+), 19 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 41f6253a..405f2665 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -61,8 +61,11 @@ if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) endif() ## Enable debug trace +if ( DEFINED CMAKE_DEBUG_TRACE ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) +endif() if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) - add_definitions ( -DDEBUG_TRACE=1 ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) endif() if ( NOT DEFINED LIBRARY_TYPE ) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 52f1e28b..21f4b667 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -95,6 +95,7 @@ THE SOFTWARE. static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Mark callback @@ -294,13 +295,19 @@ static inline void CorrelationIdRegistr(const activity_correlation_id_t& correla if (correlation_id_map == NULL) correlation_id_map = new correlation_id_map_t; const auto ret = correlation_id_map->insert({correlation_id, correlation_id_tls}); if (ret.second == false) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id is not unique(" << correlation_id << ")"); + + DEBUG_TRACE("CorrelationIdRegistr id(%lu) id_tls(%lu)\n", correlation_id, correlation_id_tls); } static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) { auto it = correlation_id_map->find(correlation_id); if (correlation_id_wait) while (it == correlation_id_map->end()) it = correlation_id_map->find(correlation_id); if (it == correlation_id_map->end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")"); - return it->second; + const activity_correlation_id_t ret_val = it->second; + + DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val); + + return ret_val; } typedef std::mutex hip_activity_mutex_t; @@ -341,6 +348,7 @@ void* HIP_SyncApiDataCallback( const void* callback_data, void* arg) { + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -375,16 +383,20 @@ void* HIP_SyncApiDataCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { // popping the record entry if (!record_pair_stack.empty()) record_pair_stack.pop(); // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + + return ret; } void* HIP_SyncActivityCallback( @@ -395,6 +407,7 @@ void* HIP_SyncActivityCallback( { static hsa_rt_utils::Timer timer; + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -436,7 +449,7 @@ void* HIP_SyncActivityCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { if (pool == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: pool is NULL"); @@ -469,9 +482,13 @@ void* HIP_SyncActivityCallback( // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + + return ret; } void HCC_ActivityIdCallback(activity_correlation_id_t correlation_id) { @@ -484,6 +501,10 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { record_ptr->domain = ACTIVITY_DOMAIN_HCC_OPS; record_ptr->correlation_id = CorrelationIdLookup(record_ptr->correlation_id); pool->Write(*record_ptr); + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); + DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d)\n", + name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id); } // Open output file @@ -673,6 +694,8 @@ PUBLIC_API const char* roctracer_op_string( return roctracer::HipLoader::Instance().ApiName(op); case ACTIVITY_DOMAIN_KFD_API: return roctracer::kfd_support::GetApiName(op); + case ACTIVITY_DOMAIN_EXT_API: + return "EXT_API"; default: EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")"); } diff --git a/src/util/logger.h b/src/util/logger.h index cd8dd470..8e525f68 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -100,10 +100,10 @@ class Logger { return *obj; } - private: static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } + private: Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { const char* path = getenv("ROCTRACER_LOG"); if (path != NULL) { @@ -198,4 +198,20 @@ class Logger { } while(0) #endif +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", + roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index ad866012..05855fd0 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include /* names denangle */ #include #include +#include #include #include #include /* SYS_xxx definitions */ @@ -78,6 +79,24 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") +static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } + +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", GetPid(), GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; hsa_rt_utils::Timer* timer = NULL; thread_local timestamp_t hsa_begin_timestamp = 0; @@ -125,9 +144,6 @@ void close_file_handles() { if (pc_sample_file_handle) close_output_file(pc_sample_file_handle); } -static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } - static const uint32_t my_pid = GetPid(); // Error handler @@ -378,19 +394,20 @@ void hip_api_callback( { (void)arg; const hip_api_data_t* data = reinterpret_cast(callback_data); + const timestamp_t timestamp = timer->timestamp_fn_ns(); + hip_api_trace_entry_t* entry = NULL; if (data->phase == ACTIVITY_API_PHASE_ENTER) { - hip_begin_timestamp = timer->timestamp_fn_ns(); + hip_begin_timestamp = timestamp; } else { // Post onit of HIP APU args hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); - const timestamp_t end_timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); + entry = hip_api_trace_buffer->GetEntry(); entry->cid = cid; entry->domain = domain; entry->begin = hip_begin_timestamp; - entry->end = end_timestamp; + entry->end = timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; @@ -435,6 +452,10 @@ void hip_api_callback( entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } + + const char * name = roctracer_op_string(domain, cid, 0); + DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu)\n", + name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id); } void mark_api_callback( @@ -465,12 +486,10 @@ hip_kernel_map_t* hip_kernel_map = NULL; std::mutex hip_kernel_mutex; void hip_api_flush_cb(hip_api_trace_entry_t* entry) { - static uint64_t correlation_id = 0; - correlation_id += 1; - const uint32_t domain = entry->domain; const uint32_t cid = entry->cid; const hip_api_data_t* data = &(entry->data); + const uint64_t correlation_id = data->correlation_id; const timestamp_t begin_timestamp = entry->begin; const timestamp_t end_timestamp = entry->end; std::ostringstream rec_ss; @@ -480,6 +499,10 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":" << entry->tid; oss << std::dec << rec_ss.str() << " " << str; + const char * name = roctracer_op_string(entry->domain, entry->cid, 0); + DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu))\n", + name, entry->domain, entry->cid, entry, entry->name, correlation_id); + if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING if (hip_api_stats != NULL) { @@ -496,6 +519,7 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { const char* kernel_name = cxx_demangle(entry->name); rec_ss << " kernel=" << kernel_name; } + rec_ss<< " :" << correlation_id; fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); } #else // !HIP_PROF_HIP_API_STRING @@ -607,6 +631,9 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); + DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu)\n", + name, record->domain, record->op, record->kind, record, record->correlation_id); + switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: if (hip_memcpy_stats != NULL) { From 8bb2d3095ea86edb5b306685a38f5cc6654c1ddb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 30 Oct 2020 02:09:09 -0500 Subject: [PATCH 13/47] SWDEV-258731 : cleanup Change-Id: I0bc4ca977ce44f864178e78ec339888f86cbed8a --- src/core/roctracer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 21f4b667..a50f2d3d 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -95,7 +95,6 @@ THE SOFTWARE. static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Mark callback From 9f02bb977f464e4edee44f2828b31a39073d1a01 Mon Sep 17 00:00:00 2001 From: Pruthvi Madugundu Date: Fri, 5 Jun 2020 00:56:19 -0700 Subject: [PATCH 14/47] Add RUNPATH to libtracer_tool.so - All libs will have RUNPATH - libtracer_tool.so is added with RUNPATH based on ROCM_RPATH when defined else not set. Signed-off-by: Pruthvi Madugundu Change-Id: I6515e603c82e1360e03eca2967f6a85e5faadc9a --- cmake_modules/env.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 405f2665..f7824148 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -53,6 +53,11 @@ set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) +## Set RUNPATH if ROCM_RPATH is defined and passed by the environment +if ( DEFINED ROCM_RPATH ) + set ( CMAKE_SHARED_LINKER_FLAGS " -Wl,--enable-new-dtags -Wl,--rpath,${ROCM_RPATH} ${CMAKE_SHARED_LINKER_FLAGS}" ) +endif () + set ( CMAKE_SKIP_BUILD_RPATH TRUE ) ## CLANG options From 73fb6ea9a6ce06f16966e55ea3bf5b03fc0f6981 Mon Sep 17 00:00:00 2001 From: Cole Nelson Date: Fri, 16 Oct 2020 12:20:40 -0700 Subject: [PATCH 15/47] CMakeList.txt: conformant package names Still needs valid email ID in the form of package_name.support@amd.com. SWDEV-257322 Names complete as built (internal) : roctracer-dev_1.0.0.40000-crdnnv.444_amd64.deb roctracer-dev-1.0.0.40000-crdnnv.444.el7.x86_64.rpm These changes are to satisfy: http://confluence.amd.com/display/GPUCPT/Package+File+Naming Change-Id: I5991326eb87d7dfa1304e3b2c5afb78f5a0c0361 Signed-off-by: Cole Nelson (cherry picked from commit 16ad4e9de702f8eeb2d7eb2cbb0db493d21af1bc) --- CMakeLists.txt | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe3cecde..f5c1d73c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,18 +177,49 @@ install ( TARGETS "kfdwrapper64" LIBRARY DESTINATION ${DEST_NAME}/lib ) ## Packaging directives set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) set ( CPACK_PACKAGE_NAME "${ROCTRACER_NAME}-dev" ) -set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." ) set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) -set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ) +if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) + set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) +endif() +message ( "-- CPACK_PACKAGE_VERSION: ${CPACK_PACKAGE_VERSION}" ) +set ( CPACK_PACKAGE_CONTACT "TODO " ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD ROCTRACER library" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables +if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) + set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) +else() + set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) +set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables +if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) + set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) +else() + set ( CPACK_RPM_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}" ) + +## 'dist' breaks manual builds on debian systems due to empty Provides +execute_process( COMMAND rpm --eval %{?dist} + RESULT_VARIABLE PROC_RESULT + OUTPUT_VARIABLE EVAL_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE ) +message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}") + +if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) + string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) +endif() +set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) +message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) From 0876c253d8b6380a1d6bd1f2b34c5b4c1e8c94db Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 1 Oct 2020 05:58:50 -0400 Subject: [PATCH 16/47] SWDEV-251491 : disabling hipModuleUnload tracing which is called on exit Change-Id: I99c22eec3fea6ac8820d574c44df099febdd27c4 (cherry picked from commit bb8f2f67858d68b13b00696b2798857676201210) --- test/tool/tracer_tool.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 05855fd0..d6be6f4f 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1092,6 +1092,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); if (is_stats_opt) { const char* path = NULL; From 18c83ea763439f9c9ab9a39ce296920c4fb99a56 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 23 Sep 2020 02:52:06 -0400 Subject: [PATCH 17/47] hip library loader check Change-Id: I34957db88932e1ed725a0a0d8ca9a66fecc92e38 (cherry picked from commit 9061c4ea414b78b71fa1dc0d0869b7519b366e73) --- src/core/loader.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/loader.h b/src/core/loader.h index 5d6e0d6c..d1f2ef01 100644 --- a/src/core/loader.h +++ b/src/core/loader.h @@ -291,7 +291,6 @@ typedef HipLoaderShared HipLoader; template bool roctracer::BaseLoader::to_check_symb_ = true; \ template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HccLoader::to_check_open_ = false; \ template<> const char* roctracer::KfdLoader::lib_name_ = "libkfdwrapper64.so"; \ template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ template<> bool roctracer::RocTxLoader::to_load_ = true; @@ -302,8 +301,7 @@ typedef HipLoaderShared HipLoader; roctracer::HipLoaderStatic::instance_t roctracer::HipLoaderStatic::instance_{}; #else #define LOADER_INSTANTIATE_HIP() \ - template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HipLoaderShared::to_check_open_ = false; + template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; #endif #if HIP_VDI From d8d7137e185bc5da67514b3c88e607418aa582bf Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 3 Sep 2020 04:01:28 -0500 Subject: [PATCH 18/47] SWDEV-213367 : codeobj event implementation Change-Id: Ibcaca6869ce96d8802c5fa8ba241f43834d6f2a7 update - codeobj event implementation Change-Id: I4c12f26a19f2b31d9ac2211c3426a0e587a332b3 update2 - codeobj event implementation Change-Id: Ic877549a83542ae00352503471d881e847ebac9c test - codeobj event implementation Change-Id: I0618d3a93de94c3d7467372ba4a3d4ea5520bfc7 URI reference test - codeobj event implementation Change-Id: I6cf7e8a648cf012cb0708058b118a75e58f992b9 adding test/app - codeobj event implementation Change-Id: Idf4c197c7b9116ccde5ec50ff47a26a858bfab32 uri test fix - codeobj event implementation Change-Id: I7c385f82f516d9d8f2cd726366f00be3664006e3 uri test cleanup - codeobj event implementation Change-Id: I542d5baf88c048c8b4717af843b803cd93e8f3bc URI buffer fix - codeobj event implementation Change-Id: Iac65e04c03a0939935c10f53c6b580a2e33878f5 HSA events tests trace-check disabled Change-Id: I0f4d13aeeceb1d1a6e2191673eacbf9c7ae2ae52 --- cmake_modules/env.cmake | 7 +- inc/roctracer_hsa.h | 3 +- src/CMakeLists.txt | 9 +- src/core/loader.h | 9 + src/core/roctracer.cpp | 20 +- test/CMakeLists.txt | 19 +- test/app/codeobj_test.cpp | 89 ++ test/app/hsaco_test.cpp | 134 ++ .../MatrixTranspose_hip_flush_trace.txt | 1315 +++++++++++++---- test/golden_traces/tests_trace_cmp_levels.txt | 2 + test/run.sh | 7 + 11 files changed, 1297 insertions(+), 317 deletions(-) create mode 100644 test/app/codeobj_test.cpp create mode 100644 test/app/hsaco_test.cpp diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index f7824148..3f5dec60 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -156,10 +156,10 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH "${HSA_RUNTIME_LIB}" DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) -get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) - set ( HSA_KMT_INC_PATH "${HSA_KMT_LIB_PATH}/../include" ) -set ( ROCM_INC_PATH "${HSA_KMT_INC_PATH}" ) + +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) +set ( ROCM_INC_PATH "${ROCM_ROOT_DIR}/include" ) ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) @@ -172,6 +172,7 @@ message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) +message ( "-------ROCM_INC_PATH: ${ROCM_INC_PATH}" ) message ( "-------------KFD-Inc: ${HSA_KMT_INC_PATH}" ) message ( "-------------HIP-Inc: ${HIP_INC_DIR}" ) message ( "-------------HIP-VDI: ${HIP_VDI}" ) diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index b9b0cf98..d9daa5e5 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include +#include // HSA OP ID enumeration enum hsa_op_id_t { @@ -34,7 +35,7 @@ enum hsa_op_id_t { HSA_OP_ID_COPY = 1, HSA_OP_ID_BARRIER = 2, HSA_OP_ID_RESERVED1 = 3, - HSA_OP_ID_NUMBER = 4 + HSA_OP_ID_NUMBER }; #ifdef __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c794c491..e9c72f84 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,9 @@ execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) # Build dynamic Library object set ( TARGET_LIB ${TARGET_NAME} ) @@ -22,14 +25,14 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${ROCM_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) # Build KFD/Thunk tracing library set ( KFD_LIB "kfdwrapper64" ) set ( KFD_LIB_SRC ${GEN_SRC_DIR}/kfd_wrapper.cpp) add_library ( ${KFD_LIB} SHARED ${KFD_LIB_SRC} ) -target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${KFD_LIB} PRIVATE c stdc++ ) # Build ROCTX tracing library @@ -39,5 +42,5 @@ set ( ROCTX_LIB_SRC ${LIB_DIR}/roctx/roctx_intercept.cpp ) add_library ( ${ROCTX_LIB} SHARED ${ROCTX_LIB_SRC} ) -target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${ROCTX_LIB} PRIVATE c stdc++ ) diff --git a/src/core/loader.h b/src/core/loader.h index d1f2ef01..946521d1 100644 --- a/src/core/loader.h +++ b/src/core/loader.h @@ -94,6 +94,10 @@ class RocpApi { EnableCallback_t* EnableActivityCallback; NameCallback_t* GetOpName; + RegisterCallback_t* RegisterEvtCallback; + OperateCallback_t* RemoveEvtCallback; + NameCallback_t* GetEvtName; + protected: void init(Loader* loader) { RegisterApiCallback = loader->GetFun("RegisterApiCallback"); @@ -101,6 +105,10 @@ class RocpApi { InitActivityCallback = loader->GetFun("InitActivityCallback"); EnableActivityCallback = loader->GetFun("EnableActivityCallback"); GetOpName = loader->GetFun("GetOpName"); + + RegisterEvtCallback = loader->GetFun("RegisterEvtCallback"); + RemoveEvtCallback = loader->GetFun("RemoveEvtCallback"); + GetEvtName = loader->GetFun("GetEvtName"); } }; @@ -290,6 +298,7 @@ typedef HipLoaderShared HipLoader; template bool roctracer::BaseLoader::to_check_open_ = true; \ template bool roctracer::BaseLoader::to_check_symb_ = true; \ template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ + template<> bool roctracer::RocpLoader::to_load_ = true; \ template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \ template<> const char* roctracer::KfdLoader::lib_name_ = "libkfdwrapper64.so"; \ template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index a50f2d3d..21203f91 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -685,6 +685,8 @@ PUBLIC_API const char* roctracer_op_string( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: return roctracer::hsa_support::GetApiName(op); + case ACTIVITY_DOMAIN_HSA_EVT: + return roctracer::RocpLoader::Instance().GetEvtName(op); case ACTIVITY_DOMAIN_HSA_OPS: return roctracer::RocpLoader::Instance().GetOpName(op); case ACTIVITY_DOMAIN_HCC_OPS: @@ -730,6 +732,7 @@ static inline uint32_t get_op_num(const uint32_t& domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER; + case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER; case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_NUMBER; case ACTIVITY_DOMAIN_KFD_API: return KFD_API_ID_NUMBER; @@ -759,13 +762,18 @@ static roctracer_status_t roctracer_enable_callback_fun( #if 0 if (op == HSA_API_ID_DISPATCH) { const bool succ = roctracer::RocpLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data); - if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback error(" << op << ") failed"); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterApiCallback error(" << op << ") failed"); break; } #endif roctracer::hsa_support::cb_table.set(op, callback, user_data); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_HCC_OPS: break; case ACTIVITY_DOMAIN_HIP_API: { if (roctracer::HipLoader::Instance().Enabled() == false) break; @@ -874,6 +882,11 @@ static roctracer_status_t roctracer_disable_callback_fun( } break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RemoveEvtCallback(op); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RemoveEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_ROCTX: { if (roctracer::RocTxLoader::Instance().Enabled()) { const bool suc = roctracer::RocTxLoader::Instance().RemoveApiCallback(op); @@ -983,6 +996,7 @@ static roctracer_status_t roctracer_enable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { const bool init_phase = (roctracer::HccLoader::GetRef() == NULL); @@ -1079,6 +1093,7 @@ static roctracer_status_t roctracer_disable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { if (roctracer::HccLoader::Instance().Enabled() == false) break; @@ -1249,6 +1264,9 @@ PUBLIC_API roctracer_status_t roctracer_set_properties( roctracer::kfd_support::intercept_KFDApiTable(); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + break; + } case ACTIVITY_DOMAIN_HSA_API: { // HSA API properties HsaApiTable* table = reinterpret_cast(properties); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ce003b7c..148c60b0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -50,7 +50,8 @@ set ( HSA_REV "19b1191" ) set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) ## build HIP tests -set ( INC_PATH "${INC_PATH} ${PROJECT_BINARY_DIR}/inc" ) +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( INC_PATH "${INC_PATH} ${GEN_INC_DIR}" ) set ( TEST_ENV HIP_VDI=${HIP_VDI} ROCM_PATH=${ROCM_ROOT_DIR} HSA_PATH=${ROCM_ROOT_DIR}/hsa INC_PATH=${INC_PATH} LIB_PATH=${LIB_PATH} HIPCC_VERBOSE=3 ) add_custom_target( mytest COMMAND ${TEST_ENV} make -C "${TEST_DIR}/MatrixTranspose" @@ -76,10 +77,24 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ${PROJECT_BINARY_DIR}/inc ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) endif () +## Build hsaco_test.cpp referenc test +set ( CO_LIB_NAME "hsaco_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/hsaco_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${HSA_RUNTIME_LIB} c stdc++ ) + +## Build codeobj event test +set ( CO_LIB_NAME "codeobj_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/codeobj_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${GEN_INC_DIR} ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${ROCTRACER_TARGET} c stdc++ ) + ## Build HSA test execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp new file mode 100644 index 00000000..086bcfb6 --- /dev/null +++ b/test/app/codeobj_test.cpp @@ -0,0 +1,89 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include + +#include "inc/roctracer.h" +#include "inc/roctracer_hsa.h" +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Check returned HSA API status +void check_status(roctracer_status_t status) { + if (status != ROCTRACER_STATUS_SUCCESS) { + const char* error_string = roctracer_error_string(); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// codeobj callback +void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { + const hsa_evt_data_t* evt_data = reinterpret_cast(data); + const uint32_t uri_length = evt_data->codeobj.uri_length; + const char* uri = evt_data->codeobj.uri; + printf("codeobj_callback domain(%u) cid(%u): load_delta(0x%lx) load_size(0x%lx) uri_length(%u) uri(\"%s\")\n", + domain, + cid, + evt_data->codeobj.load_delta, + evt_data->codeobj.load_size, + uri_length, + uri); + fflush(stdout); +} + +void initialize() { + roctracer_status_t status = roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL); + check_status(status); +} + +void cleanup() { + roctracer_status_t status = roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_EVT); + check_status(status); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { + // Enable HSA events intercepting + settings->hsa_intercepting = 1; + // Initialize profiling + initialize(); +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + // Final resources cleanup + cleanup(); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + OnUnloadTool(); +} diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp new file mode 100644 index 00000000..0f2e42ad --- /dev/null +++ b/test/app/hsaco_test.cpp @@ -0,0 +1,134 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define HSA_RT(call) \ + do { \ + const hsa_status_t status = call; \ + if (status != HSA_STATUS_SUCCESS) { \ + printf("error \"%s\"\n", #call); fflush(stdout); \ + abort(); \ + } \ + } while(0) + +// HSA API intercepting primitives +decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +hsa_ven_amd_loader_1_01_pfn_t loader_api_table{}; + +hsa_status_t code_object_callback( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* arg) +{ + printf("code_object_callback\n"); fflush(stdout); + + uint64_t load_size = 0; + uint64_t load_delta = 0; + uint32_t uri_len = 0; + char* uri_str = NULL; + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &load_size)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &load_delta)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &uri_len)); + + uri_str = (char*)calloc(uri_len + 1, sizeof(char)); + if (!uri_str) { + perror("calloc"); + abort(); + } + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + uri_str)); + + printf("load_size(0x%lx)\n", load_size); fflush(stdout); + printf("load_delta(0x%lx)\n", load_delta); fflush(stdout); + printf("uri_len(%u)\n", uri_len); fflush(stdout); + printf("uri_str(\"%s\")\n", uri_str); fflush(stdout); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_executable_freeze_interceptor( + hsa_executable_t executable, + const char *options) +{ + HSA_RT(loader_api_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + code_object_callback, + NULL)); + HSA_RT(hsa_executable_freeze_fn( + executable, + options)); + return HSA_STATUS_SUCCESS; +} + +// HSA-runtime tool on-load method +extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, + uint64_t runtime_version, + uint64_t failed_tool_count, + const char* const* failed_tool_names) +{ + printf("OnLoad: begin\n"); fflush(stdout); + // intercepting hsa_executable_freeze API + hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + // Fetching AMD Loader HSA extension API + HSA_RT(hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(hsa_ven_amd_loader_1_01_pfn_t), + &loader_api_table)); + printf("OnLoad: end\n"); fflush(stdout); + return true; +} + +extern "C" PUBLIC_API void OnUnload() { + printf("OnUnload\n"); fflush(stdout); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + printf("destructor\n"); fflush(stdout); +} diff --git a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt index 27ef8e95..6f0c4d17 100644 --- a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt @@ -1,25 +1,58 @@ -+ ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose -ROCTracer (pid=1991): +ROCTracer (pid=14696): ROCTracer: trace control flush rate(100000us) -3802701299772587 +129855595266140 HIP-trace() -Device name Device 687f +Device name Device 738c ## Iteration (99) ################# -3802701304199730:3802701304207180 1991:1991 hipGetDeviceProperties(props=, device=0) -3802701305255618:3802701305368889 1991:1991 hipMalloc(ptr=0x7fce16e0dec3, size=4194304) -3802701305370969:3802701305429809 1991:1991 hipMalloc(ptr=0x7fffc1295178, size=4194304) +129855603476896:129855603483734 14696:14696 hipGetDeviceProperties(props={}, device=0) :1 +129855604686134:129855605152950 14696:14696 hipMalloc(ptr=0x7fd65ce00000, size=4194304) :2 +129855605160451:129855605528247 14696:14696 hipMalloc(ptr=0x7fd65c800000, size=4194304) :3 PASSED! ## Iteration (98) ################# -3802701580515709:3802701582582904 0:0 CopyHostToDevice:4:1991 -3802701583225872:3802701584425191 0:0 KernelExecution:8:1991 -3802701583217109:3802701586447303 0:0 CopyDeviceToHost:10:1991 -3802701594795564:3802701596533727 0:0 CopyHostToDevice:11:1991 -3802701596646592:3802701597848875 0:0 KernelExecution:15:1991 -3802701596604988:3802701599522360 0:0 CopyDeviceToHost:17:1991 PASSED! ## Iteration (97) ################# PASSED! ## Iteration (96) ################# +129855955913848:129855957428192 0:0 CopyHostToDevice:4:14696 +129855958763342:129855959991823 0:0 KernelExecution:8:14696 +129855958734601:129855961705377 0:0 CopyDeviceToHost:10:14696 +129855971471522:129855972254607 0:0 CopyHostToDevice:11:14696 +129855972381516:129855973633356 0:0 KernelExecution:15:14696 +129855972673800:129855974135421 0:0 CopyDeviceToHost:17:14696 +129855980290261:129855981019714 0:0 CopyHostToDevice:18:14696 +129855981112002:129855982336482 0:0 KernelExecution:22:14696 +129855981076333:129855982783351 0:0 CopyDeviceToHost:24:14696 +129855988849671:129855989612220 0:0 CopyHostToDevice:25:14696 +129855989696159:129855990920319 0:0 KernelExecution:29:14696 +129855989668256:129855991384209 0:0 CopyDeviceToHost:31:14696 +129855605540988:129855957443403 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :4 +129855957456260:129855957456261 14696:14696 MARK(name(before HIP LaunchKernel)) +129855957507034:129855957514510 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 +129855957521000:129855957523014 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 +129855957529950:129855958671150 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 +129855958701410:129855958701411 14696:14696 MARK(name(after HIP LaunchKernel)) +129855958708321:129855961719221 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :10 +129855971408776:129855972257972 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :11 +129855972261515:129855972261516 14696:14696 MARK(name(before HIP LaunchKernel)) +129855972266736:129855972268234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 +129855972271629:129855972272780 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 +129855972276181:129855972282118 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 +129855972663504:129855972663505 14696:14696 MARK(name(after HIP LaunchKernel)) +129855972666015:129855974143463 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :17 +129855980222888:129855981023250 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :18 +129855981025473:129855981025474 14696:14696 MARK(name(before HIP LaunchKernel)) +129855981028834:129855981029831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 +129855981032043:129855981032913 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 +129855981035237:129855981038997 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 +129855981041265:129855981041266 14696:14696 MARK(name(after HIP LaunchKernel)) +129855981043695:129855982796928 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :24 +129855988764565:129855989615901 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :25 +129855989618073:129855989618074 14696:14696 MARK(name(before HIP LaunchKernel)) +129855989621096:129855989622129 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 +129855989624243:129855989625087 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 +129855989627271:129855989630934 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 +129855989632959:129855989632960 14696:14696 MARK(name(after HIP LaunchKernel)) +129855989635351:129855991396402 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :31 PASSED! ## Iteration (95) ################# PASSED! @@ -32,30 +65,6 @@ PASSED! ## Iteration (91) ################# PASSED! ## Iteration (90) ################# -3802701606826614:3802701608688328 0:0 CopyHostToDevice:18:1991 -3802701608781496:3802701609988668 0:0 KernelExecution:22:1991 -3802701608758548:3802701611510159 0:0 CopyDeviceToHost:24:1991 -3802701618702082:3802701620571865 0:0 CopyHostToDevice:25:1991 -3802701620675087:3802701621878110 0:0 KernelExecution:29:1991 -3802701620650876:3802701623502597 0:0 CopyDeviceToHost:31:1991 -3802701630690881:3802701632557164 0:0 CopyHostToDevice:32:1991 -3802701632661061:3802701633864973 0:0 KernelExecution:36:1991 -3802701632637885:3802701635182424 0:0 CopyDeviceToHost:38:1991 -3802701642392578:3802701644307152 0:0 CopyHostToDevice:39:1991 -3802701644410516:3802701645608650 0:0 KernelExecution:43:1991 -3802701644387082:3802701647064112 0:0 CopyDeviceToHost:45:1991 -3802701654288485:3802701656163049 0:0 CopyHostToDevice:46:1991 -3802701656267334:3802701657467098 0:0 KernelExecution:50:1991 -3802701656244070:3802701658916870 0:0 CopyDeviceToHost:52:1991 -3802701666450396:3802701668378780 0:0 CopyHostToDevice:53:1991 -3802701668482438:3802701669683832 0:0 KernelExecution:57:1991 -3802701668458481:3802701671148361 0:0 CopyDeviceToHost:59:1991 -3802701678631556:3802701680505490 0:0 CopyHostToDevice:60:1991 -3802701680609945:3802701681806894 0:0 KernelExecution:64:1991 -3802701680586811:3802701683591443 0:0 CopyDeviceToHost:66:1991 -3802701691032768:3802701692918102 0:0 CopyHostToDevice:67:1991 -3802701693021896:3802701694223438 0:0 KernelExecution:71:1991 -3802701692999202:3802701695886464 0:0 CopyDeviceToHost:73:1991 PASSED! ## Iteration (89) ################# PASSED! @@ -68,36 +77,132 @@ PASSED! ## Iteration (85) ################# PASSED! ## Iteration (84) ################# +129855997366746:129855998130772 0:0 CopyHostToDevice:32:14696 +129855998225065:129855999449385 0:0 KernelExecution:36:14696 +129855998197249:129855999925825 0:0 CopyDeviceToHost:38:14696 +129856005895171:129856006661973 0:0 CopyHostToDevice:39:14696 +129856006745770:129856007968491 0:0 KernelExecution:43:14696 +129856006717709:129856008455141 0:0 CopyDeviceToHost:45:14696 +129856014425283:129856015187951 0:0 CopyHostToDevice:46:14696 +129856015270363:129856016493884 0:0 KernelExecution:50:14696 +129856015242633:129856016989490 0:0 CopyDeviceToHost:52:14696 +129856022971470:129856023730704 0:0 CopyHostToDevice:53:14696 +129856023813883:129856025033244 0:0 KernelExecution:57:14696 +129856023785712:129856025544334 0:0 CopyDeviceToHost:59:14696 +129856031596064:129856032498907 0:0 CopyHostToDevice:60:14696 +129856032586758:129856033809639 0:0 KernelExecution:64:14696 +129856032558443:129856034354036 0:0 CopyDeviceToHost:66:14696 +129856040416553:129856041127473 0:0 CopyHostToDevice:67:14696 +129856041212287:129856042435488 0:0 KernelExecution:71:14696 +129856041184491:129856042941958 0:0 CopyDeviceToHost:73:14696 +129856049061163:129856049826011 0:0 CopyHostToDevice:74:14696 +129856049910719:129856051134400 0:0 KernelExecution:78:14696 +129856049882831:129856051651620 0:0 CopyDeviceToHost:80:14696 +129856057864499:129856058629610 0:0 CopyHostToDevice:81:14696 +129856058712855:129856059935896 0:0 KernelExecution:85:14696 +129856058684894:129856060452569 0:0 CopyDeviceToHost:87:14696 +129856066769721:129856067537899 0:0 CopyHostToDevice:88:14696 +129856067621801:129856068845321 0:0 KernelExecution:92:14696 +129856067594217:129856069423348 0:0 CopyDeviceToHost:94:14696 +129856075784739:129856076568384 0:0 CopyHostToDevice:95:14696 +129856076658166:129856077880567 0:0 KernelExecution:99:14696 +129856076630540:129856078394130 0:0 CopyDeviceToHost:101:14696 +129856084835135:129856085603333 0:0 CopyHostToDevice:102:14696 +129856085689351:129856086911912 0:0 KernelExecution:106:14696 +129856085661614:129856087438495 0:0 CopyDeviceToHost:108:14696 +129856093911070:129856094682948 0:0 CopyHostToDevice:109:14696 +129856094767987:129856095991348 0:0 KernelExecution:113:14696 +129856094739044:129856096520182 0:0 CopyDeviceToHost:115:14696 +129855997303698:129855998134058 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :32 +129855998136242:129855998136243 14696:14696 MARK(name(before HIP LaunchKernel)) +129855998138933:129855998139817 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 +129855998141918:129855998142773 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 +129855998144935:129855998149221 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 +129855998151431:129855998151432 14696:14696 MARK(name(after HIP LaunchKernel)) +129855998153828:129855999937506 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :38 +129856005829520:129856006665192 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :39 +129856006667396:129856006667397 14696:14696 MARK(name(before HIP LaunchKernel)) +129856006670307:129856006671160 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 +129856006673376:129856006674209 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 +129856006676323:129856006679651 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 +129856006681635:129856006681636 14696:14696 MARK(name(after HIP LaunchKernel)) +129856006683967:129856008469471 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :45 +129856014360174:129856015191285 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :46 +129856015193489:129856015193490 14696:14696 MARK(name(before HIP LaunchKernel)) +129856015196342:129856015197217 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 +129856015199400:129856015200221 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 +129856015202314:129856015205930 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 +129856015208058:129856015208059 14696:14696 MARK(name(after HIP LaunchKernel)) +129856015210764:129856017001555 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :52 +129856022908053:129856023733985 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :53 +129856023736320:129856023736321 14696:14696 MARK(name(before HIP LaunchKernel)) +129856023739178:129856023740063 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 +129856023742240:129856023743090 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 +129856023745309:129856023748845 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 +129856023750891:129856023750892 14696:14696 MARK(name(after HIP LaunchKernel)) +129856023753396:129856025556257 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :59 +129856031530409:129856032503170 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :60 +129856032505392:129856032505393 14696:14696 MARK(name(before HIP LaunchKernel)) +129856032508345:129856032509226 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 +129856032511486:129856032512316 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 +129856032514599:129856032518036 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 +129856032520150:129856032520151 14696:14696 MARK(name(after HIP LaunchKernel)) +129856032522410:129856034373111 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :66 +129856040397979:129856041130687 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :67 +129856041132973:129856041132974 14696:14696 MARK(name(before HIP LaunchKernel)) +129856041136399:129856041137389 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 +129856041139653:129856041140500 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 +129856041142893:129856041146663 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 +129856041148645:129856041148646 14696:14696 MARK(name(after HIP LaunchKernel)) +129856041151128:129856042953843 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :73 +129856048994841:129856049829566 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :74 +129856049831724:129856049831725 14696:14696 MARK(name(before HIP LaunchKernel)) +129856049834527:129856049835413 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 +129856049837759:129856049838585 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 +129856049840796:129856049844487 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 +129856049846529:129856049846530 14696:14696 MARK(name(after HIP LaunchKernel)) +129856049848934:129856051663797 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :80 +129856057798518:129856058633464 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :81 +129856058635650:129856058635651 14696:14696 MARK(name(before HIP LaunchKernel)) +129856058638530:129856058639560 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 +129856058641994:129856058642826 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 +129856058645125:129856058648721 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 +129856058650749:129856058650750 14696:14696 MARK(name(after HIP LaunchKernel)) +129856058653478:129856060466863 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :87 +129856066704603:129856067541502 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :88 +129856067543802:129856067543803 14696:14696 MARK(name(before HIP LaunchKernel)) +129856067546791:129856067547681 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 +129856067550027:129856067550854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 +129856067553125:129856067556952 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 +129856067559149:129856067559150 14696:14696 MARK(name(after HIP LaunchKernel)) +129856067561903:129856069442958 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :94 +129856075719215:129856076572398 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :95 +129856076574828:129856076574829 14696:14696 MARK(name(before HIP LaunchKernel)) +129856076578071:129856076578997 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 +129856076581286:129856076582119 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 +129856076584498:129856076588395 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 +129856076590554:129856076590555 14696:14696 MARK(name(after HIP LaunchKernel)) +129856076592857:129856078406672 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :101 +129856084768530:129856085607081 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :102 +129856085609437:129856085609438 14696:14696 MARK(name(before HIP LaunchKernel)) +129856085612528:129856085613498 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 +129856085615751:129856085616602 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 +129856085618831:129856085623039 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 +129856085625178:129856085625179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856085627731:129856087451206 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :108 +129856093846767:129856094686797 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :109 +129856094689153:129856094689154 14696:14696 MARK(name(before HIP LaunchKernel)) +129856094692497:129856094693485 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 +129856094695727:129856094696598 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 +129856094698884:129856094702856 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 +129856094705178:129856094705179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856094707931:129856096534639 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :115 PASSED! ## Iteration (83) ################# PASSED! ## Iteration (82) ################# PASSED! ## Iteration (81) ################# -3802701703288299:3802701705170783 0:0 CopyHostToDevice:74:1991 -3802701705274243:3802701706486156 0:0 KernelExecution:78:1991 -3802701705250604:3802701707936074 0:0 CopyDeviceToHost:80:1991 -3802701715184407:3802701716946440 0:0 CopyHostToDevice:81:1991 -3802701717062173:3802701718258234 0:0 KernelExecution:85:1991 -3802701717027281:3802701719895352 0:0 CopyDeviceToHost:87:1991 -3802701727144976:3802701729139460 0:0 CopyHostToDevice:88:1991 -3802701729244175:3802701730445125 0:0 KernelExecution:92:1991 -3802701729220511:3802701732165583 0:0 CopyDeviceToHost:94:1991 -3802701739387037:3802701741142680 0:0 CopyHostToDevice:95:1991 -3802701741249310:3802701742453815 0:0 KernelExecution:99:1991 -3802701741225710:3802701744149042 0:0 CopyDeviceToHost:101:1991 -3802701751388465:3802701753137668 0:0 CopyHostToDevice:102:1991 -3802701753243075:3802701754440321 0:0 KernelExecution:106:1991 -3802701753219589:3802701756153951 0:0 CopyDeviceToHost:108:1991 -3802701763443335:3802701765498080 0:0 CopyHostToDevice:109:1991 -3802701765603802:3802701766820456 0:0 KernelExecution:113:1991 -3802701765580171:3802701768590463 0:0 CopyDeviceToHost:115:1991 -3802701775866137:3802701777758951 0:0 CopyHostToDevice:116:1991 -3802701777862528:3802701779073255 0:0 KernelExecution:120:1991 -3802701777839322:3802701780544442 0:0 CopyDeviceToHost:122:1991 -3802701787979987:3802701790138553 0:0 CopyHostToDevice:123:1991 -3802701790243940:3802701791446371 0:0 KernelExecution:127:1991 -3802701790220103:3802701792896973 0:0 CopyDeviceToHost:129:1991 PASSED! ## Iteration (80) ################# PASSED! @@ -114,35 +219,118 @@ PASSED! ## Iteration (74) ################# PASSED! ## Iteration (73) ################# -3802701800291738:3802701802179392 0:0 CopyHostToDevice:130:1991 -3802701802285163:3802701803481223 0:0 KernelExecution:134:1991 -3802701802261733:3802701804931343 0:0 CopyDeviceToHost:136:1991 -3802701812337128:3802701814252581 0:0 CopyHostToDevice:137:1991 -3802701814356366:3802701815565464 0:0 KernelExecution:141:1991 -3802701814332902:3802701817015292 0:0 CopyDeviceToHost:143:1991 -3802701824392847:3802701826310401 0:0 CopyHostToDevice:144:1991 -3802701826415256:3802701827613539 0:0 KernelExecution:148:1991 -3802701826391761:3802701829071431 0:0 CopyDeviceToHost:150:1991 -3802701836291435:3802701838179779 0:0 CopyHostToDevice:151:1991 -3802701838283081:3802701839480623 0:0 KernelExecution:155:1991 -3802701838259290:3802701840931690 0:0 CopyDeviceToHost:157:1991 -3802701848294054:3802701850186618 0:0 CopyHostToDevice:158:1991 -3802701850293201:3802701851487632 0:0 KernelExecution:162:1991 -3802701850269869:3802701852937908 0:0 CopyDeviceToHost:164:1991 -3802701860182332:3802701862143417 0:0 CopyHostToDevice:165:1991 -3802701862248805:3802701863444865 0:0 KernelExecution:169:1991 -3802701862224967:3802701865141909 0:0 CopyDeviceToHost:171:1991 -3802701872353003:3802701874265587 0:0 CopyHostToDevice:172:1991 -3802701874371291:3802701875572092 0:0 KernelExecution:176:1991 -3802701874348307:3802701877019147 0:0 CopyDeviceToHost:178:1991 -3802701884267750:3802701886153054 0:0 CopyHostToDevice:179:1991 -3802701886259179:3802701887463536 0:0 KernelExecution:183:1991 -3802701886235615:3802701888914085 0:0 CopyDeviceToHost:185:1991 -3802701896155929:3802701898142244 0:0 CopyHostToDevice:186:1991 -3802701898246687:3802701899454155 0:0 KernelExecution:190:1991 -3802701898223504:3802701901145246 0:0 CopyDeviceToHost:192:1991 +129856103067958:129856103841032 0:0 CopyHostToDevice:116:14696 +129856103927769:129856105150970 0:0 KernelExecution:120:14696 +129856103899316:129856105721054 0:0 CopyDeviceToHost:122:14696 +129856112245852:129856113015798 0:0 CopyHostToDevice:123:14696 +129856113100485:129856114323526 0:0 KernelExecution:127:14696 +129856113072690:129856114900649 0:0 CopyDeviceToHost:129:14696 +129856121600998:129856122374148 0:0 CopyHostToDevice:130:14696 +129856122460856:129856123685017 0:0 KernelExecution:134:14696 +129856122432406:129856124221503 0:0 CopyDeviceToHost:136:14696 +129856130996154:129856131718339 0:0 CopyHostToDevice:137:14696 +129856131803770:129856133026171 0:0 KernelExecution:141:14696 +129856131775718:129856133613724 0:0 CopyDeviceToHost:143:14696 +129856140505813:129856141285491 0:0 CopyHostToDevice:144:14696 +129856141371337:129856142594218 0:0 KernelExecution:148:14696 +129856141343575:129856143188801 0:0 CopyDeviceToHost:150:14696 +129856150234971:129856151016053 0:0 CopyHostToDevice:151:14696 +129856151102892:129856152327053 0:0 KernelExecution:155:14696 +129856151074919:129856152872907 0:0 CopyDeviceToHost:157:14696 +129856159481376:129856160253347 0:0 CopyHostToDevice:158:14696 +129856160343525:129856161566086 0:0 KernelExecution:162:14696 +129856160315355:129856162137295 0:0 CopyDeviceToHost:164:14696 +129856168059715:129856168791250 0:0 CopyHostToDevice:165:14696 +129856168876828:129856170099709 0:0 KernelExecution:169:14696 +129856168849139:129856170629902 0:0 CopyDeviceToHost:171:14696 +129856176005269:129856176724156 0:0 CopyHostToDevice:172:14696 +129856176811979:129856178033100 0:0 KernelExecution:176:14696 +129856176783784:129856178564862 0:0 CopyDeviceToHost:178:14696 +129856183804454:129856184516916 0:0 CopyHostToDevice:179:14696 +129856184609470:129856185832511 0:0 KernelExecution:183:14696 +129856184581802:129856186368858 0:0 CopyDeviceToHost:185:14696 +129856191541921:129856192254454 0:0 CopyHostToDevice:186:14696 +129856192345329:129856193569809 0:0 KernelExecution:190:14696 +129856192317767:129856194105080 0:0 CopyDeviceToHost:192:14696 +129856103003811:129856103844379 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :116 +129856103846787:129856103846788 14696:14696 MARK(name(before HIP LaunchKernel)) +129856103849922:129856103850838 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 +129856103853240:129856103854136 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 +129856103856444:129856103860149 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 +129856103862386:129856103862387 14696:14696 MARK(name(after HIP LaunchKernel)) +129856103864691:129856105741098 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :122 +129856112200226:129856113019342 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :123 +129856113021598:129856113021599 14696:14696 MARK(name(before HIP LaunchKernel)) +129856113024595:129856113025504 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 +129856113027902:129856113028756 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 +129856113031010:129856113034968 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 +129856113037098:129856113037099 14696:14696 MARK(name(after HIP LaunchKernel)) +129856113039452:129856114918382 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :129 +129856121536590:129856122377686 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :130 +129856122380177:129856122380178 14696:14696 MARK(name(before HIP LaunchKernel)) +129856122383242:129856122384157 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 +129856122386562:129856122387438 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 +129856122389743:129856122393887 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 +129856122395917:129856122395918 14696:14696 MARK(name(after HIP LaunchKernel)) +129856122398705:129856124236553 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :136 +129856130930250:129856131721919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :137 +129856131724534:129856131724535 14696:14696 MARK(name(before HIP LaunchKernel)) +129856131727544:129856131728453 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 +129856131730840:129856131731718 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 +129856131734248:129856131738338 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 +129856131740508:129856131740509 14696:14696 MARK(name(after HIP LaunchKernel)) +129856131742956:129856133633762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :143 +129856140484642:129856141289559 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :144 +129856141292040:129856141292041 14696:14696 MARK(name(before HIP LaunchKernel)) +129856141295360:129856141296366 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 +129856141298705:129856141299584 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 +129856141301885:129856141305904 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 +129856141308287:129856141308288 14696:14696 MARK(name(after HIP LaunchKernel)) +129856141310745:129856143207185 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :150 +129856150167842:129856151019519 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :151 +129856151021903:129856151021904 14696:14696 MARK(name(before HIP LaunchKernel)) +129856151025430:129856151026339 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 +129856151028846:129856151029731 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 +129856151032070:129856151036399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 +129856151038525:129856151038526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856151041204:129856152887054 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :157 +129856159416500:129856160257922 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :158 +129856160260251:129856160260252 14696:14696 MARK(name(before HIP LaunchKernel)) +129856160263327:129856160264253 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 +129856160266588:129856160267551 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 +129856160269815:129856160273583 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 +129856160275639:129856160275640 14696:14696 MARK(name(after HIP LaunchKernel)) +129856160277873:129856162154856 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :164 +129856167989129:129856168794954 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :165 +129856168796817:129856168796818 14696:14696 MARK(name(before HIP LaunchKernel)) +129856168799680:129856168800356 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 +129856168802336:129856168803043 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 +129856168804923:129856168808196 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 +129856168810026:129856168810027 14696:14696 MARK(name(after HIP LaunchKernel)) +129856168811889:129856170642148 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :171 +129856175935119:129856176727698 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :172 +129856176729573:129856176729574 14696:14696 MARK(name(before HIP LaunchKernel)) +129856176732312:129856176733001 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 +129856176734764:129856176735517 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 +129856176737306:129856176740961 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 +129856176742551:129856176742552 14696:14696 MARK(name(after HIP LaunchKernel)) +129856176744384:129856178576608 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :178 +129856183733862:129856184521359 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :179 PASSED! ## Iteration (72) ################# +129856184523202:129856184523203 14696:14696 MARK(name(before HIP LaunchKernel)) +129856184526239:129856184526918 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 +129856184528695:129856184529339 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 +129856184531203:129856184534819 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 +129856184536444:129856184536445 14696:14696 MARK(name(after HIP LaunchKernel)) +129856184538159:129856186381152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :185 +129856191471466:129856192258965 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :186 +129856192260887:129856192260888 14696:14696 MARK(name(before HIP LaunchKernel)) +129856192264565:129856192265231 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 +129856192266936:129856192267582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 +129856192269493:129856192272647 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 +129856192274238:129856192274239 14696:14696 MARK(name(after HIP LaunchKernel)) +129856192276014:129856194117333 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :192 PASSED! ## Iteration (71) ################# PASSED! @@ -157,30 +345,6 @@ PASSED! ## Iteration (66) ################# PASSED! ## Iteration (65) ################# -3802701908363640:3802701910282004 0:0 CopyHostToDevice:193:1991 -3802701910388686:3802701911593636 0:0 KernelExecution:197:1991 -3802701910364944:3802701913041924 0:0 CopyDeviceToHost:199:1991 -3802701920274197:3802701922171761 0:0 CopyHostToDevice:200:1991 -3802701922278125:3802701923475222 0:0 KernelExecution:204:1991 -3802701922254592:3802701924925132 0:0 CopyDeviceToHost:206:1991 -3802701932168496:3802701934142771 0:0 CopyHostToDevice:207:1991 -3802701934246976:3802701935438295 0:0 KernelExecution:211:1991 -3802701934223551:3802701937141613 0:0 CopyDeviceToHost:213:1991 -3802701944352056:3802701946257570 0:0 CopyHostToDevice:214:1991 -3802701946362997:3802701947574317 0:0 KernelExecution:218:1991 -3802701946339571:3802701949023790 0:0 CopyDeviceToHost:220:1991 -3802701956400665:3802701958316110 0:0 CopyHostToDevice:221:1991 -3802701958422590:3802701959641615 0:0 KernelExecution:225:1991 -3802701958399130:3802701961106280 0:0 CopyDeviceToHost:227:1991 -3802701968320724:3802701970208178 0:0 CopyHostToDevice:228:1991 -3802701970318670:3802701971521693 0:0 KernelExecution:232:1991 -3802701970295529:3802701972971609 0:0 CopyDeviceToHost:234:1991 -3802701980199792:3802701982142436 0:0 CopyHostToDevice:235:1991 -3802701982245928:3802701983440062 0:0 KernelExecution:239:1991 -3802701982222487:3802701985143188 0:0 CopyDeviceToHost:241:1991 -3802701992355642:3802701994267646 0:0 CopyHostToDevice:242:1991 -3802701994371730:3802701995578753 0:0 KernelExecution:246:1991 -3802701994348667:3802701997026937 0:0 CopyDeviceToHost:248:1991 PASSED! ## Iteration (64) ################# PASSED! @@ -193,39 +357,152 @@ PASSED! ## Iteration (60) ################# PASSED! ## Iteration (59) ################# +129856199280943:129856199989681 0:0 CopyHostToDevice:193:14696 +129856200075190:129856201299831 0:0 KernelExecution:197:14696 +129856200047538:129856201850341 0:0 CopyDeviceToHost:199:14696 +129856206897412:129856207614253 0:0 CopyHostToDevice:200:14696 +129856207705498:129856208928859 0:0 KernelExecution:204:14696 +129856207676917:129856209473592 0:0 CopyDeviceToHost:206:14696 +129856214432984:129856215196409 0:0 CopyHostToDevice:207:14696 +129856215281304:129856216504825 0:0 KernelExecution:211:14696 +129856215253529:129856217050195 0:0 CopyDeviceToHost:213:14696 +129856221931666:129856222699124 0:0 CopyHostToDevice:214:14696 +129856222785050:129856224007611 0:0 KernelExecution:218:14696 +129856222756874:129856224558196 0:0 CopyDeviceToHost:220:14696 +129856229435728:129856230202586 0:0 CopyHostToDevice:221:14696 +129856230289822:129856231510942 0:0 KernelExecution:225:14696 +129856230262176:129856232049379 0:0 CopyDeviceToHost:227:14696 +129856236838217:129856237549415 0:0 CopyHostToDevice:228:14696 +129856237635376:129856238857136 0:0 KernelExecution:232:14696 +129856237607782:129856239407224 0:0 CopyDeviceToHost:234:14696 +129856244299394:129856245007567 0:0 CopyHostToDevice:235:14696 +129856245099279:129856246322159 0:0 KernelExecution:239:14696 +129856245071193:129856246864706 0:0 CopyDeviceToHost:241:14696 +129856251723187:129856252431603 0:0 CopyHostToDevice:242:14696 +129856252521404:129856253744124 0:0 KernelExecution:246:14696 +129856252493576:129856254289474 0:0 CopyDeviceToHost:248:14696 +129856259171693:129856259879626 0:0 CopyHostToDevice:249:14696 +129856259964936:129856261188937 0:0 KernelExecution:253:14696 +129856259937195:129856261731637 0:0 CopyDeviceToHost:255:14696 +129856266605795:129856267371070 0:0 CopyHostToDevice:256:14696 +129856267455912:129856268680233 0:0 KernelExecution:260:14696 +129856267428297:129856269227260 0:0 CopyDeviceToHost:262:14696 +129856274075448:129856274840296 0:0 CopyHostToDevice:263:14696 +129856274927804:129856276150525 0:0 KernelExecution:267:14696 +129856274899679:129856276695018 0:0 CopyDeviceToHost:269:14696 +129856281565009:129856282326831 0:0 CopyHostToDevice:270:14696 +129856282411157:129856283637077 0:0 KernelExecution:274:14696 +129856282383503:129856284175523 0:0 CopyDeviceToHost:276:14696 +129856288995752:129856289705630 0:0 CopyHostToDevice:277:14696 +129856289793308:129856291014269 0:0 KernelExecution:281:14696 +129856289765547:129856291559219 0:0 CopyDeviceToHost:283:14696 +129856296360197:129856297069117 0:0 CopyHostToDevice:284:14696 +129856297157310:129856298378111 0:0 KernelExecution:288:14696 +129856297129589:129856298914568 0:0 CopyDeviceToHost:290:14696 +129856199220209:129856199993256 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :193 +129856199995165:129856199995166 14696:14696 MARK(name(before HIP LaunchKernel)) +129856199998331:129856199999016 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 +129856200000971:129856200001630 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 +129856200003348:129856200006409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 +129856200007997:129856200007998 14696:14696 MARK(name(after HIP LaunchKernel)) +129856200009781:129856201864796 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :199 +129856206828954:129856207617612 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :200 +129856207619342:129856207619343 14696:14696 MARK(name(before HIP LaunchKernel)) +129856207633427:129856207634203 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 +129856207635929:129856207636565 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 +129856207638289:129856207641619 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 +129856207643379:129856207643380 14696:14696 MARK(name(after HIP LaunchKernel)) +129856207645338:129856209486625 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :206 +129856214367871:129856215199634 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :207 +129856215201421:129856215201422 14696:14696 MARK(name(before HIP LaunchKernel)) +129856215205034:129856215205701 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 +129856215207421:129856215208068 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 +129856215209926:129856215213001 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 +129856215214576:129856215214577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856215216591:129856217062762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :213 +129856221865656:129856222702390 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :214 +129856222704143:129856222704144 14696:14696 MARK(name(before HIP LaunchKernel)) +129856222707593:129856222708263 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 +129856222709907:129856222710533 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 +129856222712408:129856222715305 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 +129856222716820:129856222716821 14696:14696 MARK(name(after HIP LaunchKernel)) +129856222718703:129856224572291 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :220 +129856229369321:129856230206171 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :221 +129856230207933:129856230207934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856230211408:129856230212070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 +129856230213729:129856230214356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 +129856230216306:129856230219552 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 +129856230221084:129856230221085 14696:14696 MARK(name(after HIP LaunchKernel)) +129856230222856:129856232061167 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :227 +129856236820359:129856237552651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :228 +129856237554349:129856237554350 14696:14696 MARK(name(before HIP LaunchKernel)) +129856237557958:129856237558615 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 +129856237560382:129856237561016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 +129856237562876:129856237566063 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 +129856237567608:129856237567609 14696:14696 MARK(name(after HIP LaunchKernel)) +129856237569296:129856239419101 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :234 +129856244174381:129856245010977 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :235 +129856245012718:129856245012719 14696:14696 MARK(name(before HIP LaunchKernel)) +129856245025693:129856245026451 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 +129856245028210:129856245028855 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 +129856245030730:129856245034177 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 +129856245035805:129856245035806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856245038122:129856246876538 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :241 +129856251653109:129856252435896 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :242 +129856252437833:129856252437834 14696:14696 MARK(name(before HIP LaunchKernel)) +129856252441362:129856252442017 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 +129856252443660:129856252444296 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 +129856252446165:129856252449155 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 +129856252450809:129856252450810 14696:14696 MARK(name(after HIP LaunchKernel)) +129856252452579:129856254303055 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :248 +129856259101952:129856259882749 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :249 +129856259884515:129856259884516 14696:14696 MARK(name(before HIP LaunchKernel)) +129856259886742:129856259887392 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 +129856259889040:129856259889671 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 +129856259891415:129856259894919 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 +129856259896631:129856259896632 14696:14696 MARK(name(after HIP LaunchKernel)) +129856259898324:129856261743974 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :255 +129856266541050:129856267374498 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :256 +129856267376266:129856267376267 14696:14696 MARK(name(before HIP LaunchKernel)) +129856267379647:129856267380320 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 +129856267381929:129856267382540 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 +129856267384409:129856267387474 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 +129856267389033:129856267389034 14696:14696 MARK(name(after HIP LaunchKernel)) +129856267390764:129856269239563 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :262 +129856274008890:129856274843415 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :263 +129856274845095:129856274845096 14696:14696 MARK(name(before HIP LaunchKernel)) +129856274847806:129856274848470 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 +129856274850117:129856274850733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 +129856274852427:129856274855749 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 +129856274857358:129856274857359 14696:14696 MARK(name(after HIP LaunchKernel)) +129856274859228:129856276707873 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :269 +129856281498759:129856282330118 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :270 +129856282332044:129856282332045 14696:14696 MARK(name(before HIP LaunchKernel)) +129856282335358:129856282336015 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 +129856282338029:129856282338668 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 +129856282340644:129856282343485 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 +129856282345028:129856282345029 14696:14696 MARK(name(after HIP LaunchKernel)) +129856282347024:129856284203838 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :276 +129856288978096:129856289708673 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :277 +129856289710414:129856289710415 14696:14696 MARK(name(before HIP LaunchKernel)) +129856289714250:129856289714924 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 +129856289716689:129856289717305 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 +129856289719150:129856289722057 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 +129856289723677:129856289723678 14696:14696 MARK(name(after HIP LaunchKernel)) +129856289725380:129856291571314 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :283 +129856296341271:129856297072486 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :284 +129856297074313:129856297074314 14696:14696 MARK(name(before HIP LaunchKernel)) +129856297077733:129856297078380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 +129856297080109:129856297080733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 +129856297082729:129856297085646 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 +129856297087184:129856297087185 14696:14696 MARK(name(after HIP LaunchKernel)) +129856297089004:129856298926004 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :290 PASSED! ## Iteration (58) ################# PASSED! ## Iteration (57) ################# PASSED! ## Iteration (56) ################# -3802702004315971:3802702006430907 0:0 CopyHostToDevice:249:1991 -3802702006513343:3802702007718885 0:0 KernelExecution:253:1991 -3802702006490217:3802702009041896 0:0 CopyDeviceToHost:255:1991 -3802702018262184:3802702019943876 0:0 CopyHostToDevice:256:1991 -3802702020050568:3802702021249295 0:0 KernelExecution:260:1991 -3802702020026907:3802702022584386 0:0 CopyDeviceToHost:262:1991 -3802702029541468:3802702031219270 0:0 CopyHostToDevice:263:1991 -3802702031312763:3802702032510305 0:0 KernelExecution:267:1991 -3802702031289161:3802702033843490 0:0 CopyDeviceToHost:269:1991 -3802702040805082:3802702042480244 0:0 CopyHostToDevice:270:1991 -3802702042572785:3802702043776105 0:0 KernelExecution:274:1991 -3802702042549004:3802702045110673 0:0 CopyDeviceToHost:276:1991 -3802702052065204:3802702053741167 0:0 CopyHostToDevice:277:1991 -3802702053835958:3802702055052463 0:0 KernelExecution:281:1991 -3802702053813487:3802702056374447 0:0 CopyDeviceToHost:283:1991 -3802702063333568:3802702065014061 0:0 CopyHostToDevice:284:1991 -3802702065111999:3802702066319615 0:0 KernelExecution:288:1991 -3802702065088771:3802702067654340 0:0 CopyDeviceToHost:290:1991 -3802702074618962:3802702076284625 0:0 CopyHostToDevice:291:1991 -3802702076384443:3802702077569835 0:0 KernelExecution:295:1991 -3802702076360685:3802702078904404 0:0 CopyDeviceToHost:297:1991 -3802702085881125:3802702087555758 0:0 CopyHostToDevice:298:1991 -3802702087649675:3802702088847958 0:0 KernelExecution:302:1991 -3802702087626608:3802702090183277 0:0 CopyDeviceToHost:304:1991 -3802702097151929:3802702098830722 0:0 CopyHostToDevice:305:1991 -3802702098924116:3802702100140473 0:0 KernelExecution:309:1991 -3802702098901192:3802702101472621 0:0 CopyDeviceToHost:311:1991 PASSED! ## Iteration (55) ################# PASSED! @@ -244,35 +521,140 @@ PASSED! ## Iteration (48) ################# PASSED! ## Iteration (47) ################# -3802702108615424:3802702110296796 0:0 CopyHostToDevice:312:1991 -3802702110392443:3802702111600207 0:0 KernelExecution:316:1991 -3802702110368957:3802702112934696 0:0 CopyDeviceToHost:318:1991 -3802702119898217:3802702121579670 0:0 CopyHostToDevice:319:1991 -3802702121673899:3802702122873960 0:0 KernelExecution:323:1991 -3802702121650880:3802702124193909 0:0 CopyDeviceToHost:325:1991 -3802702131156331:3802702132834494 0:0 CopyHostToDevice:326:1991 -3802702132927702:3802702134121984 0:0 KernelExecution:330:1991 -3802702132904324:3802702135456513 0:0 CopyDeviceToHost:332:1991 -3802702142434925:3802702144099207 0:0 CopyHostToDevice:333:1991 -3802702144200141:3802702145401090 0:0 KernelExecution:337:1991 -3802702144175248:3802702146735777 0:0 CopyDeviceToHost:339:1991 -3802702153706898:3802702155385711 0:0 CopyHostToDevice:340:1991 -3802702155488005:3802702156685843 0:0 KernelExecution:344:1991 -3802702155464581:3802702158018890 0:0 CopyDeviceToHost:346:1991 -3802702164987312:3802702166668385 0:0 CopyHostToDevice:347:1991 -3802702166762069:3802702167965537 0:0 KernelExecution:351:1991 -3802702166739105:3802702169298644 0:0 CopyDeviceToHost:353:1991 -3802702176260016:3802702177933188 0:0 CopyHostToDevice:354:1991 -3802702178026430:3802702179223971 0:0 KernelExecution:358:1991 -3802702178002518:3802702180540757 0:0 CopyDeviceToHost:360:1991 -3802702187490789:3802702189167931 0:0 CopyHostToDevice:361:1991 -3802702189262737:3802702190474501 0:0 KernelExecution:365:1991 -3802702189239082:3802702191808141 0:0 CopyDeviceToHost:367:1991 -3802702198761922:3802702200425845 0:0 CopyHostToDevice:368:1991 PASSED! ## Iteration (46) ################# PASSED! ## Iteration (45) ################# +129856303845436:129856304622018 0:0 CopyHostToDevice:291:14696 +129856304714456:129856305941176 0:0 KernelExecution:295:14696 +129856304686879:129856306490313 0:0 CopyDeviceToHost:297:14696 +129856311333818:129856312045157 0:0 CopyHostToDevice:298:14696 +129856312128568:129856313351929 0:0 KernelExecution:302:14696 +129856312100713:129856313892452 0:0 CopyDeviceToHost:304:14696 +129856318773490:129856319480599 0:0 CopyHostToDevice:305:14696 +129856319573103:129856320793904 0:0 KernelExecution:309:14696 +129856319544959:129856321343459 0:0 CopyDeviceToHost:311:14696 +129856326211019:129856326977511 0:0 CopyHostToDevice:312:14696 +129856327061875:129856328282996 0:0 KernelExecution:316:14696 +129856327034134:129856328825473 0:0 CopyDeviceToHost:318:14696 +129856333673698:129856334437330 0:0 CopyHostToDevice:319:14696 +129856334523567:129856335745168 0:0 KernelExecution:323:14696 +129856334495713:129856336293262 0:0 CopyDeviceToHost:325:14696 +129856341101442:129856341984561 0:0 CopyHostToDevice:326:14696 +129856342071670:129856343294870 0:0 KernelExecution:330:14696 +129856342043988:129856343840850 0:0 CopyDeviceToHost:332:14696 +129856348646308:129856349354803 0:0 CopyHostToDevice:333:14696 +129856349441279:129856350662399 0:0 KernelExecution:337:14696 +129856349413003:129856351203503 0:0 CopyDeviceToHost:339:14696 +129856356094471:129856356820623 0:0 CopyHostToDevice:340:14696 +129856356907355:129856358130235 0:0 KernelExecution:344:14696 +129856356879789:129856358671945 0:0 CopyDeviceToHost:346:14696 +129856363528023:129856364288036 0:0 CopyHostToDevice:347:14696 +129856364405580:129856365626380 0:0 KernelExecution:351:14696 +129856364377906:129856366172703 0:0 CopyDeviceToHost:353:14696 +129856371087592:129856371798847 0:0 CopyHostToDevice:354:14696 +129856371883929:129856373108889 0:0 KernelExecution:358:14696 +129856371855593:129856373655534 0:0 CopyDeviceToHost:360:14696 +129856378493711:129856379257336 0:0 CopyHostToDevice:361:14696 +129856379342581:129856380565301 0:0 KernelExecution:365:14696 +129856379314699:129856381113012 0:0 CopyDeviceToHost:367:14696 +129856385977586:129856386744228 0:0 CopyHostToDevice:368:14696 +129856386831442:129856388055123 0:0 KernelExecution:372:14696 +129856386803378:129856388598263 0:0 CopyDeviceToHost:374:14696 +129856393484361:129856394251866 0:0 CopyHostToDevice:375:14696 +129856394339138:129856395561058 0:0 KernelExecution:379:14696 +129856394311639:129856396103600 0:0 CopyDeviceToHost:381:14696 +129856303774990:129856304626161 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :291 +129856304627884:129856304627885 14696:14696 MARK(name(before HIP LaunchKernel)) +129856304631072:129856304631723 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 +129856304633373:129856304634007 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 +129856304635811:129856304639104 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 +129856304640848:129856304640849 14696:14696 MARK(name(after HIP LaunchKernel)) +129856304642651:129856306501959 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :297 +129856311264292:129856312048766 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :298 +129856312050539:129856312050540 14696:14696 MARK(name(before HIP LaunchKernel)) +129856312053498:129856312054174 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 +129856312055946:129856312056653 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 +129856312058397:129856312061589 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 +129856312063201:129856312063202 14696:14696 MARK(name(after HIP LaunchKernel)) +129856312065053:129856313904746 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :304 +129856318704110:129856319483869 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :305 +129856319485543:129856319485544 14696:14696 MARK(name(before HIP LaunchKernel)) +129856319499258:129856319500048 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 +129856319501759:129856319502401 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 +129856319504307:129856319507787 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 +129856319509535:129856319509536 14696:14696 MARK(name(after HIP LaunchKernel)) +129856319511552:129856321356021 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :311 +129856326144210:129856326980680 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :312 +129856326982483:129856326982484 14696:14696 MARK(name(before HIP LaunchKernel)) +129856326986163:129856326986815 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 +129856326988581:129856326989210 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 +129856326991095:129856326994082 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 +129856326995650:129856326995651 14696:14696 MARK(name(after HIP LaunchKernel)) +129856326997461:129856328838450 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :318 +129856333608209:129856334440902 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :319 +129856334442697:129856334442698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856334446427:129856334447095 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 +129856334448793:129856334449426 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 +129856334451308:129856334454120 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 +129856334455718:129856334455719 14696:14696 MARK(name(after HIP LaunchKernel)) +129856334457508:129856336307654 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :325 +129856341084552:129856341987761 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :326 +129856341989501:129856341989502 14696:14696 MARK(name(before HIP LaunchKernel)) +129856341992961:129856341993616 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 +129856341995311:129856341995915 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 +129856341997784:129856342000844 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 +129856342002457:129856342002458 14696:14696 MARK(name(after HIP LaunchKernel)) +129856342004209:129856343852827 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :332 +129856348628207:129856349358297 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :333 +129856349360014:129856349360015 14696:14696 MARK(name(before HIP LaunchKernel)) +129856349363641:129856349364301 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 +129856349365955:129856349366590 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 +129856349368410:129856349371392 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 +129856349373001:129856349373002 14696:14696 MARK(name(after HIP LaunchKernel)) +129856349374736:129856351215163 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :339 +129856356026231:129856356823939 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :340 +129856356825939:129856356825940 14696:14696 MARK(name(before HIP LaunchKernel)) +129856356829316:129856356829967 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 +129856356831607:129856356832235 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 +129856356834103:129856356837300 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 +129856356838880:129856356838881 14696:14696 MARK(name(after HIP LaunchKernel)) +129856356840997:129856358683474 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :346 +129856363457621:129856364292098 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :347 +129856364293909:129856364293910 14696:14696 MARK(name(before HIP LaunchKernel)) +129856364296242:129856364296921 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 +129856364298665:129856364299325 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 +129856364301137:129856364304805 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 +129856364306614:129856364306615 14696:14696 MARK(name(after HIP LaunchKernel)) +129856364308432:129856366185192 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :353 +129856371019019:129856371802348 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :354 +129856371804072:129856371804073 14696:14696 MARK(name(before HIP LaunchKernel)) +129856371807407:129856371808089 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 +129856371809769:129856371810408 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 +129856371812409:129856371815399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 +129856371816938:129856371816939 14696:14696 MARK(name(after HIP LaunchKernel)) +129856371818730:129856373668223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :360 +129856378427685:129856379260530 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :361 +129856379262413:129856379262414 14696:14696 MARK(name(before HIP LaunchKernel)) +129856379266028:129856379266680 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 +129856379268334:129856379268974 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 +129856379270951:129856379274011 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 +129856379275576:129856379275577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856379277516:129856381125442 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :367 +129856385912709:129856386747747 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :368 +129856386749617:129856386749618 14696:14696 MARK(name(before HIP LaunchKernel)) +129856386753015:129856386753700 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 +129856386755603:129856386756230 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 +129856386758107:129856386761145 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 +129856386762828:129856386762829 14696:14696 MARK(name(after HIP LaunchKernel)) +129856386764527:129856388613300 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :374 +129856393418103:129856394255127 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :375 +129856394257084:129856394257085 14696:14696 MARK(name(before HIP LaunchKernel)) +129856394260727:129856394261393 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 +129856394263117:129856394263752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 +129856394266100:129856394269007 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 +129856394270594:129856394270595 14696:14696 MARK(name(after HIP LaunchKernel)) +129856394272528:129856396115719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :381 PASSED! ## Iteration (44) ################# PASSED! @@ -287,33 +669,6 @@ PASSED! ## Iteration (39) ################# PASSED! ## Iteration (38) ################# -3802702200526879:3802702201738792 0:0 KernelExecution:372:1991 -3802702200502735:3802702203073505 0:0 CopyDeviceToHost:374:1991 -3802702210101096:3802702211781099 0:0 CopyHostToDevice:375:1991 -3802702211874278:3802702213070339 0:0 KernelExecution:379:1991 -3802702211851149:3802702214405528 0:0 CopyDeviceToHost:381:1991 -3802702221371170:3802702223046872 0:0 CopyHostToDevice:382:1991 -3802702223141537:3802702224348264 0:0 KernelExecution:386:1991 -3802702223118273:3802702225680452 0:0 CopyDeviceToHost:388:1991 -3802702232644404:3802702234313936 0:0 CopyHostToDevice:389:1991 -3802702234409358:3802702235607788 0:0 KernelExecution:393:1991 -3802702234385867:3802702236943196 0:0 CopyDeviceToHost:395:1991 -3802702243900787:3802702245580279 0:0 CopyHostToDevice:396:1991 -3802702245674663:3802702246878279 0:0 KernelExecution:400:1991 -3802702245651760:3802702248198969 0:0 CopyDeviceToHost:402:1991 -3802702255168930:3802702256847073 0:0 CopyHostToDevice:403:1991 -3802702256941454:3802702258155589 0:0 KernelExecution:407:1991 -3802702256918733:3802702259489683 0:0 CopyDeviceToHost:409:1991 -3802702266456174:3802702268121957 0:0 CopyHostToDevice:410:1991 -3802702268222984:3802702269419637 0:0 KernelExecution:414:1991 -3802702268198287:3802702270718936 0:0 CopyDeviceToHost:416:1991 -3802702277684438:3802702279355020 0:0 CopyHostToDevice:417:1991 -3802702279449065:3802702280656977 0:0 KernelExecution:421:1991 -3802702279425380:3802702281990519 0:0 CopyDeviceToHost:423:1991 -3802702288963001:3802702290626813 0:0 CopyHostToDevice:424:1991 -3802702290725647:3802702291916077 0:0 KernelExecution:428:1991 -3802702290702274:3802702293249973 0:0 CopyDeviceToHost:430:1991 -3802702300213905:3802702301888607 0:0 CopyHostToDevice:431:1991 PASSED! ## Iteration (37) ################# PASSED! @@ -326,38 +681,152 @@ PASSED! ## Iteration (33) ################# PASSED! ## Iteration (32) ################# +129856400949298:129856401690102 0:0 CopyHostToDevice:382:14696 +129856401774737:129856402998097 0:0 KernelExecution:386:14696 +129856401746598:129856403538591 0:0 CopyDeviceToHost:388:14696 +129856408364229:129856409075649 0:0 CopyHostToDevice:389:14696 +129856409181579:129856410405739 0:0 KernelExecution:393:14696 +129856409154049:129856410946890 0:0 CopyDeviceToHost:395:14696 +129856415833858:129856416545026 0:0 CopyHostToDevice:396:14696 +129856416634688:129856417856288 0:0 KernelExecution:400:14696 +129856416607076:129856418397645 0:0 CopyDeviceToHost:402:14696 +129856423255064:129856423962733 0:0 CopyHostToDevice:403:14696 +129856424049344:129856425272224 0:0 KernelExecution:407:14696 +129856424021555:129856425837337 0:0 CopyDeviceToHost:409:14696 +129856430719717:129856431443207 0:0 CopyHostToDevice:410:14696 +129856431530370:129856432753411 0:0 KernelExecution:414:14696 +129856431502760:129856433290891 0:0 CopyDeviceToHost:416:14696 +129856438127461:129856438893077 0:0 CopyHostToDevice:417:14696 +129856438981153:129856440204834 0:0 KernelExecution:421:14696 +129856438953062:129856440755527 0:0 CopyDeviceToHost:423:14696 +129856445658301:129856446425541 0:0 CopyHostToDevice:424:14696 +129856446512512:129856447734433 0:0 KernelExecution:428:14696 +129856446484748:129856448303143 0:0 CopyDeviceToHost:430:14696 +129856453131279:129856453895371 0:0 CopyHostToDevice:431:14696 +129856453982502:129856455205222 0:0 KernelExecution:435:14696 +129856453954390:129856455747092 0:0 CopyDeviceToHost:437:14696 +129856460549446:129856461267384 0:0 CopyHostToDevice:438:14696 +129856461354488:129856462578648 0:0 KernelExecution:442:14696 +129856461327009:129856463119514 0:0 CopyDeviceToHost:444:14696 +129856467954463:129856468665082 0:0 CopyHostToDevice:445:14696 +129856468756966:129856469978566 0:0 KernelExecution:449:14696 +129856468728958:129856470519550 0:0 CopyDeviceToHost:451:14696 +129856475396016:129856476106990 0:0 CopyHostToDevice:452:14696 +129856476191506:129856477415026 0:0 KernelExecution:456:14696 +129856476164143:129856477979522 0:0 CopyDeviceToHost:458:14696 +129856482841902:129856483550322 0:0 CopyHostToDevice:459:14696 +129856483636804:129856484858245 0:0 KernelExecution:463:14696 +129856483608842:129856485404598 0:0 CopyDeviceToHost:465:14696 +129856490264533:129856491036044 0:0 CopyHostToDevice:466:14696 +129856491121979:129856492348219 0:0 KernelExecution:470:14696 +129856491094217:129856492893929 0:0 CopyDeviceToHost:472:14696 +129856497730065:129856498496809 0:0 CopyHostToDevice:473:14696 +129856498583201:129856499806882 0:0 KernelExecution:477:14696 +129856498555486:129856500349740 0:0 CopyDeviceToHost:479:14696 +129856400931528:129856401693841 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :382 +129856401695697:129856401695698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856401698086:129856401698763 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 +129856401700644:129856401701356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 +129856401703387:129856401706670 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 +129856401708283:129856401708284 14696:14696 MARK(name(after HIP LaunchKernel)) +129856401710202:129856403550731 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :388 +129856408346178:129856409079144 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :389 +129856409080946:129856409080947 14696:14696 MARK(name(before HIP LaunchKernel)) +129856409119575:129856409120361 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 +129856409122350:129856409122982 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 +129856409124716:129856409127974 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 +129856409129722:129856409129723 14696:14696 MARK(name(after HIP LaunchKernel)) +129856409131595:129856410958682 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :395 +129856415764088:129856416549283 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :396 +129856416551147:129856416551148 14696:14696 MARK(name(before HIP LaunchKernel)) +129856416554753:129856416555457 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 +129856416557440:129856416558065 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 +129856416560077:129856416563543 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 +129856416565220:129856416565221 14696:14696 MARK(name(after HIP LaunchKernel)) +129856416567086:129856418410890 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :402 +129856423185992:129856423965984 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :403 +129856423967686:129856423967687 14696:14696 MARK(name(before HIP LaunchKernel)) +129856423971156:129856423971813 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 +129856423973453:129856423974058 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 +129856423975959:129856423979023 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 +129856423980620:129856423980621 14696:14696 MARK(name(after HIP LaunchKernel)) +129856423982481:129856425851437 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :409 +129856430649566:129856431446819 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :410 +129856431448647:129856431448648 14696:14696 MARK(name(before HIP LaunchKernel)) +129856431451980:129856431452627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 +129856431454467:129856431455103 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 +129856431457061:129856431460021 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 +129856431461633:129856431461634 14696:14696 MARK(name(after HIP LaunchKernel)) +129856431463427:129856433305223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :416 +129856438060199:129856438896337 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :417 +129856438898056:129856438898057 14696:14696 MARK(name(before HIP LaunchKernel)) +129856438901614:129856438902293 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 +129856438903944:129856438904582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 +129856438906471:129856438909460 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 +129856438910995:129856438910996 14696:14696 MARK(name(after HIP LaunchKernel)) +129856438913099:129856440770029 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :423 +129856445589904:129856446428787 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :424 +129856446430525:129856446430526 14696:14696 MARK(name(before HIP LaunchKernel)) +129856446434097:129856446434755 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 +129856446436446:129856446437074 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 +129856446438958:129856446442103 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 +129856446443705:129856446443706 14696:14696 MARK(name(after HIP LaunchKernel)) +129856446445611:129856448319675 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :430 +129856453113306:129856453898651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :431 +129856453900443:129856453900444 14696:14696 MARK(name(before HIP LaunchKernel)) +129856453903924:129856453904588 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 +129856453906239:129856453906854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 +129856453908740:129856453911874 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 +129856453913486:129856453913487 14696:14696 MARK(name(after HIP LaunchKernel)) +129856453915356:129856455761272 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :437 +129856460531599:129856461270590 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :438 +129856461272368:129856461272369 14696:14696 MARK(name(before HIP LaunchKernel)) +129856461275845:129856461276515 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 +129856461278198:129856461278850 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 +129856461280791:129856461283899 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 +129856461285595:129856461285596 14696:14696 MARK(name(after HIP LaunchKernel)) +129856461287388:129856463133280 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :444 +129856467884995:129856468668564 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :445 +129856468670291:129856468670292 14696:14696 MARK(name(before HIP LaunchKernel)) +129856468673055:129856468673710 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 +129856468675408:129856468676048 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 +129856468677942:129856468681455 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 +129856468683148:129856468683149 14696:14696 MARK(name(after HIP LaunchKernel)) +129856468685101:129856470532724 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :451 +129856475326269:129856476110399 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :452 +129856476112220:129856476112221 14696:14696 MARK(name(before HIP LaunchKernel)) +129856476115691:129856476116355 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 +129856476118083:129856476118692 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 +129856476120553:129856476123478 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 +129856476125144:129856476125145 14696:14696 MARK(name(after HIP LaunchKernel)) +129856476126929:129856477993159 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :458 +129856482771986:129856483553655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :459 +129856483555435:129856483555436 14696:14696 MARK(name(before HIP LaunchKernel)) +129856483559048:129856483559715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 +129856483561368:129856483561995 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 +129856483563875:129856483567045 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 +129856483569037:129856483569038 14696:14696 MARK(name(after HIP LaunchKernel)) +129856483570875:129856485418803 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :465 +129856490199703:129856491039451 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :466 +129856491041225:129856491041226 14696:14696 MARK(name(before HIP LaunchKernel)) +129856491044551:129856491045204 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 +129856491046844:129856491047481 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 +129856491049291:129856491052245 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 +129856491053805:129856491053806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856491055528:129856492907612 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :472 +129856497665310:129856498500405 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :473 +129856498502066:129856498502067 14696:14696 MARK(name(before HIP LaunchKernel)) +129856498505506:129856498506141 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 +129856498507858:129856498508491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 +129856498510523:129856498513554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 +129856498515137:129856498515138 14696:14696 MARK(name(after HIP LaunchKernel)) +129856498517011:129856500365762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :479 PASSED! ## Iteration (31) ################# PASSED! ## Iteration (30) ################# PASSED! ## Iteration (29) ################# -3802702301982442:3802702303186799 0:0 KernelExecution:435:1991 -3802702301959178:3802702304503997 0:0 CopyDeviceToHost:437:1991 -3802702311466108:3802702313146390 0:0 CopyHostToDevice:438:1991 -3802702313238825:3802702314439626 0:0 KernelExecution:442:1991 -3802702313215791:3802702315773720 0:0 CopyDeviceToHost:444:1991 -3802702322736361:3802702324399864 0:0 CopyHostToDevice:445:1991 -3802702324503098:3802702325721085 0:0 KernelExecution:449:1991 -3802702324478794:3802702327055594 0:0 CopyDeviceToHost:451:1991 -3802702334030715:3802702335709388 0:0 CopyHostToDevice:452:1991 -3802702335806620:3802702337014532 0:0 KernelExecution:456:1991 -3802702335783968:3802702338348468 0:0 CopyDeviceToHost:458:1991 -3802702345327399:3802702347004601 0:0 CopyHostToDevice:459:1991 -3802702347117082:3802702348318476 0:0 KernelExecution:463:1991 -3802702347074012:3802702349651691 0:0 CopyDeviceToHost:465:1991 -3802702356616483:3802702358289405 0:0 CopyHostToDevice:466:1991 -3802702358382881:3802702359585164 0:0 KernelExecution:470:1991 -3802702358359406:3802702360920335 0:0 CopyDeviceToHost:472:1991 -3802702367904117:3802702369585909 0:0 CopyHostToDevice:473:1991 -3802702369679903:3802702370875371 0:0 KernelExecution:477:1991 -3802702369656630:3802702372196308 0:0 CopyDeviceToHost:479:1991 -3802702379156600:3802702380837792 0:0 CopyHostToDevice:480:1991 -3802702380930326:3802702382134830 0:0 KernelExecution:484:1991 -3802702380907402:3802702383472292 0:0 CopyDeviceToHost:486:1991 -3802702390441713:3802702392106996 0:0 CopyHostToDevice:487:1991 -3802702392207713:3802702393417847 0:0 KernelExecution:491:1991 -3802702392183556:3802702394752325 0:0 CopyDeviceToHost:493:1991 PASSED! ## Iteration (28) ################# PASSED! @@ -376,37 +845,147 @@ PASSED! ## Iteration (21) ################# PASSED! ## Iteration (20) ################# -3802702401709737:3802702403387670 0:0 CopyHostToDevice:494:1991 -3802702403489293:3802702404695872 0:0 KernelExecution:498:1991 -3802702403465280:3802702406030229 0:0 CopyDeviceToHost:500:1991 -3802702412982171:3802702414646163 0:0 CopyHostToDevice:501:1991 -3802702414739104:3802702415951461 0:0 KernelExecution:505:1991 -3802702414715973:3802702417282642 0:0 CopyDeviceToHost:507:1991 -3802702424250984:3802702425925207 0:0 CopyHostToDevice:508:1991 -3802702426022614:3802702427240009 0:0 KernelExecution:512:1991 -3802702425999277:3802702428556726 0:0 CopyDeviceToHost:514:1991 -3802702435521608:3802702437497583 0:0 CopyHostToDevice:515:1991 -3802702437591756:3802702438798483 0:0 KernelExecution:519:1991 -3802702437567843:3802702440117692 0:0 CopyDeviceToHost:521:1991 -3802702447076184:3802702448752496 0:0 CopyHostToDevice:522:1991 -3802702448844326:3802702450040979 0:0 KernelExecution:526:1991 -3802702448821457:3802702451374905 0:0 CopyDeviceToHost:528:1991 -3802702458338087:3802702460115460 0:0 CopyHostToDevice:529:1991 -3802702460219046:3802702461421625 0:0 KernelExecution:533:1991 -3802702460192041:3802702462758090 0:0 CopyDeviceToHost:535:1991 -3802702469730872:3802702471408304 0:0 CopyHostToDevice:536:1991 -3802702471502923:3802702472699724 0:0 KernelExecution:540:1991 -3802702471478905:3802702474035724 0:0 CopyDeviceToHost:542:1991 -3802702481000815:3802702482659947 0:0 CopyHostToDevice:543:1991 -3802702482757759:3802702483952190 0:0 KernelExecution:547:1991 -3802702482734898:3802702485283566 0:0 CopyDeviceToHost:549:1991 -3802702492244298:3802702493917401 0:0 CopyHostToDevice:550:1991 -3802702494011385:3802702495222705 0:0 KernelExecution:554:1991 -3802702493988441:3802702496538570 0:0 CopyDeviceToHost:556:1991 PASSED! ## Iteration (19) ################# PASSED! ## Iteration (18) ################# +129856505198157:129856505972108 0:0 CopyHostToDevice:480:14696 +129856506058878:129856507279678 0:0 KernelExecution:484:14696 +129856506031181:129856507818608 0:0 CopyDeviceToHost:486:14696 +129856512668452:129856513378344 0:0 CopyHostToDevice:487:14696 +129856513463906:129856514683906 0:0 KernelExecution:491:14696 +129856513435880:129856515225665 0:0 CopyDeviceToHost:493:14696 +129856520057898:129856520789533 0:0 CopyHostToDevice:494:14696 +129856520877018:129856522100858 0:0 KernelExecution:498:14696 +129856520849406:129856522643928 0:0 CopyDeviceToHost:500:14696 +129856527495540:129856528214422 0:0 CopyHostToDevice:501:14696 +129856528300948:129856529522228 0:0 KernelExecution:505:14696 +129856528273469:129856530060374 0:0 CopyDeviceToHost:507:14696 +129856534970413:129856535678341 0:0 CopyHostToDevice:508:14696 +129856535767312:129856536986193 0:0 KernelExecution:512:14696 +129856535739484:129856537527830 0:0 CopyDeviceToHost:514:14696 +129856542452848:129856543222239 0:0 CopyHostToDevice:515:14696 +129856543308707:129856544531907 0:0 KernelExecution:519:14696 +129856543281047:129856545069937 0:0 CopyDeviceToHost:521:14696 +129856549924160:129856550693828 0:0 CopyHostToDevice:522:14696 +129856550779510:129856552004150 0:0 KernelExecution:526:14696 +129856550751409:129856552552270 0:0 CopyDeviceToHost:528:14696 +129856557413139:129856558179223 0:0 CopyHostToDevice:529:14696 +129856558266309:129856559487269 0:0 KernelExecution:533:14696 +129856558237736:129856560027323 0:0 CopyDeviceToHost:535:14696 +129856564827841:129856565542599 0:0 CopyHostToDevice:536:14696 +129856565630041:129856566854841 0:0 KernelExecution:540:14696 +129856565602389:129856567397324 0:0 CopyDeviceToHost:542:14696 +129856572247710:129856572954375 0:0 CopyHostToDevice:543:14696 +129856573041963:129856574264203 0:0 KernelExecution:547:14696 +129856573013452:129856574809983 0:0 CopyDeviceToHost:549:14696 +129856579656436:129856580368439 0:0 CopyHostToDevice:550:14696 +129856580456039:129856581680039 0:0 KernelExecution:554:14696 +129856580428344:129856582226693 0:0 CopyDeviceToHost:556:14696 +129856587092681:129856587802199 0:0 CopyHostToDevice:557:14696 +129856587888587:129856589111627 0:0 KernelExecution:561:14696 +129856587861029:129856589654526 0:0 CopyDeviceToHost:563:14696 +129856594498640:129856595270698 0:0 CopyHostToDevice:564:14696 +129856595356053:129856596579733 0:0 KernelExecution:568:14696 +129856595328424:129856597128257 0:0 CopyDeviceToHost:570:14696 +129856601984341:129856602751266 0:0 CopyHostToDevice:571:14696 +129856505180003:129856505975222 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :480 +129856505976980:129856505976981 14696:14696 MARK(name(before HIP LaunchKernel)) +129856505980587:129856505981234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 +129856505982935:129856505983566 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 +129856505985434:129856505988514 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 +129856505990096:129856505990097 14696:14696 MARK(name(after HIP LaunchKernel)) +129856505991997:129856507832334 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :486 +129856512649603:129856513382084 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :487 +129856513384599:129856513384600 14696:14696 MARK(name(before HIP LaunchKernel)) +129856513388119:129856513389080 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 +129856513391435:129856513392275 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 +129856513394697:129856513399367 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 +129856513401523:129856513401524 14696:14696 MARK(name(after HIP LaunchKernel)) +129856513404257:129856515239416 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :493 +129856519992571:129856520793180 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :494 +129856520794974:129856520794975 14696:14696 MARK(name(before HIP LaunchKernel)) +129856520798420:129856520799070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 +129856520800911:129856520801530 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 +129856520803611:129856520806841 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 +129856520808737:129856520808738 14696:14696 MARK(name(after HIP LaunchKernel)) +129856520810545:129856522657358 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :500 +129856527425346:129856528218117 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :501 +129856528219874:129856528219875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856528221975:129856528222627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 +129856528224439:129856528225291 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 +129856528227108:129856528230172 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 +129856528231752:129856528231753 14696:14696 MARK(name(after HIP LaunchKernel)) +129856528233473:129856530074548 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :507 +129856534899214:129856535681957 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :508 +129856535683676:129856535683677 14696:14696 MARK(name(before HIP LaunchKernel)) +129856535686401:129856535687061 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 +129856535688790:129856535689423 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 +129856535691153:129856535694294 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 +129856535695868:129856535695869 14696:14696 MARK(name(after HIP LaunchKernel)) +129856535697671:129856537541753 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :514 +129856542387175:129856543225418 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :515 +129856543227192:129856543227193 14696:14696 MARK(name(before HIP LaunchKernel)) +129856543230911:129856543231570 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 +129856543233243:129856543233871 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 +129856543235930:129856543238762 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 +129856543240359:129856543240360 14696:14696 MARK(name(after HIP LaunchKernel)) +129856543242179:129856545084137 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :521 +129856549857104:129856550696919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :522 +129856550698874:129856550698875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856550702196:129856550702852 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 +129856550704612:129856550705254 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 +129856550707079:129856550709869 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 +129856550711442:129856550711443 14696:14696 MARK(name(after HIP LaunchKernel)) +129856550713182:129856552568840 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :528 +129856557336788:129856558182426 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :529 +129856558184195:129856558184196 14696:14696 MARK(name(before HIP LaunchKernel)) +129856558187727:129856558188380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 +129856558190122:129856558190752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 +129856558192774:129856558195554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 +129856558197324:129856558197325 14696:14696 MARK(name(after HIP LaunchKernel)) +129856558199234:129856560041419 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :535 +129856564809360:129856565545640 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :536 +129856565547393:129856565547394 14696:14696 MARK(name(before HIP LaunchKernel)) +129856565549636:129856565550299 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 +129856565551969:129856565552581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 +129856565554301:129856565557438 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 +129856565559047:129856565559048 14696:14696 MARK(name(after HIP LaunchKernel)) +129856565560847:129856567411065 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :542 +129856572215770:129856572957492 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :543 +129856572959234:129856572959235 14696:14696 MARK(name(before HIP LaunchKernel)) +129856572962526:129856572963184 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 +129856572964912:129856572965546 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 +129856572967421:129856572970453 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 +129856572972097:129856572972098 14696:14696 MARK(name(after HIP LaunchKernel)) +129856572974076:129856574823083 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :549 +129856579588261:129856580372449 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :550 +129856580374262:129856580374263 14696:14696 MARK(name(before HIP LaunchKernel)) +129856580376547:129856580377227 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 +129856580378975:129856580379619 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 +129856580381546:129856580384467 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 +129856580386225:129856580386226 14696:14696 MARK(name(after HIP LaunchKernel)) +129856580388205:129856582240020 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :556 +129856587022783:129856587805709 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :557 +129856587807440:129856587807441 14696:14696 MARK(name(before HIP LaunchKernel)) +129856587811171:129856587811825 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 +129856587813530:129856587814170 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 +129856587816040:129856587819243 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 +129856587820912:129856587820913 14696:14696 MARK(name(after HIP LaunchKernel)) +129856587822927:129856589666874 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :563 +129856594433516:129856595273993 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :564 +129856595275800:129856595275801 14696:14696 MARK(name(before HIP LaunchKernel)) +129856595278990:129856595279652 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 +129856595281384:129856595282018 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 +129856595283991:129856595287449 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 +129856595289101:129856595289102 14696:14696 MARK(name(after HIP LaunchKernel)) +129856595291045:129856597140491 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :570 +129856601919460:129856602754655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :571 +129856602756445:129856602756446 14696:14696 MARK(name(before HIP LaunchKernel)) +129856602769740:129856602770661 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 +129856602772396:129856602773016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 +129856602775079:129856602778192 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 +129856602779755:129856602779756 14696:14696 MARK(name(after HIP LaunchKernel)) PASSED! ## Iteration (17) ################# PASSED! @@ -419,33 +998,6 @@ PASSED! ## Iteration (13) ################# PASSED! ## Iteration (12) ################# -3802702503514682:3802702505171794 0:0 CopyHostToDevice:557:1991 -3802702505267652:3802702506468157 0:0 KernelExecution:561:1991 -3802702505243905:3802702507802814 0:0 CopyDeviceToHost:563:1991 -3802702514761386:3802702516425008 0:0 CopyHostToDevice:564:1991 -3802702516524080:3802702517731252 0:0 KernelExecution:568:1991 -3802702516500289:3802702519068477 0:0 CopyDeviceToHost:570:1991 -3802702526022439:3802702527700141 0:0 CopyHostToDevice:571:1991 -3802702527794104:3802702528982164 0:0 KernelExecution:575:1991 -3802702527771042:3802702530315231 0:0 CopyDeviceToHost:577:1991 -3802702537274272:3802702538953635 0:0 CopyHostToDevice:578:1991 -3802702539050334:3802702540254987 0:0 KernelExecution:582:1991 -3802702539025425:3802702541587134 0:0 CopyDeviceToHost:584:1991 -3802702548553016:3802702550225609 0:0 CopyHostToDevice:585:1991 -3802702550319337:3802702551518805 0:0 KernelExecution:589:1991 -3802702550295569:3802702552853758 0:0 CopyDeviceToHost:591:1991 -3802702559816479:3802702561495482 0:0 CopyHostToDevice:592:1991 -3802702561589855:3802702562788137 0:0 KernelExecution:596:1991 -3802702561565542:3802702564108601 0:0 CopyDeviceToHost:598:1991 -3802702571069693:3802702572746995 0:0 CopyHostToDevice:599:1991 -3802702572840650:3802702574041747 0:0 KernelExecution:603:1991 -3802702572817856:3802702575375565 0:0 CopyDeviceToHost:605:1991 -3802702582343137:3802702584028249 0:0 CopyHostToDevice:606:1991 -3802702584131111:3802702585314874 0:0 KernelExecution:610:1991 -3802702584098390:3802702586648988 0:0 CopyDeviceToHost:612:1991 -3802702593620890:3802702595300582 0:0 CopyHostToDevice:613:1991 -3802702595394737:3802702596603391 0:0 KernelExecution:617:1991 -3802702595371233:3802702597936882 0:0 CopyDeviceToHost:619:1991 PASSED! ## Iteration (11) ################# PASSED! @@ -460,37 +1012,143 @@ PASSED! ## Iteration (6) ################# PASSED! ## Iteration (5) ################# +129856602843500:129856604064780 0:0 KernelExecution:575:14696 +129856602815760:129856604621212 0:0 CopyDeviceToHost:577:14696 +129856609545997:129856610317997 0:0 CopyHostToDevice:578:14696 +129856610406851:129856611631491 0:0 KernelExecution:582:14696 +129856610379025:129856612168754 0:0 CopyDeviceToHost:584:14696 +129856616987475:129856617701731 0:0 CopyHostToDevice:585:14696 +129856617790176:129856619014496 0:0 KernelExecution:589:14696 +129856617761809:129856619559063 0:0 CopyDeviceToHost:591:14696 +129856624349579:129856625289209 0:0 CopyHostToDevice:592:14696 +129856625377836:129856626603916 0:0 KernelExecution:596:14696 +129856625350001:129856627147692 0:0 CopyDeviceToHost:598:14696 +129856632033149:129856632742303 0:0 CopyHostToDevice:599:14696 +129856632836527:129856634057647 0:0 KernelExecution:603:14696 +129856632808948:129856634598487 0:0 CopyDeviceToHost:605:14696 +129856639443412:129856640151030 0:0 CopyHostToDevice:606:14696 +129856640260250:129856641484890 0:0 KernelExecution:610:14696 +129856640232509:129856642041965 0:0 CopyDeviceToHost:612:14696 +129856646912100:129856647619752 0:0 CopyHostToDevice:613:14696 +129856647705914:129856648930874 0:0 KernelExecution:617:14696 +129856647678197:129856649476287 0:0 CopyDeviceToHost:619:14696 +129856654338593:129856655101879 0:0 CopyHostToDevice:620:14696 +129856655189659:129856656412699 0:0 KernelExecution:624:14696 +129856655161891:129856656960409 0:0 CopyDeviceToHost:626:14696 +129856661822483:129856662586330 0:0 CopyHostToDevice:627:14696 +129856662679432:129856663900712 0:0 KernelExecution:631:14696 +129856662650940:129856664447428 0:0 CopyDeviceToHost:633:14696 +129856669274444:129856670036595 0:0 CopyHostToDevice:634:14696 +129856670129015:129856671350615 0:0 KernelExecution:638:14696 +129856670101388:129856671895354 0:0 CopyDeviceToHost:640:14696 +129856676687339:129856677401038 0:0 CopyHostToDevice:641:14696 +129856677491350:129856678712950 0:0 KernelExecution:645:14696 +129856677463387:129856679258027 0:0 CopyDeviceToHost:647:14696 +129856684088485:129856684823542 0:0 CopyHostToDevice:648:14696 +129856684910895:129856686132975 0:0 KernelExecution:652:14696 +129856684882539:129856686675228 0:0 CopyDeviceToHost:654:14696 +129856691574066:129856692284982 0:0 CopyHostToDevice:655:14696 +129856692371897:129856693594617 0:0 KernelExecution:659:14696 +129856692344278:129856694142257 0:0 CopyDeviceToHost:661:14696 +129856699000899:129856699713058 0:0 CopyHostToDevice:662:14696 +129856699797526:129856701023446 0:0 KernelExecution:666:14696 +129856699769937:129856701569372 0:0 CopyDeviceToHost:668:14696 +129856602781709:129856604636152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :577 +129856609479851:129856610321075 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :578 +129856610323078:129856610323079 14696:14696 MARK(name(before HIP LaunchKernel)) +129856610326500:129856610327162 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 +129856610328857:129856610329498 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 +129856610331492:129856610334664 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 +129856610336290:129856610336291 14696:14696 MARK(name(after HIP LaunchKernel)) +129856610338048:129856612222255 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :584 +129856616969217:129856617705105 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :585 +129856617706989:129856617706990 14696:14696 MARK(name(before HIP LaunchKernel)) +129856617710485:129856617711142 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 +129856617712846:129856617713491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 +129856617715518:129856617718644 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 +129856617720274:129856617720275 14696:14696 MARK(name(after HIP LaunchKernel)) +129856617722118:129856619570993 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :591 +129856624331436:129856625292310 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :592 +129856625294207:129856625294208 14696:14696 MARK(name(before HIP LaunchKernel)) +129856625297113:129856625297761 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 +129856625299459:129856625300093 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 +129856625301835:129856625305409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 +129856625307116:129856625307117 14696:14696 MARK(name(after HIP LaunchKernel)) +129856625309051:129856627159676 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :598 +129856631962417:129856632745795 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :599 +129856632747622:129856632747623 14696:14696 MARK(name(before HIP LaunchKernel)) +129856632761013:129856632761762 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 +129856632763565:129856632764219 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 +129856632766094:129856632769110 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 +129856632770707:129856632770708 14696:14696 MARK(name(after HIP LaunchKernel)) +129856632772662:129856634610068 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :605 +129856639375744:129856640154106 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :606 +129856640155933:129856640155934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856640159565:129856640160216 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 +129856640161841:129856640162476 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 +129856640164410:129856640167293 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 +129856640168886:129856640168887 14696:14696 MARK(name(after HIP LaunchKernel)) +129856640170703:129856642054780 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :612 +129856646841774:129856647623131 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :613 +129856647624849:129856647624850 14696:14696 MARK(name(before HIP LaunchKernel)) +129856647628076:129856647628742 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 +129856647630426:129856647631050 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 +129856647632957:129856647636281 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 +129856647637872:129856647637873 14696:14696 MARK(name(after HIP LaunchKernel)) +129856647639599:129856649488719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :619 +129856654273909:129856655105030 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :620 +129856655106878:129856655106879 14696:14696 MARK(name(before HIP LaunchKernel)) +129856655109847:129856655110497 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 +129856655112292:129856655112914 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 +129856655114757:129856655118162 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 +129856655119835:129856655119836 14696:14696 MARK(name(after HIP LaunchKernel)) +129856655121792:129856656973292 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :626 PASSED! ## Iteration (4) ################# +129856661755424:129856662589447 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :627 +129856662591236:129856662591237 14696:14696 MARK(name(before HIP LaunchKernel)) +129856662604066:129856662604831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 +129856662606611:129856662607261 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 +129856662608995:129856662611988 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 +129856662613644:129856662613645 14696:14696 MARK(name(after HIP LaunchKernel)) +129856662615584:129856664462467 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :633 +129856669256336:129856670039683 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :634 +129856670041634:129856670041635 14696:14696 MARK(name(before HIP LaunchKernel)) +129856670054499:129856670055254 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 +129856670056982:129856670057615 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 +129856670059351:129856670062513 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 +129856670064113:129856670064114 14696:14696 MARK(name(after HIP LaunchKernel)) +129856670066200:129856671906923 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :640 +129856676668791:129856677404223 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :641 +129856677406068:129856677406069 14696:14696 MARK(name(before HIP LaunchKernel)) +129856677408812:129856677409484 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 +129856677411095:129856677411722 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 +129856677413461:129856677416941 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 +129856677418503:129856677418504 14696:14696 MARK(name(after HIP LaunchKernel)) +129856677420242:129856679269939 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :647 +129856684019418:129856684826552 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :648 +129856684828363:129856684828364 14696:14696 MARK(name(before HIP LaunchKernel)) +129856684832034:129856684832695 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 +129856684834368:129856684834970 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 +129856684836877:129856684839963 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 +129856684841560:129856684841561 14696:14696 MARK(name(after HIP LaunchKernel)) +129856684843320:129856686688518 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :654 +129856691504696:129856692288950 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :655 +129856692290798:129856692290799 14696:14696 MARK(name(before HIP LaunchKernel)) +129856692292859:129856692293513 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 +129856692295227:129856692295860 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 +129856692297819:129856692300821 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 +129856692302355:129856692302356 14696:14696 MARK(name(after HIP LaunchKernel)) +129856692304530:129856694153679 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :661 +129856698928289:129856699716162 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :662 +129856699717890:129856699717891 14696:14696 MARK(name(before HIP LaunchKernel)) +129856699720061:129856699720715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 +129856699722330:129856699722941 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 +129856699724836:129856699728198 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 +129856699729953:129856699729954 14696:14696 MARK(name(after HIP LaunchKernel)) +129856699731887:129856701581422 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :668 PASSED! ## Iteration (3) ################# -3802702605023015:3802702606699537 0:0 CopyHostToDevice:620:1991 -3802702606793386:3802702607994484 0:0 KernelExecution:624:1991 -3802702606770418:3802702609331847 0:0 CopyDeviceToHost:626:1991 -3802702616295619:3802702617971351 0:0 CopyHostToDevice:627:1991 -3802702618064582:3802702619276198 0:0 KernelExecution:631:1991 -3802702618041252:3802702620593170 0:0 CopyDeviceToHost:633:1991 -3802702627572022:3802702629249514 0:0 CopyHostToDevice:634:1991 -3802702629343204:3802702630550228 0:0 KernelExecution:638:1991 -3802702629319715:3802702631886524 0:0 CopyDeviceToHost:640:1991 -3802702638854896:3802702640514568 0:0 CopyHostToDevice:641:1991 -3802702640601153:3802702641794250 0:0 KernelExecution:645:1991 -3802702640583338:3802702643131137 0:0 CopyDeviceToHost:647:1991 -3802702650106259:3802702651784942 0:0 CopyHostToDevice:648:1991 -3802702651876671:3802702653079250 0:0 KernelExecution:652:1991 -3802702651853582:3802702654414351 0:0 CopyDeviceToHost:654:1991 -3802702661383522:3802702663061155 0:0 CopyHostToDevice:655:1991 -3802702663154356:3802702664347453 0:0 KernelExecution:659:1991 -3802702663130645:3802702665680984 0:0 CopyDeviceToHost:661:1991 -3802702672630496:3802702674303238 0:0 CopyHostToDevice:662:1991 -3802702674398093:3802702675599190 0:0 KernelExecution:666:1991 -3802702674374489:3802702676932868 0:0 CopyDeviceToHost:668:1991 -3802702683898880:3802702685606503 0:0 CopyHostToDevice:669:1991 -3802702685701165:3802702686898410 0:0 KernelExecution:673:1991 -3802702685678193:3802702688219002 0:0 CopyDeviceToHost:675:1991 -3802702695162453:3802702696838515 0:0 CopyHostToDevice:676:1991 -3802702696932444:3802702698137097 0:0 KernelExecution:680:1991 -3802702696909796:3802702699473165 0:0 CopyDeviceToHost:682:1991 PASSED! ## Iteration (2) ################# PASSED! @@ -498,12 +1156,55 @@ PASSED! PASSED! ## Iteration (0) ################# PASSED! -3802702706580728:3802702708245350 0:0 CopyHostToDevice:683:1991 -3802702708346791:3802702709549370 0:0 KernelExecution:687:1991 -3802702708322181:3802702710885410 0:0 CopyDeviceToHost:689:1991 -3802702717849822:3802702719525044 0:0 CopyHostToDevice:690:1991 -3802702719618857:3802702720813139 0:0 KernelExecution:694:1991 -3802702719594825:3802702722149644 0:0 CopyDeviceToHost:696:1991 -3802702729111215:3802702730788167 0:0 CopyHostToDevice:697:1991 -3802702730881622:3802702732076497 0:0 KernelExecution:701:1991 -3802702730858498:3802702733412517 0:0 CopyDeviceToHost:703:1991 +129856706468741:129856707235310 0:0 CopyHostToDevice:669:14696 +129856707327230:129856708548510 0:0 KernelExecution:673:14696 +129856707299810:129856709098218 0:0 CopyDeviceToHost:675:14696 +129856713958124:129856714730788 0:0 CopyHostToDevice:676:14696 +129856714818472:129856716040872 0:0 KernelExecution:680:14696 +129856714790211:129856716592662 0:0 CopyDeviceToHost:682:14696 +129856721429109:129856722193080 0:0 CopyHostToDevice:683:14696 +129856722282194:129856723505714 0:0 KernelExecution:687:14696 +129856722254384:129856724056420 0:0 CopyDeviceToHost:689:14696 +129856728891611:129856729607012 0:0 CopyHostToDevice:690:14696 +129856729693911:129856730917431 0:0 KernelExecution:694:14696 +129856729665766:129856731460761 0:0 CopyDeviceToHost:696:14696 +129856736249266:129856736963101 0:0 CopyHostToDevice:697:14696 +129856737053267:129856738276147 0:0 KernelExecution:701:14696 +129856737025461:129856738822547 0:0 CopyDeviceToHost:703:14696 +129856706409352:129856707238410 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :669 +129856707240341:129856707240342 14696:14696 MARK(name(before HIP LaunchKernel)) +129856707253495:129856707254390 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 +129856707256214:129856707256878 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 +129856707258659:129856707261885 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 +129856707263518:129856707263519 14696:14696 MARK(name(after HIP LaunchKernel)) +129856707265698:129856709110388 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :675 +129856713891418:129856714734007 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :676 +129856714735794:129856714735795 14696:14696 MARK(name(before HIP LaunchKernel)) +129856714739058:129856714739715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 +129856714741339:129856714741972 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 +129856714743986:129856714747316 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 +129856714748993:129856714748994 14696:14696 MARK(name(after HIP LaunchKernel)) +129856714750976:129856716607126 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :682 +129856721364192:129856722196489 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :683 +129856722198322:129856722198323 14696:14696 MARK(name(before HIP LaunchKernel)) +129856722202102:129856722202759 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 +129856722204452:129856722205080 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 +129856722207098:129856722210100 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 +129856722211652:129856722211653 14696:14696 MARK(name(after HIP LaunchKernel)) +129856722213452:129856724068250 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :689 +129856728873958:129856729610520 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :690 +129856729612474:129856729612475 14696:14696 MARK(name(before HIP LaunchKernel)) +129856729615953:129856729616618 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 +129856729618275:129856729618880 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 +129856729620844:129856729623983 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 +129856729625525:129856729625526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856729627363:129856731472859 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :696 +129856736212718:129856736966611 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :697 +129856736968384:129856736968385 14696:14696 MARK(name(before HIP LaunchKernel)) +129856736971498:129856736972186 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 +129856736973934:129856736974581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 +129856736976433:129856736979849 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 +129856736981559:129856736981560 14696:14696 MARK(name(after HIP LaunchKernel)) +129856736983603:129856738834349 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :703 +129856743571751:129856743607276 14696:14696 hipFree(ptr=0x7fd65ce00000) :704 +129856743609591:129856743621235 14696:14696 hipFree(ptr=0x7fd65c800000) :705 diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index 5e6dbaa7..5311d813 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -10,3 +10,5 @@ MatrixTranspose_hip_flush_trace --check-order .* MatrixTranspose_kfd_trace --check-events .* ctrl_hsa_trace --check-event .* ctrl_hsa_input_trace --check-event .* +hsa_co_trace --check-none +code_obj_trace --check-none diff --git a/test/run.sh b/test/run.sh index c5931061..577bdd30 100755 --- a/test/run.sh +++ b/test/run.sh @@ -151,6 +151,13 @@ echo " From c1632440e704aa3f62ede3d7a979ee247beabd1d Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 13 Oct 2020 16:43:18 -0500 Subject: [PATCH 19/47] SWDEV-254329 : extending debug trace with timestamps, cmake option '-DCMAKE_DEBUG_TRACE=1' Change-Id: Id16c01a6c00f6384c37fa9b5a9709a5e98e1fb57 --- src/core/roctracer.cpp | 23 ++++++++++++++--------- test/tool/tracer_tool.cpp | 12 ++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 21203f91..891b005e 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -93,7 +93,6 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") - static inline uint32_t GetPid() { return syscall(__NR_getpid); } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -172,6 +171,9 @@ void RestoreHsaApi() { } namespace roctracer { +// timestamp definitino +typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; @@ -347,6 +349,8 @@ void* HIP_SyncApiDataCallback( const void* callback_data, void* arg) { + static hsa_rt_utils::Timer timer; + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); @@ -392,8 +396,8 @@ void* HIP_SyncApiDataCallback( } const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); - DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); return ret; } @@ -405,6 +409,7 @@ void* HIP_SyncActivityCallback( void* arg) { static hsa_rt_utils::Timer timer; + const timestamp_t timestamp_ns = timer.timestamp_ns(); void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -435,7 +440,7 @@ void* HIP_SyncActivityCallback( // Filing record info record->domain = ACTIVITY_DOMAIN_HIP_API; record->op = op_id; - record->begin_ns = timer.timestamp_ns(); + record->begin_ns = timestamp_ns; // Correlation ID generating uint64_t correlation_id = data->correlation_id; @@ -460,7 +465,7 @@ void* HIP_SyncActivityCallback( } // Filing record info - record->end_ns = timer.timestamp_ns(); + record->end_ns = timestamp_ns; record->process_id = syscall(__NR_getpid); record->thread_id = syscall(__NR_gettid); @@ -484,8 +489,8 @@ void* HIP_SyncActivityCallback( } const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); - DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0); + DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); return ret; } @@ -502,8 +507,8 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { pool->Write(*record_ptr); const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); - DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d)\n", - name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id); + DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) beg_ns(%lu) end_ns(%lu)\n", + name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id, record_ptr->begin_ns, record_ptr->end_ns); } // Open output file diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index d6be6f4f..1aeb75a3 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -454,8 +454,8 @@ void hip_api_callback( } const char * name = roctracer_op_string(domain, cid, 0); - DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu)\n", - name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id); + DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu) timestamp(%lu)\n", + name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id, timestamp); } void mark_api_callback( @@ -500,8 +500,8 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { oss << std::dec << rec_ss.str() << " " << str; const char * name = roctracer_op_string(entry->domain, entry->cid, 0); - DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu))\n", - name, entry->domain, entry->cid, entry, entry->name, correlation_id); + DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) beg(%lu) end(%lu))\n", + name, entry->domain, entry->cid, entry, entry->name, correlation_id, begin_timestamp, end_timestamp); if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING @@ -631,8 +631,8 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); - DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu)\n", - name, record->domain, record->op, record->kind, record, record->correlation_id); + DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) beg(%lu) end(%lu)\n", + name, record->domain, record->op, record->kind, record, record->correlation_id, record->begin_ns, record->end_ns); switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: From 2293663e097cfe806412b7cbb8a464b492fdb97a Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 15 Oct 2020 19:25:09 -0400 Subject: [PATCH 20/47] code obj tests: adding load base Change-Id: I5fdb25b67eaae43b3c01cd8de3824f9343c37794 --- test/app/codeobj_test.cpp | 8 ++++---- test/app/hsaco_test.cpp | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp index 086bcfb6..124715cd 100644 --- a/test/app/codeobj_test.cpp +++ b/test/app/codeobj_test.cpp @@ -44,15 +44,15 @@ void check_status(roctracer_status_t status) { // codeobj callback void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { const hsa_evt_data_t* evt_data = reinterpret_cast(data); - const uint32_t uri_length = evt_data->codeobj.uri_length; const char* uri = evt_data->codeobj.uri; - printf("codeobj_callback domain(%u) cid(%u): load_delta(0x%lx) load_size(0x%lx) uri_length(%u) uri(\"%s\")\n", + printf("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n", domain, cid, - evt_data->codeobj.load_delta, + evt_data->codeobj.load_base, evt_data->codeobj.load_size, - uri_length, + evt_data->codeobj.load_delta, uri); + free((void*)uri); fflush(stdout); } diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp index 0f2e42ad..23200137 100644 --- a/test/app/hsaco_test.cpp +++ b/test/app/hsaco_test.cpp @@ -50,11 +50,16 @@ hsa_status_t code_object_callback( { printf("code_object_callback\n"); fflush(stdout); + uint64_t load_base = 0; uint64_t load_size = 0; uint64_t load_delta = 0; uint32_t uri_len = 0; char* uri_str = NULL; + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &load_base)); HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, @@ -79,11 +84,14 @@ hsa_status_t code_object_callback( HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, uri_str)); + printf("load_base(0x%lx)\n", load_base); fflush(stdout); printf("load_size(0x%lx)\n", load_size); fflush(stdout); printf("load_delta(0x%lx)\n", load_delta); fflush(stdout); printf("uri_len(%u)\n", uri_len); fflush(stdout); printf("uri_str(\"%s\")\n", uri_str); fflush(stdout); + free(uri_str); + return HSA_STATUS_SUCCESS; } From 68b1850c9267368719d63ab7a903df830641fd0a Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 18 Nov 2020 16:47:45 -0500 Subject: [PATCH 21/47] SWDEV-255938 NEW - added regex and depth support to ostream ops Change-Id: I292255adab3a70fa00a1dd5685b788521687f35b --- inc/roctracer_hip.h | 2 +- inc/roctracer_hsa.h | 17 --- script/gen_ostream_ops.py | 215 ++++++++++++++++---------------------- script/hsaap.py | 5 +- script/kfdap.py | 4 +- src/CMakeLists.txt | 5 +- 6 files changed, 98 insertions(+), 150 deletions(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 091f3279..553ec1b8 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -37,8 +37,8 @@ inline static std::ostream& operator<<(std::ostream& out, const char& v) { } #endif // __cplusplus -#include #include +#include #include #include diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index d9daa5e5..1e50c3ab 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -66,23 +66,6 @@ typedef hsa_support::ops_properties_t hsa_ops_properties_t; #include "hsa_ostream_ops.h" -std::ostream& operator<<(std::ostream& out, const hsa_amd_memory_pool_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_image_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_sampler_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} #else // !__cplusplus typedef void* hsa_amd_queue_intercept_handler; diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index c8f23629..fcc379a6 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import os, sys, re import CppHeaderParser @@ -29,11 +29,6 @@ '*/\n' -header = 'template \n' + \ -'struct output_streamer {\n' + \ -' inline static std::ostream& put(std::ostream& out, const T& v) { return out; }\n' + \ -'};\n\n' - header_basic = \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ @@ -43,8 +38,10 @@ ' return out; }\n' structs_analyzed = {} -global_ops_hip = '' +global_ops = '' global_str = '' +output_filename_h = None +apiname = "" # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): @@ -61,7 +58,6 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a return if cppHeader_struct in structs_analyzed: return - structs_analyzed[cppHeader_struct] = 1 for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): key = 'name' @@ -90,17 +86,13 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a str = '' if "union" not in mtype: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + " = \");\n" - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" - str += " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" - else: - str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \"" + name + " = \");\n" - if array_size == "": - str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + ">::put(out, v." + name + ");\n" - else: - str += " roctracer::" + apiname.lower() + "_support::output_streamer<" + mtype + "[" + array_size + "]>::put(out, v." + name + ");\n" - str += " roctracer::" + apiname.lower() + "_support::output_streamer::put(out, \", \");\n" + indent = "" + str += " if (regex_match (\"" + cppHeader_struct + "::" + name + "\", std::regex(" + apiname.upper() + "_structs_regex))) {\n" + indent = " " + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" + str += " }\n" if "void" not in mtype: global_str += str else: @@ -113,133 +105,104 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) # Parses API header file and generates ostream ops files ostream_ops.h -def gen_cppheader(infilepath, outfilepath, structs_depth): +def gen_cppheader(infilepath, outfilepath, rank): # infilepath: API Header file to be parsed # outfilepath: Output file where ostream operators are written - global_ops_hip = '' - global_ops_hsa = '' + global global_ops + global output_filename_h + global apiname global global_str try: cppHeader = CppHeaderParser.CppHeader(infilepath) except CppHeaderParser.CppParseError as e: print(e) sys.exit(1) - mpath = os.path.dirname(outfilepath) - if mpath == "": - mpath = os.getcwd() - apiname = outfilepath.replace(mpath+"/","") - apiname = apiname.replace("_ostream_ops.h","") - apiname = apiname.upper() - f = open(outfilepath,"w+") - f.write("// automatically generated\n") - f.write(LICENSE + '\n') - header_s = \ - '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#ifdef __cplusplus\n' + \ - '#include \n' + \ - '\n' + \ - '#include "roctracer.h"\n' - if apiname.lower() == 'hip': - header_s = header_s + '\n' + \ - '#include "hip/hip_runtime_api.h"\n' + \ - '#include "hip/hcc_detail/hip_vector_types.h"\n\n' - - f.write(header_s) - f.write('\n') - f.write('namespace roctracer {\n') - f.write('namespace ' + apiname.lower() + '_support {\n') - if structs_depth != -1: - f.write('static int ' + apiname.upper() + '_depth_max = ' + str(structs_depth) + ';\n') - f.write('// begin ostream ops for '+ apiname + ' \n') - if apiname.lower() == "hip" or apiname.lower() == "hsa": - f.write("// basic ostream ops\n") - f.write(header_basic) - f.write("// End of basic ostream ops\n\n") - else: - f.write(header) + if rank == 0 or rank == 2: + mpath = os.path.dirname(outfilepath) + if mpath == "": + mpath = os.getcwd() + apiname = outfilepath.replace(mpath + "/","") + output_filename_h = open(outfilepath,"w+") + apiname = apiname.replace("_ostream_ops.h","") + apiname = apiname.upper() + output_filename_h.write("// automatically generated\n") + output_filename_h.write(LICENSE + '\n') + header_s = \ + '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#ifdef __cplusplus\n' + \ + '#include \n' + \ + '\n' + \ + '#include "roctracer.h"\n' + header_s += '#include \n#include \n' + + output_filename_h.write(header_s) + output_filename_h.write('\n') + output_filename_h.write('namespace roctracer {\n') + output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') + output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \".*\";\n') + output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') + output_filename_h.write("// basic ostream ops\n") + output_filename_h.write(header_basic) + output_filename_h.write("// End of basic ostream ops\n\n") for c in cppHeader.classes: if "union" in c: continue - if apiname.lower() == 'hsa': - if c == 'max_align_t' or c == '__fsid_t': #already defined for hip + if c in structs_analyzed: continue - if len(cppHeader.classes[c]["properties"]["public"])!=0: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - f.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") - f.write("{\n") - f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") - if structs_depth != -1: - f.write(" " + apiname.upper() + "_depth_max++;\n") - f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) - process_struct(f, c, cppHeader, "", apiname) - global_str = "\n".join(global_str.split("\n")[0:-2]) - if structs_depth != -1: #reindent - global_str = global_str.split('\n') - global_str = [' ' + line.lstrip() for line in global_str] - global_str = "\n".join(global_str) - f.write(global_str+"\n") - if structs_depth != -1: - f.write(" };\n") - f.write(" " + apiname.upper() + "_depth_max--;\n") - f.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") - f.write(" return out;\n") - f.write("}\n") - global_str = '' - else: - f.write("\ntemplate<>\n") - f.write("struct output_streamer<" + c + "&> {\n") - f.write(" inline static std::ostream& put(std::ostream& out, "+c+"& v)\n") - f.write("{\n") - f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '{');\n") - if structs_depth != -1: - f.write(apiname.upper() + "_depth_max++;\n") - f.write(" if (" + apiname.upper() + "_depth_max <= " + str(structs_depth) + ") {\n" ) - process_struct(f, c, cppHeader, "", apiname) - global_str = "\n".join(global_str.split("\n")[0:-2]) - if structs_depth != -1: #reindent - global_str = global_str.split('\n') - global_str = [' ' + line.lstrip() for line in global_str] - global_str = "\n".join(global_str) - f.write(global_str+"\n") - if structs_depth != -1: - f.write(" };\n") - f.write(" " + apiname.upper() + "_depth_max--;\n") - f.write(" roctracer::" + apiname.lower() + "_support::output_streamer::put(out, '}');\n") - f.write(" return out;\n") - f.write("}\n") - f.write("};\n") - global_str = '' - if apiname.lower() == 'hip': - global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - if apiname.lower() == 'hsa': - global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - - footer = \ - '// end ostream ops for '+ apiname + ' \n' - footer += '};};\n\n' - f.write(footer) - f.write(global_ops_hip) - f.write(global_ops_hsa) - footer = '#endif //__cplusplus\n' + \ - '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - ' \n' - f.write(footer) - f.close() - print('File ' + outfilepath + ' generated') + if c == 'max_align_t' or c == '__fsid_t': # Skipping as it is defined in multiple domains + continue + if len(cppHeader.classes[c]["properties"]["public"]) != 0: + output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") + output_filename_h.write("{\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") + output_filename_h.write(" if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" ) + process_struct(output_filename_h, c, cppHeader, "", apiname) + global_str = "\n".join(global_str.split("\n")[0:-3]) + if global_str != '': global_str += "\n }\n" + output_filename_h.write(global_str) + output_filename_h.write(" };\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + output_filename_h.write(" return out;\n") + output_filename_h.write("}\n") + global_str = '' + global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + + if rank == 1 or rank == 2: + footer = '// end ostream ops for '+ apiname + ' \n' + footer += '};};\n\n' + output_filename_h.write(footer) + output_filename_h.write(global_ops) + footer = '#endif //__cplusplus\n' + \ + '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + ' \n' + output_filename_h.write(footer) + output_filename_h.close() + print('File ' + outfilepath + ' generated') return parser = argparse.ArgumentParser(description='genOstreamOps.py: generates ostream operators for all typedefs in provided input file.') requiredNamed = parser.add_argument_group('Required arguments') -requiredNamed.add_argument('-in', metavar='file', help='Header file to be parsed', required=True) +requiredNamed.add_argument('-in', metavar='fileList', help='Comma separated list of header files to be parsed', required=True) requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) -requiredNamed.add_argument('-depth', metavar='N', type=int, help='Depth for nested structs', required=False) -structs_depth = 0 args = vars(parser.parse_args()) if __name__ == '__main__': - if args['depth'] != None: structs_depth = args['depth'] - gen_cppheader(args['in'], args['out'], structs_depth) + flist = args['in'].split(',') + if len(flist) == 1: + gen_cppheader(flist[0], args['out'],2) + else: + for i in range(len(flist)): + if i == 0: + gen_cppheader(flist[i], args['out'],0) + elif i == len(flist)-1: + gen_cppheader(flist[i], args['out'],1) + else: + gen_cppheader(flist[i], args['out'],-1) diff --git a/script/hsaap.py b/script/hsaap.py index 84ee9bbf..f9458ee9 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -456,7 +456,10 @@ def gen_out_stream(self, n, name, call, struct): for ind in range(len(arg_list)): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var - self.content += ' out << ' + arg_val + if re.search(r'char\* ', struct['astr'][arg_var]): + self.content += ' out << "0x" << std::hex << (uint64_t)' + arg_val + else: + self.content += ' out << ' + arg_val ''' arg_item = struct['tlst'][ind] if re.search(r'\(\* ', arg_item): arg_pref = '' diff --git a/script/kfdap.py b/script/kfdap.py index 9f560a35..378ea126 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -494,8 +494,7 @@ def gen_out_stream(self, n, name, call, struct): arg_val = 'api_data.args.' + call + '.' + arg_var if re.search(r'MemFlags',arg_var): continue - self.content_h += ' typedef decltype(' + arg_val.replace("[]","") + ') arg_val_type_t' + str(ind) + ';\n' - self.content_h += ' roctracer::kfd_support::output_streamer::put(out, ' + arg_val.replace("[]","") + ')' + self.content_h += ' out << ' + arg_val.replace("[]","") if ind < len(arg_list)-1: self.content_h += ' << ", ";\n' else: self.content_h += ';\n' if struct['ret'] != 'void': @@ -512,7 +511,6 @@ def gen_out_stream(self, n, name, call, struct): self.content_h += ' return out;\n' self.content_h += '}\n' self.content_h += '#endif\n' - self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' # generate PUBLIC_API for all API fcts def gen_public_api(self, n, name, call, struct): diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e9c72f84..689ed637 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,14 +3,15 @@ set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) set ( GEN_SRC_DIR ${PROJECT_BINARY_DIR}/src ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h > ${GEN_INC_DIR}/hsa_ext_amd_pp.h" ) +execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) From ff8ac19b2c8d4414b35576d56e793bf43b1b01ac Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 23 Nov 2020 13:09:46 -0600 Subject: [PATCH 22/47] adding tests dry run to check if platform is in working state Change-Id: Ic430e3f959119983a65929fc70332e293cc3448d --- test/golden_traces/MatrixTranspose_dryrun_trace.txt | 0 test/golden_traces/ctrl_dryrun_trace.txt | 0 test/golden_traces/tests_trace_cmp_levels.txt | 2 ++ test/run.sh | 4 ++++ 4 files changed, 6 insertions(+) create mode 100644 test/golden_traces/MatrixTranspose_dryrun_trace.txt create mode 100644 test/golden_traces/ctrl_dryrun_trace.txt diff --git a/test/golden_traces/MatrixTranspose_dryrun_trace.txt b/test/golden_traces/MatrixTranspose_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/ctrl_dryrun_trace.txt b/test/golden_traces/ctrl_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index 5311d813..ed27e868 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -1,4 +1,6 @@ # dummy +MatrixTranspose_dryrun_trace --check-none +ctrl_dryrun_trace --check-none MatrixTranspose_ctest_trace --check-count .* MatrixTranspose_test_trace --check-count .* --ignore-count hsaKmt.* MatrixTranspose_hipaact_test_trace --check-count .* --ignore-count hsaKmt.*|hipMemcpy|__hipPushCallConfiguration|hipLaunchKernel|__hipPopCallConfiguration diff --git a/test/run.sh b/test/run.sh index 577bdd30..75a5a680 100755 --- a/test/run.sh +++ b/test/run.sh @@ -106,6 +106,10 @@ eval_test() { test_number=$((test_number + 1)) } +# Tests dry run +eval_test "MatrixTranspose dry run" ./test/MatrixTranspose MatrixTranspose_dryrun_trace +eval_test "ctrl dry run" ./test/hsa/ctrl ctrl_dryrun_trace + # Standalone test # rocTrecer is used explicitely by test eval_test "standalone C test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_ctest" MatrixTranspose_ctest_trace From d4cd09139b3ec9e38cc18e83627dc9ab6e793ca1 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 3 Nov 2020 10:39:33 -0600 Subject: [PATCH 23/47] Adding dumping of HSA handles Change-Id: I18e2cfdf2574110bffa09d30c7ac1d3941252939 --- src/util/hsa_rsrc_factory.cpp | 13 +++++++++++++ src/util/hsa_rsrc_factory.h | 2 ++ test/CMakeLists.txt | 2 +- test/tool/tracer_tool.cpp | 5 +++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index e1ef9268..cf172cae 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -742,6 +742,19 @@ hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t return hsa_api_.hsa_executable_freeze(executable, options);; } +void HsaRsrcFactory::DumpHandles(FILE* file) { + auto beg = agent_map_.begin(); + auto end = agent_map_.end(); + for (auto it = beg; it != end; ++it) { + const AgentInfo* agent_info = it->second; + fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); + if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle); + if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle); + if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle); + } + fflush(file); +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 466ccf1f..3bfeda68 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -439,6 +439,8 @@ class HsaRsrcFactory { return HSA_STATUS_SUCCESS; } + void DumpHandles(FILE* output_file); + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 148c60b0..a7511789 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,7 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "19b1191" ) +set ( HSA_REV "a4fcdae" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 1aeb75a3..218652e9 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1127,6 +1127,11 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); } + // Dumping HSA handles for agents and pools + FILE* handles_file_handle = open_output_file(output_prefix, "hsa_handles.txt"); + HsaRsrcFactory::Instance().DumpHandles(handles_file_handle); + close_output_file(handles_file_handle); + ONLOAD_TRACE_END(); return true; } From bf70a1a64555287eaef1755fb014297157764345 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Dec 2020 22:16:43 -0500 Subject: [PATCH 24/47] SWDEV-264282 : fixing tracer_tool linking Change-Id: I0fd78c01595bbd506f42cf9dfb45f62b2124f704 --- test/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a7511789..e07e7d8b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -69,6 +69,14 @@ add_custom_target( mytest COMMAND sh -xc "cp ${TEST_DIR}/golden_traces/tests_trace_cmp_levels.txt ${PROJECT_BINARY_DIR}/test/" ) +## Build HSA test +execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) +execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) +set ( TMP ${TEST_DIR} ) +set ( TEST_DIR ${HSA_TEST_DIR} ) +add_subdirectory ( ${HSA_TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) +set ( TEST_DIR ${TMP} ) + ## Util sources file( GLOB UTIL_SRC "${HSA_TEST_DIR}/util/*.cpp" ) @@ -95,12 +103,6 @@ add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) target_include_directories ( ${CO_LIB_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${GEN_INC_DIR} ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ) target_link_libraries ( ${CO_LIB_NAME} ${ROCTRACER_TARGET} c stdc++ ) -## Build HSA test -execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) -execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) -set ( TEST_DIR ${HSA_TEST_DIR} ) -add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) - ## copying run script execute_process ( COMMAND sh -xc "cp ${RUN_SCRIPT} ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "ln -s run.sh ${PROJECT_BINARY_DIR}/run_ci.sh" ) From bfb90954b7e2f957d996489a087599dd124ec383 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 9 Dec 2020 23:07:41 -0500 Subject: [PATCH 25/47] calling python3 explictely Change-Id: I3dda55865bafa41cc6670e414b213f13a2a2a7ac --- script/check_trace.py | 10 ++++------ script/gen_ostream_ops.py | 2 -- script/hsaap.py | 1 - script/kfdap.py | 5 ++--- src/CMakeLists.txt | 10 +++++----- test/run.sh | 4 ++-- 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/script/check_trace.py b/script/check_trace.py index a4506a12..c10eb3c5 100644 --- a/script/check_trace.py +++ b/script/check_trace.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - #Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. # #Permission is hereby granted, free of charge, to any person obtaining a copy @@ -200,10 +198,10 @@ def check_trace_status(tracename, verbose, check_trace_flag): events_order_r[tid] = events_order[tid] cnt = gen_events_info(trace,trace_level,no_events_cnt,events2ignore,events2chkcnt,events2chkord,verbose) if verbose: - print '\n' + rtrace + ':\n' - print cnt_r - print '\n' + trace + ':\n' - print cnt + print ('\n' + rtrace + ':\n') + print (cnt_r) + print ('\n' + trace + ':\n') + print (cnt) diff_strings(cnt_r, cnt, metric) if cnt_r == cnt: diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index fcc379a6..ae60be84 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import os, sys, re import CppHeaderParser import argparse diff --git a/script/hsaap.py b/script/hsaap.py index f9458ee9..e1a3d717 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -1,4 +1,3 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re diff --git a/script/kfdap.py b/script/kfdap.py index 378ea126..8de1d19e 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -1,4 +1,3 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re @@ -80,7 +79,7 @@ def __init__(self, header, name, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') self.beg_pattern = re.compile(name) self.end_pattern = re.compile('.*\)\s*;\s*$'); @@ -146,7 +145,7 @@ def __init__(self, header, array, data, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') self.end_pattern = re.compile('\)\s*;\s*$') self.data = data diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 689ed637..556ea16d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,13 +5,13 @@ execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h > ${GEN_INC_DIR}/hsa_ext_amd_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) diff --git a/test/run.sh b/test/run.sh index 75a5a680..e8d89095 100755 --- a/test/run.sh +++ b/test/run.sh @@ -86,11 +86,11 @@ eval_test() { is_failed=0; else if [ $is_failed = 0 ] ; then - python ./test/check_trace.py -in $test_name -ck $check_trace_flag + python3 ./test/check_trace.py -in $test_name -ck $check_trace_flag is_failed=$? if [ $is_failed != 0 ] ; then echo "Trace checker error:" - python ./test/check_trace.py -v -in $test_name -ck $check_trace_flag + python3 ./test/check_trace.py -v -in $test_name -ck $check_trace_flag fi fi fi From 3c1a4b38385dd944257c279d5b689cddd599dc48 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 18 Nov 2020 10:39:33 -0500 Subject: [PATCH 26/47] SWDEV-259683 HIP API records filtering Change-Id: I43ca5e022d2c055b6a9bc2c09b4276b490a4b986 --- src/core/roctracer.cpp | 14 ++++++++++++++ test/run.sh | 5 +++++ test/tool/tracer_tool.cpp | 16 ++++++++++++++-- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 891b005e..8bc5ea29 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -719,11 +719,25 @@ PUBLIC_API roctracer_status_t roctracer_op_code( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: { *op = roctracer::hsa_support::GetApiCode(str); + if (*op == HSA_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } case ACTIVITY_DOMAIN_KFD_API: { *op = roctracer::kfd_support::GetApiCode(str); + if (*op == KFD_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } + if (kind != NULL) *kind = 0; + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + *op = hipApiIdByName(str); + if (*op == HIP_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } diff --git a/test/run.sh b/test/run.sh index e8d89095..9a7ffc74 100755 --- a/test/run.sh +++ b/test/run.sh @@ -134,6 +134,11 @@ export ROCTRACER_DOMAIN="hip" eval_test "tool period test" "ROCP_CTRL_RATE=10:100000:1000000 ./test/MatrixTranspose" MatrixTranspose_hip_period_trace eval_test "tool flushing test" "ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose" MatrixTranspose_hip_flush_trace +#API records filtering +echo "" > input.xml +export ROCP_INPUT=input.xml +eval_test "tool HIP test input" ./test/MatrixTranspose hip_input_trace + # HSA test export ROCTRACER_DOMAIN="hsa" # test trace diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 218652e9..cc5367d1 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -112,6 +112,7 @@ bool trace_pcs = false; // API trace vector std::vector hsa_api_vec; std::vector kfd_api_vec; +std::vector hip_api_vec; LOADER_INSTANTIATE(); TRACE_BUFFER_INSTANTIATE(); @@ -921,6 +922,7 @@ void tool_load() { found = true; trace_hip_api = true; trace_hip_activity = true; + hip_api_vec = api_vec; } if (name == "KFD") { found = true; @@ -1091,9 +1093,19 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, // Enable tracing if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); - ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + if (hip_api_vec.size() != 0) { + for (unsigned i = 0; i < hip_api_vec.size(); ++i) { + uint32_t cid = HIP_API_ID_NUMBER; + const char* api = hip_api_vec[i].c_str(); + ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); + ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); + printf(" %s", api); + } + } + else { + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + } ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); - if (is_stats_opt) { const char* path = NULL; FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); From 36b1b2fad306968a2463d5b1f6526fa3b905968b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 22 Dec 2020 04:42:30 -0500 Subject: [PATCH 27/47] SWDEV-251491 : disabling tracing on exit Change-Id: Ifd5f0fbad70afa1e79da8b4b9aa639d899cbea76 --- test/tool/tracer_tool.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index cc5367d1..e9522f0b 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1024,10 +1024,18 @@ void tool_load() { ONLOAD_TRACE_END(); } +void exit_handler(int status, void* arg) { + ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); + tool_unload(); + ONLOAD_TRACE_END(); +} + // HSA-runtime tool on-load method extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { ONLOAD_TRACE_BEG(); + on_exit(exit_handler, NULL); + timer = new hsa_rt_utils::Timer(table->core_->hsa_system_get_info_fn); const char* output_prefix = getenv("ROCP_OUTPUT_DIR"); @@ -1101,11 +1109,10 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); printf(" %s", api); } - } - else { + } else { ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); } - ROCTRACER_CALL(roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipModuleUnload)); + if (is_stats_opt) { const char* path = NULL; FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); @@ -1116,6 +1123,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, } } } + if (trace_hip_activity) { hcc_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); @@ -1166,8 +1174,8 @@ extern "C" CONSTRUCTOR_API void constructor() { } extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); - roctracer_flush_buf(); tool_unload(); + roctracer_flush_buf(); if (hip_api_stats) hip_api_stats->dump(); if (hip_kernel_stats) hip_kernel_stats->dump(); From 7ea1e9a369782bbc4f4d1434d7b38624e27d6e0b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 25 Dec 2020 01:49:44 -0500 Subject: [PATCH 28/47] SWDEV-251491 : fixing tracing on exit Change-Id: I1bf2a6093331e7a08179b9f64394c5c49206ef0e --- src/core/roctracer.cpp | 25 ++++++++++++++----------- test/tool/tracer_tool.cpp | 2 ++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 8bc5ea29..272bad26 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -279,7 +279,8 @@ struct record_pair_t { roctracer_api_data_t data; record_pair_t() {}; }; -static thread_local std::stack record_pair_stack; +typedef std::stack record_pair_stack_t; +static thread_local record_pair_stack_t* record_pair_stack = NULL; // Correlation id storage static thread_local activity_correlation_id_t correlation_id_tls = 0; @@ -350,6 +351,7 @@ void* HIP_SyncApiDataCallback( void* arg) { static hsa_rt_utils::Timer timer; + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -368,8 +370,8 @@ void* HIP_SyncApiDataCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); data = &(top.data.hip); data_ptr = const_cast(data); data_ptr->phase = phase; @@ -389,7 +391,7 @@ void* HIP_SyncApiDataCallback( ret = data_ptr; } else { // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; @@ -397,7 +399,7 @@ void* HIP_SyncApiDataCallback( const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); return ret; } @@ -410,6 +412,7 @@ void* HIP_SyncActivityCallback( { static hsa_rt_utils::Timer timer; const timestamp_t timestamp_ns = timer.timestamp_ns(); + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); @@ -428,8 +431,8 @@ void* HIP_SyncActivityCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); record = &(top.record); data = &(top.data.hip); data_ptr = const_cast(data); @@ -459,8 +462,8 @@ void* HIP_SyncActivityCallback( // Getting record of stacked if (record == NULL) { - if (record_pair_stack.empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); - auto& top = record_pair_stack.top(); + if (record_pair_stack->empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); + auto& top = record_pair_stack->top(); record = &(top.record); } @@ -482,7 +485,7 @@ void* HIP_SyncActivityCallback( pool->Write(*record); // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; @@ -490,7 +493,7 @@ void* HIP_SyncActivityCallback( const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", - name, phase, op_id, record, data, pool, (int)(record_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); return ret; } diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index e9522f0b..c7e063ff 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1026,8 +1026,10 @@ void tool_load() { void exit_handler(int status, void* arg) { ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); +#if 0 tool_unload(); ONLOAD_TRACE_END(); +#endif } // HSA-runtime tool on-load method From 2a64bd062befab2e22fd0416a82c32873b241891 Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Thu, 25 Mar 2021 09:32:28 -0400 Subject: [PATCH 29/47] SWDEV-271503 Fixed core dump Change-Id: Ia582a27482581c3b81c42da0add9f6743898da6c --- script/gen_ostream_ops.py | 6 +++--- test/tool/tracer_tool.cpp | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index ae60be84..180103a5 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -85,7 +85,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a str = '' if "union" not in mtype: indent = "" - str += " if (regex_match (\"" + cppHeader_struct + "::" + name + "\", std::regex(" + apiname.upper() + "_structs_regex))) {\n" + str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ")) {\n" indent = " " str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" @@ -132,7 +132,7 @@ def gen_cppheader(infilepath, outfilepath, rank): '#include \n' + \ '\n' + \ '#include "roctracer.h"\n' - header_s += '#include \n#include \n' + header_s += '#include \n' output_filename_h.write(header_s) output_filename_h.write('\n') @@ -140,7 +140,7 @@ def gen_cppheader(infilepath, outfilepath, rank): output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') - output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \".*\";\n') + output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \"\";\n') output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') output_filename_h.write("// basic ostream ops\n") output_filename_h.write(header_basic) diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index c7e063ff..cf555747 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -823,7 +823,6 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - close_file_handles(); ONLOAD_TRACE_END(); } @@ -1026,10 +1025,6 @@ void tool_load() { void exit_handler(int status, void* arg) { ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); -#if 0 - tool_unload(); - ONLOAD_TRACE_END(); -#endif } // HSA-runtime tool on-load method @@ -1178,6 +1173,8 @@ extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); tool_unload(); roctracer_flush_buf(); + close_file_handles(); + if (hip_api_stats) hip_api_stats->dump(); if (hip_kernel_stats) hip_kernel_stats->dump(); From 89ab109f024c51674a309e94ec95866d1a56b97a Mon Sep 17 00:00:00 2001 From: Rachida Kebichi Date: Wed, 14 Apr 2021 11:17:53 -0400 Subject: [PATCH 30/47] SWDEV-281008 replace hcc_detail by amd_detail Change-Id: I180b18f9e1fae40c923d6210901f06cba14e8f13 --- inc/roctracer_hip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 553ec1b8..391831bf 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -39,7 +39,7 @@ inline static std::ostream& operator<<(std::ostream& out, const char& v) { #include #include -#include +#include #include From e723f8ca67ce9eae793ba72478a4d1e20b4bf7d9 Mon Sep 17 00:00:00 2001 From: Icarus Sparry Date: Wed, 7 Jul 2021 16:40:24 +0000 Subject: [PATCH 31/47] Add dependency on rocm-core The intention is to make all rocm-packages depend on a tiny rocm-core package so that all of rocm can be removed by removing rocm-core. Obviously it is less than ideal that you install by using some variant of "apt install rocm" and remove everything by "apt remove rocm-core", but this is easy to document. The alternative "apt autoremove rocm" may remove unrelated packages. Signed-off-by: Icarus Sparry Change-Id: I74351c7be3c2d3dfec577d36ae78222b3fd22ef3 Signed-off-by: Icarus Sparry --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8bff20c..7f678dec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ else() endif() message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core" ) ## Process the Debian install/remove scripts to update the CPACK variables configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) @@ -222,7 +223,14 @@ if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) endif() set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) +set ( CPACK_RPM_PACKAGE_REQUIRES "rocm-core" ) message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") + +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +endif() + ## Process the Rpm install/remove scripts to update the CPACK variables configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) From f7adedb800491b01137c56c2b428e6811aa71665 Mon Sep 17 00:00:00 2001 From: Christophe Paquot Date: Mon, 21 Jun 2021 10:20:25 -0700 Subject: [PATCH 32/47] SWDEV-281658 - Preserve the callback IDs enum ordering Use HIP_API_ID_NONE to detect unsupported API instead of HIP_API_ID_NUMBER which can grow with a new version of the API. This HIP_API_ID_NONE enum has a fixed value of 0 so the HIP_API_IDs really start at FIRST. Change-Id: I760aa50ddf6fa6d46bf20555ad7d429335a53f97 --- src/core/roctracer.cpp | 70 ++++++++++++++++++++++++++------------- test/tool/tracer_tool.cpp | 18 +++++----- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 272bad26..000ba7e3 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -738,7 +738,7 @@ PUBLIC_API roctracer_status_t roctracer_op_code( } case ACTIVITY_DOMAIN_HIP_API: { *op = hipApiIdByName(str); - if (*op == HIP_API_ID_NUMBER) { + if (*op == HIP_API_ID_NONE) { EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); } if (kind != NULL) *kind = 0; @@ -750,13 +750,29 @@ PUBLIC_API roctracer_status_t roctracer_op_code( API_METHOD_SUFFIX } -static inline uint32_t get_op_num(const uint32_t& domain) { +static inline uint32_t get_op_begin(uint32_t domain) { + switch (domain) { + case ACTIVITY_DOMAIN_HSA_OPS: return 0; + case ACTIVITY_DOMAIN_HSA_API: return 0; + case ACTIVITY_DOMAIN_HSA_EVT: return 0; + case ACTIVITY_DOMAIN_HCC_OPS: return 0; + case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_FIRST; + case ACTIVITY_DOMAIN_KFD_API: return 0; + case ACTIVITY_DOMAIN_EXT_API: return 0; + case ACTIVITY_DOMAIN_ROCTX: return 0; + default: + EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")"); + } + return 0; +} + +static inline uint32_t get_op_end(uint32_t domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER; case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER; - case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_NUMBER; + case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_LAST + 1;; case ACTIVITY_DOMAIN_KFD_API: return KFD_API_ID_NUMBER; case ACTIVITY_DOMAIN_EXT_API: return 0; case ACTIVITY_DOMAIN_ROCTX: return ROCTX_API_ID_NUMBER; @@ -850,8 +866,9 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_callback( void* user_data) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_callback_impl(domain, op, callback, user_data); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_callback_impl(domain, op, callback, user_data); API_METHOD_SUFFIX } @@ -860,9 +877,10 @@ PUBLIC_API roctracer_status_t roctracer_enable_callback( void* user_data) { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_callback_impl(domain, op, callback, user_data); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_callback_impl(domain, op, callback, user_data); } API_METHOD_SUFFIX } @@ -943,17 +961,19 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_callback( roctracer_domain_t domain) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_callback_impl(domain, op); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_callback_impl(domain, op); API_METHOD_SUFFIX } PUBLIC_API roctracer_status_t roctracer_disable_callback() { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_callback_impl(domain, op); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_callback_impl(domain, op); } API_METHOD_SUFFIX } @@ -1082,8 +1102,9 @@ PUBLIC_API roctracer_status_t roctracer_enable_domain_activity_expl( roctracer_pool_t* pool) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_activity_impl(domain, op, pool); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_activity_impl(domain, op, pool); API_METHOD_SUFFIX } @@ -1091,9 +1112,10 @@ PUBLIC_API roctracer_status_t roctracer_enable_activity_expl( roctracer_pool_t* pool) { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_enable_activity_impl(domain, op, pool); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_enable_activity_impl(domain, op, pool); } API_METHOD_SUFFIX } @@ -1165,17 +1187,19 @@ PUBLIC_API roctracer_status_t roctracer_disable_domain_activity( roctracer_domain_t domain) { API_METHOD_PREFIX - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_activity_impl(domain, op); + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_activity_impl(domain, op); API_METHOD_SUFFIX } PUBLIC_API roctracer_status_t roctracer_disable_activity() { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; domain++) { - const uint32_t op_num = get_op_num(domain); - for (uint32_t op = 0; op < op_num; op++) roctracer_disable_activity_impl(domain, op); + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { + const uint32_t op_end = get_op_end(domain); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) + roctracer_disable_activity_impl(domain, op); } API_METHOD_SUFFIX } diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index 42cb6ce4..608b40cf 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -1094,7 +1094,7 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); if (hip_api_vec.size() != 0) { for (unsigned i = 0; i < hip_api_vec.size(); ++i) { - uint32_t cid = HIP_API_ID_NUMBER; + uint32_t cid = HIP_API_ID_NONE; const char* api = hip_api_vec[i].c_str(); ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); @@ -1105,13 +1105,13 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, } if (is_stats_opt) { - const char* path = NULL; - FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); + const char* path = NULL; + FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); hip_api_stats = new EvtStats(f, path); - for (uint32_t id = 0; id < HIP_API_ID_NUMBER; id += 1) { + for (uint32_t id = HIP_API_ID_FIRST; id <= HIP_API_ID_LAST; id += 1) { const char* label = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, id, 0); hip_api_stats->set_label(id, label); - } + } } } @@ -1120,11 +1120,11 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); if (is_stats_opt) { - FILE* f = NULL; - const char* path = NULL; - f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); + FILE* f = NULL; + const char* path = NULL; + f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); hip_kernel_stats = new EvtStatsA(f, path); - f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); + f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); hip_memcpy_stats = new EvtStatsA(f, path); } } From f32619b8f109d48281bc1d684f93951e0589bd2e Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:08:41 -0600 Subject: [PATCH 33/47] updating known issues --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index e700ee40..5dce1e8d 100644 --- a/README.md +++ b/README.md @@ -73,3 +73,8 @@ rocTX API: or make package && dpkg -i *.deb ``` + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- OpenMP applications are not fully supported by the roctracer. From 38a1972edde07cc1be5e2cc37ea124287aaa46d3 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 21 Jan 2022 12:10:53 -0600 Subject: [PATCH 34/47] SWDEV-318551: Adding License file for tracer Making the new License file, Adding support in the CMakeLists.txt Change-Id: I43862b8b7f3025ae6200aeb442ea70c7993a7349 --- CMakeLists.txt | 5 +++++ LICENSE | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 128448c8..76be91d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,6 +184,9 @@ set ( CPACK_PACKAGE_CONTACT "ROCm Profiler Support Date: Wed, 9 Feb 2022 14:21:12 -0600 Subject: [PATCH 35/47] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index e700ee40..28838c19 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,10 @@ rocTX API: - To build roctracer library: export CMAKE_BUILD_TYPE= # release by default + cd /roctracer && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. && make -j + or + ./build.sh - To build and run test: make mytest From a19a826cd1c4b7c374cd7d6919835a54de47dd44 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Mon, 14 Feb 2022 12:40:18 -0800 Subject: [PATCH 36/47] SWDEV-291455: Prefer rocm include path to hip include path Change-Id: I1fa96e72169fac689a3a2ed38e988d7f5d18bf04 (cherry picked from commit ebda880c4a11ef986134e46591120b1a695ad254) --- src/CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5bf02101..82250d87 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,7 +24,7 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${ROCM_INC_PATH} ${GEN_INC_DIR} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) # Build ROCTX tracing library diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 55a7edea..a14ee2ac 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -85,7 +85,7 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt numa ) install ( TARGETS ${TEST_LIB} LIBRARY DESTINATION ${DEST_NAME}/tool ) endif () From 75f74bb3bf0d40db2acc054540da7babddcc4642 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Tue, 29 Mar 2022 22:27:25 +0000 Subject: [PATCH 37/47] SWDEV-307394: Fixing Download HSA-Class Issue Exchanging the git clone of the hsa-class to a local downloaded version pushed to the roctracer repo Change-Id: Id45a38b2d355102c2e0dee1e4bfde50398369047 (cherry picked from commit 7ee4f87b73c13cf8404ac11c7628661a01ef31e6) --- .gitignore | 1 - test/CMakeLists.txt | 3 - test/hsa/LICENSE | 20 + test/hsa/README.md | 4 + test/hsa/script/build_kernel.sh | 80 ++ test/hsa/src/hsa_rsrc_factory.cpp | 761 ++++++++++++++++++ test/hsa/src/hsa_rsrc_factory.h | 516 ++++++++++++ test/hsa/test/CMakeLists.txt | 64 ++ test/hsa/test/app/test.cpp | 86 ++ test/hsa/test/ctrl/run_kernel.h | 90 +++ test/hsa/test/ctrl/test_aql.h | 77 ++ test/hsa/test/ctrl/test_hsa.cpp | 279 +++++++ test/hsa/test/ctrl/test_hsa.h | 129 +++ test/hsa/test/ctrl/test_kernel.h | 138 ++++ test/hsa/test/dummy_kernel/dummy_kernel.cl | 28 + test/hsa/test/dummy_kernel/dummy_kernel.h | 71 ++ test/hsa/test/run.sh | 45 ++ .../simple_convolution/simple_convolution.cl | 76 ++ .../simple_convolution/simple_convolution.cpp | 388 +++++++++ .../simple_convolution/simple_convolution.h | 94 +++ test/hsa/test/util/evt_stats.h | 98 +++ test/hsa/test/util/helper_funcs.h | 86 ++ test/hsa/test/util/hsa_rsrc_factory.cpp | 1 + test/hsa/test/util/hsa_rsrc_factory.h | 1 + test/hsa/test/util/perf_timer.cpp | 179 ++++ test/hsa/test/util/perf_timer.h | 83 ++ test/hsa/test/util/test_assert.h | 35 + test/hsa/test/util/xml.h | 457 +++++++++++ 28 files changed, 3886 insertions(+), 4 deletions(-) create mode 100644 test/hsa/LICENSE create mode 100644 test/hsa/README.md create mode 100755 test/hsa/script/build_kernel.sh create mode 100644 test/hsa/src/hsa_rsrc_factory.cpp create mode 100644 test/hsa/src/hsa_rsrc_factory.h create mode 100644 test/hsa/test/CMakeLists.txt create mode 100644 test/hsa/test/app/test.cpp create mode 100644 test/hsa/test/ctrl/run_kernel.h create mode 100644 test/hsa/test/ctrl/test_aql.h create mode 100644 test/hsa/test/ctrl/test_hsa.cpp create mode 100644 test/hsa/test/ctrl/test_hsa.h create mode 100644 test/hsa/test/ctrl/test_kernel.h create mode 100644 test/hsa/test/dummy_kernel/dummy_kernel.cl create mode 100644 test/hsa/test/dummy_kernel/dummy_kernel.h create mode 100755 test/hsa/test/run.sh create mode 100644 test/hsa/test/simple_convolution/simple_convolution.cl create mode 100644 test/hsa/test/simple_convolution/simple_convolution.cpp create mode 100644 test/hsa/test/simple_convolution/simple_convolution.h create mode 100644 test/hsa/test/util/evt_stats.h create mode 100644 test/hsa/test/util/helper_funcs.h create mode 120000 test/hsa/test/util/hsa_rsrc_factory.cpp create mode 120000 test/hsa/test/util/hsa_rsrc_factory.h create mode 100644 test/hsa/test/util/perf_timer.cpp create mode 100644 test/hsa/test/util/perf_timer.h create mode 100644 test/hsa/test/util/test_assert.h create mode 100644 test/hsa/test/util/xml.h diff --git a/.gitignore b/.gitignore index 331d63fd..ef6bb1de 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,5 @@ *.swp *.Po build -test/hsa test/MatrixTranspose/MatrixTranspose test/MatrixTranspose_test/MatrixTranspose diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a14ee2ac..17a54c80 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,7 +44,6 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "f8b3870" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) @@ -70,8 +69,6 @@ add_custom_target( mytest ) ## Build HSA test -execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) -execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) set ( TMP ${TEST_DIR} ) set ( TEST_DIR ${HSA_TEST_DIR} ) add_subdirectory ( ${HSA_TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) diff --git a/test/hsa/LICENSE b/test/hsa/LICENSE new file mode 100644 index 00000000..597d1b16 --- /dev/null +++ b/test/hsa/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +[MITx11 license] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/test/hsa/README.md b/test/hsa/README.md new file mode 100644 index 00000000..20e09157 --- /dev/null +++ b/test/hsa/README.md @@ -0,0 +1,4 @@ +# HSA-class +``` +HSA high level C++ API +``` diff --git a/test/hsa/script/build_kernel.sh b/test/hsa/script/build_kernel.sh new file mode 100755 index 00000000..2007e7ab --- /dev/null +++ b/test/hsa/script/build_kernel.sh @@ -0,0 +1,80 @@ +#!/bin/sh -x +SO_EXT="hsaco" + +TEST_NAME=$1 +DST_DIR=$2 +ROCM_DIR=$3 +TGT_LIST=$4 + +if [ -z "$TEST_NAME" ] ; then + echo "Usage: $0 " + echo " Will look for .cl and will build .$SO_EXT dynamic code object library" + exit 1 +fi +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') + +if [ -z "$DST_DIR" ] ; then + DST_DIR=$(dirname TEST_NAME) +fi + +if [ -z "$ROCM_DIR" ] ; then + ROCM_DIR=/opt/rocm +fi + +if [ -z "$TGT_LIST" ] ; then + TGT_LIST=`$ROCM_DIR/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p"` +fi + +if [ -z "$TGT_LIST" ] ; then + echo "Error: GPU targets not found" + exit 1 +fi + +OCL_VER="2.0" + +if [ -e $ROCM_DIR/llvm ] ; then + LLVM_DIR=$ROCM_DIR/llvm + LIB_DIR=$ROCM_DIR/lib +else + LLVM_DIR=$ROCM_DIR/hcc + LIB_DIR=$LLVM_DIR/lib +fi + +# Determine whether using new or old device-libs layout +if [ -e $LIB_DIR/bitcode/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR/bitcode +elif [ -e $LIB_DIR/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR +elif [ -e $ROCM_DIR/amdgcn/bitcode/opencl.bc ]; then + BC_DIR=$ROCM_DIR/amdgcn/bitcode +else + echo "Error: Cannot find amdgcn bitcode directory" + exit 1 +fi + +CLANG_ROOT=$LLVM_DIR/lib/clang +CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` +if [ "$CLANG_DIR" = "" ] ; then + echo "Error: LLVM clang library was not found" + exit 1 +fi + +BIN_DIR=$LLVM_DIR/bin +INC_DIR=$CLANG_DIR/include +if [ -e $BC_DIR/opencl.amdgcn.bc ]; then + BITCODE_OPTS="-nogpulib \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +else + BITCODE_OPTS="--hip-device-lib-path=$BC_DIR" +fi + +for GFXIP in $TGT_LIST ; do + OBJ_PREF=$GFXIP + OBJ_FILE="${OBJ_PREF}_${OBJ_NAME}.$SO_EXT" + $BIN_DIR/clang -cl-std=CL$OCL_VER -include $INC_DIR/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' generated" +done + +exit 0 diff --git a/test/hsa/src/hsa_rsrc_factory.cpp b/test/hsa/src/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..d2d8e79e --- /dev/null +++ b/test/hsa/src/hsa_rsrc_factory.cpp @@ -0,0 +1,761 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + + InitHsaApiTable(NULL); + + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_api_.hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + + // Discover the set of Gpu devices available on the platform + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer(&hsa_api_); + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // Time correlation + const uint32_t corr_iters = 1000; + CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); + CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_api_.hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + } + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_api_.hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Wait signal +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; + while (1) { + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); + } + } + return ret_value; +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal, signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s, 1); + status = hsa_api_.hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_api_.hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + + // adevance command queue + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + +void HsaRsrcFactory::DumpHandles(FILE* file) { + auto beg = agent_map_.begin(); + auto end = agent_map_.end(); + for (auto it = beg; it != end; ++it) { + const AgentInfo* agent_info = it->second; + fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); + if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle); + if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle); + if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle); + } + fflush(file); +} + +std::atomic HsaRsrcFactory::instance_{}; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; diff --git a/test/hsa/src/hsa_rsrc_factory.h b/test/hsa/src/hsa_rsrc_factory.h new file mode 100644 index 00000000..8383aa66 --- /dev/null +++ b/test/hsa/src/hsa_rsrc_factory.h @@ -0,0 +1,516 @@ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +#ifndef _HSA_RSRC_FACTORY_H_ +#define _HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + abort(); \ + } \ +} while (0) + +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + abort(); \ + } \ +} while (0) + +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; +}; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; +}; + +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_NUMBER + }; + + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methods for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } + + // Method for timespec/ns conversion + static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + // Return time in 'ns' + static timestamp_t clocktime_ns(clockid_t clock_id) { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) const { + clockid_t clock_id = 0; + switch (clock_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; +}; + +class HsaRsrcFactory { + public: + static const size_t CMD_SLOT_SIZE_B = 0x40; + typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); + } + return obj; + } + + static HsaRsrcFactory& Instance() { + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *obj; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_.load(); + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Wait signal + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); + } + + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTime(uint32_t time_id, timestamp_t value, uint64_t* time) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time = value + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimestamp(uint32_t time_id, uint64_t value, timestamp_t* timestamp) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *timestamp = value - time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + void DumpHandles(FILE* output_file); + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static std::atomic instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + + // HSA runtime API table + static hsa_pfn_t hsa_api_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; +}; + +#endif // _HSA_RSRC_FACTORY_H_ diff --git a/test/hsa/test/CMakeLists.txt b/test/hsa/test/CMakeLists.txt new file mode 100644 index 00000000..77727b23 --- /dev/null +++ b/test/hsa/test/CMakeLists.txt @@ -0,0 +1,64 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 2.8.12 ) +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( EXE_NAME "ctrl" ) + +if ( NOT DEFINED TEST_DIR ) + set ( TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + project ( ${EXE_NAME} ) + ## Set build environment + include ( env ) +endif () + +if ( NOT DEFINED ROCM_ROOT_DIR ) + set ( ROCM_ROOT_DIR "" ) +endif () +if ( NOT DEFINED GPU_TARGETS ) + set ( GPU_TARGETS "" ) +endif () + +## Util sources +file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) + +## Test control sources +set ( CTRL_SRC + ${TEST_DIR}/app/test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Dummy kernel +set ( DUMMY_NAME dummy_kernel ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../script/build_kernel.sh '${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME}' '${PROJECT_BINARY_DIR}' '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" ) + +## Test kernel +set ( TEST_NAME simple_convolution ) +set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../script/build_kernel.sh '${TEST_DIR}/${TEST_NAME}/${TEST_NAME}' '${PROJECT_BINARY_DIR}' '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" ) + +## Building ctrl test executable +add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) diff --git a/test/hsa/test/app/test.cpp b/test/hsa/test/app/test.cpp new file mode 100644 index 00000000..23d39273 --- /dev/null +++ b/test/hsa/test/app/test.cpp @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" + +void thread_fun(const int kiter, const int diter, const uint32_t agents_number) { + const AgentInfo* agent_info[agents_number]; + hsa_queue_t* queue[agents_number]; + HsaRsrcFactory* rsrc = &HsaRsrcFactory::Instance(); + + for (uint32_t n = 0; n < agents_number; ++n) { + uint32_t agent_id = n % rsrc->GetCountOfGpuAgents(); + if (rsrc->GetGpuAgentInfo(agent_id, &agent_info[n]) == false) { + fprintf(stderr, "AgentInfo failed\n"); + abort(); + } + if (rsrc->CreateQueue(agent_info[n], 128, &queue[n]) == false) { + fprintf(stderr, "CreateQueue failed\n"); + abort(); + } + } + + for (int i = 0; i < kiter; ++i) { + for (uint32_t n = 0; n < agents_number; ++n) { + RunKernel(0, NULL, agent_info[n], queue[n], diter); + RunKernel(0, NULL, agent_info[n], queue[n], diter); + } + } + + for (uint32_t n = 0; n < agents_number; ++n) { + hsa_queue_destroy(queue[n]); + } +} + +int main(int argc, char** argv) { + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const char* agents_s = getenv("ROCP_AGENTS"); + const char* thrs_s = getenv("ROCP_THRS"); + + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + const uint32_t agents_number = (agents_s != NULL) ? (uint32_t)atol(agents_s) : 1; + const int thrs = (thrs_s != NULL) ? atol(thrs_s) : 1; + + TestHsa::HsaInstantiate(); + + std::vector t(thrs); + for (int n = 0; n < thrs; ++n) { + t[n] = std::thread(thread_fun, kiter, diter, agents_number); + } + for (int n = 0; n < thrs; ++n) { + t[n].join(); + } + + TestHsa::HsaShutdown(); + return 0; +} diff --git a/test/hsa/test/ctrl/run_kernel.h b/test/hsa/test/ctrl/run_kernel.h new file mode 100644 index 00000000..846e0b68 --- /dev/null +++ b/test/hsa/test/ctrl/run_kernel.h @@ -0,0 +1,90 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_RUN_KERNEL_H_ +#define TEST_CTRL_RUN_KERNEL_H_ + +#include "ctrl/test_hsa.h" +#include "util/test_assert.h" + +template bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL, hsa_queue_t* queue = NULL, int count = 1) { + bool ret_val = false; + + if (getenv("ROC_TEST_TRACE") == NULL) std::clog.rdbuf(NULL); + + + // Create test kernel object + Kernel test_kernel; + + TestHsa* test_hsa = new TestHsa(&test_kernel); + test_hsa->SetAgentInfo(agent_info); + test_hsa->SetQueue(queue); + + TestAql* test_aql = new Test(test_hsa); + TEST_ASSERT(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->Initialize(argc, argv); + if (ret_val == false) { + std::cerr << "Error in the test initialization" << std::endl; + // TEST_ASSERT(ret_val); + return false; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->Setup(); + if (ret_val == false) { + std::cerr << "Error in creating hsa resources" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Kernel dspatch iterations + for (int i = 0; i < count; ++i) { + // Run test kernel + ret_val = test_aql->Run(); + if (ret_val == false) { + std::cerr << "Error in running the test kernel" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } + } + + // Print time taken by sample + test_aql->PrintTime(); + + test_aql->Cleanup(); + delete test_aql; + + return ret_val; +} + +#endif // TEST_CTRL_RUN_KERNEL_H_ diff --git a/test/hsa/test/ctrl/test_aql.h b/test/hsa/test/ctrl/test_aql.h new file mode 100644 index 00000000..d77363ee --- /dev/null +++ b/test/hsa/test/ctrl/test_aql.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_AQL_H_ +#define TEST_CTRL_TEST_AQL_H_ + +#include +#include + +#include "util/hsa_rsrc_factory.h" + +// Test AQL interface +class TestAql { + public: + explicit TestAql(TestAql* t = 0) : test_(t) {} + virtual ~TestAql() { + if (test_) delete test_; + } + + TestAql* Test() { return test_; } + virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } + virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool Initialize(int argc, char** argv) { + return (test_) ? test_->Initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool Setup() { return (test_) ? test_->Setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool Run() { return (test_) ? test_->Run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool VerifyResults() { return (test_) ? test_->VerifyResults() : true; } + + // Print to console the time taken to execute kernel + virtual void PrintTime() { + if (test_) test_->PrintTime(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool Cleanup() { return (test_) ? test_->Cleanup() : true; } + + private: + TestAql* const test_; +}; + +#endif // TEST_CTRL_TEST_AQL_H_ diff --git a/test/hsa/test/ctrl/test_hsa.cpp b/test/hsa/test/ctrl/test_hsa.cpp new file mode 100644 index 00000000..638f7b1a --- /dev/null +++ b/test/hsa/test/ctrl/test_hsa.cpp @@ -0,0 +1,279 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "ctrl/test_hsa.h" + +#include + +#include "util/test_assert.h" +#include "util/helper_funcs.h" +#include "util/hsa_rsrc_factory.h" + +HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; + +HsaRsrcFactory* TestHsa::HsaInstantiate() { + // Instantiate an instance of Hsa Resources Factory + if (hsa_rsrc_ == NULL) { + hsa_rsrc_ = HsaRsrcFactory::Create(); + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + } + return hsa_rsrc_; +} + +void TestHsa::HsaShutdown() { + if (hsa_rsrc_) hsa_rsrc_->Destroy(); +} + +bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { + std::clog << "TestHsa::Initialize :" << std::endl; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + if (hsa_rsrc_ == NULL) { + TEST_ASSERT(false); + return false; + } + + // Create an instance of Gpu agent + if (agent_info_ == NULL) { + const uint32_t agent_id = 0; + if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; + return false; + } + } + std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + const uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + my_queue_ = true; + } + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + brig_path_obj_.append(agentName); + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHsa::Setup() { + std::clog << "TestHsa::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + // Load and Finalize Kernel Code Descriptor + const char* brig_path = brig_path_obj_.c_str(); + bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, symb_.c_str(), &hsa_exec_, + &kernel_code_desc_); + if (suc == false) { + std::cerr << "Error in loading and finalizing Kernel" << std::endl; + return false; + } + + mem_map_t& mem_map = test_->GetMemMap(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + if (des.size == 0) continue; + + switch (des.id) { + case TestKernel::LOCAL_DES_ID: + des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); + break; + case TestKernel::KERNARG_DES_ID: { + // Check the kernel args size + const size_t kernarg_size = des.size; + size_t size_info = 0; + const hsa_status_t status = hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + TEST_ASSERT(status == HSA_STATUS_SUCCESS); + size_info = kernarg_size; + const bool kernarg_missmatch = (kernarg_size > size_info); + if (kernarg_missmatch) { + std::cout << "kernarg_size = " << kernarg_size << ", size_info = " << size_info + << std::flush << std::endl; + TEST_ASSERT(!kernarg_missmatch); + break; + } + // ALlocate kernarg memory + des.size = size_info; + des.ptr = hsa_rsrc_->AllocateKernArgMemory(agent_info_, size_info); + if (des.ptr) memset(des.ptr, 0, size_info); + break; + } + case TestKernel::SYS_DES_ID: + des.ptr = hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::NULL_DES_ID: + des.ptr = NULL; + break; + default: + break; + } + TEST_ASSERT(des.ptr != NULL); + if (des.ptr == NULL) return false; + } + test_->Init(); + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHsa::Run() { + std::clog << "TestHsa::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->GetGridSize(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, barrier bit, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->GetKernargPtr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + hsa_signal_store_relaxed(hsa_signal_, 1); + aql.completion_signal = hsa_signal_; + + std::clog << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Submit AQL packet to the queue + const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); + + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl << std::flush; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + if (hsa_signal_wait_scacquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + TEST_ASSERT("signal_wait failed"); + } + + std::clog << "> DONE, que_idx=" << que_idx << std::endl; + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + return true; +} + +bool TestHsa::VerifyResults() { + bool cmp = false; + void* output = NULL; + const uint32_t size = test_->GetOutputSize(); + bool suc = false; + + if (size == 0) return true; + + // Copy local kernel output buffers from local memory into host memory + if (test_->IsOutputLocal()) { + output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); + suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + if (!suc) std::clog << "> VerifyResults: Memcpy failed" << std::endl << std::flush; + } else { + output = test_->GetOutputPtr(); + suc = true; + } + + if ((output != NULL) && suc) { + // Print the test output + test_->PrintOutput(output); + // Compare the results and see if they match + cmp = (memcmp(output, test_->GetRefOut(), size) == 0); + } + + if (test_->IsOutputLocal() && (output != NULL)) hsa_rsrc_->FreeMemory(output); + + return cmp; +} + +void TestHsa::PrintTime() { + std::clog << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::clog << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::clog << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHsa::Cleanup() { + hsa_executable_destroy(hsa_exec_); + hsa_signal_destroy(hsa_signal_); + if (my_queue_) hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + agent_info_ = NULL; + return true; +} diff --git a/test/hsa/test/ctrl/test_hsa.h b/test/hsa/test/ctrl/test_hsa.h new file mode 100644 index 00000000..bb54c600 --- /dev/null +++ b/test/hsa/test/ctrl/test_hsa.h @@ -0,0 +1,129 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_HSA_H_ +#define TEST_CTRL_TEST_HSA_H_ + +#include "ctrl/test_aql.h" +#include "ctrl/test_kernel.h" +#include "util/hsa_rsrc_factory.h" +#include "util/perf_timer.h" + +// Class implements HSA test +class TestHsa : public TestAql { + public: + // Instantiate HSA resources + static HsaRsrcFactory* HsaInstantiate(); + static void HsaShutdown(); + + // Constructor + explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()), symb_(test->SymbName()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + agent_info_ = NULL; + hsa_queue_ = NULL; + my_queue_ = false; + hsa_exec_ = {}; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + hsa_agent_t HsaAgent() { return agent_info_->dev_id; } + const AgentInfo* GetAgentInfo() { return agent_info_; } + void SetAgentInfo(const AgentInfo* agent_info) { agent_info_ = agent_info; } + hsa_queue_t* GetQueue() { return hsa_queue_; } + void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool Initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool Setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool Run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool VerifyResults(); + + // Print to console the time taken to execute kernel + void PrintTime(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool Cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + static HsaRsrcFactory* hsa_rsrc_; + + // Handle to an Hsa Gpu Agent + const AgentInfo* agent_info_; + + // Handle to an Hsa Queue + hsa_queue_t* hsa_queue_; + bool my_queue_; + + // Test kernel name + std::string name_; + + // Test kernel name + std::string symb_; + + // Kernel executable + hsa_executable_t hsa_exec_; +}; + +#endif // TEST_CTRL_TEST_HSA_H_ diff --git a/test/hsa/test/ctrl/test_kernel.h b/test/hsa/test/ctrl/test_kernel.h new file mode 100644 index 00000000..0ca89200 --- /dev/null +++ b/test/hsa/test/ctrl/test_kernel.h @@ -0,0 +1,138 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_KERNEL_H_ +#define TEST_CTRL_TEST_KERNEL_H_ + +#include +#include +#include +#include + +// Class implements kernel test +class TestKernel { + public: + // Exported buffers IDs + enum buf_id_t { KERNARG_EXP_ID, OUTPUT_EXP_ID, REFOUT_EXP_ID }; + // Memory descriptors IDs + enum des_id_t { NULL_DES_ID, LOCAL_DES_ID, KERNARG_DES_ID, SYS_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + des_id_t id; + void* ptr; + uint32_t size; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + virtual ~TestKernel() {} + + // Initialize method + virtual void Init() = 0; + + // Return kernel memory map + mem_map_t& GetMemMap() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t NullDescriptor() { return {NULL_DES_ID, NULL, 0}; } + + // Check if decripter is local + bool IsLocal(const mem_descr_t& descr) const { return (descr.id == LOCAL_DES_ID); } + + // Methods to get the kernel attributes + const mem_descr_t& GetKernargDescr() { return *test_map_[KERNARG_EXP_ID]; } + const mem_descr_t& GetOutputDescr() { return *test_map_[OUTPUT_EXP_ID]; } + void* GetKernargPtr() { return GetKernargDescr().ptr; } + uint32_t GetKernargSize() { return GetKernargDescr().size; } + void* GetOutputPtr() { return GetOutputDescr().ptr; } + uint32_t GetOutputSize() { return GetOutputDescr().size; } + bool IsOutputLocal() { return IsLocal(GetOutputDescr()); } + virtual uint32_t GetGridSize() const = 0; + + // Return reference output + void* GetRefOut() { return test_map_[REFOUT_EXP_ID]->ptr; } + + // Print output + virtual void PrintOutput(const void* ptr) const = 0; + + // Return name + virtual std::string Name() const = 0; + + // Return name + virtual std::string SymbName() { return Name() + ".kd"; } + + protected: + // Set buffer descriptor + bool SetInDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (des_id == KERNARG_DES_ID) { + test_map_[KERNARG_EXP_ID] = &mem_map_[buf_id]; + } + return suc; + } + + // Set results descriptor + bool SetOutDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + test_map_[OUTPUT_EXP_ID] = &mem_map_[buf_id]; + return suc; + } + + // Set host descriptor + bool SetHostDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (suc) { + mem_descr_t& descr = mem_map_[buf_id]; + descr.ptr = malloc(size); + if (des_id == REFOUT_DES_ID) { + test_map_[REFOUT_EXP_ID] = &descr; + } + if (descr.ptr == NULL) suc = false; + } + return suc; + } + + // Get memory descriptor + mem_descr_t GetDescr(const uint32_t& buf_id) const { + mem_const_it_t it = mem_map_.find(buf_id); + return (it != mem_map_.end()) ? it->second : NullDescriptor(); + } + + private: + // Set memory descriptor + bool SetMemDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + const mem_descr_t des = {des_id, NULL, size}; + auto ret = mem_map_.insert(mem_map_t::value_type(buf_id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; + // Test memory map object + std::map test_map_; +}; + +#endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/test/hsa/test/dummy_kernel/dummy_kernel.cl b/test/hsa/test/dummy_kernel/dummy_kernel.cl new file mode 100644 index 00000000..4ab159c8 --- /dev/null +++ b/test/hsa/test/dummy_kernel/dummy_kernel.cl @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + dummy kernel + */ +__kernel void DummyKernel() { + uint tid = get_global_id(0); +} diff --git a/test/hsa/test/dummy_kernel/dummy_kernel.h b/test/hsa/test/dummy_kernel/dummy_kernel.h new file mode 100644 index 00000000..1b8ce430 --- /dev/null +++ b/test/hsa/test/dummy_kernel/dummy_kernel.h @@ -0,0 +1,71 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ +#define TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements DummyKernel kernel parameters +class DummyKernel : public TestKernel { + public: + // Kernel buffers IDs + enum { KERNARG_BUF_ID, LOCAL_BUF_ID }; + + // Constructor + DummyKernel() : + width_(64), + height_(64) + { + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, 0); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, 0); + } + + // Initialize method + void Init() {} + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const {} + + // Return name + std::string Name() const { return std::string("DummyKernel"); } + + private: + // Reference CPU implementation + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight) { return true; } + + // Width of the Input array + const uint32_t width_; + + // Height of the Input array + const uint32_t height_; +}; + +#endif // TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ diff --git a/test/hsa/test/run.sh b/test/hsa/test/run.sh new file mode 100755 index 00000000..32848317 --- /dev/null +++ b/test/hsa/test/run.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +# test trace +export ROC_TEST_TRACE=1 + +# kernels loading iterations +export ROCP_KITER=50 +# kernels dispatching iterations per kernel load +# dispatching to the same queue +export ROCP_DITER=50 +# GPU agents number +export ROCP_AGENTS=2 +# host threads number +# each thread creates a queue pre GPU agent +export ROCP_THRS=3 + +eval ./test/ctrl + +#valgrind --leak-check=full $tbin +#valgrind --tool=massif $tbin +#ms_print massif.out. + +exit 0 diff --git a/test/hsa/test/simple_convolution/simple_convolution.cl b/test/hsa/test/simple_convolution/simple_convolution.cl new file mode 100644 index 00000000..3f8115a6 --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,76 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void SimpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/test/hsa/test/simple_convolution/simple_convolution.cpp b/test/hsa/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 00000000..546f9a6a --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,388 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "simple_convolution/simple_convolution.h" + +#include +#include +#include + +#include "util/helper_funcs.h" +#include "util/test_assert.h" + +const uint32_t SimpleConvolution::input_data_[]{ + 15, 201, 51, 89, 92, 34, 96, 66, 11, 225, 161, 96, 81, 211, 108, 124, 202, 244, 182, + 90, 215, 92, 98, 20, 44, 225, 55, 247, 202, 0, 45, 218, 202, 97, 51, 39, 131, 147, + 105, 143, 116, 11, 239, 198, 222, 92, 67, 169, 81, 250, 3, 40, 86, 101, 60, 131, 70, + 116, 123, 17, 117, 168, 236, 64, 10, 31, 103, 142, 179, 209, 29, 40, 220, 13, 239, 187, + 105, 50, 100, 186, 44, 104, 227, 131, 205, 32, 6, 20, 149, 130, 38, 10, 43, 18, 75, + 53, 50, 178, 195, 230, 132, 225, 14, 96, 238, 253, 27, 88, 48, 128, 18, 92, 232, 246, + 224, 182, 23, 231, 203, 172, 105, 241, 183, 148, 4, 2, 202, 55, 181, 142, 29, 57, 111, + 43, 153, 93, 41, 181, 181, 89, 54, 200, 182, 31, 190, 150, 213, 213, 126, 160, 130, 232, + 146, 57, 125, 151, 59, 71, 206, 240, 213, 236, 42, 68, 24, 195, 162, 65, 121, 87, 155, + 175, 31, 81, 207, 222, 232, 164, 180, 102, 69, 55, 79, 216, 112, 204, 112, 171, 19, 63, + 156, 233, 43, 198, 46, 67, 138, 208, 132, 4, 39, 32, 180, 71, 113, 131, 38, 90, 40, + 219, 193, 109, 18, 16, 70, 131, 220, 182, 46, 240, 245, 203, 217, 32, 146, 7, 100, 28, + 216, 233, 32, 255, 9, 213, 71, 123, 88, 110, 213, 128, 74, 150, 238, 93, 166, 52, 224, + 131, 234, 15, 115, 224, 218, 76, 1, 108, 84, 101, 137, 44, 79, 170, 44, 88, 127, 116, + 211, 216, 226, 168, 88, 45, 63, 70, 138, 230, 123, 107, 105, 101, 122, 220, 70, 84, 41, + 71, 193, 125, 173, 75, 169, 252, 245, 213, 84, 117, 73, 40, 77, 44, 209, 166, 90, 16, + 237, 229, 246, 104, 80, 95, 206, 202, 60, 20, 31, 101, 92, 225, 226, 9, 44, 140, 5, + 34, 97, 89, 151, 171, 129, 229, 216, 82, 139, 51, 99, 120, 24, 89, 225, 104, 185, 175, + 50, 246, 196, 82, 91, 32, 51, 62, 42, 96, 202, 47, 130, 44, 137, 26, 215, 10, 255, + 176, 93, 138, 227, 193, 3, 251, 27, 229, 100, 212, 149, 151, 202, 89, 233, 38, 122, 29, + 100, 164, 125, 46, 212, 0, 90, 93, 26, 50, 103, 25, 226, 197, 164, 198, 135, 168, 194, + 162, 141, 38, 119, 34, 190, 66, 124, 167, 104, 247, 197, 204, 156, 67, 251, 112, 67, 85, + 205, 93, 135, 53, 119, 106, 251, 28, 49, 130, 196, 243, 36, 82, 26, 155, 117, 216, 221, + 241, 128, 70, 233, 70, 18, 133, 137, 14, 245, 204, 99, 195, 42, 235, 248, 161, 86, 243, + 190, 135, 118, 130, 123, 154, 213, 150, 54, 74, 111, 20, 60, 240, 90, 37, 54, 109, 171, + 191, 123, 161, 140, 222, 100, 182, 202, 93, 88, 32, 80, 23, 168, 198, 153, 36, 97, 111, + 187, 151, 185, 43, 172, 245, 27, 6, 27, 82, 115, 199, 18, 239, 104, 158, 206, 205, 85, + 152, 42, 174, 185, 123, 197, 98, 65, 95, 135, 163, 206, 66, 59, 136, 109, 231, 125, 137, + 237, 153, 219, 97, 96, 237, 81, 201, 140, 31, 150, 226, 183, 192, 144, 113, 59, 86, 212, + 125, 182, 91, 33, 132, 158, 92, 12, 12, 68, 138, 149, 50, 36, 113, 147, 133, 95, 229, + 78, 235, 4, 228, 206, 188, 165, 95, 45, 225, 181, 1, 94, 107, 93, 128, 240, 251, 220, + 252, 7, 32, 135, 156, 83, 171, 14, 230, 48, 109, 203, 126, 89, 208, 99, 39, 140, 9, + 134, 185, 234, 60, 187, 73, 167, 24, 201, 152, 20, 166, 148, 27, 199, 28, 184, 26, 199, + 198, 0, 248, 52, 204, 119, 141, 157, 218, 181, 41, 227, 59, 227, 206, 119, 159, 23, 31, + 184, 224, 183, 204, 134, 76, 231, 77, 105, 160, 103, 48, 103, 104, 41, 155, 53, 160, 41, + 210, 123, 222, 252, 95, 26, 223, 45, 146, 126, 68, 177, 54, 37, 105, 3, 171, 182, 235, + 249, 31, 139, 97, 80, 243, 202, 121, 143, 0, 26, 184, 210, 149, 151, 207, 244, 177, 174, + 34, 67, 45, 102, 245, 100, 140, 95, 104, 55, 21, 83, 49, 53, 223, 147, 134, 210, 93, + 0, 97, 93, 26, 26, 48, 175, 178, 255, 164, 99, 174, 198, 167, 220, 45, 156, 64, 185, + 252, 168, 241, 18, 252, 35, 71, 219, 182, 205, 173, 19, 206, 15, 113, 232, 42, 161, 152, + 220, 160, 60, 64, 79, 3, 231, 43, 49, 132, 108, 235, 128, 21, 220, 146, 17, 255, 218, + 236, 182, 168, 154, 201, 118, 170, 58, 94, 212, 220, 246, 177, 125, 51, 241, 204, 55, 216, + 248, 104, 92, 100, 83, 221, 121, 48, 111, 138, 47, 73, 119, 230, 241, 17, 175, 103, 187, + 234, 198, 144, 199, 188, 65, 68, 240, 51, 17, 39, 11, 9, 143, 104, 109, 227, 70, 231, + 19, 181, 113, 66, 255, 233, 41, 241, 250, 217, 89, 182, 196, 31, 71, 139, 220, 137, 208, + 204, 188, 225, 243, 200, 234, 131, 48, 88, 102, 119, 63, 121, 44, 177, 188, 44, 154, 229, + 29, 149, 190, 118, 76, 130, 150, 147, 14, 114, 28, 222, 62, 217, 191, 50, 161, 170, 181, + 210, 2, 28, 73, 66, 149, 117, 243, 81, 162, 141, 55, 191, 35, 245, 54, 111, 120, 204, + 2, 134, 62, 31, 100, 125, 248, 36, 175, 153, 206, 101, 107, 209, 129, 181, 19, 22, 43, + 7, 104, 205, 149, 159, 140, 184, 149, 195, 39, 14, 143, 42, 148, 205, 73, 249, 74, 66, + 30, 250, 219, 237, 96, 71, 190, 225, 253, 210, 248, 40, 218, 96, 245, 111, 0, 130, 39, + 150, 69, 79, 165, 212, 122, 57, 162, 195, 51, 237, 6, 82, 231, 225, 63, 71, 41, 253, + 41, 38, 208, 33, 78, 170, 130, 68, 26, 131, 198, 66, 26, 12, 145, 191, 224, 11, 249, + 130, 207, 44, 112, 213, 126, 88, 183, 190, 160, 225, 187, 201, 8, 140, 235, 87, 55, 109, + 155, 81, 241, 98, 147, 11, 110, 37, 202, 79, 49, 195, 210, 0, 240, 66, 214, 110, 154, + 142, 44, 58, 111, 232, 4, 119, 117, 239, 207, 172, 93, 106, 254, 78, 205, 145, 89, 59, + 183, 35, 138, 232, 230, 92, 233, 214, 159, 191, 69, 58, 78, 114, 116, 189, 91, 121, 53, + 208, 104, 4, 125, 198, 111, 123, 20, 60, 13, 109, 120, 196, 145, 3, 172, 119, 95, 150, + 78, 255, 85, 147, 57, 163, 6, 174, 97, 97, 39, 151, 50, 144, 155, 175, 86, 11, 43, + 107, 71, 56, 216, 191, 253, 105, 194, 170, 225, 34, 64, 47, 34, 150, 195, 91, 58, 201, + 10, 155, 43, 49, 50, 93, 194, 206, 13, 25, 217, 56, 132, 33, 112, 92, 225, 109, 198, + 164, 23, 167, 199, 88, 215, 234, 238, 155, 69, 40, 100, 80, 196, 144, 129, 246, 237, 68, + 197, 250, 93, 159, 51, 225, 193, 163, 62, 163, 17, 4, 71, 41, 172, 15, 130, 132, 249, + 112, 31, 63, 152, 132, 143, 92, 20, 17, 83, 1, 86, 25, 252, 179, 185, 47, 149, 122, + 211, 211, 29, 229, 216, 101, 15, 133, 117, 145, 9, 111, 1, 40, 175, 154, 173, 62, 247, + 193, 80, 75, 194, 166, 100, 191, 90, 29, 239, 239, 152, 194, 195, 182, 168, 156, 27, 183, + 33, 145, 73, 43, 0, 75, 83, 175, 229, 0, 238, 221, 194, 63, 40, 133, 230, 140, 68, + 64, 170, 51, 48, 66, 246, 243, 248, 159, 144, 20, 87, 177, 165, 160, 220, 166, 235, 48, + 86, 209, 49, 68, 174, 243, 132, 214, 120, 106, 99, 189, 170, 13, 241, 219, 80, 232, 207, + 72, 135, 95, 92, 223, 16, 2, 127, 237, 169, 107, 29, 255, 61, 79, 68, 236, 67, 200, + 194, 188, 50, 38, 121, 221, 52, 107, 184, 132, 84, 136, 204, 219, 231, 41, 186, 248, 44, + 58, 229, 213, 166, 3, 212, 227, 82, 25, 207, 150, 225, 146, 82, 20, 185, 204, 242, 237, + 55, 170, 113, 139, 50, 62, 103, 26, 103, 34, 18, 148, 93, 247, 105, 3, 251, 62, 231, + 77, 87, 182, 227, 57, 73, 54, 77, 2, 2, 63, 239, 57, 234, 97, 197, 29, 159, 44, + 55, 7, 79, 74, 155, 172, 66, 5, 175, 61, 67, 150, 139, 155, 77, 111, 212, 151, 165, + 34, 153, 167, 98, 137, 225, 77, 234, 166, 107, 138, 211, 163, 145, 34, 237, 45, 206, 47, + 50, 126, 108, 117, 21, 248, 17, 98, 103, 230, 249, 12, 9, 147, 179, 107, 29, 149, 185, + 7, 59, 37, 146, 14, 200, 35, 49, 182, 80, 0, 230, 130, 126, 83, 248, 148, 75, 9, + 247, 178, 240, 240, 190, 249, 132, 114, 101, 161, 7, 30, 169, 67, 68, 59, 82, 12, 95, + 131, 195, 176, 131, 169, 51, 2, 252, 44, 150, 72, 54, 141, 250, 38, 126, 185, 31, 3, + 44, 132, 165, 52, 163, 78, 120, 231, 138, 202, 244, 234, 77, 183, 155, 209, 97, 207, 212, + 94, 251, 107, 166, 49, 249, 161, 88, 120, 91, 120, 123, 135, 253, 33, 188, 160, 112, 52, + 136, 250, 254, 125, 229, 76, 53, 128, 30, 150, 79, 243, 244, 75, 95, 155, 125, 88, 60, + 213, 209, 152, 78, 77, 32, 75, 110, 220, 236, 222, 17, 117, 217, 15, 242, 190, 92, 39, + 63, 123, 190, 143, 111, 178, 219, 206, 78, 88, 38, 138, 46, 247, 34, 124, 69, 66, 199, + 179, 31, 179, 145, 48, 41, 106, 64, 27, 41, 157, 67, 105, 24, 1, 249, 135, 179, 212, + 86, 1, 44, 124, 140, 91, 116, 175, 215, 185, 242, 159, 108, 17, 83, 254, 66, 124, 105, + 131, 151, 146, 32, 218, 252, 57, 219, 245, 193, 143, 201, 23, 145, 246, 148, 30, 82, 8, + 206, 41, 194, 192, 201, 47, 210, 28, 46, 20, 152, 151, 151, 48, 42, 184, 11, 38, 241, + 231, 28, 179, 119, 230, 202, 8, 220, 94, 39, 46, 103, 245, 88, 42, 181, 33, 90, 136, + 62, 136, 156, 214, 31, 52, 7, 74, 237, 19, 113, 223, 250, 141, 146, 113, 115, 92, 122, + 80, 187, 161, 126, 35, 150, 215, 78, 76, 249, 168, 212, 55, 48, 113, 14, 80, 166, 21, + 154, 147, 40, 12, 114, 35, 153, 5, 148, 12, 98, 15, 92, 29, 176, 219, 65, 71, 179, + 143, 147, 172, 56, 104, 227, 104, 218, 241, 185, 128, 7, 84, 20, 47, 96, 135, 82, 249, + 140, 231, 6, 238, 246, 99, 12, 167, 63, 77, 238, 242, 221, 130, 158, 21, 235, 129, 126, + 197, 114, 56, 69, 121, 140, 90, 169, 237, 225, 252, 231, 109, 228, 237, 91, 219, 81, 104, + 130, 144, 181, 113, 130, 147, 244, 32, 169, 223, 162, 39, 164, 21, 95, 234, 143, 236, 68, + 57, 217, 37, 53, 192, 147, 25, 174, 239, 245, 0, 87, 119, 144, 13, 232, 19, 160, 220, + 51, 73, 188, 214, 113, 96, 235, 209, 75, 122, 190, 144, 179, 151, 181, 233, 88, 73, 3, + 7, 56, 248, 7, 143, 112, 152, 156, 89, 171, 61, 53, 223, 135, 242, 181, 248, 83, 161, + 202, 158, 28, 136, 46, 208, 32, 228, 186, 121, 45, 189, 128, 102, 182, 136, 246, 38, 32, + 147, 127, 204, 208, 181, 171, 87, 167, 97, 80, 250, 2, 26, 153, 31, 163, 200, 239, 195, + 172, 169, 60, 218, 103, 188, 65, 30, 69, 55, 68, 102, 202, 196, 50, 154, 121, 221, 242, + 33, 63, 67, 28, 66, 93, 181, 97, 0, 126, 81, 196, 43, 251, 0, 5, 98, 189, 70, + 128, 3, 126, 197, 105, 72, 137, 155, 227, 3, 121, 214, 36, 184, 25, 65, 250, 118, 247, + 91, 119, 117, 173, 60, 160, 168, 60, 166, 10, 250, 237, 139, 253, 107, 80, 102, 180, 217, + 2, 151, 221, 123, 109, 1, 52, 134, 66, 46, 253, 57, 138, 117, 175, 55, 178, 79, 223, + 239, 245, 234, 233, 226, 117, 231, 78, 198, 78, 2, 159, 80, 154, 124, 204, 7, 126, 0, + 142, 193, 47, 140, 251, 185, 2, 170, 241, 180, 249, 208, 163, 239, 186, 141, 210, 48, 116, + 32, 246, 195, 34, 150, 19, 188, 19, 224, 196, 146, 224, 83, 83, 15, 224, 78, 201, 226, + 249, 186, 151, 243, 139, 58, 226, 70, 199, 181, 118, 60, 213, 109, 255, 248, 3, 19, 181, + 23, 243, 122, 169, 212, 205, 252, 228, 173, 75, 173, 144, 68, 104, 39, 55, 243, 98, 26, + 57, 41, 207, 175, 102, 165, 29, 102, 158, 32, 121, 83, 56, 109, 205, 225, 66, 155, 222, + 38, 73, 42, 212, 218, 110, 60, 1, 166, 48, 99, 193, 105, 141, 145, 25, 244, 54, 54, + 90, 213, 87, 212, 40, 143, 66, 246, 112, 132, 146, 79, 171, 220, 121, 128, 182, 232, 189, + 184, 143, 237, 27, 80, 86, 169, 226, 112, 158, 25, 166, 248, 238, 253, 204, 23, 141, 15, + 13, 254, 147, 160, 77, 63, 124, 199, 191, 50, 175, 124, 234, 62, 105, 6, 143, 192, 176, + 113, 48, 78, 139, 215, 71, 121, 213, 20, 144, 98, 35, 158, 96, 183, 62, 174, 246, 187, + 117, 182, 237, 37, 50, 216, 99, 156, 223, 243, 93, 143, 101, 142, 222, 240, 101, 37, 106, + 58, 57, 250, 157, 93, 153, 254, 20, 216, 172, 10, 147, 34, 192, 129, 71, 243, 90, 171, + 144, 57, 159, 238, 201, 4, 124, 167, 244, 225, 205, 95, 28, 7, 89, 185, 100, 243, 184, + 121, 203, 100, 131, 95, 135, 68, 224, 207, 56, 58, 122, 201, 115, 25, 183, 61, 30, 51, + 229, 18, 21, 178, 113, 49, 186, 203, 235, 31, 191, 163, 152, 138, 8, 28, 233, 143, 97, + 202, 95, 153, 4, 217, 98, 120, 243, 26, 182, 17, 77, 155, 36, 99, 78, 150, 149, 8, + 98, 128, 39, 33, 36, 192, 172, 45, 220, 149, 189, 61, 96, 28, 215, 100, 246, 58, 221, + 233, 84, 147, 251, 162, 47, 31, 5, 125, 181, 154, 134, 23, 27, 174, 57, 64, 110, 229, + 109, 75, 123, 43, 136, 219, 71, 95, 64, 61, 154, 29, 39, 238, 177, 34, 145, 225, 65, + 150, 94, 247, 49, 229, 15, 77, 147, 72, 141, 2, 45, 251, 77, 169, 38, 213, 132, 110, + 53, 196, 172, 207, 226, 212, 190, 148, 246, 79, 117, 56, 230, 212, 48, 23, 185, 63, 100, + 76, 136, 242, 78, 181, 237, 156, 95, 20, 113, 227, 131, 167, 168, 47, 119, 139, 3, 53, + 31, 250, 133, 149, 50, 107, 105, 99, 130, 34, 162, 231, 111, 42, 217, 190, 224, 199, 90, + 63, 220, 204, 35, 95, 115, 203, 143, 234, 86, 147, 32, 118, 141, 165, 11, 192, 16, 117, + 35, 147, 152, 198, 123, 7, 240, 84, 198, 209, 28, 33, 17, 248, 237, 52, 88, 97, 255, + 231, 76, 86, 122, 109, 204, 8, 18, 216, 201, 35, 77, 237, 183, 229, 179, 50, 237, 164, + 135, 179, 118, 164, 213, 135, 157, 195, 187, 245, 36, 187, 220, 113, 18, 87, 222, 222, 96, + 241, 183, 42, 21, 4, 23, 205, 233, 203, 0, 214, 112, 136, 138, 230, 44, 95, 110, 201, + 34, 41, 191, 71, 229, 155, 185, 247, 243, 151, 214, 84, 137, 141, 126, 159, 146, 149, 108, + 124, 97, 109, 82, 209, 245, 221, 183, 34, 60, 37, 236, 95, 79, 171, 167, 53, 71, 96, + 45, 58, 248, 3, 142, 129, 145, 12, 33, 36, 162, 142, 160, 3, 251, 243, 213, 240, 208, + 141, 19, 13, 178, 255, 109, 2, 170, 20, 55, 241, 116, 101, 44, 108, 105, 186, 238, 251, + 199, 15, 31, 106, 157, 191, 110, 152, 178, 67, 137, 131, 208, 156, 144, 131, 155, 253, 134, + 70, 18, 190, 55, 134, 35, 99, 243, 140, 30, 225, 135, 230, 240, 166, 81, 142, 102, 191, + 39, 25, 3, 177, 156, 211, 77, 45, 87, 233, 43, 221, 48, 61, 155, 103, 195, 191, 203, + 182, 75, 233, 152, 211, 208, 136, 121, 33, 23, 224, 224, 62, 249, 227, 239, 149, 183, 61, + 195, 15, 39, 238, 236, 87, 43, 136, 191, 239, 71, 138, 166, 147, 116, 62, 102, 68, 199, + 224, 101, 223, 193, 70, 29, 186, 42, 13, 80, 225, 75, 19, 241, 115, 1, 221, 202, 45, + 102, 137, 29, 174, 20, 195, 66, 136, 2, 168, 205, 201, 137, 50, 168, 74, 121, 198, 4, + 163, 212, 85, 133, 31, 105, 118, 146, 106, 84, 93, 152, 187, 231, 181, 105, 251, 121, 171, + 132, 123, 84, 81, 69, 221, 132, 238, 40, 253, 181, 45, 161, 137, 130, 39, 169, 235, 158, + 59, 86, 242, 153, 239, 173, 128, 165, 23, 123, 30, 195, 0, 154, 23, 81, 224, 245, 214, + 206, 30, 212, 131, 75, 117, 12, 206, 157, 181, 186, 59, 241, 17, 45, 138, 0, 219, 11, + 165, 243, 135, 196, 182, 135, 95, 205, 217, 63, 195, 175, 14, 225, 131, 145, 45, 249, 158, + 251, 150, 84, 182, 209, 70, 199, 255, 209, 199, 219, 220, 109, 206, 99, 50, 132, 234, 146, + 82, 195, 209, 22, 114, 223, 247, 246, 113, 37, 239, 16, 33, 134, 100, 215, 88, 170, 158, + 87, 123, 102, 50, 88, 211, 1, 187, 6, 134, 165, 152, 216, 105, 106, 239, 220, 74, 231, + 210, 187, 12, 194, 204, 45, 72, 49, 4, 160, 219, 162, 248, 87, 8, 43, 176, 220, 44, + 107, 227, 178, 17, 124, 139, 122, 230, 122, 87, 48, 97, 42, 236, 110, 236, 185, 155, 53, + 234, 159, 214, 198, 66, 206, 30, 75, 249, 206, 40, 38, 57, 11, 217, 74, 136, 100, 197, + 110, 223, 29, 159, 65, 71, 140, 175, 51, 69, 74, 105, 48, 234, 63, 246, 45, 13, 20, + 121, 7, 226, 161, 46, 28, 173, 7, 103, 53, 108, 45, 164, 76, 74, 68, 141, 145, 208, + 61, 197, 22, 136, 46, 70, 115, 110, 60, 161, 124, 81, 26, 132, 51, 188, 178, 79, 106, + 186, 183, 160, 39, 228, 68, 115, 46, 136, 1, 192, 89, 62, 133, 112, 198, 180, 182, 58, + 34, 243, 219, 158, 69, 245, 34, 120, 178, 213, 200, 28, 143, 128, 188, 182, 100, 1, 41, + 146, 137, 43, 82, 227, 105, 216, 83, 48, 140, 10, 106, 175, 254, 70, 77, 67, 59, 112, + 188, 237, 69, 133, 10, 212, 5, 198, 138, 105, 199, 180, 252, 81, 223, 79, 53, 73, 39, + 137, 121, 180, 148, 228, 99, 146, 42, 177, 214, 102, 33, 147, 84, 102, 25, 94, 59, 31, + 37, 197, 137, 237, 122, 133, 63, 90, 213, 116, 163, 253, 253, 29, 177, 145, 2, 21, 36, + 45, 198, 251, 147, 231, 143, 232, 78, 168, 71, 137, 199, 108, 79, 80, 90, 201, 214, 153, + 35, 172, 13, 199, 169, 11, 228, 91, 157, 231, 112, 193, 20, 54, 189, 167, 30, 77, 144, + 108, 245, 215, 246, 189, 68, 69, 14, 158, 14, 228, 55, 50, 145, 69, 249, 58, 80, 222, + 149, 237, 198, 5, 175, 218, 60, 109, 130, 91, 186, 18, 200, 175, 234, 190, 109, 46, 3, + 123, 204, 18, 96, 4, 68, 241, 73, 62, 44, 154, 29, 193, 136, 227, 199, 55, 189, 4, + 164, 64, 95, 95, 82, 39, 15, 60, 230, 124, 107, 233, 248, 55, 251, 89, 60, 63, 75, + 134, 126, 119, 32, 156, 57, 168, 127, 0, 224, 61, 5, 133, 125, 100, 228, 208, 140, 243, + 12, 114, 111, 119, 92, 104, 175, 87, 193, 236, 151, 13, 114, 21, 132, 146, 177, 189, 59, + 49, 190, 27, 110, 195, 160, 236, 40, 132, 188, 181, 120, 201, 40, 232, 65, 132, 80, 241, + 220, 18, 221, 115, 31, 79, 137, 164, 226, 58, 98, 29, 108, 32, 57, 219, 228, 218, 199, + 13, 95, 132, 195, 215, 77, 235, 191, 143, 112, 16, 128, 76, 35, 93, 191, 66, 173, 73, + 231, 143, 132, 73, 173, 240, 106, 231, 203, 78, 193, 147, 92, 33, 23, 31, 248, 100, 11, + 184, 243, 123, 201, 115, 200, 236, 209, 135, 47, 126, 209, 22, 14, 85, 95, 188, 69, 202, + 163, 17, 24, 101, 164, 117, 134, 187, 148, 127, 31, 159, 55, 19, 27, 1, 135, 227, 237, + 89, 107, 28, 216, 60, 51, 230, 145, 147, 163, 215, 93, 70, 232, 118, 172, 140, 235, 50, + 71, 128, 177, 103, 32, 233, 123, 60, 234, 2, 31, 216, 91, 139, 244, 52, 200, 40, 26, + 90, 188, 189, 49, 25, 4, 25, 144, 176, 166, 124, 227, 237, 252, 148, 85, 29, 125, 208, + 89, 104, 210, 121, 64, 46, 4, 53, 99, 204, 93, 125, 38, 25, 59, 88, 51, 64, 113, + 195, 241, 23, 64, 212, 5, 60, 104, 90, 90, 230, 42, 179, 78, 253, 44, 143, 44, 49, + 196, 143, 254, 34, 13, 36, 60, 73, 125, 112, 137, 239, 52, 122, 7, 116, 79, 12, 177, + 183, 103, 11, 158, 146, 190, 237, 143, 235, 124, 188, 28, 65, 76, 26, 100, 89, 63, 160, + 163, 188, 17, 44, 172, 69, 167, 179, 185, 246, 191, 107, 174, 38, 118, 76, 184, 53, 58, + 72, 32, 182, 5, 61, 248, 81, 88, 92, 170, 152, 253, 77, 84, 14, 122, 1, 83, 34, + 180, 13, 25, 115, 120, 199, 154, 238, 20, 83, 36, 79, 155, 68, 5, 160, 130, 254, 242, + 218, 90, 156, 114, 87, 234, 199, 101, 101, 200, 185, 135, 124, 198, 160, 240, 62, 104, 138, + 45, 125, 222, 81, 204, 122, 150, 210, 26, 24, 208, 12, 242, 42, 169, 101, 130, 148, 44, + 232, 249, 245, 161, 128, 113, 103, 33, 98, 166, 137, 236, 212, 7, 202, 38, 211, 69, 188, + 165, 95, 212, 118, 108, 199, 161, 22, 45, 35, 170, 90, 11, 163, 79, 173, 36, 193, 20, + 69, 35, 187, 207, 16, 144, 214, 219, 182, 170, 32, 114, 79, 128, 71, 198, 237, 15, 103, + 4, 60, 139, 175, 150, 151, 82, 230, 68, 119, 168, 89, 188, 204, 20, 140, 220, 165, 98, + 184, 91, 12, 217, 205, 92, 90, 20, 35, 71, 36, 138, 76, 96, 22, 251, 247, 173, 78, + 222, 241, 197, 134, 75, 130, 83, 96, 14, 47, 5, 113, 232, 96, 126, 193, 45, 218, 28, + 66, 253, 99, 103, 136, 176, 200, 158, 171, 191, 76, 249, 158, 62, 190, 37, 137, 65, 120, + 233, 80, 168, 238, 193, 145, 79, 63, 82, 125, 26, 111, 191, 24, 210, 39, 161, 131, 239, + 64, 46, 175, 140, 39, 77, 202, 230, 115, 84, 40, 235, 62, 120, 148, 45, 57, 37, 124, + 121, 120, 249, 148, 231, 185, 172, 186, 224, 77, 61, 207, 141, 107, 126, 26, 147, 204, 229, + 121, 63, 58, 161, 43, 120, 25, 191, 165, 83, 228, 34, 205, 92, 27, 97, 67, 213, 13, + 253, 182, 91, 59, 133, 233, 166, 4, 4, 57, 209, 233, 179, 16, 35, 85, 59, 155, 111, + 250, 65, 194, 223, 99, 144, 59, 127, 241, 127, 85, 255, 125, 11, 90, 184, 145, 68, 95, + 150, 72, 153, 103, 49, 76, 120, 85, 161, 179, 241, 16, 174, 51, 211, 142, 150, 99, 201, + 22, 85, 73, 108, 84, 199, 120, 175, 128, 9, 243, 223, 160, 59, 120, 8, 109, 197, 128, + 194, 103, 52, 180, 119, 227, 231, 75, 113, 126, 175, 59, 148, 4, 132, 1, 89, 75, 121, + 8, 204, 131, 251, 171, 36, 55, 36, 44, 165, 233, 172, 103, 80, 224, 28, 200, 195, 3, + 20, 53, 129, 195, 112, 22, 200, 244, 23, 34, 64, 145, 42, 12, 20, 38, 184, 56, 94, + 220, 101, 3, 198, 17, 107, 22, 242, 135, 222, 182, 138, 243, 235, 11, 182, 91, 34, 127, + 80, 58, 161, 145, 203, 204, 158, 224, 242, 86, 24, 81, 51, 126, 84, 249, 143, 191, 15, + 130, 70, 238, 57, 209, 225, 36, 221, 152, 128, 255, 24, 208, 57, 186, 97, 4, 134, 255, + 229, 121, 86, 254, 202, 137, 124, 31, 130, 12, 222, 146, 142, 37, 129, 199, 247, 98, 236, + 212, 251, 108, 211, 20, 60, 13, 206, 158, 18, 84}; + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + randomize_seed_ = 0; + + if (!IsPowerOf2(width_)) { + width_ = RoundToPowerOf2(width_); + } + + if (!IsPowerOf2(height_)) { + height_ = RoundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, sizeof(kernel_args_t)); + SetInDescr(INPUT_BUF_ID, SYS_DES_ID, input_size_bytes); + SetInDescr(MASK_BUF_ID, SYS_DES_ID, mask_size_bytes); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, input_size_bytes); + SetHostDescr(REFOUT_BUF_ID, REFOUT_DES_ID, input_size_bytes); + + if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); +} + +void SimpleConvolution::Init() { + std::clog << "SimpleConvolution::init :" << std::endl; + + mem_descr_t kernarg_des = GetDescr(KERNARG_BUF_ID); + mem_descr_t input_des = GetDescr(INPUT_BUF_ID); + mem_descr_t mask_des = GetDescr(MASK_BUF_ID); + mem_descr_t output_des = GetDescr(LOCAL_BUF_ID); +#if 0 + printf("kernarg_des %p 0x%x\n", kernarg_des.ptr, kernarg_des.size); + printf("input_des %p 0x%x\n", input_des.ptr, input_des.size); + printf("mask_des %p 0x%x\n", mask_des.ptr, mask_des.size); + printf("output_des %p 0x%x\n", output_des.ptr, output_des.size); +#endif + uint32_t* input = reinterpret_cast(input_des.ptr); + uint32_t* output_local = reinterpret_cast(output_des.ptr); + float* mask = reinterpret_cast(mask_des.ptr); + kernel_args_t* kernel_args = reinterpret_cast(kernarg_des.ptr); + + if (randomize_seed_) { + // random initialisation of input + FillRandom(input, width_, height_, 0, 255, randomize_seed_); + } else { + // initialization with preset values + memcpy(input, input_data_, width_ * height_ * sizeof(uint32_t)); + } + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + std::clog << std::dec; + PrintArray("> Input[0]", input, width_, 1); + PrintArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + ReferenceImplementation(reinterpret_cast(GetRefOut()), input, mask, width_, height_, + mask_width_, mask_height_); +} + +void SimpleConvolution::PrintOutput(const void* ptr) const { + PrintArray("> Output[0]", reinterpret_cast(ptr), width_, 1); +} + +bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, + const float* mask, const uint32_t width, + const uint32_t height, const uint32_t mask_width, + const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/test/hsa/test/simple_convolution/simple_convolution.h b/test/hsa/test/simple_convolution/simple_convolution.h new file mode 100644 index 00000000..550d1320 --- /dev/null +++ b/test/hsa/test/simple_convolution/simple_convolution.h @@ -0,0 +1,94 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ +#define TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements SimpleConvolution kernel parameters +class SimpleConvolution : public TestKernel { + public: + // Kernel buffers IDs + enum { INPUT_BUF_ID, LOCAL_BUF_ID, MASK_BUF_ID, KERNARG_BUF_ID, REFOUT_BUF_ID }; + + // Constructor + SimpleConvolution(); + + // Initialize method + void Init(); + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const; + + // Return name + std::string Name() const { return std::string("SimpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight); + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Randomize input data + unsigned randomize_seed_; + + // Input data + static const uint32_t input_data_[]; +}; + +#endif // TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ diff --git a/test/hsa/test/util/evt_stats.h b/test/hsa/test/util/evt_stats.h new file mode 100644 index 00000000..01bc1317 --- /dev/null +++ b/test/hsa/test/util/evt_stats.h @@ -0,0 +1,98 @@ +#ifndef EVT_STATS_H_ +#define EVT_STATS_H_ + +#include + +#include +#include +#include +#include + +template +class EvtStatsT { + public: + typedef std::mutex mutex_t; + typedef uint64_t evt_count_t; + typedef double evt_avr_t; + struct evt_record_t { + uint64_t count; + evt_avr_t avr; + evt_record_t() : count(0), avr(0) {} + }; + typedef typename std::map map_t; + typedef typename std::map labels_t; + + // Comparison function + struct cmpfun { + template bool operator()(const T& a, const T& b) const { + return (a.second.avr != b.second.avr) ? a.second.avr < b.second.avr : a.first < b.first; + } + }; + + inline void add_event(evt_id_t id, evt_weight_t weight) { + std::lock_guard lck(mutex_); + //printf("EvtStats %p ::add_event %u %lu\n", this, id, weight); fflush(stdout); + + evt_record_t& rec = map_[id]; + const evt_count_t prev_count = rec.count; + const evt_count_t new_count = prev_count + 1; + const evt_avr_t prev_avr = rec.avr; + const evt_avr_t new_avr = ((prev_avr * prev_count) + weight) / new_count; + + rec.count = new_count; + rec.avr = new_avr; + } + + void dump() { + std::lock_guard lck(mutex_); + fprintf(stdout, "Dumping %s\n", path_); fflush(stdout); + + typedef typename std::set, cmpfun> set_t; + set_t s_(map_.begin(), map_.end()); + + uint64_t index = 0; + for (auto& e : s_) { + const evt_id_t id = e.first; + const char* label = get_label(id); + std::ostringstream oss; + oss << index << ",\"" << label << "\"," << e.second.count << "," << (uint64_t)(e.second.avr) << "," << (uint64_t)(e.second.count * e.second.avr); + fprintf(fdes_, "%s\n", oss.str().c_str()); + index += 1; + } + + fclose(fdes_); + } + + const char* get_label(const uint32_t& id) { + auto ret = labels_.insert({id, NULL}); + const char* label = ret.first->second; + return label; + } + const char* get_label(const char* id) { + return id; + } + const char* get_label(const std::string& id) { + return id.c_str(); + } + + void set_label(evt_id_t id, const char* label) { + //printf("EvtStats %p ::set_label %u %s\n", this, id, label); fflush(stdout); + labels_[id] = label; + } + + EvtStatsT(FILE* f, const char* path) : fdes_(f), path_(path) { + //printf("EvtStats %p ::EvtStatsT()\n", this); fflush(stdout); + fprintf(fdes_, "Index,Name,Count,Avr,Total\n"); + } + + private: + mutex_t mutex_; + map_t map_; + labels_t labels_; + FILE* fdes_; + const char* path_; +}; + +typedef EvtStatsT EvtStats; + +#endif // EVT_STATS_H_ diff --git a/test/hsa/test/util/helper_funcs.h b/test/hsa/test/util/helper_funcs.h new file mode 100644 index 00000000..c76854ba --- /dev/null +++ b/test/hsa/test/util/helper_funcs.h @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HELPER_FUNCS_H_ +#define TEST_UTIL_HELPER_FUNCS_H_ + +#include +#include +#include +#include +#include + +static inline void Error(std::string error_msg) { + std::cerr << "Error: " << error_msg << std::endl; +} + +template +void PrintArray(const std::string header, const T* data, const int width, const int height) { + std::clog << header << " :\n"; + for (int i = 0; i < height; i++) { + std::clog << "> "; + for (int j = 0; j < width; j++) { + std::clog << data[i * width + j] << " "; + } + std::clog << "\n"; + } +} + +template +bool FillRandom(T* array_ptr, const int width, const int height, const T range_min, + const T range_max, unsigned int seed = 123) { + if (!array_ptr) { + Error("Cannot fill array. NULL pointer."); + return false; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(range_max - range_min) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + array_ptr[index] = range_min + T(range * rand() / (RAND_MAX + 1.0)); + } + + return true; +} + +template T RoundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template bool IsPowerOf2(T val) { + long long long_val = val; + return (((long_val & (-long_val)) - long_val == 0) && (long_val != 0)); +} + +#endif // TEST_UTIL_HELPER_FUNCS_H_ diff --git a/test/hsa/test/util/hsa_rsrc_factory.cpp b/test/hsa/test/util/hsa_rsrc_factory.cpp new file mode 120000 index 00000000..f3726ccf --- /dev/null +++ b/test/hsa/test/util/hsa_rsrc_factory.cpp @@ -0,0 +1 @@ +../../src/hsa_rsrc_factory.cpp \ No newline at end of file diff --git a/test/hsa/test/util/hsa_rsrc_factory.h b/test/hsa/test/util/hsa_rsrc_factory.h new file mode 120000 index 00000000..64af96f1 --- /dev/null +++ b/test/hsa/test/util/hsa_rsrc_factory.h @@ -0,0 +1 @@ +../../src/hsa_rsrc_factory.h \ No newline at end of file diff --git a/test/hsa/test/util/perf_timer.cpp b/test/hsa/test/util/perf_timer.cpp new file mode 100644 index 00000000..85c490b6 --- /dev/null +++ b/test/hsa/test/util/perf_timer.cpp @@ -0,0 +1,179 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/perf_timer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz_ = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!timers_.empty()) { + Timer* temp = timers_.back(); + timers_.pop_back(); + delete temp; + } +} + +// New cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->start = 0; + newTimer->clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->freq); +#else + newTimer->freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + timers_.push_back(newTimer); + return (int)(timers_.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + timers_[index]->start = (double)tmpStart; +#else +// AMD Windows timing method +#endif +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + timers_[index]->start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + // AMD timing method + unsigned int unused; + timers_[index]->start = __rdtscp(&unused); +#endif +#endif + + return SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else +// AMD Window Timing +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + unsigned int unused; + n = __rdtscp(&unused); +#endif +#endif + + n -= timers_[index]->start; + timers_[index]->start = 0; + +#ifndef _AMD + timers_[index]->clocks += n; +#else + // timers_[index]->clocks += 10 * n / freq_in_100mhz_; // unit is ns + timers_[index]->clocks += 1.0E-6 * 10 * n / freq_in_100mhz_; // convert to ms +#endif + + return SUCCESS; +} + +void PerfTimer::Error(std::string str) { std::cout << str << std::endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot read timer. Invalid handle."); + return FAILURE; + } + + double reading = double(timers_[index]->clocks); + + reading = double(reading / timers_[index]->freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/test/hsa/test/util/perf_timer.h b/test/hsa/test/util/perf_timer.h new file mode 100644 index 00000000..bfd55324 --- /dev/null +++ b/test/hsa/test/util/perf_timer.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_PERF_TIMER_H_ +#define TEST_UTIL_PERF_TIMER_H_ + +// Will use AMD timer or general Linux timer based on compilation flag +// Need to consider platform is Windows or Linux + +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif // _MSC_VER + +#include +#include +#include + +class PerfTimer { + public: + enum { SUCCESS = 0, FAILURE = 1 }; + + PerfTimer(); + ~PerfTimer(); + + // General Linux timing method + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + private: + struct Timer { + std::string name; /* name of time object */ + long long freq; /* frequency */ + double clocks; /* number of ticks at end */ + double start; /* start point ticks */ + }; + + std::vector timers_; /* vector to Timer objects */ + double freq_in_100mhz_; + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + void Error(std::string str); +}; + +#endif // TEST_UTIL_PERF_TIMER_H_ diff --git a/test/hsa/test/util/test_assert.h b/test/hsa/test/util/test_assert.h new file mode 100644 index 00000000..7803865d --- /dev/null +++ b/test/hsa/test/util/test_assert.h @@ -0,0 +1,35 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_TEST_ASSERT_H_ +#define TEST_UTIL_TEST_ASSERT_H_ + +#define TEST_ASSERT(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Assert failed(" << #cond << ") at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + exit(-1); \ + } \ + } + +#endif // TEST_UTIL_TEST_ASSERT_H_ diff --git a/test/hsa/test/util/xml.h b/test/hsa/test/util/xml.h new file mode 100644 index 00000000..eb2f5074 --- /dev/null +++ b/test/hsa/test/util/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_XML_H_ +#define TEST_UTIL_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // TEST_UTIL_XML_H_ From bdf5bb6b23413bf0b0c49ede7b3f8290a4923eb3 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Tue, 26 Apr 2022 03:03:02 -0700 Subject: [PATCH 38/47] Populate roctracer.h wrapper file with orginal file contents as dead code Backward comaptibility for components that search for contents in roctracer.h Improvements: Removed redundant code for setting and unsetting variables Added header template file in source code instead of generating it on build time Change-Id: I96aeb7f2a6d53d45eb5aeb5300024cd22dad1324 (cherry picked from commit 8ca752ce2cb13ca7859ea57b6508d0eb2f291c2c) --- header_template.hpp.in | 31 +++++++++++++++++ roctracer-backward-compat.cmake | 60 +++++++++++++-------------------- 2 files changed, 55 insertions(+), 36 deletions(-) create mode 100644 header_template.hpp.in diff --git a/header_template.hpp.in b/header_template.hpp.in new file mode 100644 index 00000000..39178898 --- /dev/null +++ b/header_template.hpp.in @@ -0,0 +1,31 @@ +/* + Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + */ + +#ifndef @include_guard@ +#define @include_guard@ + +#pragma message("This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with roctracer") +@include_statements@ + +#endif + +@original_contents@ diff --git a/roctracer-backward-compat.cmake b/roctracer-backward-compat.cmake index ccea223e..d0f927c2 100644 --- a/roctracer-backward-compat.cmake +++ b/roctracer-backward-compat.cmake @@ -25,29 +25,19 @@ set(ROCT_WRAPPER_INC_DIR ${ROCT_WRAPPER_DIR}/include) set(ROCT_WRAPPER_LIB_DIR ${ROCT_WRAPPER_DIR}/lib) set(ROCT_WRAPPER_TOOL_DIR ${ROCT_WRAPPER_DIR}/tool) -#Function to generate header template file -function(create_header_template) - file(WRITE ${ROCT_WRAPPER_DIR}/header.hpp.in "/* - Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +#Function to set actual file contents in wrapper files +#Some components grep for the contents in the file +function(set_file_contents input_file) + set(hashzero_check "#if 0 //Area for original file dump\n +/* The following is a copy of the original file for the benefit of build systems which grep for values + * in this file rather than preprocess it. This is just for backward compatibility */") - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the \"Software\"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */\n\n#ifndef @include_guard@\n#define @include_guard@ \n\n#pragma message(\"This file is deprecated. Use file from include path /opt/rocm-ver/include/ and prefix with roctracer\")\n@include_statements@ \n\n#endif") + file(READ ${input_file} file_contents) + set(original_contents "${hashzero_check}\n +${file_contents} +#endif") + get_filename_component(file_name ${input_file} NAME) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/header_template.hpp.in ${ROCT_WRAPPER_INC_DIR}/${file_name}) endfunction() #use header template file and generate wrapper header files @@ -58,33 +48,33 @@ function(generate_wrapper_header) #set include guard get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE) string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME) - set(include_guard "${include_guard}ROCTRACER_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") + set(include_guard "ROCTRACER_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") #set include statements get_filename_component(file_name ${header_file} NAME) get_filename_component ( header_subdir ${header_file} DIRECTORY ) if(header_subdir) - set(include_statements "${include_statements}#include \"../../../include/${ROCTRACER_NAME}/${header_subdir}/${file_name}\"\n") - configure_file(${ROCT_WRAPPER_DIR}/header.hpp.in ${ROCT_WRAPPER_INC_DIR}/${header_subdir}/${file_name}) + set(include_statements "#include \"../../../include/${ROCTRACER_NAME}/${header_subdir}/${file_name}\"\n") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/header_template.hpp.in ${ROCT_WRAPPER_INC_DIR}/${header_subdir}/${file_name}) else() - set(include_statements "${include_statements}#include \"../../include/${ROCTRACER_NAME}/${file_name}\"\n") - configure_file(${ROCT_WRAPPER_DIR}/header.hpp.in ${ROCT_WRAPPER_INC_DIR}/${file_name}) + set(include_statements "#include \"../../include/${ROCTRACER_NAME}/${file_name}\"\n") + if(${file_name} STREQUAL "roctracer.h") + set_file_contents(${CMAKE_CURRENT_SOURCE_DIR}/inc/${file_name}) + else() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/header_template.hpp.in ${ROCT_WRAPPER_INC_DIR}/${file_name}) + endif() endif() - unset(include_guard) - unset(include_statements) endforeach() foreach(header_file ${GEN_HEADERS}) #set include guard get_filename_component(INC_GAURD_NAME ${header_file} NAME_WE) string(TOUPPER ${INC_GAURD_NAME} INC_GAURD_NAME) - set(include_guard "${include_guard}ROCTRACER_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") + set(include_guard "ROCTRACER_WRAPPER_INCLUDE_${INC_GAURD_NAME}_H") #set include statements get_filename_component(file_name ${header_file} NAME) - set(include_statements "${include_statements}#include \"../../include/${ROCTRACER_NAME}/${file_name}\"\n") - configure_file(${ROCT_WRAPPER_DIR}/header.hpp.in ${ROCT_WRAPPER_INC_DIR}/${file_name}) + set(include_statements "#include \"../../include/${ROCTRACER_NAME}/${file_name}\"\n") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/header_template.hpp.in ${ROCT_WRAPPER_INC_DIR}/${file_name}) - unset(include_guard) - unset(include_statements) endforeach() endfunction() @@ -117,8 +107,6 @@ function(create_library_symlink) ../../lib/${ROCTRACER_NAME}/${LIB_ROCTRACERTOOL} ${ROCT_WRAPPER_TOOL_DIR}/${LIB_TRACERTOOL}) endfunction() -#Creater a template for header file -create_header_template() #Use template header file and generater wrapper header files generate_wrapper_header() install(DIRECTORY ${ROCT_WRAPPER_INC_DIR} DESTINATION ${ROCTRACER_NAME}) From 6f10c1212792d6154d4602bc1d897e08dd6fee3c Mon Sep 17 00:00:00 2001 From: Sriraksha Nagaraj Date: Wed, 21 Sep 2022 14:52:34 -0500 Subject: [PATCH 39/47] SWDEV-357760:Adding patch to fix regression test failure Change-Id: Ic81c0c20ab295e4120f6fc3b4f055559ebf1a8c5 --- script/hsaap.py | 4 ++-- src/roctracer/roctracer.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/script/hsaap.py b/script/hsaap.py index 113e005a..6046fb2a 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -335,13 +335,13 @@ def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license): self.cpp_content += '#include \"util/callback_table.h\"\n\n' self.cpp_content += '#include \n' self.cpp_content += 'namespace roctracer {\n' + self.cpp_content += 'extern activity_correlation_id_t NextCorrelationId();\n' self.cpp_content += 'namespace hsa_support {\n\n' self.cpp_content += 'static CoreApiTable CoreApi_saved_before_cb;\n' self.cpp_content += 'static AmdExtTable AmdExt_saved_before_cb;\n' self.cpp_content += 'static ImageExtTable ImageExt_saved_before_cb;\n\n' - self.cpp_content += 'std::atomic hsa_counter_{1};\n' self.cpp_content += 'static thread_local uint64_t hsa_correlation_id_tls = 0;\n' self.cpp_content += self.add_section('API callback functions', '', self.gen_callbacks) @@ -430,7 +430,7 @@ def gen_callbacks(self, n, name, call, struct): content += ' api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n' content += ' auto [ api_callback_fun, api_callback_arg ] = cb_table.Get(' + call_id + ');\n' content += ' api_data.phase = 0;\n' - content += ' api_data.correlation_id = hsa_support::hsa_counter_.fetch_add(1, std::memory_order_relaxed);\n' + content += ' api_data.correlation_id = NextCorrelationId();\n' content += ' hsa_correlation_id_tls = api_data.correlation_id;\n' content += ' if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' if ret_type != 'void': diff --git a/src/roctracer/roctracer.cpp b/src/roctracer/roctracer.cpp index 30bf4928..4a712972 100644 --- a/src/roctracer/roctracer.cpp +++ b/src/roctracer/roctracer.cpp @@ -161,7 +161,7 @@ roctracer_status_t GetExcStatus(const std::exception& e) { return (roctracer_exc_ptr) ? roctracer_exc_ptr->status() : ROCTRACER_STATUS_ERROR; } -static auto NextCorrelationId() { +activity_correlation_id_t NextCorrelationId() { static std::atomic counter{1}; return counter.fetch_add(1, std::memory_order_relaxed); } From 72b38dc1dbfd1ed5d4fe0a5ae43fef64f83555aa Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Thu, 8 Sep 2022 18:30:54 -0700 Subject: [PATCH 40/47] Fix hsa_support::timestamp_ns if HSA is not yet initialized Default to the HSA runtime's hsa_system_get_info if the saved HSA functions table is not yet initialized. Change-Id: I3659095a5ad662f7ca8b0d92bd035901c6d66bb0 (cherry picked from commit 87ffbd27f4121f71e5c8e59b19b4e932eb309e8a) --- src/roctracer/roctracer.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/roctracer/roctracer.cpp b/src/roctracer/roctracer.cpp index 4a712972..c5c2290b 100644 --- a/src/roctracer/roctracer.cpp +++ b/src/roctracer/roctracer.cpp @@ -125,18 +125,23 @@ roctracer_stop_cb_t roctracer_stop_cb = nullptr; namespace util { roctracer_timestamp_t timestamp_ns() { - uint64_t sysclock; + // If the HSA intercept is installed, then use the "original" 'hsa_system_get_info' function to + // avoid reporting calls for internal use of the HSA API by the tracer. + auto hsa_system_get_info_fn = hsa_support::saved_core_api.hsa_system_get_info_fn; + + // If the HSA intercept is not installed, use the default 'hsa_system_get_info'. + if (hsa_system_get_info_fn == nullptr) hsa_system_get_info_fn = hsa_system_get_info; - hsa_status_t status = - hsa_support::saved_core_api.hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + uint64_t sysclock; + hsa_status_t status = hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); if (status == HSA_STATUS_ERROR_NOT_INITIALIZED) return 0; CHECK_HSA_STATUS("hsa_system_get_info()", status); - static uint64_t sysclock_period = []() { + static uint64_t sysclock_period = [&]() { uint64_t sysclock_hz = 0; - hsa_status_t status = hsa_support::saved_core_api.hsa_system_get_info_fn( - HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + hsa_status_t status = hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); CHECK_HSA_STATUS("hsa_system_get_info()", status); + return (uint64_t)1000000000 / sysclock_hz; }(); @@ -1294,6 +1299,9 @@ ROCTRACER_EXPORT void OnUnload() { hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(false); assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); } + + memset(&hsa_support::saved_core_api, '\0', sizeof(hsa_support::saved_core_api)); + memset(&hsa_support::saved_amd_ext_api, '\0', sizeof(hsa_support::saved_amd_ext_api)); } } // extern "C" \ No newline at end of file From bd2ac1af9e2b5332d0a13c64b722985e8b9ed1d6 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Wed, 12 Oct 2022 21:00:09 -0700 Subject: [PATCH 41/47] SWDEV-351980 - Squashed commit of the following: commit 8a575d8d6ec5b5cd7f5996494fd3ac9676ee4712 Author: Laurent Morichetti Date: Fri Sep 30 13:24:23 2022 -0700 Remove the thread local begin_timestamp stack Using a thread_local object is problematic as the thread local destructors are called first before any global destructor, making the object invalid while tearing down the process. rocblas uses a global destructor to clean up the loaded HIP modules and ends up calling hip_executable_destroy after the timestamp stack is destructed. As a result the begin timestamp for that API function is 0. The solution is to store the phase_enter timestamp in the phase_data. Change-Id: If143f4d123dfb111c72fb20365431d07e73fc570 commit 6416434d3b3a31c04b6c4feeb2fba013b3beb4e7 Author: Laurent Morichetti Date: Fri Sep 30 11:02:27 2022 -0700 Fix a profiling data corrupted error Using rocprof with ROCP_MCOPY_DATA=1 while tracing HSA produces the following error: tblextr.py: Memcpy args "(0x7feb16a00000, 123handle=28593376125, 0x7feb12a00010, 123handle=27558560125, 4194304, 0, 0, 123handle=140661639440000125) = 1" cannot be identified Profiling data corrupted: ' ./out/rpl_data_220930_143009_1826700/input_results_220930_143009/results.txt' There are two issues: 1) The hsa_agent_t handle argument is misprinted: "123handle=...125" Instead of printing '{' and '}', it prints '123' and '125'. The wrong operator<<(unsigned char) is used and an integer value is printed instead of a char. Use std::operator<< instead of hsa_support::detail::operator<< to print '{' and '}' 2) The result value is unitialized and in some cases printed as a negative integer value. The leading '-' is not matched by the mem_manager regular expresion for HSA api calls. Correctly capture the HSA function's return value. Change-Id: If13a1e62eeb4e598447c4b90d53d1b2e3b408696 commit 329c0467cbe441da87339f9ab1e4302e41fb7efd Author: Laurent Morichetti Date: Wed Sep 28 15:41:05 2022 -0700 Fix an issue with aync copy timestamps The timestamps coming from the HIP runtime for asynchronus memory copies are corrupted (begin > end) because the HSA setting to record timestamps is turned off by the tracer's HSA intercept. The solution is to intercept hsa_amd_profiling_async_copy_enable and remember the application/runtime's request so that it can be ORed with IsEnabled(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY). Change-Id: Ib687cbf36711563e86c2bb8bc934c7c51572bfde commit b664937ebd6e0b89e775bc2393c370ac4b3259df Author: Laurent Morichetti Date: Mon Sep 26 09:35:03 2022 -0700 Use the "safe" Stack for begin_timestamp The tracer tool needs to remember the begin timestamps for API callbacks, and uses a thread_local std::stack for that purpose. The issue with thread_local objects is that they are destructed before anything else when the main thread exits. To work around that issue, we use a "safe" stack in the roctracer API. Use the same "safe" stack in the tracer tool. Change-Id: I0d69d4eb44f0205f4102d0d5ef9803a1ec1800a5 commit a287f20961d73e7b647ede0b18dc22294971ee07 Author: Laurent Morichetti Date: Mon Sep 26 09:27:07 2022 -0700 Fix a typo in HipLoader rocprof errors out with the following message: symbol lookup 'KernelNameRef' failed: libamdhip64.so.5: undefined \ symbol: KernelNameRef The HipLoader is incorrectly looking for a KernelNameRef symbol instead of hipKernelNameRef. Fixed the typo: KernelNameRef -> hipKernelNameRef. Change-Id: Ia4860e1669707b0c83d67e71b78d362b07a6aaa7 commit bb98bc7d856d54bc5dd2b64d95d4752ee396526c Author: Laurent Morichetti Date: Mon Sep 12 13:03:48 2022 -0700 Clean up logger.h Change-Id: Ibcb58d2236b012d00c3fc421a425c03093de5d50 commit 67ce5fae13c28f7a6f904482c101e9f6584f6cfe Author: Laurent Morichetti Date: Thu Sep 15 10:33:38 2022 -0700 Fix an array subscript out-of-bounds error Starting with gcc-11 (verified with gcc-12 as well), an array out-of-bounds subscript error is reported for accessing the registration table element at the operation ID index. Validating the index in the function calling Register/Unregister does not quiet the warning/error in release builds, so, for gcc-11 and gcc-12, we disable that warning just for the RegistrationTable class. Change-Id: I6bc4a02aa072cfa8905ecde5e3960aebf32fc912 commit 05ee3ff973fa9a9ad6620254bf39da4b1c87e72a Author: Laurent Morichetti Date: Thu Sep 8 22:08:08 2022 -0700 Cleanup the include files Use #include "header" instead of #include
so that the header files are found when the application #includes with -I /opt/rocm/include. Change-Id: I24feac9a5030d3600aee98084340e246c3990db5 commit 4856d339594d4e54fe55f9b2717be5794970595b Author: Laurent Morichetti Date: Fri Sep 9 10:04:16 2022 -0700 SWDEV-355896 - Fix a data corruption error in post processing The post-processing script cannot handle HIP ops without a correlation ID. The correlation ID is needed to connect the record to a HIP stream and originating thread. This issue was exposed by a change to the tracer API to report asynchronous activities even if their originating synchronous API activity (callback) is not enabled. This was a flow in the API. Also fix an issue with the API filtering. Undefined API names should not cause an exception, they should be ignored. Change-Id: Iab2221af6180ade2b9c2eb10c256c3a73d872e9f commit 900d5e0a64d3b75d612b9841241117e705400db6 Author: Laurent Morichetti Date: Thu Sep 8 21:04:41 2022 -0700 Fix the symbol name for deprecated functions Change-Id: I53c0af1d1f6a3998992bdaa737e9b10829e5abc3 commit 87ffbd27f4121f71e5c8e59b19b4e932eb309e8a Author: Laurent Morichetti Date: Thu Sep 8 18:30:54 2022 -0700 Fix hsa_support::timestamp_ns if HSA is not yet initialized Default to the HSA runtime's hsa_system_get_info if the saved HSA functions table is not yet initialized. Change-Id: I3659095a5ad662f7ca8b0d92bd035901c6d66bb0 commit db69cc1c9f2116cc0550433069da47c0b88550c4 Author: Laurent Morichetti Date: Thu Sep 8 00:31:03 2022 -0700 Fix the Loader Instead of dlopen'ing RTLD_NOLOAD a library (for example libamdhip64.so) and rely on the dynamic linker search path, search through the already loaded shared objects for a library with a matching name. Change-Id: I3e74d432bd7ca68df8927ca435b290e86aaaf9e9 commit ab3f361f618e426e9f502a33b8bb34e39c0e9884 Author: Laurent Morichetti Date: Wed Sep 7 21:12:33 2022 -0700 SWDEV-351980 - Remove the ROCtracer private interface from the public header Change-Id: Ib3183e87d0c2bd1679926a4da9bbb6e46d70fb9f commit 2673bf5e2c569387050bf54a67bb0f93e8894264 Author: Laurent Morichetti Date: Fri Sep 2 12:40:15 2022 -0700 SWDEV-351980 - Consolidate registration tables in the roctracer Change-Id: I44cd1cc81cf6a529aed89ee8db1377c0aa67f0dc commit 57867e480363b946813e3963e04c14e55d2c14b4 Author: Laurent Morichetti Date: Thu Aug 18 14:50:51 2022 -0700 Use fatal() and warning() for logging errors Change-Id: I4d525ed2a7dba72beff6fbe43383015e55465fcd commit 9d69e7d49a3ac3d91ed4ed467560ebafc10efa53 Author: Laurent Morichetti Date: Tue Aug 2 09:18:27 2022 -0700 Remove tracker.h Change-Id: I74860431c5f4c4954ddb79fb7e2a613fecc8793b commit 61c232bc693dd317a0be9cb571c936ee3659ace1 Author: Laurent Morichetti Date: Mon Jul 11 08:18:26 2022 -0700 Fix nested timestamps Change-Id: I6385d52cc858670a116f5c2eb65e4f19be73190f commit 9c57b150af4b55630848f6ba9d67709e0601417f Author: Laurent Morichetti Date: Thu Jul 7 13:18:38 2022 -0700 Remove the ROCprofiler loader Was used for the HSA_EVT activities, so no longer needed. Change-Id: I7729fb4519f2e3cee73776264647381cb5826067 commit c2b87b1fd783806469a2244ad66e5070432eccfb Author: Laurent Morichetti Date: Fri Jun 10 18:07:30 2022 -0700 Bring the HSA_EVT callbacks back to the roctracer Change-Id: I26080b264d7989880ba7e9f00502cc680b2256d7 commit ac3214d32ab8ea14538c9e209c2fdbb8a84e0dc4 Author: Laurent Morichetti Date: Thu Aug 18 20:55:54 2022 -0700 Use a global correlation_id for all records Change-Id: I87fe16fefb52a95242bc64b7007b71c9d8978d44 commit 340c7cb5536e241f364038e2890326a0d20f6883 Author: Laurent Morichetti Date: Tue Aug 30 18:47:00 2022 -0700 SWDEV-351980 - Use the new hipRegister/RemoveAsyncActivityCallback Remove the hipInitActivityCallback and use the new hipRegister/ RemoveActivityCallback which allows distinct memory pools to be used for HIP_OPS activities. Enable the multi_pool_activities test. Change-Id: I6f6feaedecc9c36285bea975caf24dbf8f5f624b commit f0e082feb17f250dd95e136a5de30f35fe00d427 Author: Laurent Morichetti Date: Tue Aug 16 20:03:10 2022 -0700 SWDEV-351980 - Remove HipApi{Callback|Activity}{Enable|Disable}Check The code is easier to read if calling HIPActivityCallbackTracker enable/disable_check directly. Both enable/disable_check return the new mask, and the check whether a callback is already installed is clearer. Change-Id: Ic90d34489b5b4d9929dc08b4d9e93cc974b136b1 commit 88c6e0a700a85b5b4ed083edbb20f841e937048d Author: Laurent Morichetti Date: Thu Aug 4 11:38:08 2022 -0700 SWDEV-351980 - Don't allocate hip_api_data and record The HIP runtime is now allocating the hip_api_data and record on its stack so we don't need the thread local record_data_pair stack anymore. Refactor the API callback function to handle both the case where synchronous user callbacks are requested and the case where asynchronous records are requested (enable_callback & enable_activity respectively). If the callback argument (memory pool) is not null, then activity records are requested. Remove CorrelationIdRegister and CorrelationIdLookup. These were used by the HIP runtime to associate a HIP record id to a ROCtracer correlation id. Instead, the HIP runtime is now using the correlation ID returned in the hip_api_data_t. Added a test to check enabling/disabling concurrent callbacks and activities. Change-Id: I5850cfead9861eb3602a3e8fcb7b22580d5fc979 commit ad01ba513a449341cdd97000b0f729968b1aca36 Author: Laurent Morichetti Date: Tue Sep 6 18:57:20 2022 -0700 Deprecate enable/disable_callback/activity[_expl] These functions have little value as it is very unlikely an application would want to enable all the domains. Change-Id: I4743e8ddf6743e60c95c7ba5240950d2ef734301 commit cfdfa2a2d45d3b20c383cdcc9bc5bf30ce844ca6 Author: Laurent Morichetti Date: Fri Aug 26 11:19:07 2022 -0700 Add multi_pool_activities test This test checks that asynchronous activities can be enabled in distinct memory pools. It enables activity reporting for HIP kernel dispatches in one memory pool, and memory copy reporting in another memory pool. The output of this test to stdout should be a series of kernel dispatch records (10) followed by a series of memory copy records (10). The records should not be interleaved. Change-Id: Idb5cca7e650b2312a1955909932364f914737856 commit 006ce7b65d7930c1294e5dfd2db4b85a002e8def Author: Laurent Morichetti Date: Mon Aug 22 21:20:04 2022 -0700 Remove global variables from the file plugin The plugin's file scope global variables destructors could be called before roctracer_plugin_finalize is called, making the global variables undefined by the time roctracer_plugin_finalize is called. To avoid this issue, remove all non-pod global variables from the file plugin. Change-Id: I4b620d67d460d9c99adfd81cbf46b0e64540c503 commit bddb9850de2d82e514ced4e83d0ff1a0bef354c2 Author: Laurent Morichetti Date: Fri Aug 19 10:31:16 2022 -0700 Remove roctracer_mark This function has been deprecated since ROCm-2.9, use ROCTX's roctxMark(const char* message) as a replacement for roctracer_mark. Change-Id: Ie4aeae1db238453fc4451746cc9a338032ba817f commit 4cd7497a87c3369171aa98b89ba2f872e0710e0d Author: Ammar ELWazir Date: Thu Aug 18 21:46:58 2022 -0500 Fixing issues caused by the plugin patches - Multithreaded Applications and plugin destruction - Fixing Async-copy trace in file plugin - Adding the assert checkups for every trace buffer flush function Change-Id: I96e096fd7ee2604931200a0b446edb5ce49959dd commit 753d543022459d6ba10cca4d90b16d88ea6e230c Author: Laurent Morichetti Date: Wed Aug 17 23:19:49 2022 -0700 Use std::dec to print the begin_timestamp Change-Id: I88377b840b2e2cce278575bc398cbdc296e6dfd7 commit 80d363a4bca0b27e7eee09abf2a28f0a22a214b9 Author: Laurent Morichetti Date: Wed Aug 17 14:25:41 2022 -0700 New util library - Add string_printf/string_vprintf. - Add warning and error with backtrace support. Change-Id: I3dd73b4caed0d767bd9e39ffef15ff8484d0b0bf commit 993dcf95038c410e070fd0bdd7ea2e07435a88d5 Author: Laurent Morichetti Date: Wed Jul 13 14:02:38 2022 -0700 Fix tput Don't set the color variables if tput is not available, not working, or if ncolors < 8. Move the color variables outside of eval to avoid calling tput over and over again. Change-Id: Id51a742b77ad0f7c99c1c7c5d05bed0f423b75de commit b7e1f7405456c069f0a09b6117f1779820df7993 Author: Ammar ELWazir Date: Thu Jun 23 01:50:07 2022 -0500 Adding File Plugin - Added File plugin as the default plugin - Moved the flush functions to the plugins - Improved the flush to file implementation Change-Id: I80dd448eb8147a8ea4aa63b39bd1d0a4baf7252b commit 1c7c5cc112d48284d6bd5c7984925c6a5217e19e Author: Ammar ELWazir Date: Mon Aug 8 19:20:32 2022 -0500 Adding Plugin Interface - Add roctracer plugins hooks - Add Roctracer plugin environment variable - Add the plugin class - Add the plugin implementation Change-Id: I12ee2e2be035abac14864764fb76837a4533cf60 commit 591db0b7187b7fb0e4fe7bb388616df1f7b92632 Author: Ammar ELWazir Date: Mon Aug 8 20:45:34 2022 -0500 Changing NULL to nullptr (Tracer Tool) Change-Id: I567bf7944599922e5d402e55142c2915ae24fb69 Change-Id: I24f448b3510d3fa2451103621b822421c11e5921 --- CMakeLists.txt | 5 +- doc/Doxyfile.in | 2 +- inc/.clang-format | 4 + inc/ext/prof_protocol.h | 49 +- inc/roctracer.h | 190 +-- inc/roctracer_ext.h | 14 +- inc/roctracer_hcc.h | 5 +- inc/roctracer_hip.h | 46 +- inc/roctracer_hsa.h | 84 +- inc/roctracer_plugin.h | 137 ++ inc/roctracer_roctx.h | 2 +- inc/roctx.h | 3 +- plugin/CMakeLists.txt | 23 + plugin/exportmap | 7 + plugin/file/CMakeLists.txt | 42 + plugin/file/file.cpp | 369 +++++ script/gen_ostream_ops.py | 32 +- script/hsaap.py | 58 +- src/CMakeLists.txt | 29 +- src/roctracer/backward_compat.cpp | 53 +- src/roctracer/correlation_id.cpp | 99 ++ src/roctracer/correlation_id.h | 50 + src/roctracer/exception.h | 1 + src/roctracer/hsa_support.cpp | 640 +++++++++ src/roctracer/hsa_support.h | 54 + src/roctracer/journal.h | 65 - src/roctracer/loader.h | 386 ++--- src/roctracer/memory_pool.h | 7 +- src/roctracer/registration_table.h | 100 ++ src/roctracer/roctracer.cpp | 1271 ++++++----------- src/roctracer/tracker.h | 155 -- src/roctx/exportmap | 5 +- src/roctx/roctx.cpp | 80 +- src/tracer_tool/tracer_tool.cpp | 736 ++++------ src/util/callback_table.h | 70 - src/util/debug.cpp | 125 ++ src/util/debug.h | 47 + src/util/logger.h | 95 +- src/util/util.cpp | 51 + src/util/util.h | 36 + test/CMakeLists.txt | 26 +- test/app/codeobj_test.cpp | 76 +- test/app/hsaco_test.cpp | 127 -- test/directed/activity_and_callback.cpp | 139 ++ test/directed/dlopen.cpp | 94 ++ test/directed/multi_pool_activities.cpp | 94 ++ .../MatrixTranspose_hip_flush_trace.txt | 200 --- .../MatrixTranspose_hip_input_trace.txt | 200 --- .../MatrixTranspose_hip_period_trace.txt | 200 --- .../MatrixTranspose_sys_hsa_trace.txt | 200 --- .../MatrixTranspose_sys_trace.txt | 200 --- .../activity_and_callback_trace.txt | 65 + .../multi_pool_activities_trace.txt | 30 + test/golden_traces/tests_trace_cmp_levels.txt | 6 +- test/hip/MatrixTranspose.cpp | 5 - test/run.sh | 31 +- 56 files changed, 3490 insertions(+), 3430 deletions(-) create mode 100644 inc/.clang-format create mode 100644 inc/roctracer_plugin.h create mode 100644 plugin/CMakeLists.txt create mode 100644 plugin/exportmap create mode 100644 plugin/file/CMakeLists.txt create mode 100644 plugin/file/file.cpp create mode 100644 src/roctracer/correlation_id.cpp create mode 100644 src/roctracer/correlation_id.h create mode 100644 src/roctracer/hsa_support.cpp create mode 100644 src/roctracer/hsa_support.h delete mode 100644 src/roctracer/journal.h create mode 100644 src/roctracer/registration_table.h delete mode 100644 src/roctracer/tracker.h delete mode 100644 src/util/callback_table.h create mode 100644 src/util/debug.cpp create mode 100644 src/util/debug.h create mode 100644 src/util/util.cpp create mode 100644 src/util/util.h delete mode 100644 test/app/hsaco_test.cpp create mode 100644 test/directed/activity_and_callback.cpp create mode 100644 test/directed/dlopen.cpp create mode 100644 test/directed/multi_pool_activities.cpp create mode 100644 test/golden_traces/activity_and_callback_trace.txt create mode 100644 test/golden_traces/multi_pool_activities_trace.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index fd27dffa..89125632 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,9 @@ if(${LIBRARY_TYPE} STREQUAL SHARED) add_subdirectory(test) endif() +## Build Plugins +add_subdirectory(plugin) + if(${LIBRARY_TYPE} STREQUAL SHARED) ## Installation and packaging @@ -169,7 +172,7 @@ if(DOXYGEN_FOUND) COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT} COMMAND make -C ${CMAKE_CURRENT_BINARY_DIR}/doc/latex pdf MAIN_DEPENDENCY ${DOXYGEN_OUT} ${DOXYGEN_IN} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/inc/roctracer.h + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/inc/roctracer.h ${CMAKE_CURRENT_SOURCE_DIR}/inc/roctracer_plugin.h COMMENT "Generating documentation") add_custom_target(doc DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/doc/html/index.html diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 988e6134..08f3cfa2 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -791,7 +791,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = @CMAKE_CURRENT_SOURCE_DIR@/inc/roctracer.h +INPUT = @CMAKE_CURRENT_SOURCE_DIR@/inc/roctracer.h @CMAKE_CURRENT_SOURCE_DIR@/inc/roctracer_plugin.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/inc/.clang-format b/inc/.clang-format new file mode 100644 index 00000000..e128ee2f --- /dev/null +++ b/inc/.clang-format @@ -0,0 +1,4 @@ +--- +BasedOnStyle: InheritParentConfig +ColumnLimit: 79 +... diff --git a/inc/ext/prof_protocol.h b/inc/ext/prof_protocol.h index 1ba622fe..69a2b0b3 100644 --- a/inc/ext/prof_protocol.h +++ b/inc/ext/prof_protocol.h @@ -21,34 +21,37 @@ #ifndef EXT_PROF_PROTOCOL_H_ #define EXT_PROF_PROTOCOL_H_ +#include #include /* Traced API domains */ typedef enum { - ACTIVITY_DOMAIN_HSA_API = 0, /* HSA API domain */ - ACTIVITY_DOMAIN_HSA_OPS = 1, /* HSA async activity domain */ - ACTIVITY_DOMAIN_HIP_OPS = 2, /* HIP async activity domain */ - ACTIVITY_DOMAIN_HCC_OPS = ACTIVITY_DOMAIN_HIP_OPS, /* HCC async activity domain */ - ACTIVITY_DOMAIN_HIP_VDI = ACTIVITY_DOMAIN_HIP_OPS, /* HIP VDI async activity domain */ - ACTIVITY_DOMAIN_HIP_API = 3, /* HIP API domain */ - ACTIVITY_DOMAIN_KFD_API = 4, /* KFD API domain */ - ACTIVITY_DOMAIN_EXT_API = 5, /* External ID domain */ - ACTIVITY_DOMAIN_ROCTX = 6, /* ROCTX domain */ - ACTIVITY_DOMAIN_HSA_EVT = 7, /* HSA events */ + ACTIVITY_DOMAIN_HSA_API = 0, /* HSA API domain */ + ACTIVITY_DOMAIN_HSA_OPS = 1, /* HSA async activity domain */ + ACTIVITY_DOMAIN_HIP_OPS = 2, /* HIP async activity domain */ + ACTIVITY_DOMAIN_HCC_OPS = + ACTIVITY_DOMAIN_HIP_OPS, /* HCC async activity domain */ + ACTIVITY_DOMAIN_HIP_VDI = + ACTIVITY_DOMAIN_HIP_OPS, /* HIP VDI async activity domain */ + ACTIVITY_DOMAIN_HIP_API = 3, /* HIP API domain */ + ACTIVITY_DOMAIN_KFD_API = 4, /* KFD API domain */ + ACTIVITY_DOMAIN_EXT_API = 5, /* External ID domain */ + ACTIVITY_DOMAIN_ROCTX = 6, /* ROCTX domain */ + ACTIVITY_DOMAIN_HSA_EVT = 7, /* HSA events */ ACTIVITY_DOMAIN_NUMBER } activity_domain_t; -/* Extension API opcodes */ -typedef enum { ACTIVITY_EXT_OP_MARK = 0, ACTIVITY_EXT_OP_EXTERN_ID = 1 } activity_ext_op_t; - /* API callback type */ -typedef void (*activity_rtapi_callback_t)(uint32_t domain, uint32_t cid, const void* data, - void* arg); +typedef void (*activity_rtapi_callback_t)(uint32_t domain, uint32_t cid, + const void* data, void* arg); typedef uint32_t activity_kind_t; typedef uint32_t activity_op_t; /* API callback phase */ -typedef enum { ACTIVITY_API_PHASE_ENTER = 0, ACTIVITY_API_PHASE_EXIT = 1 } activity_api_phase_t; +typedef enum { + ACTIVITY_API_PHASE_ENTER = 0, + ACTIVITY_API_PHASE_EXIT = 1 +} activity_api_phase_t; /* Trace record types */ @@ -66,8 +69,8 @@ typedef struct activity_record_s { union { struct { activity_correlation_id_t correlation_id; /* activity ID */ - roctracer_timestamp_t begin_ns; /* host begin timestamp */ - roctracer_timestamp_t end_ns; /* host end timestamp */ + roctracer_timestamp_t begin_ns; /* host begin timestamp */ + roctracer_timestamp_t end_ns; /* host end timestamp */ }; struct { uint32_t se; /* sampled SE */ @@ -89,16 +92,16 @@ typedef struct activity_record_s { }; }; union { - size_t bytes; /* data size bytes */ - const char* kernel_name; + size_t bytes; /* data size bytes */ + const char* kernel_name; /* kernel name */ + const char* mark_message; }; } activity_record_t; /* Activity sync callback type */ -typedef void* (*activity_sync_callback_t)(uint32_t cid, activity_record_t* record, const void* data, - void* arg); +typedef void (*activity_sync_callback_t)(uint32_t cid, activity_record_t* record, const void* data, + void* arg); /* Activity async callback type */ -typedef void (*activity_id_callback_t)(activity_correlation_id_t id); typedef void (*activity_async_callback_t)(uint32_t op, void* record, void* arg); #endif /* EXT_PROF_PROTOCOL_H_ */ diff --git a/inc/roctracer.h b/inc/roctracer.h index 1ac7dbf2..cb9b13fa 100644 --- a/inc/roctracer.h +++ b/inc/roctracer.h @@ -27,8 +27,16 @@ * The goal of the implementation is to provide a generic independent from * specific runtime profiler to trace API and asynchronous activity. * - * The API provides functionality for registering the runtimes API callbacks and - * asynchronous activity records pool support. + * The API provides functionality for registering the runtimes API callbacks + * and asynchronous activity records pool support. + * + * \section known_limitations Known Limitations and Restrictions + * + * The ROCtracer API library implementation currently has the following + * restrictions. Future releases aim to address these restrictions. + * + * 1. The ACTIVITY_DOMAIN_HSA_OPS operations HSA_OP_ID_DISPATCH, + * HSA_OP_ID_BARRIER, and HSA_OP_ID_RESERVED1 are not currently implemented. */ /** @@ -71,10 +79,10 @@ #endif /* !defined (ROCTRACER_EXPORTS) */ #endif /* !defined (ROCTRACER) */ -#include #include +#include -#include +#include "ext/prof_protocol.h" #ifdef __cplusplus extern "C" { @@ -191,6 +199,12 @@ typedef enum { * External correlation ID pop mismatch. */ ROCTRACER_STATUS_ERROR_MISMATCHED_EXTERNAL_CORRELATION_ID = -7, + /** + * The operation is not currently implemented. This error may be reported by + * any function. Check the \ref known_limitations section to determine the + * status of the library implementation of the interface. + */ + ROCTRACER_STATUS_ERROR_NOT_IMPLEMENTED = -8, /** * Deprecated error code. */ @@ -270,8 +284,8 @@ typedef activity_domain_t roctracer_domain_t; * the domain or operation are invalid. The string is owned by the ROC Tracer * library. */ -ROCTRACER_API const char* roctracer_op_string(uint32_t domain, uint32_t op, - uint32_t kind) ROCTRACER_VERSION_4_1; +ROCTRACER_API const char* roctracer_op_string( + uint32_t domain, uint32_t op, uint32_t kind) ROCTRACER_VERSION_4_1; /** * Query the operation code given a domain and the name of an operation. @@ -293,8 +307,9 @@ ROCTRACER_API const char* roctracer_op_string(uint32_t domain, uint32_t op, * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID The domain is invalid or * not supported. */ -ROCTRACER_API roctracer_status_t roctracer_op_code(uint32_t domain, const char* str, uint32_t* op, - uint32_t* kind) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t +roctracer_op_code(uint32_t domain, const char* str, uint32_t* op, + uint32_t* kind) ROCTRACER_VERSION_4_1; /** * Set the properties of a domain. @@ -308,8 +323,8 @@ ROCTRACER_API roctracer_status_t roctracer_op_code(uint32_t domain, const char* * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_set_properties(roctracer_domain_t domain, - void* properties) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_set_properties( + roctracer_domain_t domain, void* properties) ROCTRACER_VERSION_4_1; /** @} */ @@ -350,9 +365,9 @@ typedef activity_rtapi_callback_t roctracer_rtapi_callback_t; * @retval ::ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT \p op is invalid for \p * domain. */ -ROCTRACER_API roctracer_status_t roctracer_enable_op_callback(activity_domain_t domain, uint32_t op, - activity_rtapi_callback_t callback, - void* arg) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_enable_op_callback( + activity_domain_t domain, uint32_t op, activity_rtapi_callback_t callback, + void* arg) ROCTRACER_VERSION_4_1; /** * Enable runtime API callback for all operations of a domain. @@ -370,21 +385,8 @@ ROCTRACER_API roctracer_status_t roctracer_enable_op_callback(activity_domain_t * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid. */ ROCTRACER_API roctracer_status_t roctracer_enable_domain_callback( - activity_domain_t domain, activity_rtapi_callback_t callback, void* arg) ROCTRACER_VERSION_4_1; - -/** - * Enable runtime API callback for all operations of all domains. - * - * @param callback The callback to invoke each time the operation is performed - * on entry and exit. - * - * @param arg Value to pass as last argument of \p callback. - * - * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed - * successfully. - */ -ROCTRACER_API roctracer_status_t roctracer_enable_callback(activity_rtapi_callback_t callback, - void* arg) ROCTRACER_VERSION_4_1; + activity_domain_t domain, activity_rtapi_callback_t callback, + void* arg) ROCTRACER_VERSION_4_1; /** * Disable runtime API callback for a specific operation of a domain. @@ -401,8 +403,8 @@ ROCTRACER_API roctracer_status_t roctracer_enable_callback(activity_rtapi_callba * @retval ::ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT \p op is invalid for \p * domain. */ -ROCTRACER_API roctracer_status_t roctracer_disable_op_callback(activity_domain_t domain, - uint32_t op) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_disable_op_callback( + activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1; /** * Disable runtime API callback for all operations of a domain. @@ -414,25 +416,17 @@ ROCTRACER_API roctracer_status_t roctracer_disable_op_callback(activity_domain_t * * @retval ::ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID \p domain is invalid. */ -ROCTRACER_API roctracer_status_t roctracer_disable_domain_callback(activity_domain_t domain) - ROCTRACER_VERSION_4_1; - -/** - * Disable runtime API callback for all operations of all domains. - * - * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed - * successfully. - */ -ROCTRACER_API roctracer_status_t roctracer_disable_callback() ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_disable_domain_callback( + activity_domain_t domain) ROCTRACER_VERSION_4_1; /** @} */ /** \defgroup activity_api_group Activity API * - * The activity records are asynchronously logged to the pool and can be associated - * with the respective API callbacks using the correlation ID. Activity API can - * be used to enable collecting of the records with timestamping data for API - * calls and the kernel submits. + * The activity records are asynchronously logged to the pool and can be + * associated with the respective API callbacks using the correlation ID. + * Activity API can be used to enable collecting of the records with + * timestamping data for API calls and the kernel submits. * * @{ */ @@ -458,8 +452,9 @@ typedef activity_record_t roctracer_record_t; * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_next_record( - const activity_record_t* record, const activity_record_t** next) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t +roctracer_next_record(const activity_record_t* record, + const activity_record_t** next) ROCTRACER_VERSION_4_1; /** * Memory pool allocator callback. @@ -494,7 +489,8 @@ typedef void (*roctracer_allocator_t)(char** ptr, size_t size, void* arg); * * \p arg the argument specified when the callback was defined. */ -typedef void (*roctracer_buffer_callback_t)(const char* begin, const char* end, void* arg); +typedef void (*roctracer_buffer_callback_t)(const char* begin, const char* end, + void* arg); /** * Memory pool properties. @@ -561,8 +557,9 @@ typedef void roctracer_pool_t; * @retval ROCTRACER_STATUS_ERROR_MEMORY_ALLOCATION Unable to allocate memory * for the \p pool. Unable to create the pool. */ -ROCTRACER_API roctracer_status_t roctracer_open_pool_expl( - const roctracer_properties_t* properties, roctracer_pool_t** pool) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t +roctracer_open_pool_expl(const roctracer_properties_t* properties, + roctracer_pool_t** pool) ROCTRACER_VERSION_4_1; /** * Create tracer memory pool. @@ -581,8 +578,8 @@ ROCTRACER_API roctracer_status_t roctracer_open_pool_expl( * @retval ROCTRACER_STATUS_ERROR_MEMORY_ALLOCATION Unable to allocate memory * for the \p pool. Unable to create the pool. */ -ROCTRACER_API roctracer_status_t roctracer_open_pool(const roctracer_properties_t* properties) - ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_open_pool( + const roctracer_properties_t* properties) ROCTRACER_VERSION_4_1; /** * Close tracer memory pool. @@ -597,8 +594,8 @@ ROCTRACER_API roctracer_status_t roctracer_open_pool(const roctracer_properties_ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully or pool was NULL and there is no default pool. */ -ROCTRACER_API roctracer_status_t roctracer_close_pool_expl(roctracer_pool_t* pool) - ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t +roctracer_close_pool_expl(roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; /** * Close default tracer memory pool, if defined, and set to undefined. @@ -621,8 +618,8 @@ ROCTRACER_API roctracer_status_t roctracer_close_pool() ROCTRACER_VERSION_4_1; * @return Return the current default memory pool before any change, or NULL if * none is defined. */ -ROCTRACER_API roctracer_pool_t* roctracer_default_pool_expl(roctracer_pool_t* pool) - ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_pool_t* roctracer_default_pool_expl( + roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; /** * Query the current default memory pool. @@ -639,16 +636,18 @@ ROCTRACER_API roctracer_pool_t* roctracer_default_pool() ROCTRACER_VERSION_4_1; * * @param[in] op The activity operation ID in \p domain. * - * @param[in] pool The memory pool to write the activity record. If NULL, use the - * default memory pool. + * @param[in] pool The memory pool to write the activity record. If NULL, use + * the default memory pool. * * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. * - * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is defined. + * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is + * defined. */ ROCTRACER_API roctracer_status_t roctracer_enable_op_activity_expl( - activity_domain_t domain, uint32_t op, roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; + activity_domain_t domain, uint32_t op, + roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; /** * Enable activity record logging for a specified operation of a domain using @@ -663,8 +662,8 @@ ROCTRACER_API roctracer_status_t roctracer_enable_op_activity_expl( * * @retval ROCTRACER_STATUS_ERROR No default pool is defined. */ -ROCTRACER_API roctracer_status_t roctracer_enable_op_activity(activity_domain_t domain, - uint32_t op) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_enable_op_activity( + activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1; /** * Enable activity record logging for all operations of a domain providing a @@ -672,13 +671,14 @@ ROCTRACER_API roctracer_status_t roctracer_enable_op_activity(activity_domain_t * * @param[in] domain The domain. * - * @param[in] pool The memory pool to write the activity record. If NULL, use the - * default memory pool. + * @param[in] pool The memory pool to write the activity record. If NULL, use + * the default memory pool. * * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. * - * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is defined. + * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is + * defined. */ ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity_expl( activity_domain_t domain, roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; @@ -694,34 +694,8 @@ ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity_expl( * * @retval ROCTRACER_STATUS_ERROR No default pool is defined. */ -ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity(activity_domain_t domain) - ROCTRACER_VERSION_4_1; - -/** - * Enable activity record logging for all operations of all domains providing a - * memory pool. - * - * @param[in] pool The memory pool to write the activity record. If NULL, use the - * default memory pool. - * - * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed - * successfully. - * - * @retval ROCTRACER_STATUS_ERROR \p pool is NULL and no default pool is defined. - */ -ROCTRACER_API roctracer_status_t roctracer_enable_activity_expl(roctracer_pool_t* pool) - ROCTRACER_VERSION_4_1; - -/** - * Enable activity record logging for all operations of all domains using the - * default memory pool. - * - * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed - * successfully. - * - * @retval ROCTRACER_STATUS_ERROR No default pool is defined. - */ -ROCTRACER_API roctracer_status_t roctracer_enable_activity() ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity( + activity_domain_t domain) ROCTRACER_VERSION_4_1; /** * Disable activity record logging for a specified operation of a domain. @@ -733,8 +707,8 @@ ROCTRACER_API roctracer_status_t roctracer_enable_activity() ROCTRACER_VERSION_4 * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_disable_op_activity(activity_domain_t domain, - uint32_t op) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_disable_op_activity( + activity_domain_t domain, uint32_t op) ROCTRACER_VERSION_4_1; /** * Disable activity record logging for all operations of a domain. @@ -744,18 +718,8 @@ ROCTRACER_API roctracer_status_t roctracer_disable_op_activity(activity_domain_t * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_disable_domain_activity(activity_domain_t domain) - ROCTRACER_VERSION_4_1; - -/** - * Disable activity record logging for all operations of all domains. - * - * @param[in] op The activity operation ID in \p domain. - * - * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed - * successfully. - */ -ROCTRACER_API roctracer_status_t roctracer_disable_activity() ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_disable_domain_activity( + activity_domain_t domain) ROCTRACER_VERSION_4_1; /** * Flush available activity records for a memory pool. @@ -764,14 +728,14 @@ ROCTRACER_API roctracer_status_t roctracer_disable_activity() ROCTRACER_VERSION_ * stops. Use a subsequent flush when the record has completed being written to * resume the flush. * - * @param[in] pool The memory pool to flush. If NULL, flushes the default memory - * pool. + * @param[in] pool The memory pool to flush. If NULL, flushes the default + * memory pool. * * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* pool) - ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t +roctracer_flush_activity_expl(roctracer_pool_t* pool) ROCTRACER_VERSION_4_1; /** * Flush available activity records for the default memory pool. @@ -783,7 +747,8 @@ ROCTRACER_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_flush_activity() ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_flush_activity() + ROCTRACER_VERSION_4_1; /** @} */ @@ -802,7 +767,8 @@ ROCTRACER_API roctracer_status_t roctracer_flush_activity() ROCTRACER_VERSION_4_ * @retval ::ROCTRACER_STATUS_SUCCESS The function has been executed * successfully. */ -ROCTRACER_API roctracer_status_t roctracer_get_timestamp(roctracer_timestamp_t* timestamp) ROCTRACER_VERSION_4_1; +ROCTRACER_API roctracer_status_t roctracer_get_timestamp( + roctracer_timestamp_t* timestamp) ROCTRACER_VERSION_4_1; /** @} */ diff --git a/inc/roctracer_ext.h b/inc/roctracer_ext.h index 30c30136..9fc4f6a6 100644 --- a/inc/roctracer_ext.h +++ b/inc/roctracer_ext.h @@ -30,7 +30,13 @@ #ifndef ROCTRACER_EXT_H_ #define ROCTRACER_EXT_H_ -#include +#include "roctracer.h" + +/* Extension API opcodes */ +typedef enum { + ACTIVITY_EXT_OP_MARK = 0, + ACTIVITY_EXT_OP_EXTERN_ID = 1 +} activity_ext_op_t; typedef void (*roctracer_start_cb_t)(); typedef void (*roctracer_stop_cb_t)(); @@ -58,12 +64,14 @@ void ROCTRACER_API roctracer_stop() ROCTRACER_VERSION_4_1; // Notifies that the calling thread is entering an external API region. // Push an external correlation id for the calling thread. roctracer_status_t ROCTRACER_API -roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) ROCTRACER_VERSION_4_1; +roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) + ROCTRACER_VERSION_4_1; // Notifies that the calling thread is leaving an external API region. // Pop an external correlation id for the calling thread. // 'lastId' returns the last external correlation if not NULL -roctracer_status_t ROCTRACER_API roctracer_activity_pop_external_correlation_id( +roctracer_status_t ROCTRACER_API +roctracer_activity_pop_external_correlation_id( activity_correlation_id_t* last_id) ROCTRACER_VERSION_4_1; #ifdef __cplusplus diff --git a/inc/roctracer_hcc.h b/inc/roctracer_hcc.h index 5da6dab0..969282b7 100644 --- a/inc/roctracer_hcc.h +++ b/inc/roctracer_hcc.h @@ -18,6 +18,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#pragma message("This file has been deprecated and marked for removal. Please use roctracer_hip.h instead.") +#pragma message( \ + "This file has been deprecated and marked for removal. Please use roctracer_hip.h instead.") -#include \ No newline at end of file +#include "roctracer_hip.h" \ No newline at end of file diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 427e01e8..5bfab84b 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -18,44 +18,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef INC_ROCTRACER_HIP_H_ -#define INC_ROCTRACER_HIP_H_ +#ifndef ROCTRACER_HIP_H_ +#define ROCTRACER_HIP_H_ -#ifdef __cplusplus -#include - -inline static std::ostream& operator<<(std::ostream& out, const unsigned char& v) { - out << (unsigned int)v; - return out; -} - -inline static std::ostream& operator<<(std::ostream& out, const char& v) { - out << (unsigned char)v; - return out; -} -#endif // __cplusplus +#include "roctracer.h" #include -#include +#include "hip_ostream_ops.h" #include -#include - -enum { HIP_OP_ID_DISPATCH = 0, HIP_OP_ID_COPY = 1, HIP_OP_ID_BARRIER = 2, HIP_OP_ID_NUMBER = 3 }; - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Traced calls ID enumeration -typedef enum hip_api_id_t roctracer_hip_api_cid_t; - -typedef void(hipInitAsyncActivityCallback_t)(void* id_callback, void* op_callback, void* arg); -typedef bool(hipEnableAsyncActivityCallback_t)(unsigned op, bool enable); -typedef const char*(hipGetOpName_t)(unsigned op); - -#ifdef __cplusplus -} // extern "C" block -#endif // __cplusplus +typedef enum { + HIP_OP_ID_DISPATCH = 0, + HIP_OP_ID_COPY = 1, + HIP_OP_ID_BARRIER = 2, + HIP_OP_ID_NUMBER = 3 +} hip_op_id_t; -#endif // INC_ROCTRACER_HIP_H_ +#endif // ROCTRACER_HIP_H_ diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index 777d4d8b..4af35001 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -18,16 +18,15 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef INC_ROCTRACER_HSA_H_ -#define INC_ROCTRACER_HSA_H_ +#ifndef ROCTRACER_HSA_H_ +#define ROCTRACER_HSA_H_ -#include +#include "roctracer.h" #include #include -#include - -#include +#include "hsa_ostream_ops.h" +#include "hsa_prof_str.h" // HSA OP ID enumeration enum hsa_op_id_t { @@ -38,9 +37,76 @@ enum hsa_op_id_t { HSA_OP_ID_NUMBER }; +// HSA EVT ID enumeration +enum hsa_evt_id_t { + HSA_EVT_ID_ALLOCATE = 0, // Memory allocate callback + HSA_EVT_ID_DEVICE = 1, // Device assign callback + HSA_EVT_ID_MEMCOPY = 2, // Memcopy callback + HSA_EVT_ID_SUBMIT = 3, // Packet submission callback + HSA_EVT_ID_KSYMBOL = 4, // Loading/unloading of kernel symbol + HSA_EVT_ID_CODEOBJ = 5, // Loading/unloading of device code object + HSA_EVT_ID_NUMBER +}; + struct hsa_ops_properties_t { - void* table; - void* reserved1[3]; + void* reserved1[4]; }; -#endif // INC_ROCTRACER_HSA_H_ +// HSA EVT data type +typedef struct { + union { + struct { + const void* ptr; // allocated area ptr + size_t size; // allocated area size, zero size means 'free' callback + hsa_amd_segment_t segment; // allocated area's memory segment type + hsa_amd_memory_pool_global_flag_t + global_flag; // allocated area's memory global flag + int is_code; // equal to 1 if code is allocated + } allocate; + + struct { + hsa_device_type_t type; // type of assigned device + uint32_t id; // id of assigned device + hsa_agent_t agent; // device HSA agent handle + const void* ptr; // ptr the device is assigned to + } device; + + struct { + const void* dst; // memcopy dst ptr + const void* src; // memcopy src ptr + size_t size; // memcopy size bytes + } memcopy; + + struct { + const void* packet; // submitted to GPU packet + const char* + kernel_name; // kernel name, NULL if not a kernel dispatch packet + hsa_queue_t* queue; // HSA queue the packet was submitted to + uint32_t device_type; // type of device the packet is submitted to + uint32_t device_id; // id of device the packet is submitted to + } submit; + + struct { + uint64_t object; // kernel symbol object + const char* name; // kernel symbol name + uint32_t name_length; // kernel symbol name length + int unload; // symbol executable destroy + } ksymbol; + + struct { + uint32_t storage_type; // code object storage type + int storage_file; // origin file descriptor + uint64_t memory_base; // origin memory base + uint64_t memory_size; // origin memory size + uint64_t load_base; // code object load base + uint64_t load_size; // code object load size + uint64_t load_delta; // code object load size + uint32_t uri_length; // URI string length (not including the terminating + // NUL character) + const char* uri; // URI string + int unload; // unload flag + } codeobj; + }; +} hsa_evt_data_t; + +#endif // ROCTRACER_HSA_H_ diff --git a/inc/roctracer_plugin.h b/inc/roctracer_plugin.h new file mode 100644 index 00000000..da8a6d72 --- /dev/null +++ b/inc/roctracer_plugin.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +/** \section roctracer_plugin_api ROCtracer Plugin API + * + * The ROCtracer Plugin API is used by the ROCtracer Tool to output all tracing + * information. Different implementations of the ROCtracer Plugin API can be + * developed that output the tracing data in different formats. + * The ROCtracer Tool can be configured to load a specific library that + * supports the user desired format. + * + * The API is not thread safe. It is the responsibility of the ROCtracer Tool + * to ensure the operations are synchronized and not called concurrently. There + * is no requirement for the ROCtracer Tool to report trace data in any + * specific order. If the format supported by plugin requires specific + * ordering, it is the responsibility of the plugin implementation to perform + * any necessary sorting. + */ + +/** + * \file + * ROCtracer Tool Plugin API interface. + */ + +#ifndef ROCTRACER_PLUGIN_H_ +#define ROCTRACER_PLUGIN_H_ + +#include "roctracer.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \defgroup initialization_group Initialization and Finalization + * + * The ROCtracer Plugin API must be initialized before using any of the + * operations to report trace data, and finalized after the last trace data has + * been reported. + * + * @{ + */ + +/** + * Initialize plugin. + * + * Must be called before any other operation. + * + * @param[in] roctracer_major_version The major version of the ROCtracer API + * being used by the ROCtracer Tool. An error is reported if this does not + * match the major version of the ROCtracer API used to build the plugin + * library. This ensures compatibility of the trace data format. + * + * @param[in] roctracer_minor_version The minor version of the ROCtracer API + * being used by the ROCtracer Tool. An error is reported if the + * \p roctracer_major_version matches and this is greater than the minor + * version of the ROCtracer API used to build the plugin library. This ensures + * compatibility of the trace data format. + * + * @return Returns 0 on success and -1 on error. + */ +ROCTRACER_EXPORT int roctracer_plugin_initialize( + uint32_t roctracer_major_version, uint32_t roctracer_minor_version); + +/** + * Finalize plugin. + * + * This must be called after ::roctracer_plugin_initialize and after all trace + * data has been reported by ::roctracer_plugin_write_callback_record and + * ::roctracer_plugin_write_activity_records. + */ +ROCTRACER_EXPORT void roctracer_plugin_finalize(); + +/** @} */ + +/** \defgroup trace_record_write_functions Trace data reporting + * + * Operations to output trace data. + * + * @{ + */ + +/** + * Report a single callback trace data. + * + * @param[in] record Primarily domain independent trace data. + * + * @param[in] callback_data Domain specific trace data. The type of this + * argument depends on the values of \p record.domain. + * + * @return Returns 0 on success and -1 on error. + */ +ROCTRACER_EXPORT int roctracer_plugin_write_callback_record( + const roctracer_record_t* record, const void* callback_data); + +/** + * Report a range of activity trace data. + * + * Reports a range of primarily domain independent trace data. The range is + * specified by a pointer to the first record and a pointer to one past the + * last record. ::roctracer_next_record is used to iterate the range in forward + * order. + * + * @param[in] begin Pointer to the first record. + * + * @param[in] end Pointer to one past the last record. + * + * @return Returns 0 on success and -1 on error. + */ +ROCTRACER_EXPORT int roctracer_plugin_write_activity_records( + const roctracer_record_t* begin, const roctracer_record_t* end); + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* ROCTRACER_PLUGIN_H_ */ diff --git a/inc/roctracer_roctx.h b/inc/roctracer_roctx.h index b3335713..8ff3190f 100644 --- a/inc/roctracer_roctx.h +++ b/inc/roctracer_roctx.h @@ -21,7 +21,7 @@ #ifndef ROCTRACER_ROCTX_H_ #define ROCTRACER_ROCTX_H_ -#include +#include "roctx.h" /** * ROCTX API ID enumeration diff --git a/inc/roctx.h b/inc/roctx.h index cec83dc8..ccec5a18 100644 --- a/inc/roctx.h +++ b/inc/roctx.h @@ -211,7 +211,8 @@ typedef uint64_t roctx_range_id_t; * * \return Returns the ID of the new range. */ -ROCTX_API roctx_range_id_t roctxRangeStartA(const char* message) ROCTX_VERSION_4_1; +ROCTX_API roctx_range_id_t roctxRangeStartA(const char* message) + ROCTX_VERSION_4_1; #define roctxRangeStart(message) roctxRangeStartA(message) /** diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt new file mode 100644 index 00000000..0d0717c5 --- /dev/null +++ b/plugin/CMakeLists.txt @@ -0,0 +1,23 @@ +################################################################################ +## Copyright (c) 2022 Advanced Micro Devices, Inc. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. +################################################################################ + +add_subdirectory(file) \ No newline at end of file diff --git a/plugin/exportmap b/plugin/exportmap new file mode 100644 index 00000000..a189a634 --- /dev/null +++ b/plugin/exportmap @@ -0,0 +1,7 @@ +{ +global: roctracer_plugin_initialize; + roctracer_plugin_finalize; + roctracer_plugin_write_callback_record; + roctracer_plugin_write_activity_records; +local: *; +}; \ No newline at end of file diff --git a/plugin/file/CMakeLists.txt b/plugin/file/CMakeLists.txt new file mode 100644 index 00000000..f0888aee --- /dev/null +++ b/plugin/file/CMakeLists.txt @@ -0,0 +1,42 @@ +################################################################################ +## Copyright (c) 2022 Advanced Micro Devices, Inc. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. +################################################################################ + +file(GLOB FILE_SOURCES "*.cpp") +add_library(file_plugin ${LIBRARY_TYPE} ${FILE_SOURCES}) + +set_target_properties(file_plugin PROPERTIES + CXX_VISIBILITY_PRESET hidden + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../exportmap + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) + +target_compile_definitions(file_plugin + PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_HCC__=1) + +target_include_directories(file_plugin PRIVATE ${PROJECT_SOURCE_DIR}/inc) + +target_link_options(file_plugin PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap -Wl,--no-undefined) + +target_link_libraries(file_plugin PRIVATE util roctracer hsa-runtime64::hsa-runtime64 stdc++fs) + +install(TARGETS file_plugin LIBRARY + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + COMPONENT runtime) \ No newline at end of file diff --git a/plugin/file/file.cpp b/plugin/file/file.cpp new file mode 100644 index 00000000..c0d78b13 --- /dev/null +++ b/plugin/file/file.cpp @@ -0,0 +1,369 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "debug.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// Macro to check ROCtracer calls status +#define CHECK_ROCTRACER(call) \ + do { \ + if ((call) != 0) fatal("%s", roctracer_error_string()); \ + } while (false) + +namespace fs = std::experimental::filesystem; + +namespace { + +uint32_t GetPid() { + static uint32_t pid = syscall(__NR_getpid); + return pid; +} + +/* The function extracts the kernel name from +input string. By using the iterators it finds the +window in the string which contains only the kernel name. +For example 'Foo::foo(a[], int (int))' -> 'foo'*/ +std::string truncate_name(const std::string& name) { + auto rit = name.rbegin(); + auto rend = name.rend(); + uint32_t counter = 0; + char open_token = 0; + char close_token = 0; + while (rit != rend) { + if (counter == 0) { + switch (*rit) { + case ')': + counter = 1; + open_token = ')'; + close_token = '('; + break; + case '>': + counter = 1; + open_token = '>'; + close_token = '<'; + break; + case ']': + counter = 1; + open_token = ']'; + close_token = '['; + break; + case ' ': + ++rit; + continue; + } + if (counter == 0) break; + } else { + if (*rit == open_token) counter++; + if (*rit == close_token) counter--; + } + ++rit; + } + auto rbeg = rit; + while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++; + return name.substr(rend - rit, rit - rbeg); +} + +// C++ symbol demangle +std::string cxx_demangle(const std::string& symbol) { + int status; + char* demangled = abi::__cxa_demangle(symbol.c_str(), nullptr, nullptr, &status); + if (status != 0) return symbol; + std::string ret(demangled); + free(demangled); + return ret; +} + +class file_plugin_t { + private: + class output_file_t { + public: + output_file_t(std::string name) : name_(std::move(name)) {} + + std::string name() const { return name_; } + + template std::ostream& operator<<(T&& value) { + if (!is_open()) open(); + return stream_ << std::forward(value); + } + + std::ostream& operator<<(std::ostream& (*func)(std::ostream&)) { + if (!is_open()) open(); + return stream_ << func; + } + + void open() { + // If the stream is already in the failed state, there's no need to try to open the file. + if (fail()) return; + + const char* output_dir = getenv("ROCP_OUTPUT_DIR"); + + if (output_dir == nullptr) { + stream_.copyfmt(std::cout); + stream_.clear(std::cout.rdstate()); + stream_.basic_ios::rdbuf(std::cout.rdbuf()); + return; + } + + fs::path output_prefix(output_dir); + if (!fs::is_directory(fs::status(output_prefix))) { + if (!stream_.fail()) warning("Cannot open output directory '%s'", output_dir); + stream_.setstate(std::ios_base::failbit); + return; + } + + std::stringstream ss; + ss << GetPid() << "_" << name_; + stream_.open(output_prefix / ss.str()); + } + + bool is_open() const { return stream_.is_open(); } + bool fail() const { return stream_.fail(); } + + private: + const std::string name_; + std::ofstream stream_; + }; + + output_file_t* get_output_file(uint32_t domain, uint32_t op = 0) { + switch (domain) { + case ACTIVITY_DOMAIN_ROCTX: + return &roctx_file_; + case ACTIVITY_DOMAIN_HSA_API: + return &hsa_api_file_; + case ACTIVITY_DOMAIN_HIP_API: + return &hip_api_file_; + case ACTIVITY_DOMAIN_HIP_OPS: + return &hip_activity_file_; + case ACTIVITY_DOMAIN_HSA_OPS: + if (op == HSA_OP_ID_COPY) { + return &hsa_async_copy_file_; + } else if (op == HSA_OP_ID_RESERVED1) { + return &pc_sample_file_; + } + default: + assert(!"domain/op not supported!"); + break; + } + return nullptr; + } + + public: + file_plugin_t() { + // Dumping HSA handles for agents + output_file_t hsa_handles("hsa_handles.txt"); + + [[maybe_unused]] hsa_status_t status = hsa_iterate_agents( + [](hsa_agent_t agent, void* user_data) { + auto* file = static_cast(user_data); + hsa_device_type_t type; + + if (hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + *file << std::hex << std::showbase << agent.handle << " agent " + << ((type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu") << std::endl; + return HSA_STATUS_SUCCESS; + }, + &hsa_handles); + assert(status == HSA_STATUS_SUCCESS && "failed to iterate HSA agents"); + if (hsa_handles.fail()) { + warning("Cannot write to '%s'", hsa_handles.name().c_str()); + return; + } + + // App begin timestamp begin_ts_file.txt + output_file_t begin_ts("begin_ts_file.txt"); + + roctracer_timestamp_t app_begin_timestamp; + CHECK_ROCTRACER(roctracer_get_timestamp(&app_begin_timestamp)); + begin_ts << std::dec << app_begin_timestamp << std::endl; + if (begin_ts.fail()) { + warning("Cannot write to '%s'", begin_ts.name().c_str()); + return; + } + + valid_ = true; + } + + int write_callback_record(const roctracer_record_t* record, const void* callback_data) { + output_file_t* output_file{nullptr}; + switch (record->domain) { + case ACTIVITY_DOMAIN_ROCTX: { + const roctx_api_data_t* data = reinterpret_cast(callback_data); + output_file = get_output_file(ACTIVITY_DOMAIN_ROCTX); + *output_file << std::dec << record->begin_ns << " " << record->process_id << ":" + << record->thread_id << " " << record->op << ":" << data->args.id << ":\"" + << (data->args.message ? data->args.message : "") << "\"" << std::endl; + break; + } + case ACTIVITY_DOMAIN_HSA_API: { + const hsa_api_data_t* data = reinterpret_cast(callback_data); + output_file = get_output_file(ACTIVITY_DOMAIN_HSA_API); + *output_file << std::dec << record->begin_ns << ":" + << ((record->op == HSA_API_ID_hsa_shut_down) ? record->begin_ns + : record->end_ns) + << " " << record->process_id << ":" << record->thread_id << " " + << hsa_api_data_pair_t(record->op, *data) << " :" << data->correlation_id + << std::endl; + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + const hip_api_data_t* data = reinterpret_cast(callback_data); + + std::string kernel_name; + if (record->kernel_name) { + static bool truncate = []() { + const char* env_var = getenv("ROCP_TRUNCATE_NAMES"); + return env_var && std::atoi(env_var) != 0; + }(); + kernel_name = cxx_demangle(record->kernel_name); + if (truncate) kernel_name = truncate_name(kernel_name); + kernel_name = " kernel=" + kernel_name; + } + + output_file = get_output_file(ACTIVITY_DOMAIN_HIP_API); + *output_file << std::dec << record->begin_ns << ":" << record->end_ns << " " + << record->process_id << ":" << record->thread_id << " " + << hipApiString((hip_api_id_t)record->op, data) << kernel_name << " :" + << data->correlation_id << std::endl; + break; + } + default: + warning("write_callback_record: ignored record for domain %d", record->domain); + break; + } + + return (output_file && output_file->fail()) ? -1 : 0; + } + + int write_activity_records(const roctracer_record_t* begin, const roctracer_record_t* end) { + while (begin != end) { + output_file_t* output_file{nullptr}; + const char* name = roctracer_op_string(begin->domain, begin->op, begin->kind); + + switch (begin->domain) { + case ACTIVITY_DOMAIN_HIP_OPS: { + // The post-processing script cannot handle HIP ops without a correlation ID. The + // correlation ID is needed to connect the record to a HIP stream and originating thread. + // The script could be modified to handle ops without correlation IDs, but for backward + // compatibilty, we are simply dropping the records here. + if (begin->correlation_id == 0) break; + + output_file = get_output_file(ACTIVITY_DOMAIN_HIP_OPS); + *output_file << std::dec << begin->begin_ns << ":" << begin->end_ns << " " + << begin->device_id << ":" << begin->queue_id << " " << name << ":" + << begin->correlation_id << ":" << GetPid() << std::endl; + break; + } + case ACTIVITY_DOMAIN_HSA_OPS: + output_file = get_output_file(ACTIVITY_DOMAIN_HSA_OPS, begin->op); + if (begin->op == HSA_OP_ID_COPY) { + *output_file << std::dec << begin->begin_ns << ":" << begin->end_ns + << " async-copy:" << begin->correlation_id << ":" << GetPid() << std::endl; + break; + } else if (begin->op == HSA_OP_ID_RESERVED1) { + *output_file << std::dec << begin->pc_sample.se << " " << begin->pc_sample.cycle << " " + << std::hex << std::showbase << begin->pc_sample.pc << " " << name + << std::endl; + break; + } + [[fallthrough]]; + default: { + warning("write_activity_records: ignored activity for domain %d", begin->domain); + break; + } + } + if (output_file && output_file->fail()) return -1; + CHECK_ROCTRACER(roctracer_next_record(begin, &begin)); + } + return 0; + } + + bool is_valid() const { return valid_; } + + private: + bool valid_{false}; + + output_file_t roctx_file_{"roctx_trace.txt"}, hsa_api_file_{"hsa_api_trace.txt"}, + hip_api_file_{"hip_api_trace.txt"}, hip_activity_file_{"hcc_ops_trace.txt"}, + hsa_async_copy_file_{"async_copy_trace.txt"}, pc_sample_file_{"pcs_trace.txt"}; +}; + +file_plugin_t* file_plugin = nullptr; + +} // namespace + +ROCTRACER_EXPORT int roctracer_plugin_initialize(uint32_t roctracer_major_version, + uint32_t roctracer_minor_version) { + if (roctracer_major_version != ROCTRACER_VERSION_MAJOR || + roctracer_minor_version < ROCTRACER_VERSION_MINOR) + return -1; + + if (file_plugin != nullptr) return -1; + + file_plugin = new file_plugin_t(); + if (file_plugin->is_valid()) return 0; + + // The plugin failed to initialied, destroy it and return an error. + delete file_plugin; + file_plugin = nullptr; + return -1; +} + +ROCTRACER_EXPORT void roctracer_plugin_finalize() { + if (!file_plugin) return; + delete file_plugin; + file_plugin = nullptr; +} + +ROCTRACER_EXPORT int roctracer_plugin_write_callback_record(const roctracer_record_t* record, + const void* callback_data) { + if (!file_plugin || !file_plugin->is_valid()) return -1; + return file_plugin->write_callback_record(record, callback_data); +} + +ROCTRACER_EXPORT int roctracer_plugin_write_activity_records(const roctracer_record_t* begin, + const roctracer_record_t* end) { + if (!file_plugin || !file_plugin->is_valid()) return -1; + return file_plugin->write_activity_records(begin, end); +} diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 0f839acc..b9a52a30 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -52,12 +52,21 @@ header_basic = \ +'namespace detail {\n' + \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ ' static bool recursion = false;\n' + \ ' if (recursion == false) { recursion = true; out << v; recursion = false; }\n' + \ -' return out; }\n' +' return out;\n }\n' + \ +'\n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const unsigned char &v) {\n' + \ +' out << (unsigned int)v;\n' + \ +' return out;\n }\n' + \ +'\n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n' + \ +' out << (unsigned char)v;\n' + \ +' return out;\n }\n' structs_analyzed = {} global_ops = '' @@ -111,9 +120,9 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a indent = "" str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ") != std::string::npos) {\n" indent = " " - str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \"" + name + "=\");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v." + name + ");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \", \");\n" str += " }\n" if "void" not in mtype: global_str += str @@ -152,11 +161,12 @@ def gen_cppheader(infilepath, outfilepath, rank): header_s = \ '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '\n' + \ + '#include "roctracer.h"\n' + \ + '\n' + \ '#ifdef __cplusplus\n' + \ '#include \n' + \ - '\n' + \ - '#include "roctracer.h"\n' - header_s += '#include \n' + '#include \n' output_filename_h.write(header_s) output_filename_h.write('\n') @@ -181,7 +191,7 @@ def gen_cppheader(infilepath, outfilepath, rank): if len(cppHeader.classes[c]["properties"]["public"]) != 0: output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") output_filename_h.write("{\n") - output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + output_filename_h.write(" std::operator<<(out, '{');\n") output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") output_filename_h.write(" if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" ) process_struct(output_filename_h, c, cppHeader, "", apiname) @@ -190,15 +200,15 @@ def gen_cppheader(infilepath, outfilepath, rank): output_filename_h.write(global_str) output_filename_h.write(" };\n") output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") - output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + output_filename_h.write(" std::operator<<(out, '}');\n") output_filename_h.write(" return out;\n") output_filename_h.write("}\n") global_str = '' - global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v);\n" + " return out;\n" + "}\n\n" if rank == 1 or rank == 2: footer = '// end ostream ops for '+ apiname + ' \n' - footer += '};};\n\n' + footer += '};};};\n\n' output_filename_h.write(footer) output_filename_h.write(global_ops) footer = '#endif //__cplusplus\n' + \ diff --git a/script/hsaap.py b/script/hsaap.py index 6046fb2a..ad65ff82 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -332,23 +332,18 @@ def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license): self.cpp_content += "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n" self.cpp_content += '#include \n' - self.cpp_content += '#include \"util/callback_table.h\"\n\n' self.cpp_content += '#include \n' - self.cpp_content += 'namespace roctracer {\n' - self.cpp_content += 'extern activity_correlation_id_t NextCorrelationId();\n' - self.cpp_content += 'namespace hsa_support {\n\n' + self.cpp_content += 'namespace roctracer::hsa_support::detail {\n' self.cpp_content += 'static CoreApiTable CoreApi_saved_before_cb;\n' self.cpp_content += 'static AmdExtTable AmdExt_saved_before_cb;\n' self.cpp_content += 'static ImageExtTable ImageExt_saved_before_cb;\n\n' - self.cpp_content += 'static thread_local uint64_t hsa_correlation_id_tls = 0;\n' - self.cpp_content += self.add_section('API callback functions', '', self.gen_callbacks) self.cpp_content += self.add_section('API intercepting code', '', self.gen_intercept) self.cpp_content += self.add_section('API get_name function', ' ', self.gen_get_name) self.cpp_content += self.add_section('API get_code function', ' ', self.gen_get_code) - self.cpp_content += '\n};};\n' + self.cpp_content += '\n};\n' # add code section def add_section(self, title, gap, fun): @@ -406,6 +401,7 @@ def gen_arg_struct(self, n, name, call, struct): content += ' } ' + call + ';\n' else: content += ' } args;\n' + content += ' uint64_t *phase_data;\n' content += '};\n' return content @@ -413,36 +409,50 @@ def gen_arg_struct(self, n, name, call, struct): def gen_callbacks(self, n, name, call, struct): content = '' if n == -1: - content += 'static util::CallbackTable cb_table;\n' + content += '/* section: Static declarations */\n' content += '\n' if call != '-': call_id = self.api_id[call]; ret_type = struct['ret'] content += 'static ' + ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' - content += ' hsa_api_data_t api_data{};\n' + + content += ' hsa_trace_data_t trace_data;\n' + content += ' bool enabled{false};\n' + content += '\n' + content += ' if (auto function = report_activity.load(std::memory_order_relaxed); function &&\n' + content += ' (enabled =\n' + content += ' function(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &trace_data) == 0)) {\n' + content += ' if (trace_data.phase_enter != nullptr) {\n' + for var in struct['alst']: item = struct['astr'][var]; if re.search(r'char\* ', item): - content += ' api_data.args.' + call + '.' + var + ' = ' + '(' + var + ' != NULL) ? strdup(' + var + ')' + ' : NULL;\n' + # FIXME: we should not strdup the char* arguments here, as the callback will not outlive the scope of this function. Instead, we + # should generate a helper function to capture the content of the arguments similar to hipApiArgsInit for HIP. We also need a + # helper to free the memory that is allocated to capture the content. + content += ' trace_data.api_data.args.' + call + '.' + var + ' = ' + '(' + var + ' != NULL) ? strdup(' + var + ')' + ' : NULL;\n' else: - content += ' api_data.args.' + call + '.' + var + ' = ' + var + ';\n' + content += ' trace_data.api_data.args.' + call + '.' + var + ' = ' + var + ';\n' if call == 'hsa_amd_memory_async_copy_rect' and var == 'range': - content += ' api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n' - content += ' auto [ api_callback_fun, api_callback_arg ] = cb_table.Get(' + call_id + ');\n' - content += ' api_data.phase = 0;\n' - content += ' api_data.correlation_id = NextCorrelationId();\n' - content += ' hsa_correlation_id_tls = api_data.correlation_id;\n' - content += ' if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' + content += ' trace_data.api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n' + + content += ' trace_data.phase_enter(' + call_id + ', &trace_data);\n' + content += ' }\n' + content += ' }\n' + content += '\n' + if ret_type != 'void': - content += ' ' + ret_type + ' ret =' + content += ' trace_data.api_data.' + ret_type + '_retval = ' content += ' ' + name + '_saved_before_cb.' + call + '_fn(' + ', '.join(struct['alst']) + ');\n' + + content += '\n' + content += ' if (enabled && trace_data.phase_exit != nullptr)\n' + content += ' trace_data.phase_exit(' + call_id + ', &trace_data);\n' + if ret_type != 'void': - content += ' api_data.' + ret_type + '_retval = ret;\n' - content += ' api_data.phase = 1;\n' - content += ' if (api_callback_fun) api_callback_fun(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &api_data, api_callback_arg);\n' - if ret_type != 'void': - content += ' return ret;\n' + content += ' return trace_data.api_data.' + ret_type + '_retval;\n' content += '}\n' + return content # generate API intercepting code @@ -464,7 +474,7 @@ def gen_intercept(self, n, name, call, struct): def gen_get_name(self, n, name, call, struct): content = '' if n == -1: - content += 'static const char* GetApiName(const uint32_t& id) {\n' + content += 'static const char* GetApiName(uint32_t id) {\n' content += ' switch (id) {\n' return content if call != '-': diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11568f9e..90bd2e90 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,6 +100,7 @@ set(PUBLIC_HEADERS roctracer_hcc.h roctracer_hsa.h roctracer_roctx.h + roctracer_plugin.h ext/prof_protocol.h) foreach(header ${PUBLIC_HEADERS}) @@ -120,8 +121,28 @@ foreach(header ${GENERATED_HEADERS}) COMPONENT runtime) endforeach() +## Build the util library +file(GLOB UTIL_SOURCES "util/*.cpp") +add_library(util STATIC ${UTIL_SOURCES}) + +set_target_properties(util PROPERTIES POSITION_INDEPENDENT_CODE ON) + +target_include_directories(util + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/util) + +check_include_file(backtrace.h BACKTRACE_H) +if(BACKTRACE_H) + target_compile_definitions(util PRIVATE HAVE_BACKTRACE_H) + find_library(BACKTRACE_LIB "backtrace" ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}) +endif() + +if(BACKTRACE_LIB) + target_compile_definitions(util PRIVATE ENABLE_BACKTRACE) + target_link_libraries(util PRIVATE ${BACKTRACE_LIB}) +endif() + ## Build the ROCtracer library -file(GLOB ROCTRACER_SOURCES "roctracer/*.cpp" "util/*.cpp") +file(GLOB ROCTRACER_SOURCES "roctracer/*.cpp") add_library(roctracer ${LIBRARY_TYPE} ${ROCTRACER_SOURCES} ${GENERATED_HEADERS} hsa_prof_str.inline.h) set_target_properties(roctracer PROPERTIES @@ -145,7 +166,7 @@ target_include_directories(roctracer ${CMAKE_CURRENT_SOURCE_DIR}/roctracer ${CMAKE_CURRENT_SOURCE_DIR}) target_link_options(roctracer PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/roctracer/exportmap -Wl,--no-undefined) -target_link_libraries(roctracer PRIVATE hsa-runtime64::hsa-runtime64 Threads::Threads dl) +target_link_libraries(roctracer PRIVATE util hsa-runtime64::hsa-runtime64 stdc++fs Threads::Threads dl) install(TARGETS roctracer LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -176,7 +197,7 @@ install(TARGETS roctx LIBRARY ## Build the tracer_tool library if (${LIBRARY_TYPE} STREQUAL SHARED) -file(GLOB TRACER_TOOL_SOURCES "tracer_tool/*.cpp" "util/*.cpp") +file(GLOB TRACER_TOOL_SOURCES "tracer_tool/*.cpp") add_library(roctracer_tool SHARED ${TRACER_TOOL_SOURCES}) set_target_properties(roctracer_tool PROPERTIES @@ -191,7 +212,7 @@ target_include_directories(roctracer_tool ${PROJECT_SOURCE_DIR}/inc ${CMAKE_CURRENT_SOURCE_DIR}/roctracer ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries(roctracer_tool roctracer hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl) +target_link_libraries(roctracer_tool util roctracer hsa-runtime64::hsa-runtime64 stdc++fs Threads::Threads atomic dl) target_link_options(roctracer_tool PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/tracer_tool/exportmap -Wl,--no-undefined) install(TARGETS roctracer_tool LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} COMPONENT runtime) diff --git a/src/roctracer/backward_compat.cpp b/src/roctracer/backward_compat.cpp index 2fa761b9..1abf58ae 100644 --- a/src/roctracer/backward_compat.cpp +++ b/src/roctracer/backward_compat.cpp @@ -20,9 +20,54 @@ #include "roctracer.h" -#define PUBLIC_API __attribute__((visibility("default"))) +extern "C" { // Deprecated functions: -extern "C" PUBLIC_API int roctracer_load() { return 1; } -extern "C" PUBLIC_API void roctracer_unload() {} -extern "C" PUBLIC_API void roctracer_flush_buf() {} +ROCTRACER_API int roctracer_load() { return 1; } +ROCTRACER_API void roctracer_unload() {} +ROCTRACER_API void roctracer_flush_buf() {} +ROCTRACER_API void roctracer_mark(const char*) {} + +ROCTRACER_API roctracer_status_t roctracer_enable_callback(roctracer_rtapi_callback_t callback, + void* user_data) { + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) + if (auto status = + roctracer_enable_domain_callback((roctracer_domain_t)domain, callback, user_data); + status != ROCTRACER_STATUS_SUCCESS) + return status; + return ROCTRACER_STATUS_SUCCESS; +} + +ROCTRACER_API roctracer_status_t roctracer_disable_callback() { + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) + if (auto status = roctracer_disable_domain_callback((roctracer_domain_t)domain); + status != ROCTRACER_STATUS_SUCCESS) + return status; + return ROCTRACER_STATUS_SUCCESS; +} + +ROCTRACER_API roctracer_status_t roctracer_enable_activity_expl(roctracer_pool_t* pool) { + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) + if (auto status = roctracer_enable_domain_activity_expl((roctracer_domain_t)domain, pool); + status != ROCTRACER_STATUS_SUCCESS) + return status; + return ROCTRACER_STATUS_SUCCESS; +} + +ROCTRACER_API roctracer_status_t roctracer_enable_activity() { + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) + if (auto status = roctracer_enable_domain_activity((roctracer_domain_t)domain); + status != ROCTRACER_STATUS_SUCCESS) + return status; + return ROCTRACER_STATUS_SUCCESS; +} + +ROCTRACER_API roctracer_status_t roctracer_disable_activity() { + for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) + if (auto status = roctracer_disable_domain_activity((roctracer_domain_t)domain); + status != ROCTRACER_STATUS_SUCCESS) + return status; + return ROCTRACER_STATUS_SUCCESS; +} + +} // extern "C" diff --git a/src/roctracer/correlation_id.cpp b/src/roctracer/correlation_id.cpp new file mode 100644 index 00000000..a76cce1f --- /dev/null +++ b/src/roctracer/correlation_id.cpp @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "correlation_id.h" +#include "roctracer.h" + +#include +#include +#include + +namespace { + +// A stack that can be used for TLS variables. TLS destructors are invoked before global destructors +// which is a problem if operations invoked by global destructors use TLS variables. If the TLS +// stack is destructed, it still has well defined behavior by always returning a dummy element. +template class Stack : std::stack> { + using parent_type = typename std::stack>; + + public: + Stack() { valid_.store(true, std::memory_order_relaxed); } + ~Stack() { valid_.store(false, std::memory_order_relaxed); } + + template auto& emplace(Args&&... args) { + return is_valid() ? parent_type::emplace(std::forward(args)...) + : dummy_element_ = T(std::forward(args)...); + } + void push(const T& v) { + if (is_valid()) parent_type::push(v); + } + void push(T&& v) { + if (is_valid()) parent_type::push(std::move(v)); + } + void pop() { + if (is_valid()) parent_type::pop(); + } + const auto& top() const { return is_valid() ? parent_type::top() : dummy_element_; } + auto& top() { return is_valid() ? parent_type::top() : (dummy_element_ = {}); } + + bool is_valid() const { return valid_.load(std::memory_order_relaxed); } + size_t size() const { return is_valid() ? parent_type::size() : 0; } + bool empty() const { return size() == 0; } + + private: + std::atomic valid_{false}; + T dummy_element_; // Dummy element used when the stack is not valid. +}; + +thread_local Stack correlation_id_stack{}; +thread_local Stack external_id_stack{}; + +} // namespace + +namespace roctracer { + +activity_correlation_id_t CorrelationIdPush() { + static std::atomic counter{1}; + return correlation_id_stack.emplace(counter.fetch_add(1, std::memory_order_relaxed)); +} + +void CorrelationIdPop() { correlation_id_stack.pop(); } + +activity_correlation_id_t CorrelationId() { + return correlation_id_stack.empty() ? 0 : correlation_id_stack.top(); +} + +void ExternalCorrelationIdPush(activity_correlation_id_t external_id) { + external_id_stack.push(external_id); +} + +std::optional ExternalCorrelationIdPop() { + if (external_id_stack.empty()) return std::nullopt; + + auto external_id = external_id_stack.top(); + external_id_stack.pop(); + return std::make_optional(external_id); +} + +std::optional ExternalCorrelationId() { + return external_id_stack.empty() ? std::nullopt : std::make_optional(external_id_stack.top()); +} + +} // namespace roctracer \ No newline at end of file diff --git a/src/roctracer/correlation_id.h b/src/roctracer/correlation_id.h new file mode 100644 index 00000000..6fe77ead --- /dev/null +++ b/src/roctracer/correlation_id.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +#include "roctracer.h" + +#include + +namespace roctracer { + +// Start a new correlation ID region and push it onto the thread local stack. Correlation ID +// regions are nested and per-thread. +activity_correlation_id_t CorrelationIdPush(); + +// Stop the current correlation ID region and pop it from the thread local stack. +void CorrelationIdPop(); + +// Return the ID currently active correlation ID region, or 0 if no regin is active. +activity_correlation_id_t CorrelationId(); + +// Start a new external correlation ID region for the given \p external_id. As for the internal +// correlation ID regions, external correlation ID regions are nested and per-thread. +void ExternalCorrelationIdPush(activity_correlation_id_t external_id); + +// Stop the current external correlation ID region and return the external_id used to start the +// region. Return a nullopt if no region was active. +std::optional ExternalCorrelationIdPop(); + +// Return the current external correlation ID or nullopt is no region is active. +std::optional ExternalCorrelationId(); + +} // namespace roctracer \ No newline at end of file diff --git a/src/roctracer/exception.h b/src/roctracer/exception.h index a2a33a03..9efe2ee5 100644 --- a/src/roctracer/exception.h +++ b/src/roctracer/exception.h @@ -24,6 +24,7 @@ #include #include #include +#include #define EXC_RAISING(error, stream) \ do { \ diff --git a/src/roctracer/hsa_support.cpp b/src/roctracer/hsa_support.cpp new file mode 100644 index 00000000..d6bfa7b4 --- /dev/null +++ b/src/roctracer/hsa_support.cpp @@ -0,0 +1,640 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "hsa_support.h" + +#include "correlation_id.h" +#include "debug.h" +#include "exception.h" +#include "memory_pool.h" +#include "roctracer.h" +#include "roctracer_hsa.h" + +#include +#include +#include +#include +#include +#include + +namespace { + +std::atomic report_activity; + +bool IsEnabled(activity_domain_t domain, uint32_t operation_id) { + auto report = report_activity.load(std::memory_order_relaxed); + return report && report(domain, operation_id, nullptr) == 0; +} + +void ReportActivity(activity_domain_t domain, uint32_t operation_id, void* data) { + if (auto report = report_activity.load(std::memory_order_relaxed)) + report(domain, operation_id, data); +} + +} // namespace + +#include "hsa_prof_str.inline.h" + +namespace roctracer::hsa_support { + +namespace { + +CoreApiTable saved_core_api{}; +AmdExtTable saved_amd_ext_api{}; +hsa_ven_amd_loader_1_01_pfn_t hsa_loader_api{}; + +struct AgentInfo { + int index; + hsa_device_type_t type; +}; +std::unordered_map agent_info_map; + +class Tracker { + public: + enum { ENTRY_INV = 0, ENTRY_INIT = 1, ENTRY_COMPL = 2 }; + + enum entry_type_t { + DFLT_ENTRY_TYPE = 0, + API_ENTRY_TYPE = 1, + COPY_ENTRY_TYPE = 2, + KERNEL_ENTRY_TYPE = 3, + NUM_ENTRY_TYPE = 4 + }; + + struct entry_t { + std::atomic valid; + entry_type_t type; + uint64_t correlation_id; + roctracer_timestamp_t begin; // begin timestamp, ns + roctracer_timestamp_t end; // end timestamp, ns + hsa_agent_t agent; + uint32_t dev_index; + hsa_signal_t orig; + hsa_signal_t signal; + void (*handler)(const entry_t*); + union { + struct { + } copy; + struct { + const char* name; + hsa_agent_t agent; + uint32_t tid; + } kernel; + }; + }; + + // Add tracker entry + inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, + entry_t* entry) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Creating a new tracker entry + entry->type = type; + entry->agent = agent; + entry->dev_index = 0; // hsa_rsrc->GetAgentInfo(agent)->dev_index; + entry->orig = signal; + entry->valid.store(ENTRY_INIT, std::memory_order_release); + + // Creating a proxy signal + status = saved_core_api.hsa_signal_create_fn(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) fatal("hsa_signal_create failed"); + status = saved_amd_ext_api.hsa_amd_signal_async_handler_fn( + entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_signal_async_handler failed"); + } + + // Delete tracker entry + inline static void Disable(entry_t* entry) { + saved_core_api.hsa_signal_destroy_fn(entry->signal); + entry->valid.store(ENTRY_INV, std::memory_order_release); + } + + private: + // Entry completion + inline static void Complete(hsa_signal_value_t signal_value, entry_t* entry) { + static roctracer_timestamp_t sysclock_period = []() { + uint64_t sysclock_hz = 0; + hsa_status_t status = + saved_core_api.hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + if (status != HSA_STATUS_SUCCESS) fatal("hsa_system_get_info failed"); + return (uint64_t)1000000000 / sysclock_hz; + }(); + + if (entry->type == COPY_ENTRY_TYPE) { + hsa_amd_profiling_async_copy_time_t async_copy_time{}; + hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_get_async_copy_time_fn( + entry->signal, &async_copy_time); + if (status != HSA_STATUS_SUCCESS) fatal("hsa_amd_profiling_get_async_copy_time failed"); + entry->begin = async_copy_time.start * sysclock_period; + entry->end = async_copy_time.end * sysclock_period; + } else { + assert(false && "should not reach here"); + } + + hsa_signal_t orig = entry->orig; + hsa_signal_t signal = entry->signal; + + // Releasing completed entry + entry->valid.store(ENTRY_COMPL, std::memory_order_release); + + assert(entry->handler != nullptr); + entry->handler(entry); + + // Original intercepted signal completion + if (orig.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + + [[maybe_unused]] const hsa_signal_value_t new_value = + saved_core_api.hsa_signal_load_relaxed_fn(orig) - 1; + assert(signal_value == new_value && "Tracker::Complete bad signal value"); + saved_core_api.hsa_signal_store_screlease_fn(orig, signal_value); + } + saved_core_api.hsa_signal_destroy_fn(signal); + delete entry; + } + + // Handler for packet completion + static bool Handler(hsa_signal_value_t signal_value, void* arg) { + // Acquire entry + entry_t* entry = reinterpret_cast(arg); + while (entry->valid.load(std::memory_order_acquire) != ENTRY_INIT) sched_yield(); + + // Complete entry + Tracker::Complete(signal_value, entry); + return false; + } +}; + +hsa_status_t HSA_API MemoryAllocateIntercept(hsa_region_t region, size_t size, void** ptr) { + hsa_status_t status = saved_core_api.hsa_memory_allocate_fn(region, size, ptr); + if (status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE)) { + hsa_evt_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + if (saved_core_api.hsa_region_get_info_fn(region, HSA_REGION_INFO_SEGMENT, + &data.allocate.segment) != HSA_STATUS_SUCCESS || + saved_core_api.hsa_region_get_info_fn(region, HSA_REGION_INFO_GLOBAL_FLAGS, + &data.allocate.global_flag) != HSA_STATUS_SUCCESS) + fatal("hsa_region_get_info failed"); + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE, &data); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryAssignAgentIntercept(void* ptr, hsa_agent_t agent, + hsa_access_permission_t access) { + hsa_status_t status = saved_core_api.hsa_memory_assign_agent_fn(ptr, agent, access); + if (status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE)) { + hsa_evt_data_t data{}; + data.device.ptr = ptr; + if (saved_core_api.hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_DEVICE, &data.device.type) != + HSA_STATUS_SUCCESS) + fatal("hsa_agent_get_info failed"); + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE, &data); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryCopyIntercept(void* dst, const void* src, size_t size) { + hsa_status_t status = saved_core_api.hsa_memory_copy_fn(dst, src, size); + if (status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_MEMCOPY)) { + hsa_evt_data_t data{}; + data.memcopy.dst = dst; + data.memcopy.src = src; + data.memcopy.size = size; + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_MEMCOPY, &data); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryPoolAllocateIntercept(hsa_amd_memory_pool_t pool, size_t size, uint32_t flags, + void** ptr) { + hsa_status_t status = saved_amd_ext_api.hsa_amd_memory_pool_allocate_fn(pool, size, flags, ptr); + if (size == 0 || status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE)) { + hsa_evt_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + + if (saved_amd_ext_api.hsa_amd_memory_pool_get_info_fn( + pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &data.allocate.segment) != HSA_STATUS_SUCCESS || + saved_amd_ext_api.hsa_amd_memory_pool_get_info_fn( + pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &data.allocate.global_flag) != + HSA_STATUS_SUCCESS) + fatal("hsa_region_get_info failed"); + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE, &data); + } + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE)) { + auto callback_data = std::make_pair(pool, ptr); + auto agent_callback = [](hsa_agent_t agent, void* iterate_agent_callback_data) { + auto [pool, ptr] = *reinterpret_cast(iterate_agent_callback_data); + + if (hsa_amd_memory_pool_access_t value; + saved_amd_ext_api.hsa_amd_agent_memory_pool_get_info_fn( + agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &value) != HSA_STATUS_SUCCESS || + value != HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT) + return HSA_STATUS_SUCCESS; + + auto it = agent_info_map.find(agent.handle); + if (it == agent_info_map.end()) fatal("agent was not found in the agent_info map"); + + hsa_evt_data_t data{}; + data.device.type = it->second.type; + data.device.id = it->second.index; + data.device.agent = agent; + data.device.ptr = ptr; + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE, &data); + return HSA_STATUS_SUCCESS; + }; + saved_core_api.hsa_iterate_agents_fn(agent_callback, &callback_data); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t MemoryPoolFreeIntercept(void* ptr) { + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE)) { + hsa_evt_data_t data{}; + data.allocate.ptr = ptr; + data.allocate.size = 0; + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_ALLOCATE, &data); + } + + return saved_amd_ext_api.hsa_amd_memory_pool_free_fn(ptr); +} + +// Agent allow access callback 'hsa_amd_agents_allow_access' +hsa_status_t AgentsAllowAccessIntercept(uint32_t num_agents, const hsa_agent_t* agents, + const uint32_t* flags, const void* ptr) { + hsa_status_t status = + saved_amd_ext_api.hsa_amd_agents_allow_access_fn(num_agents, agents, flags, ptr); + if (status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE)) { + while (num_agents--) { + hsa_agent_t agent = *agents++; + auto it = agent_info_map.find(agent.handle); + if (it == agent_info_map.end()) fatal("agent was not found in the agent_info map"); + + hsa_evt_data_t data{}; + data.device.type = it->second.type; + data.device.id = it->second.index; + data.device.agent = agent; + data.device.ptr = ptr; + + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_DEVICE, &data); + } + } + return HSA_STATUS_SUCCESS; +} + +struct CodeObjectCallbackArg { + activity_rtapi_callback_t callback_fun; + void* callback_arg; + bool unload; +}; + +hsa_status_t CodeObjectCallback(hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, void* arg) { + hsa_evt_data_t data{}; + + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE, + &data.codeobj.storage_type) != HSA_STATUS_SUCCESS) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + + if (data.codeobj.storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE) { + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE, + &data.codeobj.storage_file) != HSA_STATUS_SUCCESS || + data.codeobj.storage_file == -1) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + data.codeobj.memory_base = data.codeobj.memory_size = 0; + } else if (data.codeobj.storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY) { + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE, + &data.codeobj.memory_base) != HSA_STATUS_SUCCESS || + hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE, + &data.codeobj.memory_size) != HSA_STATUS_SUCCESS) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + data.codeobj.storage_file = -1; + } else if (data.codeobj.storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE) { + return HSA_STATUS_SUCCESS; // FIXME: do we really not care about these code objects? + } else { + fatal("unknown code object storage type: %d", data.codeobj.storage_type); + } + + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &data.codeobj.load_base) != HSA_STATUS_SUCCESS || + hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &data.codeobj.load_size) != HSA_STATUS_SUCCESS || + hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &data.codeobj.load_delta) != HSA_STATUS_SUCCESS) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &data.codeobj.uri_length) != HSA_STATUS_SUCCESS) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + + std::string uri_str(data.codeobj.uri_length, '\0'); + if (hsa_loader_api.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, uri_str.data()) != + HSA_STATUS_SUCCESS) + fatal("hsa_ven_amd_loader_loaded_code_object_get_info failed"); + + data.codeobj.uri = uri_str.c_str(); + data.codeobj.unload = *static_cast(arg) ? 1 : 0; + ReportActivity(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, &data); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableFreezeIntercept(hsa_executable_t executable, const char* options) { + hsa_status_t status = saved_core_api.hsa_executable_freeze_fn(executable, options); + if (status != HSA_STATUS_SUCCESS) return status; + + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ)) { + bool unload = false; + hsa_loader_api.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, CodeObjectCallback, &unload); + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableDestroyIntercept(hsa_executable_t executable) { + if (IsEnabled(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ)) { + bool unload = true; + hsa_loader_api.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, CodeObjectCallback, &unload); + } + + return saved_core_api.hsa_executable_destroy_fn(executable); +} + +bool profiling_async_copy_enable = false; + +hsa_status_t ProfilingAsyncCopyEnableIntercept(bool enable) { + hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(enable); + if (status == HSA_STATUS_SUCCESS) profiling_async_copy_enable = enable; + return status; +} + +void MemoryASyncCopyHandler(const Tracker::entry_t* entry) { + activity_record_t record{}; + record.domain = ACTIVITY_DOMAIN_HSA_OPS; + record.op = HSA_OP_ID_COPY; + record.begin_ns = entry->begin; + record.end_ns = entry->end; + record.device_id = 0; + record.correlation_id = entry->correlation_id; + ReportActivity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY, &record); +} + +hsa_status_t MemoryASyncCopyIntercept(void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + bool is_enabled = IsEnabled(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY); + + // FIXME: what happens if the state changes before returning? + [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( + profiling_async_copy_enable | is_enabled); + assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); + + if (!is_enabled) { + return saved_amd_ext_api.hsa_amd_memory_async_copy_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, completion_signal); + } + + Tracker::entry_t* entry = new Tracker::entry_t(); + entry->handler = MemoryASyncCopyHandler; + entry->correlation_id = CorrelationId(); + Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); + + status = saved_amd_ext_api.hsa_amd_memory_async_copy_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, entry->signal); + if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); + + return status; +} + +hsa_status_t MemoryASyncCopyRectIntercept(const hsa_pitched_ptr_t* dst, + const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, + hsa_agent_t copy_agent, hsa_amd_copy_direction_t dir, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + bool is_enabled = IsEnabled(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY); + + // FIXME: what happens if the state changes before returning? + [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( + profiling_async_copy_enable | is_enabled); + assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); + + if (!is_enabled) { + return saved_amd_ext_api.hsa_amd_memory_async_copy_rect_fn( + dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, + completion_signal); + } + + Tracker::entry_t* entry = new Tracker::entry_t(); + entry->handler = MemoryASyncCopyHandler; + entry->correlation_id = CorrelationId(); + Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); + + status = saved_amd_ext_api.hsa_amd_memory_async_copy_rect_fn( + dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, + entry->signal); + if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); + + return status; +} + +} // namespace + +roctracer_timestamp_t timestamp_ns() { + // If the HSA intercept is installed, then use the "original" 'hsa_system_get_info' function to + // avoid reporting calls for internal use of the HSA API by the tracer. + auto hsa_system_get_info_fn = saved_core_api.hsa_system_get_info_fn; + + // If the HSA intercept is not installed, use the default 'hsa_system_get_info'. + if (hsa_system_get_info_fn == nullptr) hsa_system_get_info_fn = hsa_system_get_info; + + uint64_t sysclock; + if (hsa_status_t status = hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + status == HSA_STATUS_ERROR_NOT_INITIALIZED) + return 0; + else if (status != HSA_STATUS_SUCCESS) + fatal("hsa_system_get_info failed"); + + static uint64_t sysclock_period = [&]() { + uint64_t sysclock_hz = 0; + if (hsa_status_t status = + hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + status != HSA_STATUS_SUCCESS) + fatal("hsa_system_get_info failed"); + + return (uint64_t)1000000000 / sysclock_hz; + }(); + + return sysclock * sysclock_period; +} + +void Initialize(HsaApiTable* table) { + // Save the HSA core api and amd_ext api. + saved_core_api = *table->core_; + saved_amd_ext_api = *table->amd_ext_; + + // Enumerate the agents. + if (hsa_support::saved_core_api.hsa_iterate_agents_fn( + [](hsa_agent_t agent, void* data) { + hsa_support::AgentInfo agent_info; + if (hsa_support::saved_core_api.hsa_agent_get_info_fn( + agent, HSA_AGENT_INFO_DEVICE, &agent_info.type) != HSA_STATUS_SUCCESS) + fatal("hsa_agent_get_info failed"); + switch (agent_info.type) { + case HSA_DEVICE_TYPE_CPU: + static int cpu_agent_count = 0; + agent_info.index = cpu_agent_count++; + break; + case HSA_DEVICE_TYPE_GPU: + static int gpu_agent_count = 0; + agent_info.index = gpu_agent_count++; + break; + default: + static int other_agent_count = 0; + agent_info.index = other_agent_count++; + break; + } + hsa_support::agent_info_map.emplace(agent.handle, agent_info); + return HSA_STATUS_SUCCESS; + }, + nullptr) != HSA_STATUS_SUCCESS) + fatal("hsa_iterate_agents failed"); + + // Install the code object intercept. + hsa_status_t status = table->core_->hsa_system_get_major_extension_table_fn( + HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_ven_amd_loader_1_01_pfn_t), &hsa_loader_api); + if (status != HSA_STATUS_SUCCESS) fatal("hsa_system_get_major_extension_table failed"); + + // Install the HSA_OPS intercept + table->amd_ext_->hsa_amd_memory_async_copy_fn = MemoryASyncCopyIntercept; + table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = MemoryASyncCopyRectIntercept; + table->amd_ext_->hsa_amd_profiling_async_copy_enable_fn = ProfilingAsyncCopyEnableIntercept; + + // Install the HSA_EVT intercept + table->core_->hsa_memory_allocate_fn = MemoryAllocateIntercept; + table->core_->hsa_memory_assign_agent_fn = MemoryAssignAgentIntercept; + table->core_->hsa_memory_copy_fn = MemoryCopyIntercept; + table->amd_ext_->hsa_amd_memory_pool_allocate_fn = MemoryPoolAllocateIntercept; + table->amd_ext_->hsa_amd_memory_pool_free_fn = MemoryPoolFreeIntercept; + table->amd_ext_->hsa_amd_agents_allow_access_fn = AgentsAllowAccessIntercept; + table->core_->hsa_executable_freeze_fn = ExecutableFreezeIntercept; + table->core_->hsa_executable_destroy_fn = ExecutableDestroyIntercept; + + // Install the HSA_API wrappers + detail::InstallCoreApiWrappers(table->core_); + detail::InstallAmdExtWrappers(table->amd_ext_); + detail::InstallImageExtWrappers(table->image_ext_); +} + +void Finalize() { + if (hsa_status_t status = + saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(profiling_async_copy_enable); + status != HSA_STATUS_SUCCESS) + assert(!"hsa_amd_profiling_async_copy_enable failed"); + + memset(&saved_core_api, '\0', sizeof(saved_core_api)); + memset(&saved_amd_ext_api, '\0', sizeof(saved_amd_ext_api)); + memset(&hsa_loader_api, '\0', sizeof(hsa_loader_api)); +} + +const char* GetApiName(uint32_t id) { return detail::GetApiName(id); } + +const char* GetEvtName(uint32_t id) { + switch (id) { + case HSA_EVT_ID_ALLOCATE: + return "ALLOCATE"; + case HSA_EVT_ID_DEVICE: + return "DEVICE"; + case HSA_EVT_ID_MEMCOPY: + return "MEMCOPY"; + case HSA_EVT_ID_SUBMIT: + return "SUBMIT"; + case HSA_EVT_ID_KSYMBOL: + return "KSYMBOL"; + case HSA_EVT_ID_CODEOBJ: + return "CODEOBJ"; + case HSA_EVT_ID_NUMBER: + break; + } + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid HSA EVT callback id"); +} + +const char* GetOpsName(uint32_t id) { + switch (id) { + case HSA_OP_ID_DISPATCH: + return "DISPATCH"; + case HSA_OP_ID_COPY: + return "COPY"; + case HSA_OP_ID_BARRIER: + return "BARRIER"; + case HSA_OP_ID_RESERVED1: + return "PCSAMPLE"; + } + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid HSA OPS callback id"); +} + +uint32_t GetApiCode(const char* str) { return detail::GetApiCode(str); } + +void RegisterTracerCallback(int (*function)(activity_domain_t domain, uint32_t operation_id, + void* data)) { + report_activity.store(function, std::memory_order_relaxed); +} + +} // namespace roctracer::hsa_support diff --git a/src/roctracer/hsa_support.h b/src/roctracer/hsa_support.h new file mode 100644 index 00000000..3e9922ea --- /dev/null +++ b/src/roctracer/hsa_support.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef HSA_SUPPORT_H_ +#define HSA_SUPPORT_H_ + +#include "roctracer.h" +#include "roctracer_hsa.h" + +#include + +namespace roctracer::hsa_support { + +struct hsa_trace_data_t { + hsa_api_data_t api_data; + uint64_t phase_enter_timestamp; + uint64_t phase_data; + + void (*phase_enter)(hsa_api_id_t operation_id, hsa_trace_data_t* data); + void (*phase_exit)(hsa_api_id_t operation_id, hsa_trace_data_t* data); +}; + +void Initialize(HsaApiTable* table); +void Finalize(); + +const char* GetApiName(uint32_t id); +const char* GetEvtName(uint32_t id); +const char* GetOpsName(uint32_t id); +uint32_t GetApiCode(const char* str); + +void RegisterTracerCallback(int (*function)(activity_domain_t domain, uint32_t operation_id, + void* data)); +uint64_t timestamp_ns(); + +} // namespace roctracer::hsa_support + +#endif // HSA_SUPPORT_H_ diff --git a/src/roctracer/journal.h b/src/roctracer/journal.h deleted file mode 100644 index 0bb844e7..00000000 --- a/src/roctracer/journal.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#ifndef SRC_CORE_JOURNAL_H_ -#define SRC_CORE_JOURNAL_H_ - -#include "ext/prof_protocol.h" - -#include -#include -#include - -namespace roctracer { - -template class Journal { - public: - /* Insert { domain, op } into the journal. Return false if the insertion - updated an existing entry. */ - template , int> = 0> - bool Insert(roctracer_domain_t domain, uint32_t op, T&& data) { - std::lock_guard lock(mutex_); - auto result = map_[domain].try_emplace(op, std::forward(data)); - if (!result.second) result.first->second = std::forward(data); - return result.second; - } - - /* Remove { domain, op } from the journal. Return false if the entry did not exist. */ - bool Remove(roctracer_domain_t domain, uint32_t op) { - std::lock_guard lock(mutex_); - return map_[domain].erase(op) == 1; - } - - template void ForEach(Functor&& func) { - std::lock_guard lock(mutex_); - for (auto&& domain : map_) - for (auto&& operation : domain.second) - if (!func(domain.first /* domain */, operation.first /* op */, operation.second /* data */)) - break; /* FIXME: what are we breaking out of? */ - } - - private: - std::mutex mutex_; - std::unordered_map> map_; -}; - -} // namespace roctracer - -#endif // SRC_CORE_JOURNAL_H_ diff --git a/src/roctracer/loader.h b/src/roctracer/loader.h index 5d1adcc7..656f2c1c 100644 --- a/src/roctracer/loader.h +++ b/src/roctracer/loader.h @@ -18,319 +18,175 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SRC_CORE_LOADER_H_ -#define SRC_CORE_LOADER_H_ +#ifndef ROCTRACER_LOADER_H_ +#define ROCTRACER_LOADER_H_ -#include -#include -#include - -#define ONLD_TRACE(str) \ - if (getenv("ROCP_ONLOAD_TRACE")) do { \ - std::cout << "PID(" << GetPid() << "): TRACER_LOADER::" << __FUNCTION__ << " " << str \ - << std::endl \ - << std::flush; \ - } while (0); - -namespace roctracer { - -// Base runtime loader class -template class BaseLoader : public T { - static uint32_t GetPid() { return syscall(__NR_getpid); } - - public: - typedef std::mutex mutex_t; - typedef BaseLoader loader_t; - - bool Enabled() const { return (handle_ != NULL); } +#include "debug.h" - template fun_t* GetFun(const char* fun_name) { - if (handle_ == NULL) return NULL; +#include - fun_t* f = (fun_t*)dlsym(handle_, fun_name); - if ((to_check_symb_ == true) && (f == NULL)) { - fprintf(stderr, "roctracer: symbol lookup '%s' failed: \"%s\"\n", fun_name, dlerror()); - abort(); - } - return f; - } +#include +#include +#include +#include - static inline loader_t& Instance() { - loader_t* obj = instance_.load(std::memory_order_acquire); - if (obj == NULL) { - std::lock_guard lck(mutex_); - if (instance_.load(std::memory_order_relaxed) == NULL) { - obj = new loader_t(); - instance_.store(obj, std::memory_order_release); - } - } - return *instance_; - } +namespace fs = std::experimental::filesystem; - static loader_t* GetRef() { return instance_; } - static void SetLibName(const char* name) { lib_name_ = name; } +namespace roctracer { - private: - BaseLoader() { - const int flags = (to_load_ == true) ? RTLD_LAZY : RTLD_LAZY | RTLD_NOLOAD; - handle_ = dlopen(lib_name_, flags); - ONLD_TRACE("(" << lib_name_ << " = " << handle_ << ")"); - if ((to_check_open_ == true) && (handle_ == NULL)) { - fprintf(stderr, "roctracer: Loading '%s' failed, %s\n", lib_name_, dlerror()); - abort(); - } - - T::init(this); +// Base loader class +template class BaseLoader { + protected: + BaseLoader(const char* pattern) { + // Iterate through the process' loaded shared objects and try to dlopen the first entry with a + // file name starting with the given 'pattern'. This allows the loader to acquire a handle + // to the target library iff it is already loaded. The handle is used to query symbols + // exported by that library. + + auto callback = [this, pattern](dl_phdr_info* info) { + if (handle_ == nullptr && + fs::path(info->dlpi_name).filename().string().rfind(pattern, 0) == 0) + handle_ = ::dlopen(info->dlpi_name, RTLD_LAZY); + }; + dl_iterate_phdr( + [](dl_phdr_info* info, size_t size, void* data) { + (*reinterpret_cast(data))(info); + return 0; + }, + &callback); } ~BaseLoader() { - if (handle_ != NULL) dlclose(handle_); + if (handle_ != nullptr) ::dlclose(handle_); } - static bool to_load_; - static bool to_check_open_; - static bool to_check_symb_; + BaseLoader(const BaseLoader&) = delete; + BaseLoader& operator=(const BaseLoader&) = delete; - static mutex_t mutex_; - static const char* lib_name_; - static std::atomic instance_; - void* handle_; -}; - -// ROCprofiler library loader class -class RocpApi { public: - typedef BaseLoader Loader; - - typedef bool(RegisterCallback_t)(uint32_t op, void* callback, void* arg); - typedef bool(OperateCallback_t)(uint32_t op); - typedef bool(InitCallback_t)(void* callback, void* arg); - typedef bool(EnableCallback_t)(uint32_t op, bool enable); - typedef const char*(NameCallback_t)(uint32_t op); + bool IsEnabled() const { return handle_ != nullptr; } - RegisterCallback_t* RegisterApiCallback; - OperateCallback_t* RemoveApiCallback; - InitCallback_t* InitActivityCallback; - EnableCallback_t* EnableActivityCallback; - NameCallback_t* GetOpName; + template FunctionPtr GetFun(const char* symbol) const { + assert(IsEnabled()); - RegisterCallback_t* RegisterEvtCallback; - OperateCallback_t* RemoveEvtCallback; - NameCallback_t* GetEvtName; + auto function_ptr = reinterpret_cast(::dlsym(handle_, symbol)); + if (function_ptr == nullptr) fatal("symbol lookup '%s' failed: %s", symbol, ::dlerror()); + return function_ptr; + } - protected: - void init(Loader* loader) { - RegisterApiCallback = loader->GetFun("RegisterApiCallback"); - RemoveApiCallback = loader->GetFun("RemoveApiCallback"); - InitActivityCallback = loader->GetFun("InitActivityCallback"); - EnableActivityCallback = loader->GetFun("EnableActivityCallback"); - GetOpName = loader->GetFun("GetOpName"); - - RegisterEvtCallback = loader->GetFun("RegisterEvtCallback"); - RemoveEvtCallback = loader->GetFun("RemoveEvtCallback"); - GetEvtName = loader->GetFun("GetEvtName"); + static inline Loader& Instance() { + static Loader instance; + return instance; } + + private: + void* handle_; }; +} // namespace roctracer + // HIP runtime library loader class -#include "roctracer_hip.h" +namespace roctracer { #if STATIC_BUILD -__attribute__((weak)) hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg) { - return hipErrorUnknown; -} -__attribute__((weak)) hipError_t hipRemoveApiCallback(uint32_t id) { return hipErrorUnknown; } -__attribute__((weak)) hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg) { - return hipErrorUnknown; -} -__attribute__((weak)) hipError_t hipRemoveActivityCallback(uint32_t id) { return hipErrorUnknown; } -__attribute__((weak)) const char* hipKernelNameRef(const hipFunction_t f) { return NULL; } +__attribute__((weak)) const char* hipKernelNameRef(const hipFunction_t f) { return nullptr; } __attribute__((weak)) const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream) { - return NULL; + return nullptr; } __attribute__((weak)) int hipGetStreamDeviceId(hipStream_t stream) { return 0; } -__attribute__((weak)) const char* hipApiName(uint32_t id) { return NULL; } +__attribute__((weak)) const char* hipGetCmdName(unsigned op) { return nullptr; } +__attribute__((weak)) const char* hipApiName(uint32_t id) { return nullptr; } +__attribute__((weak)) void hipRegisterTracerCallback(int (*function)(activity_domain_t domain, + uint32_t operation_id, + void* data)) {} -__attribute__((weak)) void hipInitActivityCallback(void* id_callback, void* op_callback, - void* arg) {} -__attribute__((weak)) bool hipEnableActivityCallback(unsigned op, bool enable) { return false; } -__attribute__((weak)) const char* hipGetCmdName(unsigned op) { return NULL; } +class HipLoader { + private: + HipLoader() {} -class HipLoaderStatic { public: - typedef std::mutex mutex_t; - typedef HipLoaderStatic loader_t; - typedef std::atomic instance_t; - - typedef hipError_t(RegisterApiCallback_t)(uint32_t id, void* fun, void* arg); - typedef hipError_t(RemoveApiCallback_t)(uint32_t id); - typedef hipError_t(RegisterActivityCallback_t)(uint32_t id, void* fun, void* arg); - typedef hipError_t(RemoveActivityCallback_t)(uint32_t id); - typedef const char*(KernelNameRef_t)(const hipFunction_t f); - typedef const char*(KernelNameRefByPtr_t)(const void* hostFunction, hipStream_t stream); - typedef int(GetStreamDeviceId_t)(hipStream_t stream); - typedef const char*(ApiName_t)(uint32_t id); - - RegisterApiCallback_t* RegisterApiCallback; - RemoveApiCallback_t* RemoveApiCallback; - RegisterActivityCallback_t* RegisterActivityCallback; - RemoveActivityCallback_t* RemoveActivityCallback; - KernelNameRef_t* KernelNameRef; - KernelNameRefByPtr_t* KernelNameRefByPtr_; - const char* KernelNameRefByPtr(const void* function, hipStream_t stream = nullptr) const { - return KernelNameRefByPtr_(function, stream); - } - GetStreamDeviceId_t* GetStreamDeviceId; - ApiName_t* ApiName; - - hipInitAsyncActivityCallback_t* InitActivityCallback; - hipEnableAsyncActivityCallback_t* EnableActivityCallback; - hipGetOpName_t* GetOpName; - - static inline loader_t& Instance() { - loader_t* obj = instance_.load(std::memory_order_acquire); - if (obj == NULL) { - std::lock_guard lck(mutex_); - if (instance_.load(std::memory_order_relaxed) == NULL) { - obj = new loader_t(); - instance_.store(obj, std::memory_order_release); - } - } - return *instance_; - } + bool IsEnabled() const { return true; } - bool Enabled() const { return true; } - bool& InitActivityDone() { return init_activity_done_; } + int GetStreamDeviceId(hipStream_t stream) const { return hipGetStreamDeviceId(stream); } - private: - HipLoaderStatic() { - RegisterApiCallback = hipRegisterApiCallback; - RemoveApiCallback = hipRemoveApiCallback; - RegisterActivityCallback = hipRegisterActivityCallback; - RemoveActivityCallback = hipRemoveActivityCallback; - KernelNameRef = hipKernelNameRef; - KernelNameRefByPtr_ = hipKernelNameRefByPtr; - GetStreamDeviceId = hipGetStreamDeviceId; - ApiName = hipApiName; - - InitActivityCallback = hipInitActivityCallback; - EnableActivityCallback = hipEnableActivityCallback; - GetOpName = hipGetCmdName; - } + const char* KernelNameRef(const hipFunction_t f) const { return hipKernelNameRef(f); } - static mutex_t mutex_; - static instance_t instance_; - bool init_activity_done_ = false; -}; -#else -class HipApi { - public: - typedef BaseLoader Loader; - - typedef decltype(hipRegisterApiCallback) RegisterApiCallback_t; - typedef decltype(hipRemoveApiCallback) RemoveApiCallback_t; - typedef decltype(hipRegisterActivityCallback) RegisterActivityCallback_t; - typedef decltype(hipRemoveActivityCallback) RemoveActivityCallback_t; - typedef decltype(hipKernelNameRef) KernelNameRef_t; - typedef decltype(hipKernelNameRefByPtr) KernelNameRefByPtr_t; - typedef decltype(hipGetStreamDeviceId) GetStreamDeviceId_t; - typedef decltype(hipApiName) ApiName_t; - - RegisterApiCallback_t* RegisterApiCallback; - RemoveApiCallback_t* RemoveApiCallback; - RegisterActivityCallback_t* RegisterActivityCallback; - RemoveActivityCallback_t* RemoveActivityCallback; - KernelNameRef_t* KernelNameRef; - KernelNameRefByPtr_t* KernelNameRefByPtr_; - const char* KernelNameRefByPtr(const void* function, hipStream_t stream = nullptr) const { - return KernelNameRefByPtr_(function, stream); + const char* KernelNameRefByPtr(const void* host_function, hipStream_t stream = nullptr) const { + return hipKernelNameRefByPtr(host_function, stream); } - GetStreamDeviceId_t* GetStreamDeviceId; - ApiName_t* ApiName; - hipInitAsyncActivityCallback_t* InitActivityCallback; - hipEnableAsyncActivityCallback_t* EnableActivityCallback; - hipGetOpName_t* GetOpName; + const char* GetOpName(unsigned op) const { return hipGetCmdName(op); } - bool& InitActivityDone() { return init_activity_done_; } + const char* ApiName(uint32_t id) const { return hipApiName(id); } - protected: - void init(Loader* loader) { - RegisterApiCallback = loader->GetFun("hipRegisterApiCallback"); - RemoveApiCallback = loader->GetFun("hipRemoveApiCallback"); - RegisterActivityCallback = - loader->GetFun("hipRegisterActivityCallback"); - RemoveActivityCallback = loader->GetFun("hipRemoveActivityCallback"); - KernelNameRef = loader->GetFun("hipKernelNameRef"); - KernelNameRefByPtr_ = loader->GetFun("hipKernelNameRefByPtr"); - GetStreamDeviceId = loader->GetFun("hipGetStreamDeviceId"); - ApiName = loader->GetFun("hipApiName"); - - InitActivityCallback = - loader->GetFun("hipInitActivityCallback"); - EnableActivityCallback = - loader->GetFun("hipEnableActivityCallback"); - GetOpName = loader->GetFun("hipGetCmdName"); + void RegisterTracerCallback(int (*callback)(activity_domain_t domain, uint32_t operation_id, + void* data)) const { + return hipRegisterTracerCallback(callback); } - private: - bool init_activity_done_ = false; + static inline HipLoader& Instance() { + static HipLoader instance; + return instance; + } }; -#endif +#else +class HipLoader : public BaseLoader { + private: + friend HipLoader& BaseLoader::Instance(); + HipLoader() : BaseLoader("libamdhip64.so") {} -// rocTX runtime library loader class -#include "roctracer_roctx.h" -class RocTxApi { public: - typedef BaseLoader Loader; + int GetStreamDeviceId(hipStream_t stream) const { + static auto function = GetFun("hipGetStreamDeviceId"); + return function(stream); + } - typedef bool(RegisterApiCallback_t)(uint32_t op, void* callback, void* arg); - typedef bool(RemoveApiCallback_t)(uint32_t op); + const char* KernelNameRef(const hipFunction_t f) const { + static auto function = GetFun("hipKernelNameRef"); + return function(f); + } - RegisterApiCallback_t* RegisterApiCallback; - RemoveApiCallback_t* RemoveApiCallback; + const char* KernelNameRefByPtr(const void* host_function, hipStream_t stream = nullptr) const { + static auto function = GetFun( + "hipKernelNameRefByPtr"); + return function(host_function, stream); + } - protected: - void init(Loader* loader) { - RegisterApiCallback = loader->GetFun("RegisterApiCallback"); - RemoveApiCallback = loader->GetFun("RemoveApiCallback"); + const char* GetOpName(unsigned op) const { + static auto function = GetFun("hipGetCmdName"); + return function(op); } -}; -typedef BaseLoader RocpLoader; -typedef BaseLoader RocTxLoader; + const char* ApiName(uint32_t id) const { + static auto function = GetFun("hipApiName"); + return function(id); + } -#if STATIC_BUILD -typedef HipLoaderStatic HipLoader; -#else -typedef BaseLoader HipLoaderShared; -typedef HipLoaderShared HipLoader; + void RegisterTracerCallback(int (*callback)(activity_domain_t domain, uint32_t operation_id, + void* data)) const { + static auto function = GetFun("hipRegisterTracerCallback"); + return function(callback); + } +}; #endif -} // namespace roctracer - -#define LOADER_INSTANTIATE_2() \ - template typename roctracer::BaseLoader::mutex_t roctracer::BaseLoader::mutex_; \ - template std::atomic*> roctracer::BaseLoader::instance_{}; \ - template bool roctracer::BaseLoader::to_load_ = false; \ - template bool roctracer::BaseLoader::to_check_open_ = true; \ - template bool roctracer::BaseLoader::to_check_symb_ = true; \ - template <> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ - template <> bool roctracer::RocpLoader::to_load_ = true; \ - template <> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ - template <> bool roctracer::RocTxLoader::to_load_ = true; +// ROCTX library loader class +class RocTxLoader : public BaseLoader { + private: + friend RocTxLoader& BaseLoader::Instance(); + RocTxLoader() : BaseLoader("libroctx64.so") {} -#if STATIC_BUILD -#define LOADER_INSTANTIATE_HIP() \ - roctracer::HipLoaderStatic::mutex_t roctracer::HipLoaderStatic::mutex_; \ - roctracer::HipLoaderStatic::instance_t roctracer::HipLoaderStatic::instance_{}; -#else -#define LOADER_INSTANTIATE_HIP() \ - template <> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; -#endif + public: + void RegisterTracerCallback(int (*callback)(activity_domain_t domain, uint32_t operation_id, + void* data)) const { + static auto function = + GetFun("roctxRegisterTracerCallback"); + return function(callback); + } +}; -#define LOADER_INSTANTIATE() \ - LOADER_INSTANTIATE_2(); \ - LOADER_INSTANTIATE_HIP(); +} // namespace roctracer -#endif // SRC_CORE_LOADER_H_ +#endif // ROCTRACER_LOADER_H_ \ No newline at end of file diff --git a/src/roctracer/memory_pool.h b/src/roctracer/memory_pool.h index 4f3bfd96..6522dd49 100644 --- a/src/roctracer/memory_pool.h +++ b/src/roctracer/memory_pool.h @@ -21,6 +21,8 @@ #ifndef MEMORY_POOL_H_ #define MEMORY_POOL_H_ +#include "roctracer.h" + #include #include #include @@ -38,7 +40,10 @@ class MemoryPool { // Pool definition: The memory pool is split in 2 buffers of equal size. When first initialized, // the write pointer points to the first element of the first buffer. When a buffer is full, or // when Flush() is called, the write pointer moves to the other buffer. - const size_t allocation_size = 2 * properties_.buffer_size; + // Each buffer should be large enough to hold at least 2 activity records, as record pairs may + // be written when external correlation ids are used. + const size_t allocation_size = + 2 * std::max(2 * sizeof(roctracer_record_t), properties_.buffer_size); pool_begin_ = nullptr; AllocateMemory(&pool_begin_, allocation_size); assert(pool_begin_ != nullptr && "pool allocator failed"); diff --git a/src/roctracer/registration_table.h b/src/roctracer/registration_table.h new file mode 100644 index 00000000..f9f6efd4 --- /dev/null +++ b/src/roctracer/registration_table.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef UTIL_CALLBACK_TABLE_H_ +#define UTIL_CALLBACK_TABLE_H_ + +#include "ext/prof_protocol.h" + +#include +#include +#include +#include +#include +#include + +namespace roctracer::util { + +#if __GNUC__ == 11 || __GNUCC__ == 12 +// Starting with gcc-11 (verified with gcc-12 as well), an array out-of-bounds subscript error is +// reported for accessing the registration table element at the operation ID index. Validating the +// index in the function calling Register/Unregister does not quiet the warning/error in release +// builds, so, for gcc-11 and gcc-12, we disable that warning just for this class. +#define IGNORE_GCC_ARRAY_BOUNDS_ERROR 1 +#endif // __GNUC__ == 11 || __GNUCC__ == 12 + +#if IGNORE_GCC_ARRAY_BOUNDS_ERROR +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif // IGNORE_GCC_ARRAY_BOUNDS_ERROR + +namespace detail { +struct False { + constexpr bool operator()() { return false; } +}; +} // namespace detail + +// Generic callbacks table +template class RegistrationTable { + public: + template void Register(uint32_t operation_id, Args... args) { + assert(operation_id < N && "operation_id is out of range"); + auto& entry = table_[operation_id]; + std::unique_lock lock(entry.mutex); + if (!entry.enabled.exchange(true, std::memory_order_relaxed)) + registered_count_.fetch_add(1, std::memory_order_relaxed); + entry.data = T{std::forward(args)...}; + } + + void Unregister(uint32_t operation_id) { + assert(operation_id < N && "id is out of range"); + auto& entry = table_[operation_id]; + std::unique_lock lock(entry.mutex); + if (entry.enabled.exchange(false, std::memory_order_relaxed)) + registered_count_.fetch_sub(1, std::memory_order_relaxed); + } + + std::optional Get(uint32_t operation_id) const { + assert(operation_id < N && "id is out of range"); + auto& entry = table_[operation_id]; + if (!entry.enabled.load(std::memory_order_relaxed) || IsStopped{}()) return std::nullopt; + std::shared_lock lock(entry.mutex); + return entry.enabled.load(std::memory_order_relaxed) ? std::make_optional(entry.data) + : std::nullopt; + } + + bool IsEmpty() const { return registered_count_.load(std::memory_order_relaxed) == 0; } + + private: + std::atomic registered_count_{0}; + struct { + std::atomic enabled{false}; + mutable std::shared_mutex mutex; + T data; + } table_[N]{}; +}; + +#if IGNORE_GCC_ARRAY_BOUNDS_ERROR +#pragma GCC diagnostic pop +#endif // IGNORE_GCC_ARRAY_BOUNDS_ERROR + +} // namespace roctracer::util + +#endif // UTIL_CALLBACK_TABLE_H_ diff --git a/src/roctracer/roctracer.cpp b/src/roctracer/roctracer.cpp index c5c2290b..945ca1bb 100644 --- a/src/roctracer/roctracer.cpp +++ b/src/roctracer/roctracer.cpp @@ -34,34 +34,18 @@ #include #include #include +#include #include #include -#include "journal.h" +#include "correlation_id.h" +#include "debug.h" +#include "exception.h" +#include "hsa_support.h" #include "loader.h" +#include "logger.h" #include "memory_pool.h" -#include "tracker.h" -#include "exception.h" -#include "util/logger.h" - -#include "hsa_prof_str.inline.h" - -#define CHECK_HSA_STATUS(msg, status) \ - do { \ - if ((status) != HSA_STATUS_SUCCESS) { \ - const char* status_string = nullptr; \ - hsa_status_string(status, &status_string); \ - FATAL_LOGGING(msg << ": " << (status_string ? status_string : "")); \ - } \ - } while (false) - -#define HIPAPI_CALL(call) \ - do { \ - hipError_t err = call; \ - if (err != hipSuccess) { \ - FATAL_LOGGING("HIP error: " #call " error(" << err << ")"); \ - } \ - } while (false) +#include "registration_table.h" #define API_METHOD_PREFIX \ roctracer_status_t err = ROCTRACER_STATUS_SUCCESS; \ @@ -82,441 +66,38 @@ (void)err; \ return X; -#define ONLOAD_TRACE(str) \ - if (getenv("ROCP_ONLOAD_TRACE")) do { \ - std::cout << "PID(" << GetPid() << "): TRACER_LIB::" << __FUNCTION__ << " " << str \ - << std::endl \ - << std::flush; \ - } while (false); -#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") -#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") +static inline uint32_t GetPid() { + static auto pid = syscall(__NR_getpid); + return pid; +} +static inline uint32_t GetTid() { + static thread_local auto tid = syscall(__NR_gettid); + return tid; +} -static inline uint32_t GetPid() { return syscall(__NR_getpid); } +using namespace roctracer; -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Mark callback -// -typedef void(mark_api_callback_t)(uint32_t domain, uint32_t cid, const void* callback_data, - void* arg); -mark_api_callback_t* mark_api_callback_ptr = nullptr; +namespace { /////////////////////////////////////////////////////////////////////////////////////////////////// // Internal library methods // -namespace roctracer { - -namespace hsa_support { - -static CoreApiTable saved_core_api; -static AmdExtTable saved_amd_ext_api; -// async copy activity callback -std::mutex init_mutex; -bool async_copy_callback_enabled = false; -MemoryPool* async_copy_callback_memory_pool = nullptr; - -} // namespace hsa_support - -namespace ext_support { roctracer_start_cb_t roctracer_start_cb = nullptr; roctracer_stop_cb_t roctracer_stop_cb = nullptr; -} // namespace ext_support - -namespace util { - -roctracer_timestamp_t timestamp_ns() { - // If the HSA intercept is installed, then use the "original" 'hsa_system_get_info' function to - // avoid reporting calls for internal use of the HSA API by the tracer. - auto hsa_system_get_info_fn = hsa_support::saved_core_api.hsa_system_get_info_fn; - - // If the HSA intercept is not installed, use the default 'hsa_system_get_info'. - if (hsa_system_get_info_fn == nullptr) hsa_system_get_info_fn = hsa_system_get_info; - - uint64_t sysclock; - hsa_status_t status = hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); - if (status == HSA_STATUS_ERROR_NOT_INITIALIZED) return 0; - CHECK_HSA_STATUS("hsa_system_get_info()", status); - - static uint64_t sysclock_period = [&]() { - uint64_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info_fn(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); - CHECK_HSA_STATUS("hsa_system_get_info()", status); - - return (uint64_t)1000000000 / sysclock_hz; - }(); - - return sysclock * sysclock_period; -} - -} // namespace util - -struct CallbackJournalData { - roctracer_rtapi_callback_t callback; - void* user_data; -}; -static Journal cb_journal; - -struct ActivityJournalData { - roctracer_pool_t* pool; -}; -static Journal act_journal; roctracer_status_t GetExcStatus(const std::exception& e) { const ApiError* roctracer_exc_ptr = dynamic_cast(&e); return (roctracer_exc_ptr) ? roctracer_exc_ptr->status() : ROCTRACER_STATUS_ERROR; } -activity_correlation_id_t NextCorrelationId() { - static std::atomic counter{1}; - return counter.fetch_add(1, std::memory_order_relaxed); -} - -// Records storage -struct RecordDataPair { - roctracer_record_t record; - union { - hip_api_data_t data; - }; - RecordDataPair() {} -}; -static thread_local std::stack record_data_pair_stack; - -// Correlation id storage -static thread_local activity_correlation_id_t correlation_id_tls = 0; -static std::map correlation_id_map{}; -std::mutex correlation_id_mutex; - -static thread_local std::stack external_id_stack; - -static inline void CorrelationIdRegister(activity_correlation_id_t correlation_id) { - std::lock_guard lock(correlation_id_mutex); - [[maybe_unused]] const auto ret = correlation_id_map.insert({correlation_id, correlation_id_tls}); - assert(ret.second && "HIP activity id is not unique"); - - DEBUG_TRACE("CorrelationIdRegister id(%lu) id_tls(%lu)\n", correlation_id, correlation_id_tls); -} - -static inline activity_correlation_id_t CorrelationIdLookup( - activity_correlation_id_t correlation_id) { - std::lock_guard lock(correlation_id_mutex); - auto it = correlation_id_map.find(correlation_id); - assert(it != correlation_id_map.end() && "HIP activity id lookup failed"); - const activity_correlation_id_t ret_val = it->second; - correlation_id_map.erase(it); - - DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val); - - return ret_val; -} - -std::mutex hip_activity_mutex; - -enum { API_CB_MASK = 0x1, ACT_CB_MASK = 0x2 }; - -class HIPActivityCallbackTracker { - public: - uint32_t enable_check(uint32_t op, uint32_t mask) { return data_[op] |= mask; } - uint32_t disable_check(uint32_t op, uint32_t mask) { return data_[op] &= ~mask; } - - private: - std::unordered_map data_; -}; - -static HIPActivityCallbackTracker hip_act_cb_tracker; - -inline uint32_t HipApiActivityEnableCheck(uint32_t op) { - const uint32_t mask = hip_act_cb_tracker.enable_check(op, API_CB_MASK); - const uint32_t ret = (mask & ACT_CB_MASK); - return ret; -} - -inline uint32_t HipApiActivityDisableCheck(uint32_t op) { - const uint32_t mask = hip_act_cb_tracker.disable_check(op, API_CB_MASK); - const uint32_t ret = (mask & ACT_CB_MASK); - return ret; -} - -inline uint32_t HipActActivityEnableCheck(uint32_t op) { - hip_act_cb_tracker.enable_check(op, ACT_CB_MASK); - return 0; -} - -inline uint32_t HipActActivityDisableCheck(uint32_t op) { - const uint32_t mask = hip_act_cb_tracker.disable_check(op, ACT_CB_MASK); - const uint32_t ret = (mask & API_CB_MASK); - return ret; -} - -void* HIP_SyncApiDataCallback(uint32_t op_id, roctracer_record_t* record, const void* callback_data, - void* arg) { - void* ret = nullptr; - const hip_api_data_t* data = reinterpret_cast(callback_data); - hip_api_data_t* data_ptr = const_cast(data); - MemoryPool* pool = reinterpret_cast(arg); - - int phase = ACTIVITY_API_PHASE_ENTER; - if (record != nullptr) { - assert(data != nullptr && "ActivityCallback: data is NULL"); - phase = data->phase; - } else if (pool != nullptr) { - phase = ACTIVITY_API_PHASE_EXIT; - } - - if (phase == ACTIVITY_API_PHASE_ENTER) { - // Allocating a record if nullptr passed - if (record == nullptr) { - assert(data == nullptr && "ActivityCallback enter: record is NULL"); - data = &record_data_pair_stack.emplace().data; - data_ptr = const_cast(data); - data_ptr->phase = phase; - data_ptr->correlation_id = 0; - } - - // Correlation ID generating - uint64_t correlation_id = data->correlation_id; - if (correlation_id == 0) { - correlation_id = NextCorrelationId(); - data_ptr->correlation_id = correlation_id; - } - - // Passing correlation ID - correlation_id_tls = correlation_id; - - ret = data_ptr; - } else { - // popping the record entry - assert(!record_data_pair_stack.empty() && - "HIP_SyncApiDataCallback exit: record stack is empty"); - record_data_pair_stack.pop(); - - // Clearing correlation ID - correlation_id_tls = 0; - } - - DEBUG_TRACE( - "HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) " - "correlation_id(%lu) time_ns(%lu)\n", - roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0), phase, op_id, record, data, pool, - (int)(record_data_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, - util::timestamp_ns()); - - return ret; -} - -void* HIP_SyncActivityCallback(uint32_t op_id, roctracer_record_t* record, - const void* callback_data, void* arg) { - const roctracer_timestamp_t timestamp_ns = util::timestamp_ns(); - void* ret = nullptr; - const hip_api_data_t* data = reinterpret_cast(callback_data); - hip_api_data_t* data_ptr = const_cast(data); - MemoryPool* pool = reinterpret_cast(arg); - - int phase = ACTIVITY_API_PHASE_ENTER; - if (record != nullptr) { - assert(data != nullptr && "ActivityCallback: data is NULL"); - phase = data->phase; - } else if (pool != nullptr) { - phase = ACTIVITY_API_PHASE_EXIT; - } - - if (phase == ACTIVITY_API_PHASE_ENTER) { - // Allocating a record if nullptr passed - if (record == nullptr) { - assert(data == nullptr && "ActivityCallback enter: record is NULL"); - auto& top = record_data_pair_stack.emplace(); - record = &(top.record); - data = &(top.data); - data_ptr = const_cast(data); - data_ptr->phase = phase; - data_ptr->correlation_id = 0; - } - - // Filing record info - record->domain = ACTIVITY_DOMAIN_HIP_API; - record->op = op_id; - record->begin_ns = timestamp_ns; - - // Correlation ID generating - uint64_t correlation_id = data->correlation_id; - if (correlation_id == 0) { - correlation_id = NextCorrelationId(); - data_ptr->correlation_id = correlation_id; - } - record->correlation_id = correlation_id; - - // Passing correlation ID - correlation_id_tls = correlation_id; - - ret = data_ptr; - } else { - assert(pool != nullptr && "ActivityCallback exit: pool is NULL"); - assert(!record_data_pair_stack.empty() && "ActivityCallback exit: record stack is empty"); - - // Getting record of stacked - if (record == nullptr) record = &record_data_pair_stack.top().record; - - // Filing record info - record->end_ns = timestamp_ns; - record->process_id = syscall(__NR_getpid); - record->thread_id = syscall(__NR_gettid); - - if (!external_id_stack.empty()) { - roctracer_record_t ext_record{}; - ext_record.domain = ACTIVITY_DOMAIN_EXT_API; - ext_record.op = ACTIVITY_EXT_OP_EXTERN_ID; - ext_record.correlation_id = record->correlation_id; - ext_record.external_id = external_id_stack.top(); - pool->Write(ext_record); - } - - // Writing record to the buffer - pool->Write(*record); - - // popping the record entry - record_data_pair_stack.pop(); - - // Clearing correlation ID - correlation_id_tls = 0; - } - - DEBUG_TRACE( - "HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) " - "correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", - roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0), phase, op_id, record, data, pool, - (int)(record_data_pair_stack.size()), (data_ptr) ? data_ptr->correlation_id : 0, - timestamp_ns); - - return ret; -} - -void HIP_ActivityIdCallback(activity_correlation_id_t correlation_id) { - CorrelationIdRegister(correlation_id); -} - -void HIP_AsyncActivityCallback(uint32_t op_id, void* record_ptr, void* arg) { - MemoryPool* pool = reinterpret_cast(arg); - roctracer_record_t record = *reinterpret_cast(record_ptr); - record.domain = ACTIVITY_DOMAIN_HIP_OPS; - record.correlation_id = CorrelationIdLookup(record.correlation_id); - if (record.correlation_id == 0) return; - - // If the record is for a kernel dispatch, write the kernel name in the pool's data, - // and make the record point to it. Older HIP runtimes do not provide a kernel - // name, so record.kernel_name might be null. - if (record.op == HIP_OP_ID_DISPATCH && record.kernel_name != nullptr) - pool->Write(record, record.kernel_name, strlen(record.kernel_name) + 1, - [](auto& record, const void* data) { - record.kernel_name = static_cast(data); - }); - else - pool->Write(record); - - DEBUG_TRACE( - "HIP_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) " - "beg_ns(%lu) end_ns(%lu)\n", - roctracer_op_string(ACTIVITY_DOMAIN_HIP_OPS, record_ptr->op, record_ptr->kind), - record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id, - record_ptr->begin_ns, record_ptr->end_ns); -} - -namespace hsa_support { - -struct AgentInfo { - int index; - hsa_device_type_t type; -}; -std::unordered_map agent_info_map; - -void hsa_async_copy_handler(const Tracker::entry_t* entry) { - activity_record_t record{}; - record.domain = ACTIVITY_DOMAIN_HSA_OPS; - record.op = HSA_OP_ID_COPY; - record.begin_ns = entry->begin; - record.end_ns = entry->end; - record.device_id = 0; - record.correlation_id = entry->correlation_id; - entry->pool->Write(record); -} - -hsa_status_t hsa_amd_memory_async_copy_interceptor(void* dst, hsa_agent_t dst_agent, - const void* src, hsa_agent_t src_agent, - size_t size, uint32_t num_dep_signals, - const hsa_signal_t* dep_signals, - hsa_signal_t completion_signal) { - if (!async_copy_callback_enabled) { - return saved_amd_ext_api.hsa_amd_memory_async_copy_fn( - dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, completion_signal); - } - - Tracker::entry_t* entry = new Tracker::entry_t(); - entry->handler = hsa_async_copy_handler; - entry->pool = async_copy_callback_memory_pool; - entry->correlation_id = hsa_correlation_id_tls; - Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); - - hsa_status_t status = saved_amd_ext_api.hsa_amd_memory_async_copy_fn( - dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, entry->signal); - if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); - - return status; -} - -hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( - const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, - const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, - hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, - hsa_signal_t completion_signal) { - if (!async_copy_callback_enabled) { - return saved_amd_ext_api.hsa_amd_memory_async_copy_rect_fn( - dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, - completion_signal); - } - - Tracker::entry_t* entry = new Tracker::entry_t(); - entry->handler = hsa_async_copy_handler; - entry->pool = async_copy_callback_memory_pool; - entry->correlation_id = hsa_correlation_id_tls; - Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); - - hsa_status_t status = saved_amd_ext_api.hsa_amd_memory_async_copy_rect_fn( - dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, - entry->signal); - if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); - - return status; -} - -} // namespace hsa_support - -void HSA_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { - MemoryPool* pool = reinterpret_cast(arg); - roctracer_record_t* record_ptr = reinterpret_cast(record); - record_ptr->domain = ACTIVITY_DOMAIN_HSA_OPS; - pool->Write(*record_ptr); -} - -// Logger routines and primitives -util::Logger::mutex_t util::Logger::mutex_; -std::atomic util::Logger::instance_{}; +std::mutex registration_mutex; // Memory pool routines and primitives -MemoryPool* default_memory_pool = nullptr; std::recursive_mutex memory_pool_mutex; +MemoryPool* default_memory_pool = nullptr; -// Stop status routines and primitives -unsigned stop_status_value = 0; -std::mutex stop_status_mutex; -unsigned set_stopped(unsigned val) { - std::lock_guard lock(stop_status_mutex); - const unsigned ret = (stop_status_value ^ val); - stop_status_value = val; - return ret; -} -} // namespace roctracer - -using namespace roctracer; - -LOADER_INSTANTIATE(); +} // namespace /////////////////////////////////////////////////////////////////////////////////////////////////// // Public library methods @@ -528,7 +109,7 @@ ROCTRACER_API uint32_t roctracer_version_minor() { return ROCTRACER_VERSION_MINO // Returns the last error ROCTRACER_API const char* roctracer_error_string() { - return strdup(util::Logger::LastMessage().c_str()); + return strdup(util::Logger::Instance().LastMessage().c_str()); } // Return Op string by given domain and activity/API codes @@ -539,9 +120,9 @@ ROCTRACER_API const char* roctracer_op_string(uint32_t domain, uint32_t op, uint case ACTIVITY_DOMAIN_HSA_API: return hsa_support::GetApiName(op); case ACTIVITY_DOMAIN_HSA_EVT: - return RocpLoader::Instance().GetEvtName(op); + return hsa_support::GetEvtName(op); case ACTIVITY_DOMAIN_HSA_OPS: - return RocpLoader::Instance().GetOpName(op); + return hsa_support::GetOpsName(op); case ACTIVITY_DOMAIN_HIP_OPS: return HipLoader::Instance().GetOpName(kind); case ACTIVITY_DOMAIN_HIP_API: @@ -549,7 +130,7 @@ ROCTRACER_API const char* roctracer_op_string(uint32_t domain, uint32_t op, uint case ACTIVITY_DOMAIN_EXT_API: return "EXT_API"; default: - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); + throw roctracer::ApiError(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID"); } API_METHOD_CATCH(nullptr) } @@ -583,111 +164,352 @@ ROCTRACER_API roctracer_status_t roctracer_op_code(uint32_t domain, const char* API_METHOD_SUFFIX } -static inline uint32_t get_op_begin(uint32_t domain) { +namespace { + +template struct DomainTraits; + +template <> struct DomainTraits { + using ApiData = hip_api_data_t; + using OperationId = hip_api_id_t; + static constexpr size_t kOpIdBegin = HIP_API_ID_FIRST; + static constexpr size_t kOpIdEnd = HIP_API_ID_LAST + 1; +}; + +template <> struct DomainTraits { + using ApiData = hsa_api_data_t; + using OperationId = hsa_api_id_t; + static constexpr size_t kOpIdBegin = 0; + static constexpr size_t kOpIdEnd = HSA_API_ID_NUMBER; +}; + +template <> struct DomainTraits { + using ApiData = roctx_api_data_t; + using OperationId = roctx_api_id_t; + static constexpr size_t kOpIdBegin = 0; + static constexpr size_t kOpIdEnd = ROCTX_API_ID_NUMBER; +}; + +template <> struct DomainTraits { + using OperationId = hip_op_id_t; + static constexpr size_t kOpIdBegin = 0; + static constexpr size_t kOpIdEnd = HIP_OP_ID_NUMBER; +}; + +template <> struct DomainTraits { + using OperationId = hsa_op_id_t; + static constexpr size_t kOpIdBegin = 0; + static constexpr size_t kOpIdEnd = HSA_OP_ID_NUMBER; +}; + +template <> struct DomainTraits { + using ApiData = hsa_evt_data_t; + using OperationId = hsa_evt_id_t; + static constexpr size_t kOpIdBegin = 0; + static constexpr size_t kOpIdEnd = HSA_EVT_ID_NUMBER; +}; + +constexpr uint32_t get_op_begin(activity_domain_t domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: - return 0; + return DomainTraits::kOpIdBegin; case ACTIVITY_DOMAIN_HSA_API: - return 0; + return DomainTraits::kOpIdBegin; case ACTIVITY_DOMAIN_HSA_EVT: - return 0; + return DomainTraits::kOpIdBegin; case ACTIVITY_DOMAIN_HIP_OPS: - return 0; + return DomainTraits::kOpIdBegin; case ACTIVITY_DOMAIN_HIP_API: - return HIP_API_ID_FIRST; - case ACTIVITY_DOMAIN_EXT_API: - return 0; + return DomainTraits::kOpIdBegin; case ACTIVITY_DOMAIN_ROCTX: + return DomainTraits::kOpIdBegin; + case ACTIVITY_DOMAIN_EXT_API: return 0; default: - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); + throw roctracer::ApiError(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID"); } - return 0; } -static inline uint32_t get_op_end(uint32_t domain) { +constexpr uint32_t get_op_end(activity_domain_t domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: - return HSA_OP_ID_NUMBER; + return DomainTraits::kOpIdEnd; case ACTIVITY_DOMAIN_HSA_API: - return HSA_API_ID_NUMBER; + return DomainTraits::kOpIdEnd; case ACTIVITY_DOMAIN_HSA_EVT: - return HSA_EVT_ID_NUMBER; + return DomainTraits::kOpIdEnd; case ACTIVITY_DOMAIN_HIP_OPS: - return HIP_OP_ID_NUMBER; + return DomainTraits::kOpIdEnd; case ACTIVITY_DOMAIN_HIP_API: - return HIP_API_ID_LAST + 1; - case ACTIVITY_DOMAIN_EXT_API: - return 0; + return DomainTraits::kOpIdEnd; case ACTIVITY_DOMAIN_ROCTX: - return ROCTX_API_ID_NUMBER; + return DomainTraits::kOpIdEnd; + case ACTIVITY_DOMAIN_EXT_API: + return get_op_begin(ACTIVITY_DOMAIN_EXT_API); default: - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); + throw roctracer::ApiError(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID"); } - return 0; } -// Enable runtime API callbacks -static void roctracer_enable_callback_fun(roctracer_domain_t domain, uint32_t op, - roctracer_rtapi_callback_t callback, void* user_data) { - switch (domain) { - case ACTIVITY_DOMAIN_HSA_OPS: - break; - case ACTIVITY_DOMAIN_HSA_API: { -#if 0 - if (op == HSA_API_ID_DISPATCH) { - if (!RocpLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data)) - FATAL_LOGGING("HSA::RegisterApiCallback error(" << op << ") failed"); - break; +std::atomic stopped_status{false}; + +struct IsStopped { + bool operator()() const { return stopped_status.load(std::memory_order_relaxed); } +}; + +struct NeverStopped { + constexpr bool operator()() { return false; } +}; + +using UserCallback = std::pair; + +template +using CallbackRegistrationTable = + util::RegistrationTable::kOpIdEnd, IsStopped>; + +template +using ActivityRegistrationTable = + util::RegistrationTable::kOpIdEnd, IsStopped>; + +template struct ApiTracer { + using ApiData = typename DomainTraits::ApiData; + using OperationId = typename DomainTraits::OperationId; + + struct TraceData { + ApiData api_data; // API specific data (for example, function arguments). + uint64_t phase_enter_timestamp; // timestamp when phase_enter was executed. + uint64_t phase_data; // data that can be shared between phase_enter and phase_exit. + + void (*phase_enter)(OperationId operation_id, TraceData* data); + void (*phase_exit)(OperationId operation_id, TraceData* data); + }; + + static void Exit(OperationId operation_id, TraceData* trace_data) { + if (auto pool = activity_table.Get(operation_id)) { + assert(trace_data != nullptr); + activity_record_t record{}; + + record.domain = domain; + record.op = operation_id; + record.correlation_id = trace_data->api_data.correlation_id; + record.begin_ns = trace_data->phase_enter_timestamp; + record.end_ns = hsa_support::timestamp_ns(); + record.process_id = GetPid(); + record.thread_id = GetTid(); + + if (auto external_id = ExternalCorrelationId()) { + roctracer_record_t ext_record{}; + ext_record.domain = ACTIVITY_DOMAIN_EXT_API; + ext_record.op = ACTIVITY_EXT_OP_EXTERN_ID; + ext_record.correlation_id = record.correlation_id; + ext_record.external_id = *external_id; + // Write the external correlation id record directly followed by the activity record. + (*pool)->Write(std::array{ext_record, record}); + } else { + // Write record to the buffer. + (*pool)->Write(record); } -#endif - if (op >= HSA_API_ID_NUMBER) - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, - "invalid HSA API operation ID(" << op << ")"); + } + CorrelationIdPop(); + } - hsa_support::cb_table.Set(op, callback, user_data); - break; + static void Exit_UserCallback(OperationId operation_id, TraceData* trace_data) { + if (auto user_callback = callback_table.Get(operation_id)) { + assert(trace_data != nullptr); + trace_data->api_data.phase = ACTIVITY_API_PHASE_EXIT; + user_callback->first(domain, operation_id, &trace_data->api_data, user_callback->second); } - case ACTIVITY_DOMAIN_HSA_EVT: { - if (!RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data)) - FATAL_LOGGING("HSA::RegisterEvtCallback error(" << op << ") failed"); - break; + Exit(operation_id, trace_data); + } + + static void Enter_UserCallback(OperationId operation_id, TraceData* trace_data) { + if (auto user_callback = callback_table.Get(operation_id)) { + assert(trace_data != nullptr); + trace_data->api_data.phase = ACTIVITY_API_PHASE_ENTER; + trace_data->api_data.phase_data = &trace_data->phase_data; + user_callback->first(domain, operation_id, &trace_data->api_data, user_callback->second); + trace_data->phase_exit = Exit_UserCallback; + } else { + trace_data->phase_exit = Exit; + } + } + + static int Enter(OperationId operation_id, TraceData* trace_data) { + bool callback_enabled = callback_table.Get(operation_id).has_value(), + activity_enabled = activity_table.Get(operation_id).has_value(); + if (!callback_enabled && !activity_enabled) return -1; + + if (trace_data != nullptr) { + // Generate a new correlation ID. + trace_data->api_data.correlation_id = CorrelationIdPush(); + + if (activity_enabled) { + trace_data->phase_enter_timestamp = hsa_support::timestamp_ns(); + trace_data->phase_enter = nullptr; + trace_data->phase_exit = Exit; + } + if (callback_enabled) { + trace_data->phase_enter = Enter_UserCallback; + trace_data->phase_exit = [](OperationId, TraceData*) { fatal("should not reach here"); }; + } } + return 0; + } + + static CallbackRegistrationTable callback_table; + static ActivityRegistrationTable activity_table; +}; + +template +CallbackRegistrationTable ApiTracer::callback_table; + +template +ActivityRegistrationTable ApiTracer::activity_table; + +using HIP_ApiTracer = ApiTracer; +using HSA_ApiTracer = ApiTracer; + +CallbackRegistrationTable roctx_api_callback_table; +ActivityRegistrationTable hip_ops_activity_table; +ActivityRegistrationTable hsa_ops_activity_table; +CallbackRegistrationTable hsa_evt_callback_table; + +int TracerCallback(activity_domain_t domain, uint32_t operation_id, void* data) { + switch (domain) { + case ACTIVITY_DOMAIN_HSA_API: + return HSA_ApiTracer::Enter(static_cast(operation_id), + static_cast(data)); + + case ACTIVITY_DOMAIN_HIP_API: + return HIP_ApiTracer::Enter(static_cast(operation_id), + static_cast(data)); + case ACTIVITY_DOMAIN_HIP_OPS: + if (auto pool = hip_ops_activity_table.Get(operation_id)) { + if (auto record = static_cast(data)) { + // If the record is for a kernel dispatch, write the kernel name in the pool's data, + // and make the record point to it. Older HIP runtimes do not provide a kernel + // name, so record.kernel_name might be null. + if (operation_id == HIP_OP_ID_DISPATCH && record->kernel_name != nullptr) + (*pool)->Write(*record, record->kernel_name, strlen(record->kernel_name) + 1, + [](auto& record, const void* data) { + record.kernel_name = static_cast(data); + }); + else + (*pool)->Write(*record); + } + return 0; + } break; - case ACTIVITY_DOMAIN_HIP_API: { - if (!HipLoader::Instance().Enabled()) break; - std::lock_guard lock(hip_activity_mutex); - - hipError_t hip_err = - HipLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIP::RegisterApiCallback(" << op << ") error(" << hip_err << ")"); - - if (HipApiActivityEnableCheck(op) == 0) { - hip_err = HipLoader::Instance().RegisterActivityCallback(op, (void*)HIP_SyncApiDataCallback, - (void*)1); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIPAPI: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err - << ")"); + + case ACTIVITY_DOMAIN_ROCTX: + if (auto user_callback = roctx_api_callback_table.Get(operation_id)) { + if (auto api_data = static_cast::ApiData*>(data)) + user_callback->first(ACTIVITY_DOMAIN_ROCTX, operation_id, api_data, + user_callback->second); + return 0; } break; - } - case ACTIVITY_DOMAIN_ROCTX: { - if (RocTxLoader::Instance().Enabled() && - !RocTxLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data)) - FATAL_LOGGING("ROCTX::RegisterApiCallback(" << op << ") failed"); + + case ACTIVITY_DOMAIN_HSA_OPS: + if (auto pool = hsa_ops_activity_table.Get(operation_id)) { + if (auto record = static_cast(data)) (*pool)->Write(*record); + return 0; + } break; - } + + case ACTIVITY_DOMAIN_HSA_EVT: + if (auto user_callback = hsa_evt_callback_table.Get(operation_id)) { + if (auto api_data = static_cast::ApiData*>(data)) + user_callback->first(ACTIVITY_DOMAIN_HSA_EVT, operation_id, api_data, + user_callback->second); + return 0; + } + break; + default: - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); + break; } + return -1; } -static void roctracer_enable_callback_impl(roctracer_domain_t domain, uint32_t op, +template struct RegistrationTableGroup { + private: + bool AllEmpty() const { + return std::apply([](auto&&... tables) { return (tables.IsEmpty() && ...); }, tables_); + } + + public: + template + RegistrationTableGroup(Functor1&& engage_tracer, Functor2&& disengage_tracer, Tables&... tables) + : engage_tracer_(std::forward(engage_tracer)), + disengage_tracer_(std::forward(disengage_tracer)), + tables_(tables...) {} + + template + void Register(T& table, uint32_t operation_id, Args... args) const { + if (AllEmpty()) engage_tracer_(); + table.Register(operation_id, std::forward(args)...); + } + + template void Unregister(T& table, uint32_t operation_id) const { + table.Unregister(operation_id); + if (AllEmpty()) disengage_tracer_(); + } + + private: + const std::function engage_tracer_, disengage_tracer_; + const std::tuple tables_; +}; + +RegistrationTableGroup HSA_registration_group( + []() { hsa_support::RegisterTracerCallback(TracerCallback); }, + []() { hsa_support::RegisterTracerCallback(nullptr); }, HSA_ApiTracer::callback_table, + HSA_ApiTracer::activity_table, hsa_ops_activity_table, hsa_evt_callback_table); + +RegistrationTableGroup HIP_registration_group( + []() { HipLoader::Instance().RegisterTracerCallback(TracerCallback); }, + []() { HipLoader::Instance().RegisterTracerCallback(nullptr); }, HIP_ApiTracer::callback_table, + HIP_ApiTracer::activity_table, hip_ops_activity_table); + +RegistrationTableGroup ROCTX_registration_group( + []() { RocTxLoader::Instance().RegisterTracerCallback(TracerCallback); }, + []() { RocTxLoader::Instance().RegisterTracerCallback(nullptr); }, roctx_api_callback_table); + +} // namespace + +// Enable runtime API callbacks +static void roctracer_enable_callback_impl(roctracer_domain_t domain, uint32_t operation_id, roctracer_rtapi_callback_t callback, void* user_data) { - cb_journal.Insert(domain, op, {callback, user_data}); - roctracer_enable_callback_fun(domain, op, callback, user_data); + std::lock_guard lock(registration_mutex); + + if (operation_id >= get_op_end(domain) || callback == nullptr) + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid argument"); + + switch (domain) { + case ACTIVITY_DOMAIN_HSA_EVT: + HSA_registration_group.Register(hsa_evt_callback_table, operation_id, callback, user_data); + break; + case ACTIVITY_DOMAIN_HSA_API: + HSA_registration_group.Register(HSA_ApiTracer::callback_table, operation_id, callback, + user_data); + break; + case ACTIVITY_DOMAIN_HSA_OPS: + break; + case ACTIVITY_DOMAIN_HIP_API: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Register(HIP_ApiTracer::callback_table, operation_id, callback, + user_data); + break; + case ACTIVITY_DOMAIN_HIP_OPS: + break; + case ACTIVITY_DOMAIN_ROCTX: + if (RocTxLoader::Instance().IsEnabled()) + ROCTX_registration_group.Register(roctx_api_callback_table, operation_id, callback, + user_data); + break; + default: + EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); + } } ROCTRACER_API roctracer_status_t roctracer_enable_op_callback(roctracer_domain_t domain, @@ -708,72 +530,37 @@ ROCTRACER_API roctracer_status_t roctracer_enable_domain_callback( API_METHOD_SUFFIX } -ROCTRACER_API roctracer_status_t roctracer_enable_callback(roctracer_rtapi_callback_t callback, - void* user_data) { - API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { - const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_enable_callback_impl((roctracer_domain_t)domain, op, callback, user_data); - } - API_METHOD_SUFFIX -} - // Disable runtime API callbacks -static void roctracer_disable_callback_fun(roctracer_domain_t domain, uint32_t op) { +static void roctracer_disable_callback_impl(roctracer_domain_t domain, uint32_t operation_id) { + std::lock_guard lock(registration_mutex); + + if (operation_id >= get_op_end(domain)) + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid argument"); + switch (domain) { - case ACTIVITY_DOMAIN_HSA_OPS: + case ACTIVITY_DOMAIN_HSA_EVT: + HSA_registration_group.Unregister(hsa_evt_callback_table, operation_id); break; - case ACTIVITY_DOMAIN_HSA_API: { -#if 0 - if (op == HSA_API_ID_DISPATCH && !RocpLoader::Instance().RemoveApiCallback(op)) - FATAL_LOGGING("HSA::RemoveActivityCallback error(" << op << ") failed"); - break; -#endif - if (op >= HSA_API_ID_NUMBER) - EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, - "invalid HSA API operation ID(" << op << ")"); - hsa_support::cb_table.Set(op, nullptr, nullptr); + case ACTIVITY_DOMAIN_HSA_API: + HSA_registration_group.Unregister(HSA_ApiTracer::callback_table, operation_id); break; - } - case ACTIVITY_DOMAIN_HIP_OPS: + case ACTIVITY_DOMAIN_HSA_OPS: break; - case ACTIVITY_DOMAIN_HIP_API: { - if (!HipLoader::Instance().Enabled()) break; - std::lock_guard lock(hip_activity_mutex); - - const hipError_t hip_err = HipLoader::Instance().RemoveApiCallback(op); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIP::RemoveApiCallback(" << op << "), error(" << hip_err << ")"); - - if (HipApiActivityDisableCheck(op) == 0) { - const hipError_t hip_err = HipLoader::Instance().RemoveActivityCallback(op); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIPAPI: HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err - << ")"); - } + case ACTIVITY_DOMAIN_HIP_API: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Unregister(HIP_ApiTracer::callback_table, operation_id); break; - } - case ACTIVITY_DOMAIN_HSA_EVT: { - if (!RocpLoader::Instance().RemoveEvtCallback(op)) - FATAL_LOGGING("HSA::RemoveEvtCallback error(" << op << ") failed"); + case ACTIVITY_DOMAIN_HIP_OPS: break; - } - case ACTIVITY_DOMAIN_ROCTX: { - if (RocTxLoader::Instance().Enabled() && !RocTxLoader::Instance().RemoveApiCallback(op)) - FATAL_LOGGING("ROCTX::RemoveApiCallback(" << op << ") failed"); + case ACTIVITY_DOMAIN_ROCTX: + if (RocTxLoader::Instance().IsEnabled()) + ROCTX_registration_group.Unregister(roctx_api_callback_table, operation_id); break; - } default: EXC_RAISING(ROCTRACER_STATUS_ERROR_INVALID_DOMAIN_ID, "invalid domain ID(" << domain << ")"); } } -static void roctracer_disable_callback_impl(roctracer_domain_t domain, uint32_t op) { - cb_journal.Remove(domain, op); - roctracer_disable_callback_fun(domain, op); -} - ROCTRACER_API roctracer_status_t roctracer_disable_op_callback(roctracer_domain_t domain, uint32_t op) { API_METHOD_PREFIX @@ -789,16 +576,6 @@ ROCTRACER_API roctracer_status_t roctracer_disable_domain_callback(roctracer_dom API_METHOD_SUFFIX } -ROCTRACER_API roctracer_status_t roctracer_disable_callback() { - API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { - const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_disable_callback_impl((roctracer_domain_t)domain, op); - } - API_METHOD_SUFFIX -} - // Return default pool and set new one if parameter pool is not NULL. ROCTRACER_API roctracer_pool_t* roctracer_default_pool_expl(roctracer_pool_t* pool) { std::lock_guard lock(memory_pool_mutex); @@ -848,64 +625,35 @@ ROCTRACER_API roctracer_status_t roctracer_next_record(const activity_record_t* } // Enable activity records logging -static void roctracer_enable_activity_fun(roctracer_domain_t domain, uint32_t op, - roctracer_pool_t* pool) { - assert(pool != nullptr); +static void roctracer_enable_activity_impl(roctracer_domain_t domain, uint32_t op, + roctracer_pool_t* pool) { + std::lock_guard lock(registration_mutex); + + MemoryPool* memory_pool = reinterpret_cast(pool); + if (memory_pool == nullptr) memory_pool = default_memory_pool; + if (memory_pool == nullptr) + EXC_RAISING(ROCTRACER_STATUS_ERROR_DEFAULT_POOL_UNDEFINED, "no default pool"); + + if (op >= get_op_end(domain)) + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid argument"); + switch (domain) { - case ACTIVITY_DOMAIN_HSA_OPS: { - if (op == HSA_OP_ID_COPY) { - std::scoped_lock lock(hsa_support::init_mutex); - - if (hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn != nullptr) { - [[maybe_unused]] hsa_status_t status = - hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(true); - assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); - } - RocpLoader::Instance(); - hsa_support::async_copy_callback_enabled = true; - hsa_support::async_copy_callback_memory_pool = reinterpret_cast(pool); - } else { - const bool init_phase = (RocpLoader::GetRef() == nullptr); - if (RocpLoader::GetRef() == nullptr) break; - if (init_phase) { - RocpLoader::Instance().InitActivityCallback((void*)HSA_AsyncActivityCallback, - (void*)pool); - } - if (!RocpLoader::Instance().EnableActivityCallback(op, true)) - FATAL_LOGGING("HSA::EnableActivityCallback error"); - } + case ACTIVITY_DOMAIN_HSA_EVT: break; - } case ACTIVITY_DOMAIN_HSA_API: + HSA_registration_group.Register(HSA_ApiTracer::activity_table, op, memory_pool); break; - case ACTIVITY_DOMAIN_HSA_EVT: - RocpLoader::Instance(); + case ACTIVITY_DOMAIN_HSA_OPS: + HSA_registration_group.Register(hsa_ops_activity_table, op, memory_pool); break; - case ACTIVITY_DOMAIN_HIP_OPS: { - if (!HipLoader::Instance().Enabled()) break; - std::lock_guard lock(hip_activity_mutex); - - if (!HipLoader::Instance().InitActivityDone()) { - HipLoader::Instance().InitActivityCallback((void*)HIP_ActivityIdCallback, - (void*)HIP_AsyncActivityCallback, (void*)pool); - HipLoader::Instance().InitActivityDone() = true; - } - if (!HipLoader::Instance().EnableActivityCallback(op, true)) - FATAL_LOGGING("HIP::EnableActivityCallback error"); + case ACTIVITY_DOMAIN_HIP_API: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Register(HIP_ApiTracer::activity_table, op, memory_pool); break; - } - case ACTIVITY_DOMAIN_HIP_API: { - if (!HipLoader::Instance().Enabled()) break; - std::lock_guard lock(hip_activity_mutex); - - if (HipActActivityEnableCheck(op) == 0) { - const hipError_t hip_err = HipLoader::Instance().RegisterActivityCallback( - op, (void*)HIP_SyncActivityCallback, (void*)pool); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIP::RegisterActivityCallback(" << op << " error(" << hip_err << ")"); - } + case ACTIVITY_DOMAIN_HIP_OPS: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Register(hip_ops_activity_table, op, memory_pool); break; - } case ACTIVITY_DOMAIN_ROCTX: break; default: @@ -913,15 +661,6 @@ static void roctracer_enable_activity_fun(roctracer_domain_t domain, uint32_t op } } -static void roctracer_enable_activity_impl(roctracer_domain_t domain, uint32_t op, - roctracer_pool_t* pool) { - if (pool == nullptr) pool = default_memory_pool; - if (pool == nullptr) - EXC_RAISING(ROCTRACER_STATUS_ERROR_DEFAULT_POOL_UNDEFINED, "no default pool"); - act_journal.Insert(domain, op, {pool}); - roctracer_enable_activity_fun(domain, op, pool); -} - ROCTRACER_API roctracer_status_t roctracer_enable_op_activity_expl(roctracer_domain_t domain, uint32_t op, roctracer_pool_t* pool) { @@ -940,8 +679,11 @@ ROCTRACER_API roctracer_status_t roctracer_enable_op_activity(activity_domain_t static void roctracer_enable_domain_activity_impl(roctracer_domain_t domain, roctracer_pool_t* pool) { const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_enable_activity_impl(domain, op, pool); + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) try { + roctracer_enable_activity_impl(domain, op, pool); + } catch (const ApiError& err) { + if (err.status() != ROCTRACER_STATUS_ERROR_NOT_IMPLEMENTED) throw; + } } ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity_expl(roctracer_domain_t domain, @@ -957,76 +699,30 @@ ROCTRACER_API roctracer_status_t roctracer_enable_domain_activity(activity_domai API_METHOD_SUFFIX } -static void roctracer_enable_activity_impl(roctracer_pool_t* pool) { - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { - const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_enable_activity_impl((roctracer_domain_t)domain, op, pool); - } -} - -ROCTRACER_API roctracer_status_t roctracer_enable_activity_expl(roctracer_pool_t* pool) { - API_METHOD_PREFIX - roctracer_enable_activity_impl(pool); - API_METHOD_SUFFIX -} +// Disable activity records logging +static void roctracer_disable_activity_impl(roctracer_domain_t domain, uint32_t op) { + std::lock_guard lock(registration_mutex); -ROCTRACER_API roctracer_status_t roctracer_enable_activity() { - API_METHOD_PREFIX - roctracer_enable_activity_impl(nullptr); - API_METHOD_SUFFIX -} + if (op >= get_op_end(domain)) + throw ApiError(ROCTRACER_STATUS_ERROR_INVALID_ARGUMENT, "invalid argument"); -// Disable activity records logging -static void roctracer_disable_activity_fun(roctracer_domain_t domain, uint32_t op) { switch (domain) { - case ACTIVITY_DOMAIN_HSA_OPS: { - if (op == HSA_OP_ID_COPY) { - std::scoped_lock lock(hsa_support::init_mutex); - - hsa_support::async_copy_callback_enabled = false; - hsa_support::async_copy_callback_memory_pool = nullptr; - - if (hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn != nullptr) { - [[maybe_unused]] hsa_status_t status = - hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(false); - assert(status == HSA_STATUS_SUCCESS || status == HSA_STATUS_ERROR_NOT_INITIALIZED || - !"hsa_amd_profiling_async_copy_enable failed"); - } - } else { - if (RocpLoader::GetRef() == nullptr) break; - if (!RocpLoader::Instance().EnableActivityCallback(op, false)) - FATAL_LOGGING("HSA::EnableActivityCallback(false) error, op(" << op << ")"); - } + case ACTIVITY_DOMAIN_HSA_EVT: break; - } case ACTIVITY_DOMAIN_HSA_API: + HSA_registration_group.Unregister(HSA_ApiTracer::activity_table, op); break; - case ACTIVITY_DOMAIN_HSA_EVT: + case ACTIVITY_DOMAIN_HSA_OPS: + HSA_registration_group.Unregister(hsa_ops_activity_table, op); break; - case ACTIVITY_DOMAIN_HIP_OPS: { - if (HipLoader::Instance().Enabled() && - !HipLoader::Instance().EnableActivityCallback(op, false)) - FATAL_LOGGING("HIP::EnableActivityCallback(nullptr) error, op(" << op << ")"); + case ACTIVITY_DOMAIN_HIP_API: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Unregister(HIP_ApiTracer::activity_table, op); break; - } - case ACTIVITY_DOMAIN_HIP_API: { - if (!HipLoader::Instance().Enabled()) break; - std::lock_guard lock(hip_activity_mutex); - - if (HipActActivityDisableCheck(op) == 0) { - const hipError_t hip_err = HipLoader::Instance().RemoveActivityCallback(op); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIP::RemoveActivityCallback op(" << op << "), error(" << hip_err << ")"); - } else { - const hipError_t hip_err = HipLoader::Instance().RegisterActivityCallback( - op, (void*)HIP_SyncApiDataCallback, (void*)1); - if (hip_err != hipSuccess) - FATAL_LOGGING("HIPACT: HIP::RegisterActivityCallback(" << op << ") error(" << hip_err - << ")"); - } + case ACTIVITY_DOMAIN_HIP_OPS: + if (HipLoader::Instance().IsEnabled()) + HIP_registration_group.Unregister(hip_ops_activity_table, op); break; - } case ACTIVITY_DOMAIN_ROCTX: break; default: @@ -1034,11 +730,6 @@ static void roctracer_disable_activity_fun(roctracer_domain_t domain, uint32_t o } } -static void roctracer_disable_activity_impl(roctracer_domain_t domain, uint32_t op) { - act_journal.Remove(domain, op); - roctracer_disable_activity_fun(domain, op); -} - ROCTRACER_API roctracer_status_t roctracer_disable_op_activity(roctracer_domain_t domain, uint32_t op) { API_METHOD_PREFIX @@ -1046,21 +737,18 @@ ROCTRACER_API roctracer_status_t roctracer_disable_op_activity(roctracer_domain_ API_METHOD_SUFFIX } -ROCTRACER_API roctracer_status_t roctracer_disable_domain_activity(roctracer_domain_t domain) { - API_METHOD_PREFIX +static void roctracer_disable_domain_activity_impl(roctracer_domain_t domain) { const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_disable_activity_impl(domain, op); - API_METHOD_SUFFIX + for (uint32_t op = get_op_begin(domain); op < op_end; ++op) try { + roctracer_disable_activity_impl(domain, op); + } catch (const ApiError& err) { + if (err.status() != ROCTRACER_STATUS_ERROR_NOT_IMPLEMENTED) throw; + } } -ROCTRACER_API roctracer_status_t roctracer_disable_activity() { +ROCTRACER_API roctracer_status_t roctracer_disable_domain_activity(roctracer_domain_t domain) { API_METHOD_PREFIX - for (uint32_t domain = 0; domain < ACTIVITY_DOMAIN_NUMBER; ++domain) { - const uint32_t op_end = get_op_end(domain); - for (uint32_t op = get_op_begin(domain); op < op_end; ++op) - roctracer_disable_activity_impl((roctracer_domain_t)domain, op); - } + roctracer_disable_domain_activity_impl(domain); API_METHOD_SUFFIX } @@ -1072,6 +760,7 @@ static void roctracer_close_pool_impl(roctracer_pool_t* pool) { MemoryPool* p = reinterpret_cast(pool); if (p == default_memory_pool) default_memory_pool = nullptr; +#if 0 // Disable any activities that specify the pool being deleted. std::vector> ops; act_journal.ForEach( @@ -1080,6 +769,7 @@ static void roctracer_close_pool_impl(roctracer_pool_t* pool) { return true; }); for (auto&& [domain, op] : ops) roctracer_disable_activity_impl(domain, op); +#endif delete (p); } @@ -1120,70 +810,42 @@ ROCTRACER_API roctracer_status_t roctracer_flush_activity() { ROCTRACER_API roctracer_status_t roctracer_activity_push_external_correlation_id(activity_correlation_id_t id) { API_METHOD_PREFIX - external_id_stack.push(id); + ExternalCorrelationIdPush(id); API_METHOD_SUFFIX } // Notifies that the calling thread is leaving an external API region. -// Pop an external correlation id for the calling thread. -// 'lastId' returns the last external correlation +// Pop an external correlation id for the calling thread, and return it in 'last_id' if not null. ROCTRACER_API roctracer_status_t roctracer_activity_pop_external_correlation_id(activity_correlation_id_t* last_id) { API_METHOD_PREFIX - if (last_id != nullptr) *last_id = 0; - if (external_id_stack.empty()) - EXC_RAISING(ROCTRACER_STATUS_ERROR_MISMATCHED_EXTERNAL_CORRELATION_ID, - "not matching external range pop"); - if (last_id != nullptr) *last_id = external_id_stack.top(); - external_id_stack.pop(); - API_METHOD_SUFFIX -} -// Mark API (FIXME: why isn't it in the roctracer_ext.h header?) -extern "C" ROCTRACER_API void roctracer_mark(const char* str) { - if (mark_api_callback_ptr) { - mark_api_callback_ptr(ACTIVITY_DOMAIN_EXT_API, ACTIVITY_EXT_OP_MARK, str, nullptr); - NextCorrelationId(); // account for user-defined markers when tracking - // correlation id + auto external_id = ExternalCorrelationIdPop(); + if (!external_id) { + if (last_id != nullptr) *last_id = 0; + EXC_RAISING(ROCTRACER_STATUS_ERROR_MISMATCHED_EXTERNAL_CORRELATION_ID, + "unbalanced external correlation id pop"); } + + if (last_id != nullptr) *last_id = *external_id; + API_METHOD_SUFFIX } // Start API ROCTRACER_API void roctracer_start() { - if (set_stopped(0)) { - if (ext_support::roctracer_start_cb) ext_support::roctracer_start_cb(); - cb_journal.ForEach([](roctracer_domain_t domain, uint32_t op, const CallbackJournalData& data) { - roctracer_enable_callback_fun(domain, op, data.callback, data.user_data); - return true; - }); - act_journal.ForEach( - [](roctracer_domain_t domain, uint32_t op, const ActivityJournalData& data) { - roctracer_enable_activity_fun(domain, op, data.pool); - return true; - }); - } + if (stopped_status.exchange(false, std::memory_order_relaxed) && roctracer_start_cb) + roctracer_start_cb(); } // Stop API ROCTRACER_API void roctracer_stop() { - if (set_stopped(1)) { - // Must disable the activity first as the spawner checks for the activity being NULL - // to indicate that there is no callback. - act_journal.ForEach([](roctracer_domain_t domain, uint32_t op, const ActivityJournalData&) { - roctracer_disable_activity_fun(domain, op); - return true; - }); - cb_journal.ForEach([](roctracer_domain_t domain, uint32_t op, const CallbackJournalData&) { - roctracer_disable_callback_fun(domain, op); - return true; - }); - if (ext_support::roctracer_stop_cb) ext_support::roctracer_stop_cb(); - } + if (!stopped_status.exchange(true, std::memory_order_relaxed) && roctracer_stop_cb) + roctracer_stop_cb(); } ROCTRACER_API roctracer_status_t roctracer_get_timestamp(roctracer_timestamp_t* timestamp) { API_METHOD_PREFIX - *timestamp = util::timestamp_ns(); + *timestamp = hsa_support::timestamp_ns(); API_METHOD_SUFFIX } @@ -1192,25 +854,18 @@ ROCTRACER_API roctracer_status_t roctracer_set_properties(roctracer_domain_t dom void* properties) { API_METHOD_PREFIX switch (domain) { - case ACTIVITY_DOMAIN_HSA_OPS: { - break; - } - case ACTIVITY_DOMAIN_HSA_EVT: { - break; - } - case ACTIVITY_DOMAIN_HSA_API: { - break; - } + case ACTIVITY_DOMAIN_HSA_OPS: + case ACTIVITY_DOMAIN_HSA_EVT: + case ACTIVITY_DOMAIN_HSA_API: case ACTIVITY_DOMAIN_HIP_OPS: case ACTIVITY_DOMAIN_HIP_API: { - mark_api_callback_ptr = reinterpret_cast(properties); break; } case ACTIVITY_DOMAIN_EXT_API: { roctracer_ext_properties_t* ops_properties = reinterpret_cast(properties); - ext_support::roctracer_start_cb = ops_properties->start_cb; - ext_support::roctracer_stop_cb = ops_properties->stop_cb; + roctracer_start_cb = ops_properties->start_cb; + roctracer_stop_cb = ops_properties->stop_cb; break; } default: @@ -1219,89 +874,21 @@ ROCTRACER_API roctracer_status_t roctracer_set_properties(roctracer_domain_t dom API_METHOD_SUFFIX } -__attribute__((constructor)) void constructor() { - ONLOAD_TRACE_BEG(); - util::Logger::Create(); - ONLOAD_TRACE_END(); -} - -__attribute__((destructor)) void destructor() { - ONLOAD_TRACE_BEG(); - util::Logger::Destroy(); - ONLOAD_TRACE_END(); -} - extern "C" { // The HSA_AMD_TOOL_PRIORITY variable must be a constant value type initialized by the loader -// itself, not by code during _init. 'extern const' seems do that although that is not a guarantee. +// itself, not by code during _init. 'extern const' seems to do that although that is not a +// guarantee. ROCTRACER_EXPORT extern const uint32_t HSA_AMD_TOOL_PRIORITY = 50; // HSA-runtime tool on-load method ROCTRACER_EXPORT bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { - std::scoped_lock lock(hsa_support::init_mutex); - - // Save the HSA core api and amd_ext api. - hsa_support::saved_core_api = *table->core_; - hsa_support::saved_amd_ext_api = *table->amd_ext_; - - // Enumerate the agents. - if (hsa_support::saved_core_api.hsa_iterate_agents_fn( - [](hsa_agent_t agent, void* data) { - hsa_support::AgentInfo agent_info; - if (hsa_support::saved_core_api.hsa_agent_get_info_fn( - agent, HSA_AGENT_INFO_DEVICE, &agent_info.type) != HSA_STATUS_SUCCESS) - FATAL_LOGGING("hsa_agent_get_info failed"); - switch (agent_info.type) { - case HSA_DEVICE_TYPE_CPU: - static int cpu_agent_count = 0; - agent_info.index = cpu_agent_count++; - break; - case HSA_DEVICE_TYPE_GPU: - static int gpu_agent_count = 0; - agent_info.index = gpu_agent_count++; - break; - default: - static int other_agent_count = 0; - agent_info.index = other_agent_count++; - break; - } - hsa_support::agent_info_map.emplace(agent.handle, agent_info); - return HSA_STATUS_SUCCESS; - }, - nullptr) != HSA_STATUS_SUCCESS) - FATAL_LOGGING("hsa_iterate_agents failed"); - - // Install the HSA_OPS intercept - table->amd_ext_->hsa_amd_memory_async_copy_fn = - hsa_support::hsa_amd_memory_async_copy_interceptor; - table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = - hsa_support::hsa_amd_memory_async_copy_rect_interceptor; - - // Install the HSA_API wrappers - hsa_support::InstallCoreApiWrappers(table->core_); - hsa_support::InstallAmdExtWrappers(table->amd_ext_); - hsa_support::InstallImageExtWrappers(table->image_ext_); - - if (hsa_support::async_copy_callback_enabled) { - [[maybe_unused]] hsa_status_t status = - hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(true); - assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); - } - + [](auto&&...) {}(runtime_version, failed_tool_count, failed_tool_names); + hsa_support::Initialize(table); return true; } -ROCTRACER_EXPORT void OnUnload() { - if (hsa_support::async_copy_callback_enabled) { - [[maybe_unused]] hsa_status_t status = - hsa_support::saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(false); - assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); - } - - memset(&hsa_support::saved_core_api, '\0', sizeof(hsa_support::saved_core_api)); - memset(&hsa_support::saved_amd_ext_api, '\0', sizeof(hsa_support::saved_amd_ext_api)); -} +ROCTRACER_EXPORT void OnUnload() { hsa_support::Finalize(); } -} // extern "C" \ No newline at end of file +} // extern "C" diff --git a/src/roctracer/tracker.h b/src/roctracer/tracker.h deleted file mode 100644 index d651c288..00000000 --- a/src/roctracer/tracker.h +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#ifndef SRC_CORE_TRACKER_H_ -#define SRC_CORE_TRACKER_H_ - -#include -#include -#include -#include - -#include - -#include "exception.h" -#include "util/logger.h" - -namespace roctracer { -class Tracker { - public: - enum { ENTRY_INV = 0, ENTRY_INIT = 1, ENTRY_COMPL = 2 }; - - enum entry_type_t { - DFLT_ENTRY_TYPE = 0, - API_ENTRY_TYPE = 1, - COPY_ENTRY_TYPE = 2, - KERNEL_ENTRY_TYPE = 3, - NUM_ENTRY_TYPE = 4 - }; - - struct entry_t { - std::atomic valid; - entry_type_t type; - uint64_t correlation_id; - roctracer_timestamp_t begin; // begin timestamp, ns - roctracer_timestamp_t end; // end timestamp, ns - hsa_agent_t agent; - uint32_t dev_index; - hsa_signal_t orig; - hsa_signal_t signal; - void (*handler)(const entry_t*); - MemoryPool* pool; - union { - struct { - } copy; - struct { - const char* name; - hsa_agent_t agent; - uint32_t tid; - } kernel; - }; - }; - - // Add tracker entry - inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, - entry_t* entry) { - hsa_status_t status = HSA_STATUS_ERROR; - - // Creating a new tracker entry - entry->type = type; - entry->agent = agent; - entry->dev_index = 0; // hsa_rsrc->GetAgentInfo(agent)->dev_index; - entry->orig = signal; - entry->valid.store(ENTRY_INIT, std::memory_order_release); - - // Creating a proxy signal - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) FATAL_LOGGING("hsa_signal_create failed"); - status = - hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); - if (status != HSA_STATUS_SUCCESS) FATAL_LOGGING("hsa_amd_signal_async_handler failed"); - } - - // Delete tracker entry - inline static void Disable(entry_t* entry) { - hsa_signal_destroy(entry->signal); - entry->valid.store(ENTRY_INV, std::memory_order_release); - } - - private: - // Entry completion - inline static void Complete(hsa_signal_value_t signal_value, entry_t* entry) { - static roctracer_timestamp_t sysclock_period = []() { - uint64_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); - if (status != HSA_STATUS_SUCCESS) FATAL_LOGGING("hsa_system_get_info failed"); - return (uint64_t)1000000000 / sysclock_hz; - }(); - - if (entry->type == COPY_ENTRY_TYPE) { - hsa_amd_profiling_async_copy_time_t async_copy_time{}; - hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); - if (status != HSA_STATUS_SUCCESS) - FATAL_LOGGING("hsa_amd_profiling_get_async_copy_time failed"); - entry->begin = async_copy_time.start * sysclock_period; - entry->end = async_copy_time.end * sysclock_period; - } else { - assert(false && "should not reach here"); - } - - hsa_signal_t orig = entry->orig; - hsa_signal_t signal = entry->signal; - - // Releasing completed entry - entry->valid.store(ENTRY_COMPL, std::memory_order_release); - - assert(entry->handler != nullptr); - entry->handler(entry); - - // Original intercepted signal completion - if (orig.handle) { - amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); - amd_signal_t* prof_signal_ptr = reinterpret_cast(signal.handle); - orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; - orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - - [[maybe_unused]] const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1; - assert(signal_value == new_value && "Tracker::Complete bad signal value"); - hsa_signal_store_screlease(orig, signal_value); - } - hsa_signal_destroy(signal); - delete entry; - } - - // Handler for packet completion - static bool Handler(hsa_signal_value_t signal_value, void* arg) { - // Acquire entry - entry_t* entry = reinterpret_cast(arg); - while (entry->valid.load(std::memory_order_acquire) != ENTRY_INIT) sched_yield(); - - // Complete entry - Tracker::Complete(signal_value, entry); - return false; - } -}; - -} // namespace roctracer - -#endif // SRC_CORE_TRACKER_H_ diff --git a/src/roctx/exportmap b/src/roctx/exportmap index de575163..9018c824 100644 --- a/src/roctx/exportmap +++ b/src/roctx/exportmap @@ -1,11 +1,10 @@ ROCTX_4.1 { -global: RegisterApiCallback; - RemoveApiCallback; - roctxMarkA; +global: roctxMarkA; roctxRangePop; roctxRangePushA; roctxRangeStartA; roctxRangeStop; + roctxRegisterTracerCallback; roctx_version_major; roctx_version_minor; local: *; diff --git a/src/roctx/roctx.cpp b/src/roctx/roctx.cpp index 9da91958..c7baf57a 100644 --- a/src/roctx/roctx.cpp +++ b/src/roctx/roctx.cpp @@ -22,66 +22,72 @@ #include "roctracer_roctx.h" #include "ext/prof_protocol.h" -#include "util/callback_table.h" +#include +#include namespace { -roctracer::util::CallbackTable callbacks; -thread_local int nested_range_level(0); +std::atomic report_activity; +thread_local int nested_range_level{0}; + +void ReportActivity(roctx_api_id_t operation_id, const char* message = nullptr, + roctx_range_id_t id = {}) { + auto function = report_activity.load(std::memory_order_relaxed); + if (!function) return; + + roctx_api_data_t api_data{}; + switch (operation_id) { + case ROCTX_API_ID_roctxMarkA: + api_data.args.roctxMarkA.message = message; + break; + case ROCTX_API_ID_roctxRangePushA: + api_data.args.roctxRangePushA.message = message; + break; + case ROCTX_API_ID_roctxRangePop: + break; + case ROCTX_API_ID_roctxRangeStartA: + api_data.args.roctxRangeStartA.message = message; + api_data.args.roctxRangeStartA.id = id; + break; + case ROCTX_API_ID_roctxRangeStop: + api_data.args.roctxRangeStop.id = id; + break; + default: + assert(!"should not reach here"); + } + function(ACTIVITY_DOMAIN_ROCTX, operation_id, &api_data); +} } // namespace ROCTX_API uint32_t roctx_version_major() { return ROCTX_VERSION_MAJOR; } ROCTX_API uint32_t roctx_version_minor() { return ROCTX_VERSION_MINOR; } -ROCTX_API void roctxMarkA(const char* message) { - roctx_api_data_t api_data{}; - api_data.args.roctxMarkA.message = message; - callbacks.Invoke(ROCTX_API_ID_roctxMarkA, &api_data); -} +ROCTX_API void roctxMarkA(const char* message) { ReportActivity(ROCTX_API_ID_roctxMarkA, message); } ROCTX_API int roctxRangePushA(const char* message) { - roctx_api_data_t api_data{}; - api_data.args.roctxRangePushA.message = message; - callbacks.Invoke(ROCTX_API_ID_roctxRangePushA, &api_data); - + ReportActivity(ROCTX_API_ID_roctxRangePushA, message); return nested_range_level++; } ROCTX_API int roctxRangePop() { - roctx_api_data_t api_data{}; - callbacks.Invoke(ROCTX_API_ID_roctxRangePop, &api_data); - + ReportActivity(ROCTX_API_ID_roctxRangePop); if (nested_range_level == 0) return -1; return --nested_range_level; } ROCTX_API roctx_range_id_t roctxRangeStartA(const char* message) { static std::atomic start_stop_range_id(1); - auto id = start_stop_range_id++; - - roctx_api_data_t api_data{}; - api_data.args.roctxRangeStartA.message = message; - api_data.args.roctxRangeStartA.id = id; - callbacks.Invoke(ROCTX_API_ID_roctxRangeStartA, &api_data); - - return id; + auto range_id = start_stop_range_id++; + ReportActivity(ROCTX_API_ID_roctxRangeStartA, message, range_id); + return range_id; } -ROCTX_API void roctxRangeStop(roctx_range_id_t rangeId) { - roctx_api_data_t api_data{}; - api_data.args.roctxRangeStop.id = rangeId; - callbacks.Invoke(ROCTX_API_ID_roctxRangeStop, &api_data); +ROCTX_API void roctxRangeStop(roctx_range_id_t range_id) { + ReportActivity(ROCTX_API_ID_roctxRangeStop, nullptr, range_id); } -extern "C" ROCTX_EXPORT bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { - if (op >= ROCTX_API_ID_NUMBER) return false; - callbacks.Set(op, reinterpret_cast(callback), arg); - return true; +extern "C" ROCTX_EXPORT void roctxRegisterTracerCallback(const void* function) { + report_activity.store(reinterpret_cast(function), + std::memory_order_relaxed); } - -extern "C" ROCTX_EXPORT bool RemoveApiCallback(uint32_t op) { - if (op >= ROCTX_API_ID_NUMBER) return false; - callbacks.Set(op, nullptr, nullptr); - return true; -} \ No newline at end of file diff --git a/src/tracer_tool/tracer_tool.cpp b/src/tracer_tool/tracer_tool.cpp index 78912d49..f28c7718 100644 --- a/src/tracer_tool/tracer_tool.cpp +++ b/src/tracer_tool/tracer_tool.cpp @@ -18,17 +18,28 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include +#include +#include +#include +#include + +#include #include +#include +#include +#include #include #include -#include #include -#include +#include #include +#include #include #include /* kernel name demangling */ #include +#include #include #include #include @@ -37,58 +48,24 @@ #include #include /* usleep */ - -#include -#include -#include -#include - -#include "util/xml.h" +#include "debug.h" #include "loader.h" #include "trace_buffer.h" +#include "xml.h" + +namespace fs = std::experimental::filesystem; // Macro to check ROC-tracer calls status #define CHECK_ROCTRACER(call) \ do { \ - int err = call; \ - if (err != 0) { \ - std::cerr << roctracer_error_string() << std::endl << std::flush; \ - abort(); \ + if ((call) != ROCTRACER_STATUS_SUCCESS) { \ + fatal(#call " failed: %s", roctracer_error_string()); \ } \ - } while (0) - -#define ONLOAD_TRACE(str) \ - if (getenv("ROCP_ONLOAD_TRACE")) do { \ - std::cout << "PID(" << GetPid() << "): TRACER_TOOL::" << __FUNCTION__ << " " << str \ - << std::endl \ - << std::flush; \ - } while (0); -#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") -#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") - -static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } - -#if DEBUG_TRACE_ON -inline static void DEBUG_TRACE(const char* fmt, ...) { - constexpr int size = 256; - char buf[size]; - - va_list valist; - va_start(valist, fmt); - vsnprintf(buf, size, fmt, valist); - printf("%u:%u %s", GetPid(), GetTid(), buf); - fflush(stdout); - va_end(valist); -} -#else -#define DEBUG_TRACE(...) -#endif + } while (false) -thread_local roctracer_timestamp_t hsa_begin_timestamp = 0; -thread_local roctracer_timestamp_t hip_begin_timestamp = 0; +TRACE_BUFFER_INSTANTIATE(); -namespace util { +namespace { inline roctracer_timestamp_t timestamp_ns() { roctracer_timestamp_t timestamp; @@ -96,7 +73,8 @@ inline roctracer_timestamp_t timestamp_ns() { return timestamp; } -} // namespace util +std::vector hsa_api_vec; +std::vector hip_api_vec; bool trace_roctx = false; bool trace_hsa_api = false; @@ -105,93 +83,13 @@ bool trace_hip_api = false; bool trace_hip_activity = false; bool trace_pcs = false; -std::vector hsa_api_vec; -std::vector hip_api_vec; - -LOADER_INSTANTIATE(); -TRACE_BUFFER_INSTANTIATE(); - -// Global output file handle -FILE* begin_ts_file_handle = NULL; -FILE* roctx_file_handle = NULL; -FILE* hsa_api_file_handle = NULL; -FILE* hsa_async_copy_file_handle = NULL; -FILE* hip_api_file_handle = NULL; -FILE* hip_activity_file_handle = NULL; -FILE* pc_sample_file_handle = NULL; - -void close_output_file(FILE* file_handle); -void close_file_handles() { - if (begin_ts_file_handle) close_output_file(begin_ts_file_handle); - if (roctx_file_handle) close_output_file(roctx_file_handle); - if (hsa_api_file_handle) close_output_file(hsa_api_file_handle); - if (hsa_async_copy_file_handle) close_output_file(hsa_async_copy_file_handle); - if (hip_api_file_handle) close_output_file(hip_api_file_handle); - if (hip_activity_file_handle) close_output_file(hip_activity_file_handle); - if (pc_sample_file_handle) close_output_file(pc_sample_file_handle); -} - -static const uint32_t my_pid = GetPid(); - -// Error handler -void fatal(const std::string msg) { - close_file_handles(); - fflush(stdout); - fprintf(stderr, "%s\n\n", msg.c_str()); - fflush(stderr); - abort(); +uint32_t GetPid() { + static uint32_t pid = syscall(__NR_getpid); + return pid; } -/* The function extracts the kernel name from -input string. By using the iterators it finds the -window in the string which contains only the kernel name. -For example 'Foo::foo(a[], int (int))' -> 'foo'*/ -std::string truncate_name(const std::string& name) { - auto rit = name.rbegin(); - auto rend = name.rend(); - uint32_t counter = 0; - char open_token = 0; - char close_token = 0; - while (rit != rend) { - if (counter == 0) { - switch (*rit) { - case ')': - counter = 1; - open_token = ')'; - close_token = '('; - break; - case '>': - counter = 1; - open_token = '>'; - close_token = '<'; - break; - case ']': - counter = 1; - open_token = ']'; - close_token = '['; - break; - case ' ': - ++rit; - continue; - } - if (counter == 0) break; - } else { - if (*rit == open_token) counter++; - if (*rit == close_token) counter--; - } - ++rit; - } - auto rbeg = rit; - while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++; - return name.substr(rend - rit, rit - rbeg); -} -// C++ symbol demangle -static inline std::string cxx_demangle(const std::string& symbol) { - int status; - char* demangled = abi::__cxa_demangle(symbol.c_str(), nullptr, nullptr, &status); - if (status != 0) return symbol; - std::string ret(demangled); - free(demangled); - return ret; +uint32_t GetTid() { + static thread_local uint32_t tid = syscall(__NR_gettid); + return tid; } // Tracing control thread @@ -230,55 +128,105 @@ void flush_thr_fun() { } } +class roctracer_plugin_t { + public: + roctracer_plugin_t(const std::string& plugin_path) { + plugin_handle_ = dlopen(plugin_path.c_str(), RTLD_LAZY); + if (plugin_handle_ == nullptr) { + warning("dlopen(\"%s\") failed: %s", plugin_path.c_str(), dlerror()); + return; + } + + roctracer_plugin_write_callback_record_ = + reinterpret_cast( + dlsym(plugin_handle_, "roctracer_plugin_write_callback_record")); + if (!roctracer_plugin_write_callback_record_) return; + + roctracer_plugin_write_activity_records_ = + reinterpret_cast( + dlsym(plugin_handle_, "roctracer_plugin_write_activity_records")); + if (!roctracer_plugin_write_activity_records_) return; + + roctracer_plugin_finalize_ = reinterpret_cast( + dlsym(plugin_handle_, "roctracer_plugin_finalize")); + if (!roctracer_plugin_finalize_) return; + + if (auto* initialize = reinterpret_cast( + dlsym(plugin_handle_, "roctracer_plugin_initialize")); + initialize != nullptr) + valid_ = initialize(ROCTRACER_VERSION_MAJOR, ROCTRACER_VERSION_MINOR) == 0; + } + + ~roctracer_plugin_t() { + if (is_valid()) roctracer_plugin_finalize_(); + if (plugin_handle_ != nullptr) dlclose(plugin_handle_); + } + + bool is_valid() const { return valid_; } + + template auto write_callback_record(Args... args) const { + assert(is_valid()); + return roctracer_plugin_write_callback_record_(std::forward(args)...); + } + template auto write_activity_records(Args... args) const { + assert(is_valid()); + return roctracer_plugin_write_activity_records_(std::forward(args)...); + } + + private: + bool valid_{false}; + void* plugin_handle_; + + decltype(roctracer_plugin_finalize)* roctracer_plugin_finalize_; + decltype(roctracer_plugin_write_callback_record)* roctracer_plugin_write_callback_record_; + decltype(roctracer_plugin_write_activity_records)* roctracer_plugin_write_activity_records_; +}; + +std::optional plugin; + +} // namespace + /////////////////////////////////////////////////////////////////////////////////////////////////////// // rocTX annotation tracing struct roctx_trace_entry_t { std::atomic valid; - uint32_t cid; - roctracer_timestamp_t time; - uint32_t pid; - uint32_t tid; - roctx_range_id_t rid; - const char* message; - - roctx_trace_entry_t(uint32_t cid_, roctracer_timestamp_t time_, uint32_t pid_, uint32_t tid_, - roctx_range_id_t rid_, const char* message_) - : valid(roctracer::TRACE_ENTRY_INIT), - cid(cid_), - time(time_), - pid(pid_), - tid(tid_), - rid(rid_), - message(message_ != nullptr ? strdup(message_) : nullptr) {} + roctracer_record_t record; + union { + roctx_api_data_t data; + }; + + roctx_trace_entry_t(uint32_t cid, roctracer_timestamp_t time, uint32_t pid, uint32_t tid, + roctx_range_id_t rid, const char* message) + : valid(roctracer::TRACE_ENTRY_INIT) { + record.domain = ACTIVITY_DOMAIN_ROCTX; + record.op = cid; + record.kind = 0; + record.begin_ns = time; + record.end_ns = 0; + record.process_id = pid; + record.thread_id = tid; + data.args.message = message != nullptr ? strdup(message) : nullptr; + data.args.id = rid; + } ~roctx_trace_entry_t() { - if (message != nullptr) free(const_cast(message)); + if (data.args.message != nullptr) free(const_cast(data.args.message)); } }; -// rocTX buffer flush function -void roctx_flush_cb(roctx_trace_entry_t* entry) { - std::ostringstream os; - os << entry->time << " " << entry->pid << ":" << entry->tid << " " << entry->cid << ":" - << entry->rid; - if (entry->message != NULL) - os << ":\"" << entry->message << "\""; - else - os << ":\"\""; - fprintf(roctx_file_handle, "%s\n", os.str().c_str()); - fflush(roctx_file_handle); -} - -roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, - roctx_flush_cb); +roctracer::TraceBuffer roctx_trace_buffer( + "rocTX API", 0x200000, [](roctx_trace_entry_t* entry) { + assert(plugin && "plugin is not initialized"); + plugin->write_callback_record(&entry->record, &entry->data); + }); // rocTX callback function void roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* /* user_arg */) { const roctx_api_data_t* data = reinterpret_cast(callback_data); - roctx_trace_entry_t& entry = roctx_trace_buffer.Emplace( - cid, util::timestamp_ns(), GetPid(), GetTid(), data->args.id, data->args.message); + roctx_trace_entry_t& entry = roctx_trace_buffer.Emplace(cid, timestamp_ns(), GetPid(), GetTid(), + data->args.id, data->args.message); entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); } @@ -287,35 +235,31 @@ void roctx_api_callback(uint32_t domain, uint32_t cid, const void* callback_data struct hsa_api_trace_entry_t { std::atomic valid; - uint32_t cid; - roctracer_timestamp_t begin; - roctracer_timestamp_t end; - uint32_t pid; - uint32_t tid; - hsa_api_data_t data; - - hsa_api_trace_entry_t(uint32_t cid_, roctracer_timestamp_t begin_, roctracer_timestamp_t end_, - uint32_t pid_, uint32_t tid_, const hsa_api_data_t& data_) - : valid(roctracer::TRACE_ENTRY_INIT), - cid(cid_), - begin(begin_), - end(end_), - pid(pid_), - tid(tid_), - data(data_) {} + roctracer_record_t record; + union { + hsa_api_data_t data; + }; + + hsa_api_trace_entry_t(uint32_t cid, roctracer_timestamp_t begin, roctracer_timestamp_t end, + uint32_t pid, uint32_t tid, const hsa_api_data_t& hsa_api_data) + : valid(roctracer::TRACE_ENTRY_INIT) { + record.domain = ACTIVITY_DOMAIN_HSA_API; + record.op = cid; + record.kind = 0; + record.begin_ns = begin; + record.end_ns = end; + record.process_id = pid; + record.thread_id = tid; + data = hsa_api_data; + } ~hsa_api_trace_entry_t() {} }; -void hsa_api_flush_cb(hsa_api_trace_entry_t* entry) { - std::ostringstream os; - os << entry->begin << ":" << entry->end << " " << entry->pid << ":" << entry->tid << " " - << hsa_api_data_pair_t(entry->cid, entry->data) << " :" << entry->data.correlation_id; - fprintf(hsa_api_file_handle, "%s\n", os.str().c_str()); - fflush(hsa_api_file_handle); -} - -roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, - hsa_api_flush_cb); +roctracer::TraceBuffer hsa_api_trace_buffer( + "HSA API", 0x200000, [](hsa_api_trace_entry_t* entry) { + assert(plugin && "plugin is not initialized"); + plugin->write_callback_record(&entry->record, &entry->data); + }); // HSA API callback function @@ -323,12 +267,13 @@ void hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, (void)arg; const hsa_api_data_t* data = reinterpret_cast(callback_data); if (data->phase == ACTIVITY_API_PHASE_ENTER) { - hsa_begin_timestamp = util::timestamp_ns(); + *data->phase_data = timestamp_ns(); } else { + const roctracer_timestamp_t begin_timestamp = *data->phase_data; const roctracer_timestamp_t end_timestamp = - (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : util::timestamp_ns(); - hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace( - cid, hsa_begin_timestamp, end_timestamp, GetPid(), GetTid(), *data); + (cid == HSA_API_ID_hsa_shut_down) ? begin_timestamp : timestamp_ns(); + hsa_api_trace_entry_t& entry = hsa_api_trace_buffer.Emplace(cid, begin_timestamp, end_timestamp, + GetPid(), GetTid(), *data); entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); } } @@ -338,32 +283,28 @@ void hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, struct hip_api_trace_entry_t { std::atomic valid; - activity_domain_t domain; - uint32_t cid; - roctracer_timestamp_t begin; - roctracer_timestamp_t end; - uint32_t pid; - uint32_t tid; - hip_api_data_t data; - const char* name; - void* ptr; - - hip_api_trace_entry_t(activity_domain_t domain_, uint32_t cid_, roctracer_timestamp_t begin_, - roctracer_timestamp_t end_, uint32_t pid_, uint32_t tid_, - const hip_api_data_t& data_, const char* name_, void* ptr_) - : valid(roctracer::TRACE_ENTRY_INIT), - domain(domain_), - cid(cid_), - begin(begin_), - end(end_), - pid(pid_), - tid(tid_), - data(data_), - name(name_ != nullptr ? strdup(name_) : nullptr), - ptr(ptr_) {} + roctracer_record_t record; + union { + hip_api_data_t data; + }; + + hip_api_trace_entry_t(uint32_t cid, roctracer_timestamp_t begin, roctracer_timestamp_t end, + uint32_t pid, uint32_t tid, const hip_api_data_t& hip_api_data, + const char* name) + : valid(roctracer::TRACE_ENTRY_INIT) { + record.domain = ACTIVITY_DOMAIN_HIP_API; + record.op = cid; + record.kind = 0; + record.begin_ns = begin; + record.end_ns = end; + record.process_id = pid; + record.thread_id = tid; + data = hip_api_data; + record.kernel_name = name ? strdup(name) : nullptr; + } ~hip_api_trace_entry_t() { - if (name != nullptr) free(const_cast(name)); + if (record.kernel_name != nullptr) free(const_cast(record.kernel_name)); } }; @@ -451,128 +392,32 @@ static std::optional getKernelName(uint32_t cid, const hip_api_data function); } -void hip_api_flush_cb(hip_api_trace_entry_t* entry) { - const uint32_t domain = entry->domain; - const uint32_t cid = entry->cid; - const hip_api_data_t* data = &(entry->data); - const uint64_t correlation_id = data->correlation_id; - const roctracer_timestamp_t begin_timestamp = entry->begin; - const roctracer_timestamp_t end_timestamp = entry->end; - std::ostringstream rec_ss; - std::ostringstream oss; - - const char* str = - (domain != ACTIVITY_DOMAIN_EXT_API) ? roctracer_op_string(domain, cid, 0) : strdup("MARK"); - rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":" - << entry->tid; - oss << std::dec << rec_ss.str() << " " << str; - - DEBUG_TRACE( - "hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) " - "beg(%lu) end(%lu))\n", - roctracer_op_string(entry->domain, entry->cid, 0), entry->domain, entry->cid, entry, - entry->name, correlation_id, begin_timestamp, end_timestamp); - - if (domain == ACTIVITY_DOMAIN_HIP_API) { - const char* str = hipApiString((hip_api_id_t)cid, data); - rec_ss << " " << str; - if (entry->name) { - static bool truncate = []() { - const char* env_var = getenv("ROCP_TRUNCATE_NAMES"); - return env_var && std::atoi(env_var) != 0; - }(); - - std::string kernel_name(cxx_demangle(entry->name)); - if (truncate) kernel_name = truncate_name(kernel_name); - rec_ss << " kernel=" << kernel_name; - } - rec_ss << " :" << correlation_id; - fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); - } else { - fprintf(hip_api_file_handle, "%s(name(%s))\n", oss.str().c_str(), entry->name); - } - fflush(hip_api_file_handle); -} - -roctracer::TraceBuffer hip_api_trace_buffer("HIP API", 0x200000, - hip_api_flush_cb); +roctracer::TraceBuffer hip_api_trace_buffer( + "HIP API", 0x200000, [](hip_api_trace_entry_t* entry) { + assert(plugin && "plugin is not initialized"); + plugin->write_callback_record(&entry->record, &entry->data); + }); void hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) { (void)arg; const hip_api_data_t* data = reinterpret_cast(callback_data); - const roctracer_timestamp_t timestamp = util::timestamp_ns(); + const roctracer_timestamp_t timestamp = timestamp_ns(); + std::optional kernel_name; if (data->phase == ACTIVITY_API_PHASE_ENTER) { - hip_begin_timestamp = timestamp; + *data->phase_data = timestamp; } else { // Post init of HIP APU args hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); - auto kernel_name = getKernelName(cid, data); - hip_api_trace_entry_t& entry = hip_api_trace_buffer.Emplace( - static_cast(domain), cid, hip_begin_timestamp, timestamp, GetPid(), - GetTid(), *data, kernel_name ? kernel_name->c_str() : nullptr, - cid == HIP_API_ID_hipMalloc ? data->args.hipMalloc.ptr : nullptr); + kernel_name = getKernelName(cid, data); + hip_api_trace_entry_t& entry = + hip_api_trace_buffer.Emplace(cid, *data->phase_data, timestamp, GetPid(), GetTid(), *data, + kernel_name ? kernel_name->c_str() : nullptr); entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); } - - DEBUG_TRACE( - "hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") " - "correlation_id(%lu) timestamp(%lu)\n", - roctracer_op_string(domain, cid, 0), data->phase, cid, data, entry, - (entry.name != nullptr) ? entry.name : "", data->correlation_id, timestamp); -} - -void mark_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) { - (void)arg; - const char* name = reinterpret_cast(callback_data); - - const roctracer_timestamp_t timestamp = util::timestamp_ns(); - hip_api_trace_entry_t& entry = hip_api_trace_buffer.Emplace( - static_cast(domain), cid, timestamp, timestamp + 1, GetPid(), GetTid(), - hip_api_data_t{}, name, nullptr); - entry.valid.store(roctracer::TRACE_ENTRY_COMPLETE, std::memory_order_release); } /////////////////////////////////////////////////////////////////////////////////////////////////////// -// Activity tracing callback -// hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) -void pool_activity_callback(const char* begin, const char* end, void* arg) { - const roctracer_record_t* record = reinterpret_cast(begin); - const roctracer_record_t* end_record = reinterpret_cast(end); - - while (record < end_record) { - const char* name = roctracer_op_string(record->domain, record->op, record->kind); - DEBUG_TRACE( - "pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) " - "beg(%lu) end(%lu)\n", - name, record->domain, record->op, record->kind, record, record->correlation_id, - record->begin_ns, record->end_ns); - - switch (record->domain) { - case ACTIVITY_DOMAIN_HIP_OPS: - fprintf(hip_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", record->begin_ns, - record->end_ns, record->device_id, record->queue_id, name, record->correlation_id, - my_pid); - fflush(hip_activity_file_handle); - break; - case ACTIVITY_DOMAIN_HSA_OPS: - if (record->op == HSA_OP_ID_COPY) { - fprintf(hsa_async_copy_file_handle, "%lu:%lu async-copy:%lu:%u\n", record->begin_ns, - record->end_ns, record->correlation_id, my_pid); - fflush(hsa_async_copy_file_handle); - } else if (record->op == HSA_OP_ID_RESERVED1) { - fprintf(pc_sample_file_handle, "%u %lu 0x%lx %s\n", record->pc_sample.se, - record->pc_sample.cycle, record->pc_sample.pc, name); - fflush(pc_sample_file_handle); - } - break; - } - CHECK_ROCTRACER(roctracer_next_record(record, &record)); - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////////// - // Input parser std::string normalize_token(const std::string& token, bool not_empty, const std::string& label) { const std::string space_chars_set = " \t"; @@ -593,30 +438,30 @@ std::string normalize_token(const std::string& token, bool not_empty, const std: } if (((first_pos != std::string::npos) && (norm_len == 0)) || ((first_pos == std::string::npos) && not_empty)) { - fatal("normalize_token error, " + label + ": '" + token + "'," + error_str); + error("normalize_token error: %s", error_str.c_str()); } return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); } int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, - std::vector* vec, const char* label = NULL) { + std::vector* vec, const char* label = nullptr) { int parse_iter = 0; const auto& opts = node->opts; auto it = opts.find(field); if (it != opts.end()) { - const std::string array_string = it->second; - if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); + const std::string& array_string = it->second; + if (label != nullptr) std::cout << label << field << " = " << array_string << std::endl; size_t pos1 = 0; - const size_t string_len = array_string.length(); + size_t string_len = array_string.length(); while (pos1 < string_len) { // set pos2 such that it also handles case of multiple delimiter options. // For example- "hipLaunchKernel, hipExtModuleLaunchKernel, hipMemsetAsync" // in this example delimiters are ' ' and also ',' - const size_t pos2 = array_string.find_first_of(delim, pos1); - const bool found = (pos2 != std::string::npos); - const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; - const std::string token = array_string.substr(pos1, token_len); - const std::string norm_str = normalize_token(token, found, "get_xml_array"); + size_t pos2 = array_string.find_first_of(delim, pos1); + bool found = (pos2 != std::string::npos); + size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; + std::string token = array_string.substr(pos1, token_len); + std::string norm_str = normalize_token(token, found, "get_xml_array"); if (norm_str.length() != 0) vec->push_back(norm_str); if (!found) break; // update pos2 such that it represents the first non-delimiter character @@ -628,41 +473,16 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const return parse_iter; } -// Open output file -FILE* open_output_file(const char* prefix, const char* name, const char** path = NULL) { - FILE* file_handle = NULL; - if (path != NULL) *path = NULL; - - if (prefix != NULL) { - std::ostringstream oss; - oss << prefix << "/" << GetPid() << "_" << name; - file_handle = fopen(oss.str().c_str(), "w"); - if (file_handle == NULL) { - std::ostringstream errmsg; - errmsg << "ROCTracer: fopen error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - - if (path != NULL) *path = strdup(oss.str().c_str()); - } else - file_handle = stdout; - return file_handle; -} - -void close_output_file(FILE* file_handle) { - if (file_handle != NULL) { - fflush(file_handle); - if (file_handle != stdout) fclose(file_handle); - } -} - // Allocating tracing pool void open_tracing_pool() { - if (roctracer_default_pool() == NULL) { + if (roctracer_default_pool() == nullptr) { roctracer_properties_t properties{}; properties.buffer_size = 0x80000; - properties.buffer_callback_fun = pool_activity_callback; + properties.buffer_callback_fun = [](const char* begin, const char* end, void* /* arg */) { + assert(plugin && "plugin is not initialized"); + plugin->write_activity_records(reinterpret_cast(begin), + reinterpret_cast(end)); + }; CHECK_ROCTRACER(roctracer_open_pool(&properties)); } } @@ -680,8 +500,6 @@ static bool is_loaded = false; // tool unload method void tool_unload() { - ONLOAD_TRACE("begin, loaded(" << is_loaded << ")"); - if (is_loaded == false) return; is_loaded = false; @@ -717,34 +535,24 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - - close_file_handles(); - - ONLOAD_TRACE_END(); } // tool load method void tool_load() { - ONLOAD_TRACE("begin, loaded(" << is_loaded << ")"); - if (is_loaded == true) return; is_loaded = true; - // Output file - const char* output_prefix = getenv("ROCP_OUTPUT_DIR"); - if (output_prefix != NULL) { - DIR* dir = opendir(output_prefix); - if (dir == NULL) { - std::ostringstream errmsg; - errmsg << "ROCTracer: Cannot open output directory '" << output_prefix << "'"; - perror(errmsg.str().c_str()); - abort(); - } + // Load output plugin + const char* plugin_name = getenv("ROCTRACER_PLUGIN_LIB"); + if (plugin_name == nullptr) plugin_name = "libfile_plugin.so"; + if (Dl_info dl_info; dladdr((void*)tool_load, &dl_info) != 0) { + if (!plugin.emplace(fs::path(dl_info.dli_fname).replace_filename(plugin_name)).is_valid()) + plugin.reset(); } // API traces switches const char* trace_domain = getenv("ROCTRACER_DOMAIN"); - if (trace_domain != NULL) { + if (trace_domain != nullptr) { // ROCTX domain if (std::string(trace_domain).find("roctx") != std::string::npos) { trace_roctx = true; @@ -771,28 +579,24 @@ void tool_load() { } } - printf("ROCTracer (pid=%d): ", (int)GetPid()); - fflush(stdout); + std::cout << "ROCtracer (" << std::dec << GetPid() << "):"; // XML input const char* xml_name = getenv("ROCP_INPUT"); - if (xml_name != NULL) { + if (xml_name != nullptr) { xml::Xml* xml = xml::Xml::Create(xml_name); - if (xml == NULL) { - fprintf(stderr, "ROCTracer: Input file not found '%s'\n", xml_name); - abort(); - } + if (xml == nullptr) error("input file not found '%s'", xml_name); bool found = false; for (const auto* entry : xml->GetNodes("top.trace")) { auto it = entry->opts.find("name"); - if (it == entry->opts.end()) fatal("ROCTracer: trace name is missing"); + if (it == entry->opts.end()) error("trace name is missing"); const std::string& name = it->second; std::vector api_vec; for (const auto* node : entry->nodes) { if (node->tag != "parameters") - fatal("ROCTracer: trace node is not supported '" + name + ":" + node->tag + "'"); + error("trace node is not supported '%s:%%%s'", name.c_str(), node->tag.c_str()); get_xml_array(node, "api", ", ", &api_vec); // delimiter options given as both spaces and commas (' ' and ',') break; @@ -819,33 +623,30 @@ void tool_load() { } } - if (found) printf("input from \"%s\"", xml_name); + if (found) std::cout << " input from \"" << xml_name << "\""; } - printf("\n"); + std::cout << std::endl; // Disable HIP activity if HSA activity was set if (trace_hsa_activity == true) trace_hip_activity = false; // Enable rpcTX callbacks if (trace_roctx) { - roctx_file_handle = open_output_file(output_prefix, "roctx_trace.txt"); - // initialize HSA tracing - fprintf(stdout, " rocTX-trace()\n"); - fflush(stdout); + std::cout << " rocTX-trace()" << std::endl; CHECK_ROCTRACER( - roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, NULL)); + roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, roctx_api_callback, nullptr)); } const char* ctrl_str = getenv("ROCP_CTRL_RATE"); - if (ctrl_str != NULL) { + if (ctrl_str != nullptr) { uint32_t ctrl_delay = 0; uint32_t ctrl_len = 0; uint32_t ctrl_rate = 0; if (sscanf(ctrl_str, "%d:%d:%d", &ctrl_delay, &ctrl_len, &ctrl_rate) != 3 || ctrl_len > ctrl_rate) - fatal("Invalid ROCP_CTRL_RATE variable (ctrl_delay:ctrl_len:ctrl_rate)"); + error("invalid ROCP_CTRL_RATE variable (ctrl_delay:ctrl_len:ctrl_rate)"); control_dist_us = ctrl_rate - ctrl_len; control_len_us = ctrl_len; @@ -854,42 +655,39 @@ void tool_load() { roctracer_stop(); if (ctrl_delay != UINT32_MAX) { - fprintf(stdout, "ROCTracer: trace control: delay(%uus), length(%uus), rate(%uus)\n", - ctrl_delay, ctrl_len, ctrl_rate); - fflush(stdout); + std::cout << "ROCtracer: trace control: delay(" << ctrl_delay << "us), length(" << ctrl_len + << "us), rate(" << ctrl_rate << "us)" << std::endl; trace_period_thread = new std::thread(trace_period_fun); } else { - fprintf(stdout, "ROCTracer: trace start disabled\n"); - fflush(stdout); + std::cout << "ROCtracer: trace start disabled" << std::endl; } } const char* flush_str = getenv("ROCP_FLUSH_RATE"); - if (flush_str != NULL) { + if (flush_str != nullptr) { sscanf(flush_str, "%d", &control_flush_us); - if (control_flush_us == 0) { - fprintf(stderr, "ROCTracer: control flush rate bad value\n"); - abort(); - } + if (control_flush_us == 0) error("invalid control flush rate value '%s'", flush_str); - fprintf(stdout, "ROCTracer: trace control flush rate(%uus)\n", control_flush_us); - fflush(stdout); + std::cout << "ROCtracer: trace control flush rate(" << control_flush_us << "us)" << std::endl; flush_thread = new std::thread(flush_thr_fun); } - - ONLOAD_TRACE_END(); } extern "C" { // The HSA_AMD_TOOL_PRIORITY variable must be a constant value type initialized by the loader -// itself, not by code during _init. 'extern const' seems do that although that is not a guarantee. +// itself, not by code during _init. 'extern const' seems to do that although that is not a +// guarantee. ROCTRACER_EXPORT extern const uint32_t HSA_AMD_TOOL_PRIORITY = 1050; // HSA-runtime tool on-load method ROCTRACER_EXPORT bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { - ONLOAD_TRACE_BEG(); + if (roctracer_version_major() != ROCTRACER_VERSION_MAJOR || + roctracer_version_minor() < ROCTRACER_VERSION_MINOR) { + warning("the ROCtracer API version is not compatible with this tool"); + return true; + } tool_load(); @@ -897,121 +695,85 @@ ROCTRACER_EXPORT bool OnLoad(HsaApiTable* table, uint64_t runtime_version, // application before exiting, so register an atexit handler to unload the tool. std::atexit(tool_unload); - const char* output_prefix = getenv("ROCP_OUTPUT_DIR"); - - // Dumping HSA handles for agents - FILE* handles_file_handle = open_output_file(output_prefix, "hsa_handles.txt"); - auto iterate_agent_data = std::make_pair(table, handles_file_handle); - - [[maybe_unused]] hsa_status_t status = table->core_->hsa_iterate_agents_fn( - [](hsa_agent_t agent, void* user_data) { - auto [hsa_api_table, hsa_handles_file] = - *reinterpret_cast(user_data); - hsa_device_type_t type; - - if (hsa_api_table->core_->hsa_agent_get_info_fn(agent, HSA_AGENT_INFO_DEVICE, &type) != - HSA_STATUS_SUCCESS) - return HSA_STATUS_ERROR; - - fprintf(hsa_handles_file, "0x%lx agent %s\n", agent.handle, - (type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); - return HSA_STATUS_SUCCESS; - }, - &iterate_agent_data); - assert(status == HSA_STATUS_SUCCESS && "failed to iterate HSA agents"); - - close_output_file(handles_file_handle); - - // App begin timestamp begin_ts_file.txt - begin_ts_file_handle = open_output_file(output_prefix, "begin_ts_file.txt"); - const roctracer_timestamp_t app_start_time = util::timestamp_ns(); - fprintf(begin_ts_file_handle, "%lu\n", app_start_time); - // Enable HSA API callbacks/activity if (trace_hsa_api) { - hsa_api_file_handle = open_output_file(output_prefix, "hsa_api_trace.txt"); - - fprintf(stdout, " HSA-trace("); - fflush(stdout); + std::ostringstream out; + out << " HSA-trace("; if (hsa_api_vec.size() != 0) { + out << "-*"; for (unsigned i = 0; i < hsa_api_vec.size(); ++i) { uint32_t cid = HSA_API_ID_NUMBER; const char* api = hsa_api_vec[i].c_str(); - CHECK_ROCTRACER(roctracer_op_code(ACTIVITY_DOMAIN_HSA_API, api, &cid, NULL)); - CHECK_ROCTRACER( - roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_API, cid, hsa_api_callback, NULL)); - printf(" %s", api); + if (roctracer_op_code(ACTIVITY_DOMAIN_HSA_API, api, &cid, nullptr) == + ROCTRACER_STATUS_SUCCESS && + roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_API, cid, hsa_api_callback, nullptr) == + ROCTRACER_STATUS_SUCCESS) + out << ' ' << api; + else + warning("Unable to enable HSA_API tracing for invalid operation %s", api); } } else { CHECK_ROCTRACER( - roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HSA_API, hsa_api_callback, NULL)); + roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HSA_API, hsa_api_callback, nullptr)); + out << "*"; } - printf(")\n"); + std::cout << out.str() << ')' << std::endl; } // Enable HSA GPU activity if (trace_hsa_activity) { - hsa_async_copy_file_handle = open_output_file(output_prefix, "async_copy_trace.txt"); - // Allocating tracing pool open_tracing_pool(); - fprintf(stdout, " HSA-activity-trace()\n"); - fflush(stdout); + std::cout << " HSA-activity-trace()" << std::endl; CHECK_ROCTRACER(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY)); } // Enable HIP API callbacks/activity if (trace_hip_api || trace_hip_activity) { - fprintf(stdout, " HIP-trace()\n"); - fflush(stdout); - // roctracer properties - roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, (void*)mark_api_callback); + std::ostringstream out; + out << " HIP-trace("; // Allocating tracing pool open_tracing_pool(); // Enable tracing if (trace_hip_api) { - hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); if (hip_api_vec.size() != 0) { + out << "-*"; for (unsigned i = 0; i < hip_api_vec.size(); ++i) { uint32_t cid = HIP_API_ID_NONE; const char* api = hip_api_vec[i].c_str(); - CHECK_ROCTRACER(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); - CHECK_ROCTRACER( - roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); - printf(" %s", api); + if (roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, nullptr) == + ROCTRACER_STATUS_SUCCESS && + roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, + nullptr) == ROCTRACER_STATUS_SUCCESS) + out << ' ' << api; + else + warning("Unable to enable HIP_API tracing for invalid operation %s", api); } } else { CHECK_ROCTRACER( - roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr)); + out << "*"; } } if (trace_hip_activity) { - hip_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); CHECK_ROCTRACER(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_OPS)); } + std::cout << out.str() << ')' << std::endl; } // Enable PC sampling if (trace_pcs) { - fprintf(stdout, " PCS-trace()\n"); - fflush(stdout); + std::cout << " PCS-trace()" << std::endl; open_tracing_pool(); - pc_sample_file_handle = open_output_file(output_prefix, "pcs_trace.txt"); CHECK_ROCTRACER(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); } - - ONLOAD_TRACE_END(); return true; } // HSA-runtime on-unload method -ROCTRACER_EXPORT void OnUnload() { - ONLOAD_TRACE_BEG(); - tool_unload(); - ONLOAD_TRACE_END(); -} +ROCTRACER_EXPORT void OnUnload() { tool_unload(); } } // extern "C" diff --git a/src/util/callback_table.h b/src/util/callback_table.h deleted file mode 100644 index 5165ef54..00000000 --- a/src/util/callback_table.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#ifndef UTIL_CALLBACK_TABLE_H_ -#define UTIL_CALLBACK_TABLE_H_ - -#include "ext/prof_protocol.h" - -#include -#include -#include -#include -#include - -namespace roctracer::util { - -// Generic callbacks table -template class CallbackTable { - public: - CallbackTable() - // Zero initialize the callbacks array as the function pointer is used to determine if the - // callback is enabled. - : callbacks_() {} - - void Set(uint32_t callback_id, activity_rtapi_callback_t callback_function, void* user_arg) { - assert(callback_id < N && "callback_id is out of range"); - std::lock_guard lock(mutex_); - auto& callback = callbacks_[callback_id]; - callback.first.store(callback_function, std::memory_order_relaxed); - callback.second = user_arg; - } - - auto Get(uint32_t callback_id) const { - assert(callback_id < N && "id is out of range"); - std::lock_guard lock(mutex_); - auto& callback = callbacks_[callback_id]; - return std::make_pair(callback.first.load(std::memory_order_relaxed), callback.second); - } - - template void Invoke(uint32_t callback_id, Args... args) { - if (callbacks_[callback_id].first.load(std::memory_order_relaxed) == nullptr) return; - if (auto [callback_function, user_arg] = Get(callback_id); callback_function != nullptr) - callback_function(Domain, callback_id, std::forward(args)..., user_arg); - } - - private: - std::array, void*>, N> callbacks_; - mutable std::mutex mutex_; -}; - -} // namespace roctracer::util - -#endif // UTIL_CALLBACK_TABLE_H_ diff --git a/src/util/debug.cpp b/src/util/debug.cpp new file mode 100644 index 00000000..9d5ec3b2 --- /dev/null +++ b/src/util/debug.cpp @@ -0,0 +1,125 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "debug.h" +#include "util.h" + +#include +#include +#include +#include +#include + +#if defined(ENABLE_BACKTRACE) + +#include +#include + +namespace { + +struct BackTraceInfo { + struct ::backtrace_state* state = nullptr; + std::stringstream sstream{}; + int depth = 0; + int error = 0; +}; + +void errorCallback(void* data, const char* message, int errnum) { + BackTraceInfo* info = static_cast(data); + info->sstream << "ROCtracer error: " << message << '(' << errnum << ')'; + info->error = 1; +} + +void syminfoCallback(void* data, uintptr_t /* pc */, const char* symname, uintptr_t /* symval */, + uintptr_t /* symsize */) { + BackTraceInfo* info = static_cast(data); + + if (symname == nullptr) return; + + int status; + char* demangled = abi::__cxa_demangle(symname, nullptr, nullptr, &status); + info->sstream << ' ' << (status == 0 ? demangled : symname); + free(demangled); +} + +int fullCallback(void* data, uintptr_t pc, const char* filename, int lineno, const char* function) { + BackTraceInfo* info = static_cast(data); + + info->sstream << std::endl + << " #" << std::dec << info->depth++ << ' ' << "0x" << std::noshowbase + << std::hex << std::setfill('0') << std::setw(sizeof(pc) * 2) << pc; + if (function == nullptr) + backtrace_syminfo(info->state, pc, syminfoCallback, errorCallback, data); + else { + int status; + char* demangled = abi::__cxa_demangle(function, nullptr, nullptr, &status); + info->sstream << ' ' << (status == 0 ? demangled : function); + free(demangled); + + if (filename != nullptr) { + info->sstream << " in " << filename; + if (lineno) info->sstream << ':' << std::dec << lineno; + } + } + + return info->error; +} + +} // namespace +#endif // defined (ENABLE_BACKTRACE) + +namespace roctracer { + +void warning(const char* format, ...) { + va_list va; + va_start(va, format); + std::cerr << "ROCtracer warning: " << string_vprintf(format, va) << std::endl; + va_end(va); +} + +void error(const char* format, ...) { + va_list va; + va_start(va, format); + std::cerr << "ROCtracer error: " << string_vprintf(format, va) << std::endl; + va_end(va); + exit(EXIT_FAILURE); +} + +void fatal [[noreturn]] (const char* format, ...) { + va_list va; + va_start(va, format); + std::string message = string_vprintf(format, va); + va_end(va); + +#if defined(ENABLE_BACKTRACE) + BackTraceInfo info; + + info.sstream << std::endl << "Backtrace:"; + info.state = ::backtrace_create_state("/proc/self/exe", 0, errorCallback, &info); + ::backtrace_full(info.state, 1, fullCallback, errorCallback, &info); + + message += info.sstream.str(); +#endif /* defined (ENABLE_BACKTRACE) */ + + std::cerr << "ROCtracer fatal error: " << message << std::endl; + abort(); +} + +} // namespace roctracer \ No newline at end of file diff --git a/src/util/debug.h b/src/util/debug.h new file mode 100644 index 00000000..d90afb48 --- /dev/null +++ b/src/util/debug.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +namespace roctracer { + +extern void warning(const char* format, ...) +#if defined(__GNUC__) + __attribute__((format(printf, 1, 2))) +#endif // defined (__GNUC__) + ; + +extern void error [[noreturn]] (const char* format, ...) +#if defined(__GNUC__) +__attribute__((format(printf, 1, 2))) +#endif // defined (__GNUC__) +; + +extern void fatal [[noreturn]] (const char* format, ...) +#if defined(__GNUC__) +__attribute__((format(printf, 1, 2))) +#endif // defined (__GNUC__) +; + +} // namespace roctracer + +using roctracer::error; +using roctracer::fatal; +using roctracer::warning; diff --git a/src/util/logger.h b/src/util/logger.h index daf07a6a..0bbdbe79 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -43,11 +43,9 @@ namespace roctracer::util { class Logger { public: - typedef std::recursive_mutex mutex_t; - - template Logger& operator<<(const T& m) { + template Logger& operator<<(T&& m) { std::ostringstream oss; - oss << m; + oss << std::forward(m); if (!streaming_) Log(oss.str()); else @@ -56,7 +54,7 @@ class Logger { return *this; } - typedef void (*manip_t)(); + using manip_t = void (*)(); Logger& operator<<(manip_t f) { f(); return *this; @@ -65,59 +63,35 @@ class Logger { static void begm() { Instance().ResetStreaming(true); } static void endl() { Instance().ResetStreaming(false); } - static const std::string& LastMessage() { - Logger& logger = Instance(); - std::lock_guard lck(mutex_); - return logger.message_[GetTid()]; - } - - static Logger* Create() { - std::lock_guard lck(mutex_); - Logger* obj = instance_.load(std::memory_order_relaxed); - if (obj == NULL) { - obj = new Logger(); - if (obj == NULL) { - std::cerr << "ROCTracer: log object creation failed" << std::endl << std::flush; - abort(); - } - instance_.store(obj, std::memory_order_release); - } - return obj; - } - - static void Destroy() { - std::lock_guard lck(mutex_); - if (instance_ != NULL) delete instance_.load(); - instance_ = NULL; + const std::string& LastMessage() { + std::lock_guard lock(mutex_); + return message_[GetTid()]; } static Logger& Instance() { - Logger* obj = instance_.load(std::memory_order_acquire); - if (obj == NULL) obj = Create(); - return *obj; + static Logger instance; + return instance; } static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } private: - Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { - const char* path = getenv("ROCTRACER_LOG"); - if (path != NULL) { - file_ = fopen("/tmp/roctracer_log.txt", "a"); - } + Logger() : file_(nullptr), dirty_(false), streaming_(false), messaging_(false) { + const char* var = getenv("ROCTRACER_LOG"); + if (var != nullptr) file_ = fopen("/tmp/roctracer_log.txt", "a"); ResetStreaming(false); } ~Logger() { - if (file_ != NULL) { + if (file_ != nullptr) { if (dirty_) Put("\n"); fclose(file_); } } void ResetStreaming(const bool messaging) { - std::lock_guard lck(mutex_); + std::lock_guard lock(mutex_); if (messaging) { message_[GetTid()] = ""; } else if (streaming_) { @@ -129,11 +103,11 @@ class Logger { } void Put(const std::string& m) { - std::lock_guard lck(mutex_); + std::lock_guard lock(mutex_); if (messaging_) { message_[GetTid()] += m; } - if (file_ != NULL) { + if (file_ != nullptr) { dirty_ = true; flock(fileno(file_), LOCK_EX); fprintf(file_, "%s", m.c_str()); @@ -143,7 +117,7 @@ class Logger { } void Log(const std::string& m) { - const time_t rawtime = time(NULL); + const time_t rawtime = time(nullptr); tm tm_info; localtime_r(&rawtime, &tm_info); char tm_str[26]; @@ -158,8 +132,7 @@ class Logger { bool streaming_; bool messaging_; - static mutex_t mutex_; - static std::atomic instance_; + std::recursive_mutex mutex_; std::map message_; }; @@ -170,51 +143,25 @@ class Logger { roctracer::util::Logger::Instance() \ << "fatal: " << roctracer::util::Logger::begm << stream << roctracer::util::Logger::endl; \ abort(); \ - } while (0) + } while (false) #define ERR_LOGGING(stream) \ do { \ roctracer::util::Logger::Instance() \ << "error: " << roctracer::util::Logger::begm << stream << roctracer::util::Logger::endl; \ - } while (0) + } while (false) #define INFO_LOGGING(stream) \ do { \ roctracer::util::Logger::Instance() \ << "info: " << roctracer::util::Logger::begm << stream << roctracer::util::Logger::endl; \ - } while (0) + } while (false) #define WARN_LOGGING(stream) \ do { \ std::cerr << "ROCProfiler: " << stream << std::endl; \ roctracer::util::Logger::Instance() << "warning: " << roctracer::util::Logger::begm << stream \ << roctracer::util::Logger::endl; \ - } while (0) - -#ifdef DEBUG -#define DBG_LOGGING(stream) \ - do { \ - roctracer::util::Logger::Instance() \ - << roctracer::util::Logger::begm << "debug: \"" << stream << "\"" \ - << " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ - << roctracer::util::Logger::endl; \ - } while (0) -#endif - -#if DEBUG_TRACE_ON -inline static void DEBUG_TRACE(const char* fmt, ...) { - constexpr int size = 256; - char buf[size]; - - va_list valist; - va_start(valist, fmt); - vsnprintf(buf, size, fmt, valist); - printf("%u:%u %s", roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf); - fflush(stdout); - va_end(valist); -} -#else -#define DEBUG_TRACE(...) -#endif + } while (false) #endif // SRC_UTIL_LOGGER_H_ diff --git a/src/util/util.cpp b/src/util/util.cpp new file mode 100644 index 00000000..61e72961 --- /dev/null +++ b/src/util/util.cpp @@ -0,0 +1,51 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "util.h" + +#include +#include +#include + +namespace roctracer { + +std::string string_vprintf(const char* format, va_list va) { + va_list copy; + + va_copy(copy, va); + size_t size = vsnprintf(NULL, 0, format, copy); + va_end(copy); + + std::string str(size, '\0'); + vsprintf(&str[0], format, va); + + return str; +} + +std::string string_printf(const char* format, ...) { + va_list va; + va_start(va, format); + std::string str(string_vprintf(format, va)); + va_end(va); + + return str; +} + +} // namespace roctracer diff --git a/src/util/util.h b/src/util/util.h new file mode 100644 index 00000000..5b286bcb --- /dev/null +++ b/src/util/util.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#pragma once + +#include +#include + +namespace roctracer { + +extern std::string string_vprintf(const char* format, va_list va); + +extern std::string string_printf(const char* format, ...) +#if defined(__GNUC__) + __attribute__((format(printf, 1, 2))) +#endif // defined (__GNUC__) + ; + +} // namespace roctracer \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 65d1fbdd..c7c5903b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,6 +20,8 @@ ## IN THE SOFTWARE. ################################################################################ +get_property(HSA_RUNTIME_INCLUDE_DIRECTORIES TARGET hsa-runtime64::hsa-runtime64 PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + # Set the HIP language runtime link flags as FindHIP does not set them. set(CMAKE_EXECUTABLE_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG}) set(CMAKE_EXECUTABLE_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP}) @@ -68,12 +70,6 @@ target_include_directories(MatrixTranspose_ctest PRIVATE ${PROJECT_SOURCE_DIR}/i target_link_libraries(MatrixTranspose_ctest PRIVATE roctracer roctx) add_dependencies(mytest MatrixTranspose_ctest) -## Build hsaco_test reference test -add_library(hsaco_test SHARED app/hsaco_test.cpp) -target_compile_definitions(hsaco_test PRIVATE AMD_INTERNAL_BUILD) -target_link_libraries(hsaco_test hsa-runtime64::hsa-runtime64) -add_dependencies(mytest hsaco_test) - ## Build codeobj event test add_library(codeobj_test SHARED app/codeobj_test.cpp) target_include_directories(codeobj_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/inc) @@ -133,6 +129,24 @@ target_include_directories(memory_pool PRIVATE ${PROJECT_SOURCE_DIR}/src/roctrac target_link_libraries(memory_pool Threads::Threads atomic) add_dependencies(mytest memory_pool) +## Build the activity_and_callback test +set_source_files_properties(directed/activity_and_callback.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +hip_add_executable(activity_and_callback directed/activity_and_callback.cpp) +target_link_libraries(activity_and_callback roctracer) +add_dependencies(mytest activity_and_callback) + +## Build the multi_pool_activities test +set_source_files_properties(directed/multi_pool_activities.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +hip_add_executable(multi_pool_activities directed/multi_pool_activities.cpp) +target_link_libraries(multi_pool_activities roctracer) +add_dependencies(mytest multi_pool_activities) + +## Build the dlopen test +add_executable(dlopen directed/dlopen.cpp) +target_include_directories(dlopen PRIVATE ${PROJECT_SOURCE_DIR}/inc ${HSA_RUNTIME_INCLUDE_DIRECTORIES}) +target_link_libraries(dlopen dl) +add_dependencies(mytest dlopen) + ## Copy the golden traces and test scripts configure_file(run.sh ${PROJECT_BINARY_DIR} COPYONLY) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink run.sh ${PROJECT_BINARY_DIR}/run_ci.sh) diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp index 4593a8f4..0737d49a 100644 --- a/test/app/codeobj_test.cpp +++ b/test/app/codeobj_test.cpp @@ -18,68 +18,52 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include +#include +#include +#include -#include "inc/roctracer.h" -#include "inc/roctracer_hsa.h" -#include - -#define PUBLIC_API __attribute__((visibility("default"))) -#define CONSTRUCTOR_API __attribute__((constructor)) -#define DESTRUCTOR_API __attribute__((destructor)) +#include "roctracer.h" +#include "roctracer_hsa.h" +namespace { // Check returned HSA API status -void check_status(roctracer_status_t status) { +inline void CHECK(roctracer_status_t status) { if (status != ROCTRACER_STATUS_SUCCESS) { - const char* error_string = roctracer_error_string(); - fprintf(stderr, "ERROR: %s\n", error_string); + fprintf(stderr, "ERROR: %s\n", roctracer_error_string()); abort(); } } // codeobj callback -void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { +void CodeObjectCallback(uint32_t domain, uint32_t cid, const void* data, void* arg) { const hsa_evt_data_t* evt_data = reinterpret_cast(data); - const char* uri = evt_data->codeobj.uri; - printf( - "codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) " - "uri(\"%s\")\n", - domain, cid, evt_data->codeobj.load_base, evt_data->codeobj.load_size, - evt_data->codeobj.load_delta, uri); - free((void*)uri); - fflush(stdout); + fprintf(stdout, + "codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) " + "load_delta(0x%lx) uri(\"%s\") unload(%d)\n", + domain, cid, evt_data->codeobj.load_base, evt_data->codeobj.load_size, + evt_data->codeobj.load_delta, evt_data->codeobj.uri, evt_data->codeobj.unload); } -void initialize() { - roctracer_status_t status = roctracer_enable_op_callback( - ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL); - check_status(status); -} +} // namespace -void cleanup() { - roctracer_status_t status = roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_EVT); - check_status(status); -} +#include -// Tool constructor -extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { - // Enable HSA events intercepting - settings->hsa_intercepting = 1; - // Initialize profiling - initialize(); -} +extern "C" { +// The HSA_AMD_TOOL_PRIORITY variable must be a constant value type initialized by the loader +// itself, not by code during _init. 'extern const' seems to do that although that is not a +// guarantee. +ROCTRACER_EXPORT extern const uint32_t HSA_AMD_TOOL_PRIORITY = 1050; -// Tool destructor -extern "C" PUBLIC_API void OnUnloadTool() { - // Final resources cleanup - cleanup(); +// HSA-runtime tool on-load method +ROCTRACER_EXPORT bool OnLoad(HsaApiTable* table, uint64_t runtime_version, + uint64_t failed_tool_count, const char* const* failed_tool_names) { + CHECK(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, + CodeObjectCallback, nullptr)); + return true; } -extern "C" CONSTRUCTOR_API void constructor() { - printf("constructor\n"); - fflush(stdout); +ROCTRACER_EXPORT void OnUnload() { + CHECK(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_EVT)); } -extern "C" DESTRUCTOR_API void destructor() { OnUnloadTool(); } +} // extern "C" diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp deleted file mode 100644 index b6e08cc5..00000000 --- a/test/app/hsaco_test.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. */ - -#include -#include -#include -#include -#include - -#define PUBLIC_API __attribute__((visibility("default"))) -#define CONSTRUCTOR_API __attribute__((constructor)) -#define DESTRUCTOR_API __attribute__((destructor)) - -#define HSA_RT(call) \ - do { \ - const hsa_status_t status = call; \ - if (status != HSA_STATUS_SUCCESS) { \ - printf("error \"%s\"\n", #call); \ - fflush(stdout); \ - abort(); \ - } \ - } while (0) - -// HSA API intercepting primitives -decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; -hsa_ven_amd_loader_1_01_pfn_t loader_api_table{}; - -hsa_status_t code_object_callback(hsa_executable_t executable, - hsa_loaded_code_object_t loaded_code_object, void* arg) { - printf("code_object_callback\n"); - fflush(stdout); - - uint64_t load_base = 0; - uint64_t load_size = 0; - uint64_t load_delta = 0; - uint32_t uri_len = 0; - char* uri_str = NULL; - - HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( - loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, &load_base)); - HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( - loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, &load_size)); - HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( - loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, &load_delta)); - HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( - loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, &uri_len)); - - uri_str = (char*)calloc(uri_len + 1, sizeof(char)); - if (!uri_str) { - perror("calloc"); - abort(); - } - - HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( - loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, uri_str)); - - printf("load_base(0x%lx)\n", load_base); - fflush(stdout); - printf("load_size(0x%lx)\n", load_size); - fflush(stdout); - printf("load_delta(0x%lx)\n", load_delta); - fflush(stdout); - printf("uri_len(%u)\n", uri_len); - fflush(stdout); - printf("uri_str(\"%s\")\n", uri_str); - fflush(stdout); - - free(uri_str); - - return HSA_STATUS_SUCCESS; -} - -hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char* options) { - HSA_RT(loader_api_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( - executable, code_object_callback, NULL)); - HSA_RT(hsa_executable_freeze_fn(executable, options)); - return HSA_STATUS_SUCCESS; -} - -// HSA-runtime tool on-load method -extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, - uint64_t failed_tool_count, - const char* const* failed_tool_names) { - printf("OnLoad: begin\n"); - fflush(stdout); - // intercepting hsa_executable_freeze API - hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; - table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; - // Fetching AMD Loader HSA extension API - HSA_RT(hsa_system_get_major_extension_table( - HSA_EXTENSION_AMD_LOADER, 1, sizeof(hsa_ven_amd_loader_1_01_pfn_t), &loader_api_table)); - printf("OnLoad: end\n"); - fflush(stdout); - return true; -} - -extern "C" PUBLIC_API void OnUnload() { - printf("OnUnload\n"); - fflush(stdout); -} - -extern "C" CONSTRUCTOR_API void constructor() { - printf("constructor\n"); - fflush(stdout); -} - -extern "C" DESTRUCTOR_API void destructor() { - printf("destructor\n"); - fflush(stdout); -} diff --git a/test/directed/activity_and_callback.cpp b/test/directed/activity_and_callback.cpp new file mode 100644 index 00000000..67f46526 --- /dev/null +++ b/test/directed/activity_and_callback.cpp @@ -0,0 +1,139 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#define HIP_PROF_HIP_API_STRING 1 +#include + +#include +#include +#include +#include + +__global__ void kernel() {} + +template inline void CHECK(T status); + +template <> inline void CHECK(hipError_t err) { + if (err != hipSuccess) { + std::cerr << hipGetErrorString(err) << std::endl; + abort(); + } +} + +template <> inline void CHECK(roctracer_status_t status) { + if (status != ROCTRACER_STATUS_SUCCESS) { + std::cerr << roctracer_error_string() << std::endl; + abort(); + } +} + +namespace { + +uint32_t GetPid() { + static auto pid = syscall(__NR_getpid); + return pid; +} +uint32_t GetTid() { + static thread_local auto tid = syscall(__NR_gettid); + return tid; +} + +void hip_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void* arg) { + const hip_api_data_t* data = static_cast(callback_data); + fprintf(stdout, "<%s id(%u)\tcorrelation_id(%lu) %s pid(%d) tid(%d)>\n", + roctracer_op_string(domain, cid, 0), cid, data->correlation_id, + (data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit", GetPid(), GetTid()); +} + +void buffer_callback(const char* begin, const char* end, void* arg) { + for (const roctracer_record_t* record = (const roctracer_record_t*)begin; + record < (const roctracer_record_t*)end; CHECK(roctracer_next_record(record, &record))) { + fprintf(stdout, "\t%s\tcorrelation_id(%lu) time_ns(%lu:%lu)\n", + roctracer_op_string(record->domain, record->op, record->kind), record->correlation_id, + record->begin_ns, record->end_ns); + } +} + +} // namespace + +int main() { + CHECK(hipSetDevice(0)); + + roctracer_properties_t properties{}; + properties.buffer_callback_fun = buffer_callback; + properties.buffer_callback_arg = nullptr; + properties.buffer_size = 1024; + CHECK(roctracer_open_pool(&properties)); + + // 1: callbacks only + CHECK(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 2: callbacks and activities + CHECK(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 3: activities only + CHECK(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 4: callbacks only + CHECK(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, nullptr)); + CHECK(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 5: callbacks and activities + CHECK(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 6: callbacks only + CHECK(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + // 7: none + CHECK(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API)); + CHECK(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + CHECK(hipSetDevice(0)); + kernel<<<1, 1>>>(); + CHECK(hipDeviceSynchronize()); + CHECK(roctracer_flush_activity()); + + return 0; +} \ No newline at end of file diff --git a/test/directed/dlopen.cpp b/test/directed/dlopen.cpp new file mode 100644 index 00000000..7a112c52 --- /dev/null +++ b/test/directed/dlopen.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + + +#include "roctracer.h" + +#include +#include + +#include + +using get_timestamp_t = decltype(roctracer_get_timestamp); +using hsa_init_t = decltype(hsa_init); +using hsa_shut_down_t = decltype(hsa_shut_down); + +int main() { + // CASE 1: HSA is not loaded. + // + { + void* tracer_library = dlopen("libroctracer64.so", RTLD_LAZY); + assert(tracer_library != nullptr); + + auto* get_timestamp = + reinterpret_cast(dlsym(tracer_library, "roctracer_get_timestamp")); + assert(get_timestamp != nullptr); + + roctracer_timestamp_t timestamp; + (*get_timestamp)(×tamp); + dlclose(tracer_library); + } + + // CASE 2 Load the roctracer after hsa_init(). + // + void* hsa_library = dlopen("libhsa-runtime64.so.1", RTLD_LAZY); + assert(hsa_library != nullptr); + + auto* hsa_init = reinterpret_cast(dlsym(hsa_library, "hsa_init")); + auto* hsa_shut_down = reinterpret_cast(dlsym(hsa_library, "hsa_shut_down")); + assert(hsa_init != nullptr && hsa_shut_down != nullptr); + + { + (*hsa_init)(); + + void* tracer_library = dlopen("libroctracer64.so", RTLD_LAZY); + assert(tracer_library != nullptr); + + auto* get_timestamp = + reinterpret_cast(dlsym(tracer_library, "roctracer_get_timestamp")); + assert(get_timestamp != nullptr); + + roctracer_timestamp_t timestamp; + (*get_timestamp)(×tamp); + + dlclose(tracer_library); + (*hsa_shut_down)(); + } + + // CASE 3: Load and use the roctracer before hsa_init(). + // + { + void* tracer_library = dlopen("libroctracer64.so", RTLD_LAZY); + assert(tracer_library != nullptr); + + auto* get_timestamp = + reinterpret_cast(dlsym(tracer_library, "roctracer_get_timestamp")); + assert(get_timestamp != nullptr); + + roctracer_timestamp_t timestamp; + (*get_timestamp)(×tamp); + + (*hsa_init)(); + (*hsa_shut_down)(); + dlclose(tracer_library); + } + + return 0; +} \ No newline at end of file diff --git a/test/directed/multi_pool_activities.cpp b/test/directed/multi_pool_activities.cpp new file mode 100644 index 00000000..b948b478 --- /dev/null +++ b/test/directed/multi_pool_activities.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#include + +#include + +// This test checks that asynchronous activities can be enabled in distinct memory pools. It enables +// activity reporting for HIP kernel dispatches in one memory pool, and memory copy reporting in +// another memory pool. The output of this test to stdout should be a series of kernel dispatch +// records (10) followed by a series of memory copy records (10). The records should not be +// interleaved. + +__global__ void kernel(void* global_memory) {} + +namespace { + +template inline void CHECK(T status); + +template <> inline void CHECK(hipError_t err) { + if (err != hipSuccess) { + std::cerr << hipGetErrorString(err) << std::endl; + abort(); + } +} + +template <> inline void CHECK(roctracer_status_t status) { + if (status != ROCTRACER_STATUS_SUCCESS) { + std::cerr << roctracer_error_string() << std::endl; + abort(); + } +} + +void buffer_callback(const char* begin, const char* end, void* arg) { + for (const roctracer_record_t* record = (const roctracer_record_t*)begin; + record != (const roctracer_record_t*)end; CHECK(roctracer_next_record(record, &record))) { + fprintf(stdout, "\t:%s\t: correlation_id(%lu) time_ns(%lu:%lu)\n", + roctracer_op_string(record->domain, record->op, record->kind), record->correlation_id, + record->begin_ns, record->end_ns); + } +} + +} // namespace + +int main() { + CHECK(hipSetDevice(0)); + + roctracer_properties_t properties{}; + properties.buffer_callback_fun = buffer_callback; + properties.buffer_callback_arg = nullptr; + properties.buffer_size = 1024 * 1024; + + roctracer_pool_t* pool_1; + CHECK(roctracer_open_pool_expl(&properties, &pool_1)); + CHECK(roctracer_enable_op_activity_expl(ACTIVITY_DOMAIN_HIP_OPS, HIP_OP_ID_DISPATCH, pool_1)); + + roctracer_pool_t* pool_2; + CHECK(roctracer_open_pool_expl(&properties, &pool_2)); + CHECK(roctracer_enable_op_activity_expl(ACTIVITY_DOMAIN_HIP_OPS, HIP_OP_ID_COPY, pool_2)); + CHECK(roctracer_enable_op_activity_expl(ACTIVITY_DOMAIN_HIP_API, HIP_API_ID_hipMemcpy, pool_2)); + + int host_array[256] = {0}; + int* device_memory; + CHECK(hipMalloc(&device_memory, sizeof(host_array))); + + for (int i = 0; i < 10; ++i) { + CHECK(hipMemcpy(device_memory, host_array, sizeof(host_array), hipMemcpyHostToDevice)); + kernel<<<1, 1>>>(device_memory); + } + CHECK(hipDeviceSynchronize()); + + CHECK(roctracer_flush_activity_expl(pool_1)); + CHECK(roctracer_flush_activity_expl(pool_2)); + return 0; +} diff --git a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt index 6f0c4d17..051d58b2 100644 --- a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt @@ -26,32 +26,24 @@ PASSED! 129855989696159:129855990920319 0:0 KernelExecution:29:14696 129855989668256:129855991384209 0:0 CopyDeviceToHost:31:14696 129855605540988:129855957443403 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :4 -129855957456260:129855957456261 14696:14696 MARK(name(before HIP LaunchKernel)) 129855957507034:129855957514510 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 129855957521000:129855957523014 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 129855957529950:129855958671150 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 -129855958701410:129855958701411 14696:14696 MARK(name(after HIP LaunchKernel)) 129855958708321:129855961719221 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :10 129855971408776:129855972257972 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :11 -129855972261515:129855972261516 14696:14696 MARK(name(before HIP LaunchKernel)) 129855972266736:129855972268234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 129855972271629:129855972272780 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 129855972276181:129855972282118 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 -129855972663504:129855972663505 14696:14696 MARK(name(after HIP LaunchKernel)) 129855972666015:129855974143463 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :17 129855980222888:129855981023250 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :18 -129855981025473:129855981025474 14696:14696 MARK(name(before HIP LaunchKernel)) 129855981028834:129855981029831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 129855981032043:129855981032913 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 129855981035237:129855981038997 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 -129855981041265:129855981041266 14696:14696 MARK(name(after HIP LaunchKernel)) 129855981043695:129855982796928 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :24 129855988764565:129855989615901 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :25 -129855989618073:129855989618074 14696:14696 MARK(name(before HIP LaunchKernel)) 129855989621096:129855989622129 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 129855989624243:129855989625087 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 129855989627271:129855989630934 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 -129855989632959:129855989632960 14696:14696 MARK(name(after HIP LaunchKernel)) 129855989635351:129855991396402 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :31 PASSED! ## Iteration (95) ################# @@ -114,88 +106,64 @@ PASSED! 129856094767987:129856095991348 0:0 KernelExecution:113:14696 129856094739044:129856096520182 0:0 CopyDeviceToHost:115:14696 129855997303698:129855998134058 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :32 -129855998136242:129855998136243 14696:14696 MARK(name(before HIP LaunchKernel)) 129855998138933:129855998139817 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 129855998141918:129855998142773 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 129855998144935:129855998149221 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 -129855998151431:129855998151432 14696:14696 MARK(name(after HIP LaunchKernel)) 129855998153828:129855999937506 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :38 129856005829520:129856006665192 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :39 -129856006667396:129856006667397 14696:14696 MARK(name(before HIP LaunchKernel)) 129856006670307:129856006671160 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 129856006673376:129856006674209 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 129856006676323:129856006679651 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 -129856006681635:129856006681636 14696:14696 MARK(name(after HIP LaunchKernel)) 129856006683967:129856008469471 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :45 129856014360174:129856015191285 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :46 -129856015193489:129856015193490 14696:14696 MARK(name(before HIP LaunchKernel)) 129856015196342:129856015197217 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 129856015199400:129856015200221 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 129856015202314:129856015205930 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 -129856015208058:129856015208059 14696:14696 MARK(name(after HIP LaunchKernel)) 129856015210764:129856017001555 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :52 129856022908053:129856023733985 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :53 -129856023736320:129856023736321 14696:14696 MARK(name(before HIP LaunchKernel)) 129856023739178:129856023740063 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 129856023742240:129856023743090 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 129856023745309:129856023748845 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 -129856023750891:129856023750892 14696:14696 MARK(name(after HIP LaunchKernel)) 129856023753396:129856025556257 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :59 129856031530409:129856032503170 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :60 -129856032505392:129856032505393 14696:14696 MARK(name(before HIP LaunchKernel)) 129856032508345:129856032509226 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 129856032511486:129856032512316 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 129856032514599:129856032518036 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 -129856032520150:129856032520151 14696:14696 MARK(name(after HIP LaunchKernel)) 129856032522410:129856034373111 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :66 129856040397979:129856041130687 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :67 -129856041132973:129856041132974 14696:14696 MARK(name(before HIP LaunchKernel)) 129856041136399:129856041137389 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 129856041139653:129856041140500 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 129856041142893:129856041146663 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 -129856041148645:129856041148646 14696:14696 MARK(name(after HIP LaunchKernel)) 129856041151128:129856042953843 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :73 129856048994841:129856049829566 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :74 -129856049831724:129856049831725 14696:14696 MARK(name(before HIP LaunchKernel)) 129856049834527:129856049835413 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 129856049837759:129856049838585 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 129856049840796:129856049844487 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 -129856049846529:129856049846530 14696:14696 MARK(name(after HIP LaunchKernel)) 129856049848934:129856051663797 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :80 129856057798518:129856058633464 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :81 -129856058635650:129856058635651 14696:14696 MARK(name(before HIP LaunchKernel)) 129856058638530:129856058639560 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 129856058641994:129856058642826 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 129856058645125:129856058648721 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 -129856058650749:129856058650750 14696:14696 MARK(name(after HIP LaunchKernel)) 129856058653478:129856060466863 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :87 129856066704603:129856067541502 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :88 -129856067543802:129856067543803 14696:14696 MARK(name(before HIP LaunchKernel)) 129856067546791:129856067547681 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 129856067550027:129856067550854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 129856067553125:129856067556952 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 -129856067559149:129856067559150 14696:14696 MARK(name(after HIP LaunchKernel)) 129856067561903:129856069442958 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :94 129856075719215:129856076572398 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :95 -129856076574828:129856076574829 14696:14696 MARK(name(before HIP LaunchKernel)) 129856076578071:129856076578997 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 129856076581286:129856076582119 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 129856076584498:129856076588395 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 -129856076590554:129856076590555 14696:14696 MARK(name(after HIP LaunchKernel)) 129856076592857:129856078406672 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :101 129856084768530:129856085607081 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :102 -129856085609437:129856085609438 14696:14696 MARK(name(before HIP LaunchKernel)) 129856085612528:129856085613498 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 129856085615751:129856085616602 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 129856085618831:129856085623039 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 -129856085625178:129856085625179 14696:14696 MARK(name(after HIP LaunchKernel)) 129856085627731:129856087451206 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :108 129856093846767:129856094686797 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :109 -129856094689153:129856094689154 14696:14696 MARK(name(before HIP LaunchKernel)) 129856094692497:129856094693485 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 129856094695727:129856094696598 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 129856094698884:129856094702856 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 -129856094705178:129856094705179 14696:14696 MARK(name(after HIP LaunchKernel)) 129856094707931:129856096534639 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :115 PASSED! ## Iteration (83) ################# @@ -253,83 +221,61 @@ PASSED! 129856192345329:129856193569809 0:0 KernelExecution:190:14696 129856192317767:129856194105080 0:0 CopyDeviceToHost:192:14696 129856103003811:129856103844379 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :116 -129856103846787:129856103846788 14696:14696 MARK(name(before HIP LaunchKernel)) 129856103849922:129856103850838 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 129856103853240:129856103854136 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 129856103856444:129856103860149 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 -129856103862386:129856103862387 14696:14696 MARK(name(after HIP LaunchKernel)) 129856103864691:129856105741098 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :122 129856112200226:129856113019342 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :123 -129856113021598:129856113021599 14696:14696 MARK(name(before HIP LaunchKernel)) 129856113024595:129856113025504 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 129856113027902:129856113028756 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 129856113031010:129856113034968 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 -129856113037098:129856113037099 14696:14696 MARK(name(after HIP LaunchKernel)) 129856113039452:129856114918382 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :129 129856121536590:129856122377686 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :130 -129856122380177:129856122380178 14696:14696 MARK(name(before HIP LaunchKernel)) 129856122383242:129856122384157 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 129856122386562:129856122387438 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 129856122389743:129856122393887 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 -129856122395917:129856122395918 14696:14696 MARK(name(after HIP LaunchKernel)) 129856122398705:129856124236553 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :136 129856130930250:129856131721919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :137 -129856131724534:129856131724535 14696:14696 MARK(name(before HIP LaunchKernel)) 129856131727544:129856131728453 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 129856131730840:129856131731718 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 129856131734248:129856131738338 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 -129856131740508:129856131740509 14696:14696 MARK(name(after HIP LaunchKernel)) 129856131742956:129856133633762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :143 129856140484642:129856141289559 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :144 -129856141292040:129856141292041 14696:14696 MARK(name(before HIP LaunchKernel)) 129856141295360:129856141296366 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 129856141298705:129856141299584 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 129856141301885:129856141305904 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 -129856141308287:129856141308288 14696:14696 MARK(name(after HIP LaunchKernel)) 129856141310745:129856143207185 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :150 129856150167842:129856151019519 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :151 -129856151021903:129856151021904 14696:14696 MARK(name(before HIP LaunchKernel)) 129856151025430:129856151026339 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 129856151028846:129856151029731 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 129856151032070:129856151036399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 -129856151038525:129856151038526 14696:14696 MARK(name(after HIP LaunchKernel)) 129856151041204:129856152887054 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :157 129856159416500:129856160257922 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :158 -129856160260251:129856160260252 14696:14696 MARK(name(before HIP LaunchKernel)) 129856160263327:129856160264253 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 129856160266588:129856160267551 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 129856160269815:129856160273583 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 -129856160275639:129856160275640 14696:14696 MARK(name(after HIP LaunchKernel)) 129856160277873:129856162154856 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :164 129856167989129:129856168794954 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :165 -129856168796817:129856168796818 14696:14696 MARK(name(before HIP LaunchKernel)) 129856168799680:129856168800356 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 129856168802336:129856168803043 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 129856168804923:129856168808196 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 -129856168810026:129856168810027 14696:14696 MARK(name(after HIP LaunchKernel)) 129856168811889:129856170642148 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :171 129856175935119:129856176727698 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :172 -129856176729573:129856176729574 14696:14696 MARK(name(before HIP LaunchKernel)) 129856176732312:129856176733001 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 129856176734764:129856176735517 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 129856176737306:129856176740961 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 -129856176742551:129856176742552 14696:14696 MARK(name(after HIP LaunchKernel)) 129856176744384:129856178576608 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :178 129856183733862:129856184521359 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :179 PASSED! ## Iteration (72) ################# -129856184523202:129856184523203 14696:14696 MARK(name(before HIP LaunchKernel)) 129856184526239:129856184526918 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 129856184528695:129856184529339 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 129856184531203:129856184534819 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 -129856184536444:129856184536445 14696:14696 MARK(name(after HIP LaunchKernel)) 129856184538159:129856186381152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :185 129856191471466:129856192258965 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :186 -129856192260887:129856192260888 14696:14696 MARK(name(before HIP LaunchKernel)) 129856192264565:129856192265231 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 129856192266936:129856192267582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 129856192269493:129856192272647 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 -129856192274238:129856192274239 14696:14696 MARK(name(after HIP LaunchKernel)) 129856192276014:129856194117333 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :192 PASSED! ## Iteration (71) ################# @@ -400,102 +346,74 @@ PASSED! 129856297157310:129856298378111 0:0 KernelExecution:288:14696 129856297129589:129856298914568 0:0 CopyDeviceToHost:290:14696 129856199220209:129856199993256 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :193 -129856199995165:129856199995166 14696:14696 MARK(name(before HIP LaunchKernel)) 129856199998331:129856199999016 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 129856200000971:129856200001630 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 129856200003348:129856200006409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 -129856200007997:129856200007998 14696:14696 MARK(name(after HIP LaunchKernel)) 129856200009781:129856201864796 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :199 129856206828954:129856207617612 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :200 -129856207619342:129856207619343 14696:14696 MARK(name(before HIP LaunchKernel)) 129856207633427:129856207634203 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 129856207635929:129856207636565 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 129856207638289:129856207641619 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 -129856207643379:129856207643380 14696:14696 MARK(name(after HIP LaunchKernel)) 129856207645338:129856209486625 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :206 129856214367871:129856215199634 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :207 -129856215201421:129856215201422 14696:14696 MARK(name(before HIP LaunchKernel)) 129856215205034:129856215205701 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 129856215207421:129856215208068 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 129856215209926:129856215213001 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 -129856215214576:129856215214577 14696:14696 MARK(name(after HIP LaunchKernel)) 129856215216591:129856217062762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :213 129856221865656:129856222702390 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :214 -129856222704143:129856222704144 14696:14696 MARK(name(before HIP LaunchKernel)) 129856222707593:129856222708263 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 129856222709907:129856222710533 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 129856222712408:129856222715305 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 -129856222716820:129856222716821 14696:14696 MARK(name(after HIP LaunchKernel)) 129856222718703:129856224572291 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :220 129856229369321:129856230206171 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :221 -129856230207933:129856230207934 14696:14696 MARK(name(before HIP LaunchKernel)) 129856230211408:129856230212070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 129856230213729:129856230214356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 129856230216306:129856230219552 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 -129856230221084:129856230221085 14696:14696 MARK(name(after HIP LaunchKernel)) 129856230222856:129856232061167 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :227 129856236820359:129856237552651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :228 -129856237554349:129856237554350 14696:14696 MARK(name(before HIP LaunchKernel)) 129856237557958:129856237558615 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 129856237560382:129856237561016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 129856237562876:129856237566063 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 -129856237567608:129856237567609 14696:14696 MARK(name(after HIP LaunchKernel)) 129856237569296:129856239419101 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :234 129856244174381:129856245010977 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :235 -129856245012718:129856245012719 14696:14696 MARK(name(before HIP LaunchKernel)) 129856245025693:129856245026451 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 129856245028210:129856245028855 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 129856245030730:129856245034177 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 -129856245035805:129856245035806 14696:14696 MARK(name(after HIP LaunchKernel)) 129856245038122:129856246876538 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :241 129856251653109:129856252435896 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :242 -129856252437833:129856252437834 14696:14696 MARK(name(before HIP LaunchKernel)) 129856252441362:129856252442017 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 129856252443660:129856252444296 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 129856252446165:129856252449155 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 -129856252450809:129856252450810 14696:14696 MARK(name(after HIP LaunchKernel)) 129856252452579:129856254303055 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :248 129856259101952:129856259882749 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :249 -129856259884515:129856259884516 14696:14696 MARK(name(before HIP LaunchKernel)) 129856259886742:129856259887392 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 129856259889040:129856259889671 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 129856259891415:129856259894919 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 -129856259896631:129856259896632 14696:14696 MARK(name(after HIP LaunchKernel)) 129856259898324:129856261743974 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :255 129856266541050:129856267374498 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :256 -129856267376266:129856267376267 14696:14696 MARK(name(before HIP LaunchKernel)) 129856267379647:129856267380320 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 129856267381929:129856267382540 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 129856267384409:129856267387474 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 -129856267389033:129856267389034 14696:14696 MARK(name(after HIP LaunchKernel)) 129856267390764:129856269239563 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :262 129856274008890:129856274843415 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :263 -129856274845095:129856274845096 14696:14696 MARK(name(before HIP LaunchKernel)) 129856274847806:129856274848470 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 129856274850117:129856274850733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 129856274852427:129856274855749 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 -129856274857358:129856274857359 14696:14696 MARK(name(after HIP LaunchKernel)) 129856274859228:129856276707873 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :269 129856281498759:129856282330118 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :270 -129856282332044:129856282332045 14696:14696 MARK(name(before HIP LaunchKernel)) 129856282335358:129856282336015 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 129856282338029:129856282338668 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 129856282340644:129856282343485 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 -129856282345028:129856282345029 14696:14696 MARK(name(after HIP LaunchKernel)) 129856282347024:129856284203838 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :276 129856288978096:129856289708673 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :277 -129856289710414:129856289710415 14696:14696 MARK(name(before HIP LaunchKernel)) 129856289714250:129856289714924 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 129856289716689:129856289717305 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 129856289719150:129856289722057 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 -129856289723677:129856289723678 14696:14696 MARK(name(after HIP LaunchKernel)) 129856289725380:129856291571314 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :283 129856296341271:129856297072486 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :284 -129856297074313:129856297074314 14696:14696 MARK(name(before HIP LaunchKernel)) 129856297077733:129856297078380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 129856297080109:129856297080733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 129856297082729:129856297085646 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 -129856297087184:129856297087185 14696:14696 MARK(name(after HIP LaunchKernel)) 129856297089004:129856298926004 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :290 PASSED! ## Iteration (58) ################# @@ -565,95 +483,69 @@ PASSED! 129856394339138:129856395561058 0:0 KernelExecution:379:14696 129856394311639:129856396103600 0:0 CopyDeviceToHost:381:14696 129856303774990:129856304626161 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :291 -129856304627884:129856304627885 14696:14696 MARK(name(before HIP LaunchKernel)) 129856304631072:129856304631723 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 129856304633373:129856304634007 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 129856304635811:129856304639104 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 -129856304640848:129856304640849 14696:14696 MARK(name(after HIP LaunchKernel)) 129856304642651:129856306501959 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :297 129856311264292:129856312048766 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :298 -129856312050539:129856312050540 14696:14696 MARK(name(before HIP LaunchKernel)) 129856312053498:129856312054174 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 129856312055946:129856312056653 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 129856312058397:129856312061589 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 -129856312063201:129856312063202 14696:14696 MARK(name(after HIP LaunchKernel)) 129856312065053:129856313904746 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :304 129856318704110:129856319483869 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :305 -129856319485543:129856319485544 14696:14696 MARK(name(before HIP LaunchKernel)) 129856319499258:129856319500048 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 129856319501759:129856319502401 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 129856319504307:129856319507787 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 -129856319509535:129856319509536 14696:14696 MARK(name(after HIP LaunchKernel)) 129856319511552:129856321356021 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :311 129856326144210:129856326980680 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :312 -129856326982483:129856326982484 14696:14696 MARK(name(before HIP LaunchKernel)) 129856326986163:129856326986815 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 129856326988581:129856326989210 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 129856326991095:129856326994082 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 -129856326995650:129856326995651 14696:14696 MARK(name(after HIP LaunchKernel)) 129856326997461:129856328838450 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :318 129856333608209:129856334440902 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :319 -129856334442697:129856334442698 14696:14696 MARK(name(before HIP LaunchKernel)) 129856334446427:129856334447095 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 129856334448793:129856334449426 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 129856334451308:129856334454120 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 -129856334455718:129856334455719 14696:14696 MARK(name(after HIP LaunchKernel)) 129856334457508:129856336307654 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :325 129856341084552:129856341987761 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :326 -129856341989501:129856341989502 14696:14696 MARK(name(before HIP LaunchKernel)) 129856341992961:129856341993616 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 129856341995311:129856341995915 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 129856341997784:129856342000844 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 -129856342002457:129856342002458 14696:14696 MARK(name(after HIP LaunchKernel)) 129856342004209:129856343852827 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :332 129856348628207:129856349358297 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :333 -129856349360014:129856349360015 14696:14696 MARK(name(before HIP LaunchKernel)) 129856349363641:129856349364301 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 129856349365955:129856349366590 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 129856349368410:129856349371392 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 -129856349373001:129856349373002 14696:14696 MARK(name(after HIP LaunchKernel)) 129856349374736:129856351215163 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :339 129856356026231:129856356823939 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :340 -129856356825939:129856356825940 14696:14696 MARK(name(before HIP LaunchKernel)) 129856356829316:129856356829967 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 129856356831607:129856356832235 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 129856356834103:129856356837300 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 -129856356838880:129856356838881 14696:14696 MARK(name(after HIP LaunchKernel)) 129856356840997:129856358683474 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :346 129856363457621:129856364292098 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :347 -129856364293909:129856364293910 14696:14696 MARK(name(before HIP LaunchKernel)) 129856364296242:129856364296921 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 129856364298665:129856364299325 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 129856364301137:129856364304805 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 -129856364306614:129856364306615 14696:14696 MARK(name(after HIP LaunchKernel)) 129856364308432:129856366185192 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :353 129856371019019:129856371802348 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :354 -129856371804072:129856371804073 14696:14696 MARK(name(before HIP LaunchKernel)) 129856371807407:129856371808089 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 129856371809769:129856371810408 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 129856371812409:129856371815399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 -129856371816938:129856371816939 14696:14696 MARK(name(after HIP LaunchKernel)) 129856371818730:129856373668223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :360 129856378427685:129856379260530 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :361 -129856379262413:129856379262414 14696:14696 MARK(name(before HIP LaunchKernel)) 129856379266028:129856379266680 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 129856379268334:129856379268974 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 129856379270951:129856379274011 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 -129856379275576:129856379275577 14696:14696 MARK(name(after HIP LaunchKernel)) 129856379277516:129856381125442 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :367 129856385912709:129856386747747 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :368 -129856386749617:129856386749618 14696:14696 MARK(name(before HIP LaunchKernel)) 129856386753015:129856386753700 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 129856386755603:129856386756230 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 129856386758107:129856386761145 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 -129856386762828:129856386762829 14696:14696 MARK(name(after HIP LaunchKernel)) 129856386764527:129856388613300 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :374 129856393418103:129856394255127 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :375 -129856394257084:129856394257085 14696:14696 MARK(name(before HIP LaunchKernel)) 129856394260727:129856394261393 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 129856394263117:129856394263752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 129856394266100:129856394269007 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 -129856394270594:129856394270595 14696:14696 MARK(name(after HIP LaunchKernel)) 129856394272528:129856396115719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :381 PASSED! ## Iteration (44) ################# @@ -724,102 +616,74 @@ PASSED! 129856498583201:129856499806882 0:0 KernelExecution:477:14696 129856498555486:129856500349740 0:0 CopyDeviceToHost:479:14696 129856400931528:129856401693841 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :382 -129856401695697:129856401695698 14696:14696 MARK(name(before HIP LaunchKernel)) 129856401698086:129856401698763 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 129856401700644:129856401701356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 129856401703387:129856401706670 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 -129856401708283:129856401708284 14696:14696 MARK(name(after HIP LaunchKernel)) 129856401710202:129856403550731 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :388 129856408346178:129856409079144 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :389 -129856409080946:129856409080947 14696:14696 MARK(name(before HIP LaunchKernel)) 129856409119575:129856409120361 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 129856409122350:129856409122982 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 129856409124716:129856409127974 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 -129856409129722:129856409129723 14696:14696 MARK(name(after HIP LaunchKernel)) 129856409131595:129856410958682 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :395 129856415764088:129856416549283 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :396 -129856416551147:129856416551148 14696:14696 MARK(name(before HIP LaunchKernel)) 129856416554753:129856416555457 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 129856416557440:129856416558065 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 129856416560077:129856416563543 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 -129856416565220:129856416565221 14696:14696 MARK(name(after HIP LaunchKernel)) 129856416567086:129856418410890 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :402 129856423185992:129856423965984 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :403 -129856423967686:129856423967687 14696:14696 MARK(name(before HIP LaunchKernel)) 129856423971156:129856423971813 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 129856423973453:129856423974058 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 129856423975959:129856423979023 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 -129856423980620:129856423980621 14696:14696 MARK(name(after HIP LaunchKernel)) 129856423982481:129856425851437 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :409 129856430649566:129856431446819 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :410 -129856431448647:129856431448648 14696:14696 MARK(name(before HIP LaunchKernel)) 129856431451980:129856431452627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 129856431454467:129856431455103 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 129856431457061:129856431460021 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 -129856431461633:129856431461634 14696:14696 MARK(name(after HIP LaunchKernel)) 129856431463427:129856433305223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :416 129856438060199:129856438896337 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :417 -129856438898056:129856438898057 14696:14696 MARK(name(before HIP LaunchKernel)) 129856438901614:129856438902293 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 129856438903944:129856438904582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 129856438906471:129856438909460 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 -129856438910995:129856438910996 14696:14696 MARK(name(after HIP LaunchKernel)) 129856438913099:129856440770029 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :423 129856445589904:129856446428787 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :424 -129856446430525:129856446430526 14696:14696 MARK(name(before HIP LaunchKernel)) 129856446434097:129856446434755 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 129856446436446:129856446437074 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 129856446438958:129856446442103 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 -129856446443705:129856446443706 14696:14696 MARK(name(after HIP LaunchKernel)) 129856446445611:129856448319675 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :430 129856453113306:129856453898651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :431 -129856453900443:129856453900444 14696:14696 MARK(name(before HIP LaunchKernel)) 129856453903924:129856453904588 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 129856453906239:129856453906854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 129856453908740:129856453911874 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 -129856453913486:129856453913487 14696:14696 MARK(name(after HIP LaunchKernel)) 129856453915356:129856455761272 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :437 129856460531599:129856461270590 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :438 -129856461272368:129856461272369 14696:14696 MARK(name(before HIP LaunchKernel)) 129856461275845:129856461276515 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 129856461278198:129856461278850 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 129856461280791:129856461283899 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 -129856461285595:129856461285596 14696:14696 MARK(name(after HIP LaunchKernel)) 129856461287388:129856463133280 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :444 129856467884995:129856468668564 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :445 -129856468670291:129856468670292 14696:14696 MARK(name(before HIP LaunchKernel)) 129856468673055:129856468673710 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 129856468675408:129856468676048 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 129856468677942:129856468681455 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 -129856468683148:129856468683149 14696:14696 MARK(name(after HIP LaunchKernel)) 129856468685101:129856470532724 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :451 129856475326269:129856476110399 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :452 -129856476112220:129856476112221 14696:14696 MARK(name(before HIP LaunchKernel)) 129856476115691:129856476116355 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 129856476118083:129856476118692 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 129856476120553:129856476123478 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 -129856476125144:129856476125145 14696:14696 MARK(name(after HIP LaunchKernel)) 129856476126929:129856477993159 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :458 129856482771986:129856483553655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :459 -129856483555435:129856483555436 14696:14696 MARK(name(before HIP LaunchKernel)) 129856483559048:129856483559715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 129856483561368:129856483561995 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 129856483563875:129856483567045 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 -129856483569037:129856483569038 14696:14696 MARK(name(after HIP LaunchKernel)) 129856483570875:129856485418803 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :465 129856490199703:129856491039451 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :466 -129856491041225:129856491041226 14696:14696 MARK(name(before HIP LaunchKernel)) 129856491044551:129856491045204 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 129856491046844:129856491047481 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 129856491049291:129856491052245 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 -129856491053805:129856491053806 14696:14696 MARK(name(after HIP LaunchKernel)) 129856491055528:129856492907612 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :472 129856497665310:129856498500405 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :473 -129856498502066:129856498502067 14696:14696 MARK(name(before HIP LaunchKernel)) 129856498505506:129856498506141 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 129856498507858:129856498508491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 129856498510523:129856498513554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 -129856498515137:129856498515138 14696:14696 MARK(name(after HIP LaunchKernel)) 129856498517011:129856500365762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :479 PASSED! ## Iteration (31) ################# @@ -890,102 +754,74 @@ PASSED! 129856595328424:129856597128257 0:0 CopyDeviceToHost:570:14696 129856601984341:129856602751266 0:0 CopyHostToDevice:571:14696 129856505180003:129856505975222 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :480 -129856505976980:129856505976981 14696:14696 MARK(name(before HIP LaunchKernel)) 129856505980587:129856505981234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 129856505982935:129856505983566 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 129856505985434:129856505988514 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 -129856505990096:129856505990097 14696:14696 MARK(name(after HIP LaunchKernel)) 129856505991997:129856507832334 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :486 129856512649603:129856513382084 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :487 -129856513384599:129856513384600 14696:14696 MARK(name(before HIP LaunchKernel)) 129856513388119:129856513389080 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 129856513391435:129856513392275 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 129856513394697:129856513399367 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 -129856513401523:129856513401524 14696:14696 MARK(name(after HIP LaunchKernel)) 129856513404257:129856515239416 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :493 129856519992571:129856520793180 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :494 -129856520794974:129856520794975 14696:14696 MARK(name(before HIP LaunchKernel)) 129856520798420:129856520799070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 129856520800911:129856520801530 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 129856520803611:129856520806841 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 -129856520808737:129856520808738 14696:14696 MARK(name(after HIP LaunchKernel)) 129856520810545:129856522657358 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :500 129856527425346:129856528218117 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :501 -129856528219874:129856528219875 14696:14696 MARK(name(before HIP LaunchKernel)) 129856528221975:129856528222627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 129856528224439:129856528225291 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 129856528227108:129856528230172 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 -129856528231752:129856528231753 14696:14696 MARK(name(after HIP LaunchKernel)) 129856528233473:129856530074548 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :507 129856534899214:129856535681957 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :508 -129856535683676:129856535683677 14696:14696 MARK(name(before HIP LaunchKernel)) 129856535686401:129856535687061 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 129856535688790:129856535689423 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 129856535691153:129856535694294 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 -129856535695868:129856535695869 14696:14696 MARK(name(after HIP LaunchKernel)) 129856535697671:129856537541753 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :514 129856542387175:129856543225418 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :515 -129856543227192:129856543227193 14696:14696 MARK(name(before HIP LaunchKernel)) 129856543230911:129856543231570 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 129856543233243:129856543233871 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 129856543235930:129856543238762 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 -129856543240359:129856543240360 14696:14696 MARK(name(after HIP LaunchKernel)) 129856543242179:129856545084137 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :521 129856549857104:129856550696919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :522 -129856550698874:129856550698875 14696:14696 MARK(name(before HIP LaunchKernel)) 129856550702196:129856550702852 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 129856550704612:129856550705254 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 129856550707079:129856550709869 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 -129856550711442:129856550711443 14696:14696 MARK(name(after HIP LaunchKernel)) 129856550713182:129856552568840 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :528 129856557336788:129856558182426 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :529 -129856558184195:129856558184196 14696:14696 MARK(name(before HIP LaunchKernel)) 129856558187727:129856558188380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 129856558190122:129856558190752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 129856558192774:129856558195554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 -129856558197324:129856558197325 14696:14696 MARK(name(after HIP LaunchKernel)) 129856558199234:129856560041419 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :535 129856564809360:129856565545640 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :536 -129856565547393:129856565547394 14696:14696 MARK(name(before HIP LaunchKernel)) 129856565549636:129856565550299 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 129856565551969:129856565552581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 129856565554301:129856565557438 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 -129856565559047:129856565559048 14696:14696 MARK(name(after HIP LaunchKernel)) 129856565560847:129856567411065 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :542 129856572215770:129856572957492 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :543 -129856572959234:129856572959235 14696:14696 MARK(name(before HIP LaunchKernel)) 129856572962526:129856572963184 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 129856572964912:129856572965546 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 129856572967421:129856572970453 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 -129856572972097:129856572972098 14696:14696 MARK(name(after HIP LaunchKernel)) 129856572974076:129856574823083 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :549 129856579588261:129856580372449 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :550 -129856580374262:129856580374263 14696:14696 MARK(name(before HIP LaunchKernel)) 129856580376547:129856580377227 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 129856580378975:129856580379619 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 129856580381546:129856580384467 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 -129856580386225:129856580386226 14696:14696 MARK(name(after HIP LaunchKernel)) 129856580388205:129856582240020 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :556 129856587022783:129856587805709 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :557 -129856587807440:129856587807441 14696:14696 MARK(name(before HIP LaunchKernel)) 129856587811171:129856587811825 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 129856587813530:129856587814170 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 129856587816040:129856587819243 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 -129856587820912:129856587820913 14696:14696 MARK(name(after HIP LaunchKernel)) 129856587822927:129856589666874 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :563 129856594433516:129856595273993 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :564 -129856595275800:129856595275801 14696:14696 MARK(name(before HIP LaunchKernel)) 129856595278990:129856595279652 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 129856595281384:129856595282018 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 129856595283991:129856595287449 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 -129856595289101:129856595289102 14696:14696 MARK(name(after HIP LaunchKernel)) 129856595291045:129856597140491 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :570 129856601919460:129856602754655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :571 -129856602756445:129856602756446 14696:14696 MARK(name(before HIP LaunchKernel)) 129856602769740:129856602770661 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 129856602772396:129856602773016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 129856602775079:129856602778192 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 -129856602779755:129856602779756 14696:14696 MARK(name(after HIP LaunchKernel)) PASSED! ## Iteration (17) ################# PASSED! @@ -1055,97 +891,71 @@ PASSED! 129856699769937:129856701569372 0:0 CopyDeviceToHost:668:14696 129856602781709:129856604636152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :577 129856609479851:129856610321075 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :578 -129856610323078:129856610323079 14696:14696 MARK(name(before HIP LaunchKernel)) 129856610326500:129856610327162 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 129856610328857:129856610329498 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 129856610331492:129856610334664 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 -129856610336290:129856610336291 14696:14696 MARK(name(after HIP LaunchKernel)) 129856610338048:129856612222255 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :584 129856616969217:129856617705105 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :585 -129856617706989:129856617706990 14696:14696 MARK(name(before HIP LaunchKernel)) 129856617710485:129856617711142 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 129856617712846:129856617713491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 129856617715518:129856617718644 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 -129856617720274:129856617720275 14696:14696 MARK(name(after HIP LaunchKernel)) 129856617722118:129856619570993 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :591 129856624331436:129856625292310 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :592 -129856625294207:129856625294208 14696:14696 MARK(name(before HIP LaunchKernel)) 129856625297113:129856625297761 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 129856625299459:129856625300093 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 129856625301835:129856625305409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 -129856625307116:129856625307117 14696:14696 MARK(name(after HIP LaunchKernel)) 129856625309051:129856627159676 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :598 129856631962417:129856632745795 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :599 -129856632747622:129856632747623 14696:14696 MARK(name(before HIP LaunchKernel)) 129856632761013:129856632761762 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 129856632763565:129856632764219 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 129856632766094:129856632769110 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 -129856632770707:129856632770708 14696:14696 MARK(name(after HIP LaunchKernel)) 129856632772662:129856634610068 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :605 129856639375744:129856640154106 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :606 -129856640155933:129856640155934 14696:14696 MARK(name(before HIP LaunchKernel)) 129856640159565:129856640160216 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 129856640161841:129856640162476 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 129856640164410:129856640167293 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 -129856640168886:129856640168887 14696:14696 MARK(name(after HIP LaunchKernel)) 129856640170703:129856642054780 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :612 129856646841774:129856647623131 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :613 -129856647624849:129856647624850 14696:14696 MARK(name(before HIP LaunchKernel)) 129856647628076:129856647628742 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 129856647630426:129856647631050 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 129856647632957:129856647636281 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 -129856647637872:129856647637873 14696:14696 MARK(name(after HIP LaunchKernel)) 129856647639599:129856649488719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :619 129856654273909:129856655105030 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :620 -129856655106878:129856655106879 14696:14696 MARK(name(before HIP LaunchKernel)) 129856655109847:129856655110497 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 129856655112292:129856655112914 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 129856655114757:129856655118162 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 -129856655119835:129856655119836 14696:14696 MARK(name(after HIP LaunchKernel)) 129856655121792:129856656973292 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :626 PASSED! ## Iteration (4) ################# 129856661755424:129856662589447 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :627 -129856662591236:129856662591237 14696:14696 MARK(name(before HIP LaunchKernel)) 129856662604066:129856662604831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 129856662606611:129856662607261 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 129856662608995:129856662611988 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 -129856662613644:129856662613645 14696:14696 MARK(name(after HIP LaunchKernel)) 129856662615584:129856664462467 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :633 129856669256336:129856670039683 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :634 -129856670041634:129856670041635 14696:14696 MARK(name(before HIP LaunchKernel)) 129856670054499:129856670055254 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 129856670056982:129856670057615 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 129856670059351:129856670062513 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 -129856670064113:129856670064114 14696:14696 MARK(name(after HIP LaunchKernel)) 129856670066200:129856671906923 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :640 129856676668791:129856677404223 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :641 -129856677406068:129856677406069 14696:14696 MARK(name(before HIP LaunchKernel)) 129856677408812:129856677409484 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 129856677411095:129856677411722 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 129856677413461:129856677416941 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 -129856677418503:129856677418504 14696:14696 MARK(name(after HIP LaunchKernel)) 129856677420242:129856679269939 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :647 129856684019418:129856684826552 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :648 -129856684828363:129856684828364 14696:14696 MARK(name(before HIP LaunchKernel)) 129856684832034:129856684832695 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 129856684834368:129856684834970 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 129856684836877:129856684839963 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 -129856684841560:129856684841561 14696:14696 MARK(name(after HIP LaunchKernel)) 129856684843320:129856686688518 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :654 129856691504696:129856692288950 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :655 -129856692290798:129856692290799 14696:14696 MARK(name(before HIP LaunchKernel)) 129856692292859:129856692293513 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 129856692295227:129856692295860 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 129856692297819:129856692300821 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 -129856692302355:129856692302356 14696:14696 MARK(name(after HIP LaunchKernel)) 129856692304530:129856694153679 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :661 129856698928289:129856699716162 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :662 -129856699717890:129856699717891 14696:14696 MARK(name(before HIP LaunchKernel)) 129856699720061:129856699720715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 129856699722330:129856699722941 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 129856699724836:129856699728198 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 -129856699729953:129856699729954 14696:14696 MARK(name(after HIP LaunchKernel)) 129856699731887:129856701581422 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :668 PASSED! ## Iteration (3) ################# @@ -1172,39 +982,29 @@ PASSED! 129856737053267:129856738276147 0:0 KernelExecution:701:14696 129856737025461:129856738822547 0:0 CopyDeviceToHost:703:14696 129856706409352:129856707238410 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :669 -129856707240341:129856707240342 14696:14696 MARK(name(before HIP LaunchKernel)) 129856707253495:129856707254390 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 129856707256214:129856707256878 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 129856707258659:129856707261885 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 -129856707263518:129856707263519 14696:14696 MARK(name(after HIP LaunchKernel)) 129856707265698:129856709110388 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :675 129856713891418:129856714734007 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :676 -129856714735794:129856714735795 14696:14696 MARK(name(before HIP LaunchKernel)) 129856714739058:129856714739715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 129856714741339:129856714741972 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 129856714743986:129856714747316 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 -129856714748993:129856714748994 14696:14696 MARK(name(after HIP LaunchKernel)) 129856714750976:129856716607126 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :682 129856721364192:129856722196489 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :683 -129856722198322:129856722198323 14696:14696 MARK(name(before HIP LaunchKernel)) 129856722202102:129856722202759 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 129856722204452:129856722205080 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 129856722207098:129856722210100 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 -129856722211652:129856722211653 14696:14696 MARK(name(after HIP LaunchKernel)) 129856722213452:129856724068250 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :689 129856728873958:129856729610520 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :690 -129856729612474:129856729612475 14696:14696 MARK(name(before HIP LaunchKernel)) 129856729615953:129856729616618 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 129856729618275:129856729618880 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 129856729620844:129856729623983 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 -129856729625525:129856729625526 14696:14696 MARK(name(after HIP LaunchKernel)) 129856729627363:129856731472859 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :696 129856736212718:129856736966611 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :697 -129856736968384:129856736968385 14696:14696 MARK(name(before HIP LaunchKernel)) 129856736971498:129856736972186 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 129856736973934:129856736974581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 129856736976433:129856736979849 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 -129856736981559:129856736981560 14696:14696 MARK(name(after HIP LaunchKernel)) 129856736983603:129856738834349 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :703 129856743571751:129856743607276 14696:14696 hipFree(ptr=0x7fd65ce00000) :704 129856743609591:129856743621235 14696:14696 hipFree(ptr=0x7fd65c800000) :705 diff --git a/test/golden_traces/MatrixTranspose_hip_input_trace.txt b/test/golden_traces/MatrixTranspose_hip_input_trace.txt index 8f68254c..26138161 100644 --- a/test/golden_traces/MatrixTranspose_hip_input_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_input_trace.txt @@ -408,404 +408,204 @@ PASSED! 4496524903280142:4496524903426608 880592:880592 hipMalloc(ptr=0x7f14c3000000, size=4194304) :1 4496524903446365:4496524903573365 880592:880592 hipMalloc(ptr=0x7f14c2800000, size=4194304) :2 4496524903588203:4496525133627902 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :3 -4496525133635787:4496525133635788 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525134199640:4496525134199641 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525134207305:4496525140607184 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :6 4496525154755917:4496525158532879 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :7 -4496525158535584:4496525158535585 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525158550863:4496525158550864 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525158552125:4496525163335997 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :10 4496525175814741:4496525179506102 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :11 -4496525179507855:4496525179507856 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525179517513:4496525179517514 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525179519266:4496525184300123 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :14 4496525196393148:4496525200179318 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :15 -4496525200180801:4496525200180802 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525200188466:4496525200188467 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525200189638:4496525204936449 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :18 4496525216744046:4496525220425409 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :19 -4496525220430760:4496525220430761 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525220437733:4496525220437734 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525220438995:4496525225172542 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :22 4496525236900238:4496525240619832 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :23 -4496525240621355:4496525240621356 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525240632276:4496525240632277 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525240633478:4496525245363489 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :26 4496525257076899:4496525260752009 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :27 -4496525260753792:4496525260753793 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525260764202:4496525260764203 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525260765474:4496525265528037 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :30 4496525278601381:4496525282344690 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :31 -4496525282346172:4496525282346173 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525282352745:4496525282352746 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525282356923:4496525287062988 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :34 4496525298199228:4496525301976453 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :35 -4496525301977805:4496525301977806 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525301985179:4496525301985180 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525301989357:4496525306701493 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :38 4496525319442590:4496525323149401 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :39 -4496525323150894:4496525323150895 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525323157957:4496525323157958 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525323159049:4496525327878419 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :42 4496525340212129:4496525343964345 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :43 -4496525343965688:4496525343965689 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525343972501:4496525343972502 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525343973583:4496525348741845 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :46 4496525360729852:4496525364398150 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :47 -4496525364403590:4496525364403591 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525364410854:4496525364410855 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525364412076:4496525369144271 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :50 4496525381014837:4496525384667765 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :51 -4496525384672134:4496525384672135 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525384679758:4496525384679759 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525384680900:4496525389438431 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :54 4496525401191155:4496525404934986 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :55 -4496525404936389:4496525404936390 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525404945757:4496525404945758 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525404946919:4496525410298471 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :58 4496525423502872:4496525427227196 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :59 -4496525427228929:4496525427228930 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525427240851:4496525427240852 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525427242074:4496525431931367 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :62 4496525444940650:4496525448596826 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :63 -4496525448598570:4496525448598571 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525448605503:4496525448605504 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525448609119:4496525453319692 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :66 4496525464825952:4496525468507253 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :67 -4496525468508586:4496525468508587 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525468515599:4496525468515600 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525468516811:4496525473221545 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :70 4496525484776165:4496525488428212 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :71 -4496525488429845:4496525488429846 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525488437419:4496525488437420 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525488438561:4496525493128415 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :74 4496525504724173:4496525508381501 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :75 -4496525508383023:4496525508383024 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525508389967:4496525508389968 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525508391049:4496525513111039 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :78 4496525524703772:4496525528368253 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :79 -4496525528373653:4496525528373654 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525528380236:4496525528380237 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525528381448:4496525533087222 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :82 4496525544743344:4496525548393326 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :83 -4496525548394819:4496525548394820 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525548404588:4496525548404589 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525548405820:4496525553108910 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :86 4496525564776353:4496525568419633 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :87 -4496525568421026:4496525568421027 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525568430434:4496525568430435 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525568431526:4496525573146797 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :90 4496525584773363:4496525588428076 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :91 -4496525588429449:4496525588429450 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525588436592:4496525588436593 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525588441251:4496525593151694 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :94 4496525604951276:4496525608612772 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :95 -4496525608614626:4496525608614627 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525608621939:4496525608621940 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525608626318:4496525613335408 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :98 4496525624925244:4496525628593993 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :99 -4496525628595485:4496525628595486 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525628603070:4496525628603071 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525628604623:4496525633324024 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :102 4496525644955148:4496525648653793 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :103 -4496525648655416:4496525648655417 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525648662599:4496525648662600 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525648663752:4496525653416343 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :106 4496525665022200:4496525668655764 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :107 -4496525668661865:4496525668661866 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525668668568:4496525668668569 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525668669820:4496525673415239 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :110 4496525685021526:4496525688656242 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :111 -4496525688661842:4496525688661843 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525688668305:4496525688668306 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525688669547:4496525693410117 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :114 4496525705048605:4496525708697446 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :115 -4496525708698938:4496525708698939 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525708709679:4496525708709680 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525708710921:4496525713450741 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :118 4496525725057529:4496525728701911 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :119 -4496525728703444:4496525728703445 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525728713643:4496525728713644 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525728714956:4496525733453982 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :122 4496525745101558:4496525748716245 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :123 -4496525748717458:4496525748717459 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525748724611:4496525748724612 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525748728729:4496525753469559 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :126 4496525765097578:4496525768729748 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :127 -4496525768730961:4496525768730962 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525768738184:4496525768738185 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525768739346:4496525773486869 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :130 4496525785153981:4496525788889836 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :131 -4496525788891139:4496525788891140 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525788898142:4496525788898143 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525788899334:4496525793625478 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :134 4496525806694594:4496525810379663 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :135 -4496525810381096:4496525810381097 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525810388580:4496525810388581 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525810389762:4496525815062083 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :138 4496525829025717:4496525832652416 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :139 -4496525832657686:4496525832657687 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525832664930:4496525832664931 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525832666232:4496525838020960 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :142 4496525851335449:4496525855079079 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :143 -4496525855085421:4496525855085422 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525855093967:4496525855093968 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525855095120:4496525859768263 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :146 4496525873262060:4496525876972367 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :147 -4496525876974080:4496525876974081 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525876984951:4496525876984952 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525876986173:4496525881732493 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :150 4496525894813432:4496525898344842 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :151 -4496525898346185:4496525898346186 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525898353449:4496525898353450 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525898357967:4496525903025769 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :154 4496525914049899:4496525917708138 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :155 -4496525917709640:4496525917709641 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525917717645:4496525917717646 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525917721533:4496525922419974 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :158 4496525933083213:4496525936861418 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :159 -4496525936863392:4496525936863393 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525936870265:4496525936870266 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525936871808:4496525941569528 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :162 4496525952271099:4496525956051338 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :163 -4496525956052891:4496525956052892 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525956060355:4496525956060356 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525956061647:4496525960747143 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :166 4496525973345872:4496525977113719 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :167 -4496525977118778:4496525977118779 880592:880592 MARK(name(before HIP LaunchKernel)) -4496525977126052:4496525977126053 880592:880592 MARK(name(after HIP LaunchKernel)) 4496525977127134:4496525981829511 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :170 4496525996273891:4496525999995089 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :171 -4496526000000189:4496526000000190 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526000007312:4496526000007313 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526000008564:4496526004736400 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :174 4496526016480127:4496526020175927 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :175 -4496526020177280:4496526020177281 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526020187950:4496526020187951 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526020189152:4496526024907620 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :178 4496526036737941:4496526040392402 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :179 -4496526040393895:4496526040393896 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526040404845:4496526040404846 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526040406088:4496526045126310 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :182 4496526056750972:4496526060387640 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :183 -4496526060388933:4496526060388934 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526060396076:4496526060396077 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526060400274:4496526065106138 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :186 4496526076767389:4496526080447340 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :187 -4496526080448782:4496526080448783 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526080455275:4496526080455276 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526080456357:4496526085186276 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :190 4496526096786342:4496526100434311 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :191 -4496526100435814:4496526100435815 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526100442497:4496526100442498 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526100443629:4496526105161156 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :194 4496526116801218:4496526120501776 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :195 -4496526120503329:4496526120503330 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526120510072:4496526120510073 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526120511514:4496526125217650 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :198 4496526136832464:4496526140494920 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :199 -4496526140500110:4496526140500111 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526140507885:4496526140507886 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526140509047:4496526145220241 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :202 4496526156843100:4496526160502872 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :203 -4496526160507441:4496526160507442 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526160514635:4496526160514636 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526160515847:4496526165229165 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :206 4496526176867433:4496526180510052 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :207 -4496526180511384:4496526180511385 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526180521073:4496526180521074 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526180522345:4496526185229233 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :210 4496526196882068:4496526200511312 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :211 -4496526200512644:4496526200512645 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526200519086:4496526200519087 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526200522904:4496526205229030 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :214 4496526216885692:4496526220532459 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :215 -4496526220533741:4496526220533742 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526220540234:4496526220540235 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526220544051:4496526225248913 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :218 4496526236953526:4496526240608640 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :219 -4496526240610032:4496526240610033 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526240617316:4496526240617317 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526240618608:4496526245336946 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :222 4496526257032562:4496526260697764 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :223 -4496526260699086:4496526260699087 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526260705799:4496526260705800 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526260706901:4496526266068673 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :226 4496526279332747:4496526283108167 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :227 -4496526283114379:4496526283114380 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526283122765:4496526283122766 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526283124067:4496526287796549 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :230 4496526301304112:4496526305009459 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :231 -4496526305014168:4496526305014169 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526305022153:4496526305022154 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526305023315:4496526309749930 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :234 4496526322801743:4496526326516899 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :235 -4496526326518893:4496526326518894 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526326533039:4496526326533040 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526326534903:4496526331245847 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :238 4496526340107579:4496526343906984 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :239 -4496526343908958:4496526343908959 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526343922443:4496526343922444 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526343924247:4496526348732695 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :242 4496526360672632:4496526364342794 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :243 -4496526364344246:4496526364344247 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526364351580:4496526364351581 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526364356309:4496526369073956 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :246 4496526380921729:4496526384554339 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :247 -4496526384555681:4496526384555682 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526384563015:4496526384563016 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526384564297:4496526389280813 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :250 4496526401055448:4496526404706814 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :251 -4496526404708236:4496526404708237 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526404715179:4496526404715180 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526404716291:4496526409453775 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :254 4496526421231786:4496526424938237 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :255 -4496526424939570:4496526424939571 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526424946152:4496526424946153 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526424947284:4496526429644452 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :258 4496526442708689:4496526446413777 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :259 -4496526446418566:4496526446418567 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526446425690:4496526446425691 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526446426842:4496526451217265 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :262 4496526463698402:4496526467373874 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :263 -4496526467379615:4496526467379616 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526467388151:4496526467388152 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526467389634:4496526472081051 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :266 4496526484017822:4496526487658527 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :267 -4496526487660120:4496526487660121 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526487671221:4496526487671222 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526487672484:4496526492365545 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :270 4496526504217596:4496526507940206 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :271 -4496526507941599:4496526507941600 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526507948341:4496526507948342 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526507952269:4496526512732904 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :274 4496526524481750:4496526528181919 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :275 -4496526528183281:4496526528183282 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526528190385:4496526528190386 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526528194643:4496526532912630 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :278 4496526544545337:4496526548219016 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :279 -4496526548220469:4496526548220470 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526548227472:4496526548227473 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526548228644:4496526552953164 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :282 4496526564603244:4496526568298141 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :283 -4496526568300125:4496526568300126 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526568306718:4496526568306719 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526568308411:4496526573022752 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :286 4496526584675307:4496526588335099 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :287 -4496526588336581:4496526588336582 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526588343565:4496526588343566 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526588344907:4496526593063886 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :290 4496526604685893:4496526608347148 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :291 -4496526608352378:4496526608352379 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526608359341:4496526608359342 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526608360543:4496526613066458 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :294 4496526624712120:4496526628367043 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :295 -4496526628368456:4496526628368457 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526628378575:4496526628378576 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526628379787:4496526633084449 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :298 4496526644736553:4496526648383901 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :299 -4496526648385494:4496526648385495 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526648394792:4496526648394793 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526648395974:4496526653098974 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :302 4496526664769242:4496526668420808 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :303 -4496526668422211:4496526668422212 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526668428923:4496526668428924 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526668433191:4496526673144235 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :306 4496526684807961:4496526689128249 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :307 -4496526689129702:4496526689129703 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526689136916:4496526689136917 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526689138038:4496526693842629 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :310 4496526706935721:4496526710671858 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :311 -4496526710673421:4496526710673422 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526710680895:4496526710680896 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526710682528:4496526715357624 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :314 4496526728844507:4496526732497026 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :315 -4496526732498649:4496526732498650 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526732506164:4496526732506165 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526732507246:4496526737198843 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :318 4496526748886113:4496526752517561 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :319 -4496526752522671:4496526752522672 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526752529534:4496526752529535 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526752530505:4496526757248855 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :322 4496526768940533:4496526772608911 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :323 -4496526772614301:4496526772614302 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526772620994:4496526772620995 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526772622196:4496526777347416 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :326 4496526789025531:4496526792698958 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :327 -4496526792700511:4496526792700512 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526792710941:4496526792710942 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526792712093:4496526797464554 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :330 4496526809125575:4496526812718051 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :331 -4496526812719434:4496526812719435 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526812727048:4496526812727049 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526812731336:4496526817481033 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :334 4496526829138567:4496526832879692 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :335 -4496526832880944:4496526832880945 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526832887747:4496526832887748 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526832891444:4496526837636884 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :338 4496526849206473:4496526852946255 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :339 -4496526852947527:4496526852947528 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526852953829:4496526852953830 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526852954871:4496526857658531 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :342 4496526869245494:4496526873593022 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :343 -4496526873594705:4496526873594706 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526873602520:4496526873602521 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526873604183:4496526878299818 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :346 4496526891597135:4496526895318503 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :347 -4496526895320266:4496526895320267 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526895328381:4496526895328382 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526895329664:4496526900009158 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :350 4496526913485803:4496526917173899 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :351 -4496526917179309:4496526917179310 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526917186683:4496526917186684 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526917187725:4496526921890463 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :354 4496526934966763:4496526938716654 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :355 -4496526938717977:4496526938717978 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526938728427:4496526938728428 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526938729629:4496526943439361 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :358 4496526957411730:4496526961125205 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :359 -4496526961126547:4496526961126548 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526961136536:4496526961136537 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526961138059:4496526965826390 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :362 4496526978879125:4496526982620470 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :363 -4496526982621833:4496526982621834 880592:880592 MARK(name(before HIP LaunchKernel)) -4496526982628466:4496526982628467 880592:880592 MARK(name(after HIP LaunchKernel)) 4496526982632643:4496526987311768 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :366 4496527000778785:4496527004436191 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :367 -4496527004437654:4496527004437655 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527004445078:4496527004445079 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527004448955:4496527009161343 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :370 4496527020723638:4496527024395191 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :371 -4496527024396794:4496527024396795 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527024403457:4496527024403458 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527024404639:4496527029129138 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :374 4496527040699580:4496527044353630 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :375 -4496527044355043:4496527044355044 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527044361916:4496527044361917 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527044363048:4496527049060306 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :378 4496527060632609:4496527064307921 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :379 -4496527064313732:4496527064313733 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527064320805:4496527064320806 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527064321907:4496527069027702 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :382 4496527080621436:4496527084288380 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :383 -4496527084292949:4496527084292950 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527084299932:4496527084299933 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527084301195:4496527089011037 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :386 4496527100657080:4496527104315719 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :387 -4496527104317693:4496527104317694 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527104327251:4496527104327252 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527104328433:4496527109036020 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :390 4496527120665231:4496527125002391 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :391 -4496527125004065:4496527125004066 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527125011499:4496527125011500 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527125016488:4496527129737942 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :394 4496527143046309:4496527146717213 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :395 -4496527146718826:4496527146718827 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527146726520:4496527146726521 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527146730868:4496527151451590 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :398 4496527163132549:4496527166937384 880592:880592 hipMemcpy(dst=0x7f14c3000000, src=0x7f14c3cff010, sizeBytes=4194304, kind=1) :399 -4496527166938797:4496527166938798 880592:880592 MARK(name(before HIP LaunchKernel)) -4496527166946050:4496527166946051 880592:880592 MARK(name(after HIP LaunchKernel)) 4496527166947223:4496527171616068 880592:880592 hipMemcpy(dst=0x7f14c38fe010, src=0x7f14c2800000, sizeBytes=4194304, kind=2) :402 4496527185123250:4496527185153196 880592:880592 hipFree(ptr=0x7f14c3000000) :403 4496527185154519:4496527185168716 880592:880592 hipFree(ptr=0x7f14c2800000) :404 diff --git a/test/golden_traces/MatrixTranspose_hip_period_trace.txt b/test/golden_traces/MatrixTranspose_hip_period_trace.txt index caa2d638..249eceda 100644 --- a/test/golden_traces/MatrixTranspose_hip_period_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_period_trace.txt @@ -208,251 +208,51 @@ PASSED! 3802699752571489:3802699752686289 1983:1983 hipMalloc(ptr=0x7f6c121ff010, size=4194304) 3802699752688639:3802699752749390 1983:1983 hipMalloc(ptr=0x7fffefcadf28, size=4194304) 3802699752763840:3802700027958750 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700027966800:3802700027966801 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700028567724:3802700028567725 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700041950374:3802700041950375 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700041963674:3802700041963675 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700054151914:3802700054151915 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700054162714:3802700054162715 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700066165433:3802700066165434 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700066176343:3802700066176344 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700078181322:3802700078181323 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700078192012:3802700078192013 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700090220561:3802700090220562 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700090239211:3802700090239212 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700102271721:3802700102271722 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700102282171:3802700102282172 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700114144958:3802700114144959 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700114162049:3802700114162050 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700126128128:3802700126128129 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700126138758:3802700126138759 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700138129156:3802700138129157 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700138139446:3802700138139447 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700150136865:3802700150136866 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700150148016:3802700150148017 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700162246915:3802700162246916 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700162258105:3802700162258106 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700174131823:3802700174131824 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700174149233:3802700174149234 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700186413294:3802700186413295 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700186424475:3802700186424476 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700198692895:3802700198692896 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700198703415:3802700198703416 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700210532173:3802700210532174 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700210542783:3802700210542784 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700222880184:3802700222880185 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700222891274:3802700222891275 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700234962094:3802700234962095 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700234972984:3802700234972985 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700247111934:3802700247111935 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700247122294:3802700247122295 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700259114883:3802700259114884 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700259131593:3802700259131594 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700270919381:3802700270919382 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700270930441:3802700270930442 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700282944209:3802700282944210 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700282954850:3802700282954851 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700294960369:3802700294960370 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700294970439:3802700294970440 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700306951068:3802700306951069 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700306963188:3802700306963189 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700318935636:3802700318935637 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700318952436:3802700318952437 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700330939575:3802700330939576 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700330957096:3802700330957097 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700342957675:3802700342957676 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700342976555:3802700342976556 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700354958353:3802700354958354 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700354969733:3802700354969734 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700367116224:3802700367116225 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700367127874:3802700367127875 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700378910551:3802700378910552 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700378921781:3802700378921782 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700391300403:3802700391300404 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700391311253:3802700391311254 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700403119421:3802700403119422 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700403149901:3802700403149902 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700414928588:3802700414928589 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700414939088:3802700414939089 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700426957197:3802700426957198 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700426969607:3802700426969608 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700438945256:3802700438945257 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700438956156:3802700438956157 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700450955785:3802700450955786 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700450966535:3802700450966536 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700462947734:3802700462947735 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700462958494:3802700462958495 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700475120764:3802700475120765 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700475133244:3802700475133245 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700486943952:3802700486943953 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700486963842:3802700486963843 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700498936501:3802700498936502 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700498947611:3802700498947612 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700510957970:3802700510957971 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700510969340:3802700510969341 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700522956379:3802700522956380 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700522968409:3802700522968410 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700534942538:3802700534942539 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700534953908:3802700534953909 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700546789315:3802700546789316 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700546806236:3802700546806237 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700558559853:3802700558559854 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700558571313:3802700558571314 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700570153708:3802700570153709 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700570176129:3802700570176130 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700581726404:3802700581726405 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700581741565:3802700581741566 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700593057879:3802700593057880 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700593070449:3802700593070450 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700604400132:3802700604400133 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700604413112:3802700604413113 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700615730637:3802700615730638 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700615743157:3802700615743158 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700627079061:3802700627079062 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700627101981:3802700627101982 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700638410875:3802700638410876 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700638425755:3802700638425756 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700649752129:3802700649752130 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700649766079:3802700649766080 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700661088702:3802700661088703 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700661101353:3802700661101354 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700672436427:3802700672436428 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700672449997:3802700672449998 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700683973653:3802700683973654 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700683987193:3802700683987194 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700695366597:3802700695366598 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700695379007:3802700695379008 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700706708861:3802700706708862 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700706735151:3802700706735152 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700718019795:3802700718019796 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700718034725:3802700718034726 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700729346979:3802700729346980 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700729360039:3802700729360040 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700740688063:3802700740688064 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700740702553:3802700740702554 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700752045097:3802700752045098 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700752060217:3802700752060218 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700763470772:3802700763470773 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700763485762:3802700763485763 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700774801006:3802700774801007 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700774813776:3802700774813777 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700786122470:3802700786122471 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700786147830:3802700786147831 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700797490594:3802700797490595 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700797504834:3802700797504835 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700808806748:3802700808806749 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700808822388:3802700808822389 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700820150282:3802700820150283 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700820163482:3802700820163483 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700831687737:3802700831687738 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700831701867:3802700831701868 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700843073042:3802700843073043 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700843096032:3802700843096033 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700854417226:3802700854417227 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700854429236:3802700854429237 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700865753490:3802700865753491 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700865775180:3802700865775181 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700877105074:3802700877105075 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700877120554:3802700877120555 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700888576349:3802700888576350 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700888589579:3802700888589580 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700900086965:3802700900086966 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700900101025:3802700900101026 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700911461388:3802700911461389 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700911474589:3802700911474590 1983:1983 MARK(name(after HIP LaunchKernel)) -3802700922810673:3802700922810674 1983:1983 MARK(name(before HIP LaunchKernel)) -3802700922833153:3802700922833154 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700932447414:3802700934135107 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700934139057:3802700934139058 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700934143817:3802700934144527 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700934146607:3802700934147267 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e03f3099) 3802700934158787:3802700934164967 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700934191267:3802700934191268 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700934192847:3802700936775947 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802700943795998:3802700945501111 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700945513191:3802700945513192 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700945517031:3802700945517901 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700945519841:3802700945520521 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e0ecbb86) 3802700945522671:3802700945530171 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700945531971:3802700945531972 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700945534701:3802700948131020 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802700955136442:3802700956839355 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700956843375:3802700956843376 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700956847725:3802700956848495 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700956850235:3802700956850825 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e1999f61) 3802700956860545:3802700956868795 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700956870695:3802700956870696 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700956872065:3802700959479235 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802700966505397:3802700968203670 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700968207780:3802700968207781 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700968219030:3802700968219770 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700968221700:3802700968222280 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e247222e) 3802700968225090:3802700968233560 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700968235360:3802700968235361 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700968241120:3802700970853059 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802700977859821:3802700979559833 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700979563253:3802700979563254 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700979567803:3802700979568553 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700979570433:3802700979571073 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e2f44d18) 3802700979581243:3802700979589274 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700979590674:3802700979590675 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700979592044:3802700982222943 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802700989239045:3802700990944838 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802700990948338:3802700990948339 1983:1983 MARK(name(before HIP LaunchKernel)) 3802700990960008:3802700990960828 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802700990963068:3802700990963638 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e3a221d9) 3802700990966328:3802700990975628 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802700990977238:3802700990977239 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700990978718:3802700993694078 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802701000919212:3802701002625515 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802701002628745:3802701002628746 1983:1983 MARK(name(before HIP LaunchKernel)) 3802701002633405:3802701002634215 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802701002635935:3802701002636515 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e45440c4) 3802701002649885:3802701002657855 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802701002659335:3802701002659336 1983:1983 MARK(name(after HIP LaunchKernel)) 3802701002660835:3802701005267024 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802701012322026:3802701014008789 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802701014011999:3802701014012000 1983:1983 MARK(name(before HIP LaunchKernel)) 3802701014023469:3802701014024239 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802701014028089:3802701014028669 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e5020cc5) 3802701014031569:3802701014039849 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802701014041409:3802701014041410 1983:1983 MARK(name(after HIP LaunchKernel)) 3802701014042919:3802701016640288 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) 3802701023688501:3802701025398903 1983:1983 hipMemcpy(dst=0x7f6c11400000, src=0x7f6c121ff010, sizeBytes=4194304, kind=1) -3802701025402873:3802701025402874 1983:1983 MARK(name(before HIP LaunchKernel)) 3802701025407454:3802701025408214 1983:1983 __hipPushCallConfiguration(gridDim=, blockDim=, sharedMem=0, stream=0) 3802701025410224:3802701025411104 1983:1983 __hipPopCallConfiguration(gridDim=, blockDim=, sharedMem=140106682958042, stream=0xd8282e5afc125) 3802701025412944:3802701025420534 1983:1983 hipLaunchKernel(function_address=0x401030, numBlocks=, dimBlocks=, args=0x3b9aca00, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) -3802701025429704:3802701025429705 1983:1983 MARK(name(after HIP LaunchKernel)) 3802701025431374:3802701028050563 1983:1983 hipMemcpy(dst=0x7f6c11dfe010, src=0x7f6c10e00000, sizeBytes=4194304, kind=2) -3802701036808678:3802701036808679 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701036822078:3802701036822079 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701048170132:3802701048170133 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701048184912:3802701048184913 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701059593377:3802701059593378 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701059607287:3802701059607288 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701070985111:3802701070985112 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701071008911:3802701071008912 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701082354665:3802701082354666 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701082369396:3802701082369397 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701093751910:3802701093751911 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701093766810:3802701093766811 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701105308045:3802701105308046 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701105323296:3802701105323297 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701116675540:3802701116675541 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701116689570:3802701116689571 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701128156035:3802701128156036 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701128181736:3802701128181737 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701139551739:3802701139551740 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701139565579:3802701139565580 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701150939144:3802701150939145 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701150961354:3802701150961355 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701162295078:3802701162295079 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701162308528:3802701162308529 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701173678182:3802701173678183 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701173692162:3802701173692163 1983:1983 MARK(name(after HIP LaunchKernel)) -3802701185053367:3802701185053368 1983:1983 MARK(name(before HIP LaunchKernel)) -3802701185066667:3802701185066668 1983:1983 MARK(name(after HIP LaunchKernel)) 3802700025923715:3802700027953920 0:0 CopyHostToDevice:4:1983 3802700932468645:3802700934131397 0:0 CopyHostToDevice:159:1983 3802700934227596:3802700935424394 0:0 KernelExecution:163:1983 diff --git a/test/golden_traces/MatrixTranspose_sys_hsa_trace.txt b/test/golden_traces/MatrixTranspose_sys_hsa_trace.txt index 136cf148..a92b7d58 100644 --- a/test/golden_traces/MatrixTranspose_sys_hsa_trace.txt +++ b/test/golden_traces/MatrixTranspose_sys_hsa_trace.txt @@ -5144,704 +5144,504 @@ ROCTracer (pid=158131): 337766975606005:337766975727544 158131:158131 hipMalloc(ptr=0x7f3e43000000, size=4194304) :2 337766975729067:337766975854995 158131:158131 hipMalloc(ptr=0x7f3e42800000, size=4194304) :3 337766975868801:337767194313754 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :4 -337767194318883:337767194318884 158131:158131 MARK(name(before HIP LaunchKernel)) 337767194342478:337767194343830 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 337767194346004:337767194346485 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 337767194348239:337767194857128 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 -337767194872548:337767194872549 158131:158131 MARK(name(after HIP LaunchKernel)) 337767194877096:337767198600589 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :10 337767211443633:337767213698847 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :11 -337767213700460:337767213700461 158131:158131 MARK(name(before HIP LaunchKernel)) 337767213705559:337767213706150 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 337767213707283:337767213707753 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 337767213709958:337767213724806 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 -337767213726569:337767213726570 158131:158131 MARK(name(after HIP LaunchKernel)) 337767213728493:337767215930456 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :17 337767228387752:337767230638347 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :18 -337767230639619:337767230639620 158131:158131 MARK(name(before HIP LaunchKernel)) 337767230645540:337767230646081 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 337767230647023:337767230647865 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 337767230648726:337767230660429 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 -337767230661911:337767230661912 158131:158131 MARK(name(after HIP LaunchKernel)) 337767230664256:337767232863504 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :24 337767243411499:337767245697039 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :25 -337767245698882:337767245698883 158131:158131 MARK(name(before HIP LaunchKernel)) 337767245705184:337767245705765 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 337767245707098:337767245707689 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 337767245711677:337767245727186 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 -337767245729220:337767245729221 158131:158131 MARK(name(after HIP LaunchKernel)) 337767245733127:337767247954105 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :31 337767256142289:337767258320928 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :32 -337767258322752:337767258322753 158131:158131 MARK(name(before HIP LaunchKernel)) 337767258328102:337767258328503 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 337767258329374:337767258330206 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 337767258330987:337767258342148 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 -337767258343721:337767258343722 158131:158131 MARK(name(after HIP LaunchKernel)) 337767258346667:337767260513855 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :38 337767270817268:337767273062733 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :39 -337767273064547:337767273064548 158131:158131 MARK(name(before HIP LaunchKernel)) 337767273070348:337767273070919 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 337767273072292:337767273072913 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 337767273077181:337767273093912 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 -337767273095986:337767273095987 158131:158131 MARK(name(after HIP LaunchKernel)) 337767273099132:337767275379964 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :45 337767283563870:337767285732370 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :46 -337767285733633:337767285733634 158131:158131 MARK(name(before HIP LaunchKernel)) 337767285739734:337767285740285 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 337767285741207:337767285741668 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 337767285742469:337767285753019 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 -337767285755083:337767285755084 158131:158131 MARK(name(after HIP LaunchKernel)) 337767285757818:337767287927761 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :52 337767297920088:337767300172035 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :53 -337767300173809:337767300173810 158131:158131 MARK(name(before HIP LaunchKernel)) 337767300179620:337767300180261 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 337767300181513:337767300182285 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 337767300186312:337767300202703 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 -337767300204958:337767300204959 158131:158131 MARK(name(after HIP LaunchKernel)) 337767300208164:337767302399998 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :59 337767310566371:337767312698212 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :60 -337767312701368:337767312701369 158131:158131 MARK(name(before HIP LaunchKernel)) 337767312705676:337767312706147 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 337767312707008:337767312707459 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 337767312708261:337767312720814 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 -337767312722297:337767312722298 158131:158131 MARK(name(after HIP LaunchKernel)) 337767312725162:337767314905485 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :66 337767324225302:337767326630218 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :67 -337767326632171:337767326632172 158131:158131 MARK(name(before HIP LaunchKernel)) 337767326638293:337767326639004 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 337767326640347:337767326641088 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 337767326644284:337767326659503 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 -337767326662058:337767326662059 158131:158131 MARK(name(after HIP LaunchKernel)) 337767326665324:337767328874220 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :73 337767341730008:337767343889150 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :74 -337767343891364:337767343891365 158131:158131 MARK(name(before HIP LaunchKernel)) 337767343895582:337767343895963 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 337767343896855:337767343897276 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 337767343898067:337767343909118 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 -337767343910821:337767343910822 158131:158131 MARK(name(after HIP LaunchKernel)) 337767343914097:337767346149393 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :80 337767357340201:337767359602798 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :81 -337767359604041:337767359604042 158131:158131 MARK(name(before HIP LaunchKernel)) 337767359607898:337767359608229 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 337767359609191:337767359609641 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 337767359612637:337767359624790 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 -337767359626142:337767359626143 158131:158131 MARK(name(after HIP LaunchKernel)) 337767359628226:337767362392901 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :87 337767370583520:337767372727504 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :88 -337767372729918:337767372729919 158131:158131 MARK(name(before HIP LaunchKernel)) 337767372734026:337767372734597 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 337767372735338:337767372736060 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 337767372736931:337767372748102 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 -337767372750006:337767372750007 158131:158131 MARK(name(after HIP LaunchKernel)) 337767372753082:337767374911783 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :94 337767386842927:337767389026054 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :95 -337767389027407:337767389027408 158131:158131 MARK(name(before HIP LaunchKernel)) 337767389032176:337767389032757 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 337767389034631:337767389035061 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 337767389035893:337767389047835 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 -337767389049208:337767389049209 158131:158131 MARK(name(after HIP LaunchKernel)) 337767389051653:337767391262963 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :101 337767401302348:337767403625540 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :102 -337767403631150:337767403631151 158131:158131 MARK(name(before HIP LaunchKernel)) 337767403636711:337767403637442 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 337767403638775:337767403639406 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 337767403640658:337767403655196 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 -337767403657119:337767403657120 158131:158131 MARK(name(after HIP LaunchKernel)) 337767403661508:337767405850096 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :108 337767414038200:337767416155293 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :109 -337767416157006:337767416157007 158131:158131 MARK(name(before HIP LaunchKernel)) 337767416161023:337767416161414 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 337767416164199:337767416164761 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 337767416165562:337767416175861 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 -337767416177174:337767416177175 158131:158131 MARK(name(after HIP LaunchKernel)) 337767416179899:337767418323041 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :115 337767426540801:337767428694984 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :116 -337767428697219:337767428697220 158131:158131 MARK(name(before HIP LaunchKernel)) 337767428701006:337767428701396 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 337767428702178:337767428702759 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 337767428703530:337767428713249 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 -337767428715854:337767428715855 158131:158131 MARK(name(after HIP LaunchKernel)) 337767428718048:337767430853946 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :122 337767442974988:337767445133299 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :123 -337767445134602:337767445134603 158131:158131 MARK(name(before HIP LaunchKernel)) 337767445138419:337767445138820 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 337767445141455:337767445141895 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 337767445142757:337767445153267 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 -337767445154730:337767445154731 158131:158131 MARK(name(after HIP LaunchKernel)) 337767445156874:337767447277203 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :129 337767457262296:337767459631826 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :130 -337767459635052:337767459635053 158131:158131 MARK(name(before HIP LaunchKernel)) 337767459640913:337767459641674 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 337767459643007:337767459643618 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 337767459644880:337767459659969 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 -337767459663044:337767459663045 158131:158131 MARK(name(after HIP LaunchKernel)) 337767459666571:337767461850200 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :136 337767470021230:337767472126100 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :137 -337767472127232:337767472127233 158131:158131 MARK(name(before HIP LaunchKernel)) 337767472131229:337767472131680 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 337767472133854:337767472134395 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 337767472135177:337767472145406 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 -337767472147189:337767472147190 158131:158131 MARK(name(after HIP LaunchKernel)) 337767472149313:337767474294991 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :143 337767482547276:337767484700437 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :144 -337767484701639:337767484701640 158131:158131 MARK(name(before HIP LaunchKernel)) 337767484705637:337767484706107 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 337767484707009:337767484707460 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 337767484708231:337767484717819 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 -337767484721026:337767484721027 158131:158131 MARK(name(after HIP LaunchKernel)) 337767484723540:337767486915875 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :150 337767498181053:337767500271376 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :151 -337767500272478:337767500272479 158131:158131 MARK(name(before HIP LaunchKernel)) 337767500276305:337767500276836 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 337767500279020:337767500279471 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 337767500280313:337767500290712 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 -337767500292085:337767500292086 158131:158131 MARK(name(after HIP LaunchKernel)) 337767500294219:337767502424908 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :157 337767514185079:337767516608650 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :158 -337767516609902:337767516609903 158131:158131 MARK(name(before HIP LaunchKernel)) 337767516613960:337767516614401 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 337767516615273:337767516615693 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 337767516616465:337767516625923 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 -337767516628097:337767516628098 158131:158131 MARK(name(after HIP LaunchKernel)) 337767516630201:337767518783622 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :164 337767528416742:337767530629065 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :165 -337767530630918:337767530630919 158131:158131 MARK(name(before HIP LaunchKernel)) 337767530636539:337767530637591 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 337767530641548:337767530642310 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 337767530643622:337767530657498 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 -337767530659362:337767530659363 158131:158131 MARK(name(after HIP LaunchKernel)) 337767530662688:337767532867857 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :171 337767545437393:337767547614971 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :172 -337767547616493:337767547616494 158131:158131 MARK(name(before HIP LaunchKernel)) 337767547621162:337767547621713 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 337767547622695:337767547623116 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 337767547623907:337767547635189 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 -337767547638194:337767547638195 158131:158131 MARK(name(after HIP LaunchKernel)) 337767547640258:337767549833144 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :178 337767561223869:337767563606182 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :179 -337767563607365:337767563607366 158131:158131 MARK(name(before HIP LaunchKernel)) 337767563613396:337767563613977 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 337767563614849:337767563615299 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 337767563616141:337767563627482 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 -337767563628835:337767563628836 158131:158131 MARK(name(after HIP LaunchKernel)) 337767563631961:337767565791204 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :185 337767574231283:337767576611813 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :186 -337767576613155:337767576613156 158131:158131 MARK(name(before HIP LaunchKernel)) 337767576617123:337767576617614 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 337767576618505:337767576618946 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 337767576619868:337767576630779 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 -337767576634506:337767576634507 158131:158131 MARK(name(after HIP LaunchKernel)) 337767576636640:337767578794640 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :192 337767590051562:337767592160019 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :193 -337767592161472:337767592161473 158131:158131 MARK(name(before HIP LaunchKernel)) 337767592166561:337767592166952 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 337767592167914:337767592168345 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 337767592169216:337767592179626 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 -337767592180968:337767592180969 158131:158131 MARK(name(after HIP LaunchKernel)) 337767592183203:337767594314973 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :199 337767605936183:337767608091408 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :200 -337767608092600:337767608092601 158131:158131 MARK(name(before HIP LaunchKernel)) 337767608096537:337767608096988 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 337767608097750:337767608098311 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 337767608101026:337767608112738 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 -337767608114020:337767608114021 158131:158131 MARK(name(after HIP LaunchKernel)) 337767608116084:337767610323438 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :206 337767618518844:337767620683096 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :207 -337767620684288:337767620684289 158131:158131 MARK(name(before HIP LaunchKernel)) 337767620689798:337767620690259 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 337767620691031:337767620691481 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 337767620692273:337767620702853 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 -337767620704055:337767620704056 158131:158131 MARK(name(after HIP LaunchKernel)) 337767620706219:337767622841567 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :213 337767634728658:337767636912607 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :214 -337767636913829:337767636913830 158131:158131 MARK(name(before HIP LaunchKernel)) 337767636917446:337767636917907 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 337767636918768:337767636919219 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 337767636922566:337767636933456 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 -337767636934718:337767636934719 158131:158131 MARK(name(after HIP LaunchKernel)) 337767636936953:337767639147723 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :220 337767648806150:337767651012391 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :221 -337767651014295:337767651014296 158131:158131 MARK(name(before HIP LaunchKernel)) 337767651022560:337767651023131 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 337767651024564:337767651025175 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 337767651026508:337767651042107 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 -337767651044301:337767651044302 158131:158131 MARK(name(after HIP LaunchKernel)) 337767651047637:337767653313171 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :227 337767661488300:337767663643385 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :228 -337767663644467:337767663644468 158131:158131 MARK(name(before HIP LaunchKernel)) 337767663648374:337767663648785 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 337767663649857:337767663650298 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 337767663652402:337767663662471 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 -337767663663723:337767663663724 158131:158131 MARK(name(after HIP LaunchKernel)) 337767663665907:337767665806926 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :234 337767673986283:337767676162668 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :235 -337767676163920:337767676163921 158131:158131 MARK(name(before HIP LaunchKernel)) 337767676170162:337767676170573 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 337767676171475:337767676171915 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 337767676172757:337767676183457 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 -337767676184780:337767676184781 158131:158131 MARK(name(after HIP LaunchKernel)) 337767676188406:337767678326950 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :241 337767689596174:337767691883598 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :242 -337767691884750:337767691884751 158131:158131 MARK(name(before HIP LaunchKernel)) 337767691889128:337767691889539 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 337767691890451:337767691890882 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 337767691893787:337767691905870 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 -337767691906972:337767691906973 158131:158131 MARK(name(after HIP LaunchKernel)) 337767691909086:337767694152087 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :248 337767705566316:337767707738723 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :249 -337767707739965:337767707739966 158131:158131 MARK(name(before HIP LaunchKernel)) 337767707745015:337767707745466 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 337767707746347:337767707746788 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 337767707747540:337767707757609 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 -337767707758841:337767707758842 158131:158131 MARK(name(after HIP LaunchKernel)) 337767707762127:337767709917513 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :255 337767718100777:337767720221597 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :256 -337767720222760:337767720222761 158131:158131 MARK(name(before HIP LaunchKernel)) 337767720226607:337767720227148 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 337767720228020:337767720228460 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 337767720231025:337767720241485 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 -337767720242727:337767720242728 158131:158131 MARK(name(after HIP LaunchKernel)) 337767720244851:337767722379548 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :262 337767733662679:337767735850947 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :263 -337767735855846:337767735855847 158131:158131 MARK(name(before HIP LaunchKernel)) 337767735859793:337767735860264 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 337767735861136:337767735861567 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 337767735862328:337767735872968 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 -337767735874281:337767735874282 158131:158131 MARK(name(after HIP LaunchKernel)) 337767735877266:337767738154572 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :269 337767749496464:337767751689019 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :270 -337767751690242:337767751690243 158131:158131 MARK(name(before HIP LaunchKernel)) 337767751695231:337767751695752 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 337767751696614:337767751697055 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 337767751699119:337767751709298 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 -337767751710520:337767751710521 158131:158131 MARK(name(after HIP LaunchKernel)) 337767751712754:337767753849585 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :276 337767762008121:337767764124393 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :277 -337767764127208:337767764127209 158131:158131 MARK(name(before HIP LaunchKernel)) 337767764130875:337767764131336 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 337767764132227:337767764133149 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 337767764133941:337767764144380 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 -337767764145683:337767764145684 158131:158131 MARK(name(after HIP LaunchKernel)) 337767764148638:337767766289286 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :283 337767777579200:337767779697586 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :284 -337767779698878:337767779698879 158131:158131 MARK(name(before HIP LaunchKernel)) 337767779702736:337767779703196 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 337767779705741:337767779706352 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 337767779707164:337767779717423 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 -337767779718676:337767779718677 158131:158131 MARK(name(after HIP LaunchKernel)) 337767779720840:337767781914698 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :290 337767793066662:337767795230303 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :291 -337767795232837:337767795232838 158131:158131 MARK(name(before HIP LaunchKernel)) 337767795236725:337767795237186 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 337767795238047:337767795238799 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 337767795239580:337767795250210 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 -337767795251523:337767795251524 158131:158131 MARK(name(after HIP LaunchKernel)) 337767795254298:337767797393343 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :297 337767805564384:337767807730991 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :298 -337767807732183:337767807732184 158131:158131 MARK(name(before HIP LaunchKernel)) 337767807736431:337767807736832 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 337767807738886:337767807739326 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 337767807740128:337767807750718 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 -337767807752381:337767807752382 158131:158131 MARK(name(after HIP LaunchKernel)) 337767807754445:337767809910952 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :304 337767821070741:337767823217330 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :305 -337767823220606:337767823220607 158131:158131 MARK(name(before HIP LaunchKernel)) 337767823224113:337767823224584 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 337767823225435:337767823226117 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 337767823226888:337767823237809 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 -337767823239061:337767823239062 158131:158131 MARK(name(after HIP LaunchKernel)) 337767823242508:337767825378306 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :311 337767836186891:337767838611975 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :312 -337767838613157:337767838613158 158131:158131 MARK(name(before HIP LaunchKernel)) 337767838616974:337767838617435 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 337767838619529:337767838619980 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 337767838620751:337767838631151 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 -337767838632373:337767838632374 158131:158131 MARK(name(after HIP LaunchKernel)) 337767838634497:337767841389975 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :318 337767849602334:337767851741900 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :319 -337767851744665:337767851744666 158131:158131 MARK(name(before HIP LaunchKernel)) 337767851748422:337767851748953 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 337767851749865:337767851750326 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 337767851751257:337767851762939 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 -337767851765164:337767851765165 158131:158131 MARK(name(after HIP LaunchKernel)) 337767851767288:337767853916672 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :325 337767865022429:337767867180790 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :326 -337767867182052:337767867182053 158131:158131 MARK(name(before HIP LaunchKernel)) 337767867186040:337767867186581 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 337767867189446:337767867189897 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 337767867190779:337767867201489 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 -337767867202901:337767867202902 158131:158131 MARK(name(after HIP LaunchKernel)) 337767867204965:337767869336385 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :332 337767877529990:337767879695264 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :333 -337767879697488:337767879697489 158131:158131 MARK(name(before HIP LaunchKernel)) 337767879701245:337767879701716 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 337767879702618:337767879703058 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 337767879703930:337767879713799 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 -337767879715792:337767879715793 158131:158131 MARK(name(after HIP LaunchKernel)) 337767879717967:337767881913167 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :339 337767892995540:337767895138812 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :340 -337767895140095:337767895140096 158131:158131 MARK(name(before HIP LaunchKernel)) 337767895143782:337767895144343 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 337767895147399:337767895147850 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 337767895148741:337767895159992 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 -337767895161385:337767895161386 158131:158131 MARK(name(after HIP LaunchKernel)) 337767895163479:337767897306631 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :346 337767905491088:337767907644809 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :347 -337767907645981:337767907645982 158131:158131 MARK(name(before HIP LaunchKernel)) 337767907649959:337767907650369 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 337767907651271:337767907651702 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 337767907652503:337767907662182 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 -337767907665338:337767907665339 158131:158131 MARK(name(after HIP LaunchKernel)) 337767907667542:337767909844217 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :353 337767920834587:337767923015841 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :354 -337767923016993:337767923016994 158131:158131 MARK(name(before HIP LaunchKernel)) 337767923020860:337767923021301 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 337767923023625:337767923024216 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 337767923025018:337767923034997 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 -337767923036169:337767923036170 158131:158131 MARK(name(after HIP LaunchKernel)) 337767923038564:337767925257779 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :360 337767933415867:337767935607120 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :361 -337767935608392:337767935608393 158131:158131 MARK(name(before HIP LaunchKernel)) 337767935614504:337767935614894 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 337767935615796:337767935616227 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 337767935617028:337767935628791 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 -337767935630814:337767935630815 158131:158131 MARK(name(after HIP LaunchKernel)) 337767935633069:337767937795447 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :367 337767949313502:337767951612979 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :368 -337767951614161:337767951614162 158131:158131 MARK(name(before HIP LaunchKernel)) 337767951619331:337767951619782 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 337767951620653:337767951621084 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 337767951621866:337767951631975 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 -337767951633117:337767951633118 158131:158131 MARK(name(after HIP LaunchKernel)) 337767951635461:337767953790546 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :374 337767961974873:337767964090794 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :375 -337767964091946:337767964091947 158131:158131 MARK(name(before HIP LaunchKernel)) 337767964095883:337767964096334 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 337767964097106:337767964097537 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 337767964098328:337767964109599 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 -337767964112976:337767964112977 158131:158131 MARK(name(after HIP LaunchKernel)) 337767964115200:337767966322082 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :381 337767978141735:337767980312048 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :382 -337767980313270:337767980313271 158131:158131 MARK(name(before HIP LaunchKernel)) 337767980318530:337767980318991 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 337767980319862:337767980320283 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 337767980321075:337767980333218 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 -337767980334370:337767980334371 158131:158131 MARK(name(after HIP LaunchKernel)) 337767980336434:337767982463225 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :388 337767990643965:337767992831180 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :389 -337767992832242:337767992832243 158131:158131 MARK(name(before HIP LaunchKernel)) 337767992835949:337767992836400 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 337767992837272:337767992838023 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 337767992838895:337767992849154 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 -337767992852671:337767992852672 158131:158131 MARK(name(after HIP LaunchKernel)) 337767992854985:337767995170302 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :395 337768006421925:337768008620131 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :396 -337768008621293:337768008621294 158131:158131 MARK(name(before HIP LaunchKernel)) 337768008627004:337768008627585 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 337768008628486:337768008628917 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 337768008629709:337768008640940 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 -337768008642232:337768008642233 158131:158131 MARK(name(after HIP LaunchKernel)) 337768008644336:337768010807607 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :402 337768022826035:337768025034360 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :403 -337768025035673:337768025035674 158131:158131 MARK(name(before HIP LaunchKernel)) 337768025040201:337768025040742 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 337768025041624:337768025042085 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 337768025045702:337768025059588 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 -337768025061241:337768025061242 158131:158131 MARK(name(after HIP LaunchKernel)) 337768025063405:337768027279074 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :409 337768035511752:337768037677757 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :410 -337768037679060:337768037679061 158131:158131 MARK(name(before HIP LaunchKernel)) 337768037684300:337768037684831 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 337768037685733:337768037686173 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 337768037687045:337768037698837 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 -337768037700029:337768037700030 158131:158131 MARK(name(after HIP LaunchKernel)) 337768037702123:337768039835918 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :416 337768052464758:337768054677029 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :417 -337768054678492:337768054678493 158131:158131 MARK(name(before HIP LaunchKernel)) 337768054682209:337768054682760 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 337768054683682:337768054684123 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 337768054687028:337768054699391 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 -337768054700824:337768054700825 158131:158131 MARK(name(after HIP LaunchKernel)) 337768054702888:337768056852582 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :423 337768067696706:337768069885404 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :424 -337768069886696:337768069886697 158131:158131 MARK(name(before HIP LaunchKernel)) 337768069892627:337768069893168 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 337768069894090:337768069894531 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 337768069895333:337768069906854 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 -337768069908016:337768069908017 158131:158131 MARK(name(after HIP LaunchKernel)) 337768069910862:337768072170173 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :430 337768080351174:337768082619121 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :431 -337768082620604:337768082620605 158131:158131 MARK(name(before HIP LaunchKernel)) 337768082624071:337768082624542 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 337768082625433:337768082625864 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 337768082627908:337768082637987 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 -337768082639350:337768082639351 158131:158131 MARK(name(after HIP LaunchKernel)) 337768082641434:337768084796047 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :437 337768095528300:337768097722809 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :438 -337768097724172:337768097724173 158131:158131 MARK(name(before HIP LaunchKernel)) 337768097729221:337768097729712 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 337768097730584:337768097731155 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 337768097732016:337768097741915 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 -337768097743117:337768097743118 158131:158131 MARK(name(after HIP LaunchKernel)) 337768097746354:337768099921176 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :444 337768108134717:337768110237323 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :445 -337768110238696:337768110238697 158131:158131 MARK(name(before HIP LaunchKernel)) 337768110242483:337768110242954 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 337768110243806:337768110244246 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 337768110246992:337768110258072 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 -337768110259235:337768110259236 158131:158131 MARK(name(after HIP LaunchKernel)) 337768110261339:337768112386066 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :451 337768122890659:337768125074838 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :452 -337768125076772:337768125076773 158131:158131 MARK(name(before HIP LaunchKernel)) 337768125081792:337768125082333 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 337768125083224:337768125083976 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 337768125084777:337768125095507 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 -337768125096680:337768125096681 158131:158131 MARK(name(after HIP LaunchKernel)) 337768125100236:337768127306697 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :458 337768135501804:337768137646259 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :459 -337768137647451:337768137647452 158131:158131 MARK(name(before HIP LaunchKernel)) 337768137651519:337768137651969 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 337768137652901:337768137653332 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 337768137655256:337768137665595 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 -337768137667489:337768137667490 158131:158131 MARK(name(after HIP LaunchKernel)) 337768137669593:337768139840788 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :465 337768150007965:337768152183808 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :466 -337768152188658:337768152188659 158131:158131 MARK(name(before HIP LaunchKernel)) 337768152194328:337768152195040 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 337768152196402:337768152197013 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 337768152198296:337768152212402 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 -337768152214216:337768152214217 158131:158131 MARK(name(after HIP LaunchKernel)) 337768152219596:337768154397694 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :472 337768162580378:337768164727518 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :473 -337768164728680:337768164728681 158131:158131 MARK(name(before HIP LaunchKernel)) 337768164733259:337768164733719 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 337768164734801:337768164735282 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 337768164737577:337768164748748 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 -337768164750000:337768164750001 158131:158131 MARK(name(after HIP LaunchKernel)) 337768164752154:337768166918771 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :479 337768176494672:337768178718727 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :480 -337768178721673:337768178721674 158131:158131 MARK(name(before HIP LaunchKernel)) 337768178727373:337768178727954 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 337768178729638:337768178730259 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 337768178731531:337768178746109 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 -337768178747892:337768178747893 158131:158131 MARK(name(after HIP LaunchKernel)) 337768178752310:337768180959564 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :486 337768193581480:337768195710897 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :487 -337768195712069:337768195712070 158131:158131 MARK(name(before HIP LaunchKernel)) 337768195716056:337768195716457 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 337768195719553:337768195720294 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 337768195721106:337768195731295 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 -337768195732517:337768195732518 158131:158131 MARK(name(after HIP LaunchKernel)) 337768195735303:337768197924692 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :493 337768209190018:337768211608961 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :494 -337768211611596:337768211611597 158131:158131 MARK(name(before HIP LaunchKernel)) 337768211615473:337768211615864 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 337768211617006:337768211617567 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 337768211618379:337768211629159 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 -337768211630471:337768211630472 158131:158131 MARK(name(after HIP LaunchKernel)) 337768211633337:337768213792179 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :500 337768221985442:337768224096274 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :501 -337768224097586:337768224097587 158131:158131 MARK(name(before HIP LaunchKernel)) 337768224101524:337768224101924 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 337768224104630:337768224105050 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 337768224105842:337768224116632 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 -337768224117704:337768224117705 158131:158131 MARK(name(after HIP LaunchKernel)) 337768224119818:337768226336920 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :507 337768237853602:337768240033864 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :508 -337768240036840:337768240036841 158131:158131 MARK(name(before HIP LaunchKernel)) 337768240040687:337768240041078 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 337768240042400:337768240042821 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 337768240043613:337768240053391 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 -337768240055445:337768240055446 158131:158131 MARK(name(after HIP LaunchKernel)) 337768240057689:337768242274030 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :514 337768250469578:337768252848905 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :515 -337768252855277:337768252855278 158131:158131 MARK(name(before HIP LaunchKernel)) 337768252869364:337768252870586 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 337768252876868:337768252877830 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 337768252880114:337768252911774 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 -337768252915411:337768252915412 158131:158131 MARK(name(after HIP LaunchKernel)) 337768252920370:337768255347979 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :521 337768268551523:337768270751402 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :522 -337768270755650:337768270755651 158131:158131 MARK(name(before HIP LaunchKernel)) 337768270761180:337768270761641 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 337768270762723:337768270763194 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 337768270765088:337768270783793 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 -337768270786478:337768270786479 158131:158131 MARK(name(after HIP LaunchKernel)) 337768270788863:337768272953826 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :528 337768285726044:337768287887231 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :529 -337768287888854:337768287888855 158131:158131 MARK(name(before HIP LaunchKernel)) 337768287892821:337768287893372 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 337768287896709:337768287897280 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 337768287898051:337768287909623 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 -337768287911166:337768287911167 158131:158131 MARK(name(after HIP LaunchKernel)) 337768287913500:337768290155559 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :535 337768301488935:337768303702591 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :536 -337768303705937:337768303705938 158131:158131 MARK(name(before HIP LaunchKernel)) 337768303710095:337768303710626 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 337768303711508:337768303711928 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 337768303712700:337768303724151 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 -337768303726566:337768303726567 158131:158131 MARK(name(after HIP LaunchKernel)) 337768303728660:337768305927337 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :542 337768314071007:337768316174444 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :543 -337768316175637:337768316175638 158131:158131 MARK(name(before HIP LaunchKernel)) 337768316179454:337768316180315 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 337768316183652:337768316184063 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 337768316184874:337768316197488 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 -337768316198820:337768316198821 158131:158131 MARK(name(after HIP LaunchKernel)) 337768316200924:337768318880829 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :549 337768330621273:337768332854295 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :550 -337768332855597:337768332855598 158131:158131 MARK(name(before HIP LaunchKernel)) 337768332859845:337768332860316 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 337768332861198:337768332861759 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 337768332862540:337768332874884 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 -337768332878350:337768332878351 158131:158131 MARK(name(after HIP LaunchKernel)) 337768332880615:337768335154463 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :556 337768344608986:337768346866314 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :557 -337768346868297:337768346868298 158131:158131 MARK(name(before HIP LaunchKernel)) 337768346873848:337768346874479 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 337768346878667:337768346879689 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 337768346880981:337768346896671 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 -337768346898554:337768346898555 158131:158131 MARK(name(after HIP LaunchKernel)) 337768346901931:337768349187070 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :563 337768357366927:337768359620077 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :564 -337768359621420:337768359621421 158131:158131 MARK(name(before HIP LaunchKernel)) 337768359626189:337768359626810 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 337768359627762:337768359628192 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 337768359629154:337768359643301 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 -337768359645665:337768359645666 158131:158131 MARK(name(after HIP LaunchKernel)) 337768359648020:337768361807924 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :570 337768370013571:337768372141474 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :571 -337768372142707:337768372142708 158131:158131 MARK(name(before HIP LaunchKernel)) 337768372147686:337768372148157 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 337768372149049:337768372149469 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 337768372150241:337768372160180 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 -337768372161652:337768372161653 158131:158131 MARK(name(after HIP LaunchKernel)) 337768372163756:337768374308411 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :577 337768385673678:337768387892924 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :578 -337768387894387:337768387894388 158131:158131 MARK(name(before HIP LaunchKernel)) 337768387898584:337768387899166 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 337768387900057:337768387900478 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 337768387901350:337768387912661 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 -337768387916108:337768387916109 158131:158131 MARK(name(after HIP LaunchKernel)) 337768387918372:337768390163166 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :584 337768398340079:337768400614609 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :585 -337768400615961:337768400615962 158131:158131 MARK(name(before HIP LaunchKernel)) 337768400622193:337768400622644 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 337768400623596:337768400624016 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 337768400624818:337768400637362 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 -337768400638854:337768400638855 158131:158131 MARK(name(after HIP LaunchKernel)) 337768400641169:337768402797706 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :591 337768414385352:337768416618845 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :592 -337768416620398:337768416620399 158131:158131 MARK(name(before HIP LaunchKernel)) 337768416624315:337768416625017 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 337768416626008:337768416626429 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 337768416627221:337768416637450 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 -337768416640917:337768416640918 158131:158131 MARK(name(after HIP LaunchKernel)) 337768416643371:337768418792214 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :598 337768427961818:337768430177186 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :599 -337768430179080:337768430179081 158131:158131 MARK(name(before HIP LaunchKernel)) 337768430186484:337768430187155 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 337768430188517:337768430189089 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 337768430190351:337768430204648 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 -337768430206511:337768430206512 158131:158131 MARK(name(after HIP LaunchKernel)) 337768430209727:337768432405409 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :605 337768445362658:337768447619615 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :606 -337768447620907:337768447620908 158131:158131 MARK(name(before HIP LaunchKernel)) 337768447624705:337768447625246 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 337768447626007:337768447626418 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 337768447629554:337768447639172 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 -337768447640605:337768447640606 158131:158131 MARK(name(after HIP LaunchKernel)) 337768447642769:337768449799557 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :612 337768460754209:337768463014171 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :613 -337768463015454:337768463015455 158131:158131 MARK(name(before HIP LaunchKernel)) 337768463020914:337768463021425 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 337768463022307:337768463022697 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 337768463023549:337768463033768 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 -337768463035271:337768463035272 158131:158131 MARK(name(after HIP LaunchKernel)) 337768463037485:337768465249006 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :619 337768473400381:337768475617864 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :620 -337768475619156:337768475619157 158131:158131 MARK(name(before HIP LaunchKernel)) 337768475623053:337768475623524 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 337768475624506:337768475624927 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 337768475627251:337768475637441 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 -337768475638953:337768475638954 158131:158131 MARK(name(after HIP LaunchKernel)) 337768475641158:337768477801122 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :626 337768489348752:337768491615487 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :627 -337768491616780:337768491616781 158131:158131 MARK(name(before HIP LaunchKernel)) 337768491622821:337768491623262 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 337768491624254:337768491624685 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 337768491625486:337768491635575 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 -337768491637098:337768491637099 158131:158131 MARK(name(after HIP LaunchKernel)) 337768491641817:337768493801620 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :633 337768501990595:337768504118268 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :634 -337768504120121:337768504120122 158131:158131 MARK(name(before HIP LaunchKernel)) 337768504124329:337768504124770 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 337768504125662:337768504126072 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 337768504128397:337768504138977 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 -337768504140389:337768504140390 158131:158131 MARK(name(after HIP LaunchKernel)) 337768504142463:337768506366899 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :640 337768518232669:337768520625573 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :641 -337768520626815:337768520626816 158131:158131 MARK(name(before HIP LaunchKernel)) 337768520631905:337768520632456 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 337768520633878:337768520634299 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 337768520635061:337768520645651 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 -337768520647073:337768520647074 158131:158131 MARK(name(after HIP LaunchKernel)) 337768520650029:337768522810183 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :647 337768532969966:337768535149747 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :648 -337768535151751:337768535151752 158131:158131 MARK(name(before HIP LaunchKernel)) 337768535157873:337768535158514 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 337768535159816:337768535160448 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 337768535165006:337768535180515 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 -337768535182409:337768535182410 158131:158131 MARK(name(after HIP LaunchKernel)) 337768535185725:337768537380745 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :654 337768545542419:337768547693376 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :655 -337768547695821:337768547695822 158131:158131 MARK(name(before HIP LaunchKernel)) 337768547699348:337768547699789 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 337768547700690:337768547701111 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 337768547701913:337768547711901 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 -337768547713164:337768547713165 158131:158131 MARK(name(after HIP LaunchKernel)) 337768547716490:337768549856496 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :661 337768558079085:337768560202931 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :662 -337768560204103:337768560204104 158131:158131 MARK(name(before HIP LaunchKernel)) 337768560208211:337768560208672 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 337768560209604:337768560210025 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 337768560213241:337768560225203 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 -337768560227177:337768560227178 158131:158131 MARK(name(after HIP LaunchKernel)) 337768560229411:337768562358307 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :668 337768574142772:337768576292777 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :669 -337768576296725:337768576296726 158131:158131 MARK(name(before HIP LaunchKernel)) 337768576300492:337768576301013 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 337768576301914:337768576302335 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 337768576303127:337768576313827 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 -337768576315029:337768576315030 158131:158131 MARK(name(after HIP LaunchKernel)) 337768576318716:337768578450637 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :675 337768586668517:337768588863888 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :676 -337768588865100:337768588865101 158131:158131 MARK(name(before HIP LaunchKernel)) 337768588869188:337768588869729 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 337768588870801:337768588871352 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 337768588873446:337768588884256 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 -337768588885429:337768588885430 158131:158131 MARK(name(after HIP LaunchKernel)) 337768588887593:337768591162173 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :682 337768603357906:337768605622036 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :683 -337768605625142:337768605625143 158131:158131 MARK(name(before HIP LaunchKernel)) 337768605629520:337768605630041 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 337768605631043:337768605631464 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 337768605632225:337768605644258 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 -337768605645821:337768605645822 158131:158131 MARK(name(after HIP LaunchKernel)) 337768605648927:337768607800946 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :689 337768617929891:337768620088853 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :690 -337768620090907:337768620090908 158131:158131 MARK(name(before HIP LaunchKernel)) 337768620097259:337768620097810 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 337768620101827:337768620102438 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 337768620103691:337768620119190 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 -337768620121354:337768620121355 158131:158131 MARK(name(after HIP LaunchKernel)) 337768620124650:337768622380325 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :696 337768630606661:337768632738702 158131:158131 hipMemcpy(dst=0x7f3e43000000, src=0x7f3e7c0ff010, sizeBytes=4194304, kind=1) :697 -337768632742249:337768632742250 158131:158131 MARK(name(before HIP LaunchKernel)) 337768632747128:337768632747529 158131:158131 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 337768632748420:337768632748871 158131:158131 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 337768632749663:337768632761475 158131:158131 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7fff99cd0d68, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 -337768632763158:337768632763159 158131:158131 MARK(name(after HIP LaunchKernel)) 337768632766214:337768634923603 158131:158131 hipMemcpy(dst=0x7f3e43bf9010, src=0x7f3e42800000, sizeBytes=4194304, kind=2) :703 337768644579574:337768644620882 158131:158131 hipFree(ptr=0x7f3e43000000) :704 337768644622795:337768644639537 158131:158131 hipFree(ptr=0x7f3e42800000) :705 diff --git a/test/golden_traces/MatrixTranspose_sys_trace.txt b/test/golden_traces/MatrixTranspose_sys_trace.txt index bee7a240..b9f5aa67 100644 --- a/test/golden_traces/MatrixTranspose_sys_trace.txt +++ b/test/golden_traces/MatrixTranspose_sys_trace.txt @@ -4041,704 +4041,504 @@ ROCTracer (pid=158125): 337764704442075:337764704580366 158125:158125 hipMalloc(ptr=0x7f95c0800000, size=4194304) :2 337764704581889:337764704737823 158125:158125 hipMalloc(ptr=0x7f9484c00000, size=4194304) :3 337764704752891:337764922533350 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :4 -337764922538480:337764922538481 158125:158125 MARK(name(before HIP LaunchKernel)) 337764922562495:337764922564128 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 337764922566293:337764922566894 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 337764922569459:337764923031149 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 -337764923044474:337764923044475 158125:158125 MARK(name(after HIP LaunchKernel)) 337764923047921:337764925574527 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :10 337764938684604:337764941181393 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :11 -337764941182806:337764941182807 158125:158125 MARK(name(before HIP LaunchKernel)) 337764941188697:337764941189108 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 337764941190230:337764941190701 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 337764941194668:337764941210328 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 -337764941212081:337764941212082 158125:158125 MARK(name(after HIP LaunchKernel)) 337764941214015:337764942339087 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :17 337764950623693:337764953056311 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :18 -337764953057774:337764953057775 158125:158125 MARK(name(before HIP LaunchKernel)) 337764953063685:337764953064206 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 337764953065609:337764953066049 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 337764953067041:337764953079134 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 -337764953081038:337764953081039 158125:158125 MARK(name(after HIP LaunchKernel)) 337764953083192:337764954180501 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :24 337764962423108:337764964979740 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :25 -337764964980832:337764964980833 158125:158125 MARK(name(before HIP LaunchKernel)) 337764964984519:337764964985050 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 337764964986002:337764964986443 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 337764964988627:337764964999588 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 -337764965000860:337764965000861 158125:158125 MARK(name(after HIP LaunchKernel)) 337764965002904:337764966123257 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :31 337764974365011:337764977266363 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :32 -337764977267505:337764977267506 158125:158125 MARK(name(before HIP LaunchKernel)) 337764977273266:337764977273797 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 337764977274759:337764977275220 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 337764977276171:337764977287002 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 -337764977288485:337764977288486 158125:158125 MARK(name(after HIP LaunchKernel)) 337764977291550:337764978407134 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :38 337764986653789:337764989132825 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :39 -337764989134297:337764989134298 158125:158125 MARK(name(before HIP LaunchKernel)) 337764989138095:337764989138666 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 337764989139617:337764989140068 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 337764989143615:337764989154375 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 -337764989155898:337764989155899 158125:158125 MARK(name(after HIP LaunchKernel)) 337764989158022:337764990265090 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :45 337764998786773:337765001226896 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :46 -337765001227958:337765001227959 158125:158125 MARK(name(before HIP LaunchKernel)) 337765001234560:337765001235011 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 337765001235843:337765001236294 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 337765001237235:337765001247825 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 -337765001249288:337765001249289 158125:158125 MARK(name(after HIP LaunchKernel)) 337765001252113:337765002370653 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :52 337765010705494:337765013159863 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :53 -337765013160895:337765013160896 158125:158125 MARK(name(before HIP LaunchKernel)) 337765013164893:337765013165353 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 337765013166506:337765013166976 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 337765013170443:337765013182155 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 -337765013183498:337765013183499 158125:158125 MARK(name(after HIP LaunchKernel)) 337765013185662:337765014295144 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :59 337765022617712:337765025088462 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :60 -337765025090746:337765025090747 158125:158125 MARK(name(before HIP LaunchKernel)) 337765025095195:337765025095586 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 337765025096567:337765025097018 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 337765025097990:337765025108670 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 -337765025109913:337765025109914 158125:158125 MARK(name(after HIP LaunchKernel)) 337765025112798:337765026232400 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :66 337765034457684:337765036908086 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :67 -337765036909138:337765036909139 158125:158125 MARK(name(before HIP LaunchKernel)) 337765036913035:337765036913426 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 337765036914247:337765036914708 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 337765036918836:337765036929165 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 -337765036930508:337765036930509 158125:158125 MARK(name(after HIP LaunchKernel)) 337765036932752:337765038039760 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :73 337765046315278:337765048767192 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :74 -337765048770388:337765048770389 158125:158125 MARK(name(before HIP LaunchKernel)) 337765048774425:337765048774806 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 337765048775738:337765048776078 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 337765048777030:337765048788001 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 -337765048789774:337765048789775 158125:158125 MARK(name(after HIP LaunchKernel)) 337765048793040:337765049915989 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :80 337765058187129:337765060574001 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :81 -337765060575344:337765060575345 158125:158125 MARK(name(before HIP LaunchKernel)) 337765060578850:337765060579231 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 337765060580163:337765060580503 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 337765060584280:337765060595902 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 -337765060597225:337765060597226 158125:158125 MARK(name(after HIP LaunchKernel)) 337765060599449:337765061740842 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :87 337765069951218:337765072362576 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :88 -337765072364740:337765072364741 158125:158125 MARK(name(before HIP LaunchKernel)) 337765072369058:337765072369489 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 337765072370421:337765072370821 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 337765072371773:337765072381732 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 -337765072383095:337765072383096 158125:158125 MARK(name(after HIP LaunchKernel)) 337765072386431:337765073501153 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :94 337765081732048:337765084159937 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :95 -337765084160999:337765084161000 158125:158125 MARK(name(before HIP LaunchKernel)) 337765084165328:337765084165688 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 337765084170227:337765084170567 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 337765084171619:337765084182320 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 -337765084183682:337765084183683 158125:158125 MARK(name(after HIP LaunchKernel)) 337765084185656:337765085297784 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :101 337765093537775:337765096025317 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :102 -337765096029174:337765096029175 158125:158125 MARK(name(before HIP LaunchKernel)) 337765096032892:337765096033272 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 337765096034655:337765096034985 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 337765096035937:337765096046277 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 -337765096047579:337765096047580 158125:158125 MARK(name(after HIP LaunchKernel)) 337765096050314:337765097356057 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :108 337765105649560:337765108080996 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :109 -337765108082258:337765108082259 158125:158125 MARK(name(before HIP LaunchKernel)) 337765108086076:337765108086627 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 337765108091215:337765108091636 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 337765108092828:337765108106043 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 -337765108107466:337765108107467 158125:158125 MARK(name(after HIP LaunchKernel)) 337765108109790:337765109224292 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :115 337765117727170:337765120171099 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :116 -337765120173344:337765120173345 158125:158125 MARK(name(before HIP LaunchKernel)) 337765120177231:337765120177742 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 337765120178804:337765120179125 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 337765120180006:337765120191999 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 -337765120194123:337765120194124 158125:158125 MARK(name(after HIP LaunchKernel)) 337765120196457:337765121307212 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :122 337765129537163:337765131980392 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :123 -337765131981414:337765131981415 158125:158125 MARK(name(before HIP LaunchKernel)) 337765131984740:337765131985341 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 337765131987676:337765131988016 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 337765131989018:337765131999758 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 -337765132009206:337765132009207 158125:158125 MARK(name(after HIP LaunchKernel)) 337765132011340:337765133127245 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :129 337765141404317:337765143867803 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :130 -337765143871109:337765143871110 158125:158125 MARK(name(before HIP LaunchKernel)) 337765143875017:337765143875528 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 337765143876369:337765143876690 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 337765143877562:337765143889744 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 -337765143891989:337765143891990 158125:158125 MARK(name(after HIP LaunchKernel)) 337765143894754:337765145003225 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :136 337765153330852:337765155802584 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :137 -337765155803606:337765155803607 158125:158125 MARK(name(before HIP LaunchKernel)) 337765155807283:337765155807814 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 337765155811080:337765155811430 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 337765155812422:337765155823523 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 -337765155824956:337765155824957 158125:158125 MARK(name(after HIP LaunchKernel)) 337765155826960:337765156943876 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :143 337765165226198:337765167700264 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :144 -337765167702008:337765167702009 158125:158125 MARK(name(before HIP LaunchKernel)) 337765167705424:337765167705855 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 337765167706757:337765167707087 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 337765167708179:337765167720783 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 -337765167723098:337765167723099 158125:158125 MARK(name(after HIP LaunchKernel)) 337765167725672:337765168845485 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :150 337765177111916:337765179504549 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :151 -337765179505631:337765179505632 158125:158125 MARK(name(before HIP LaunchKernel)) 337765179509078:337765179509548 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 337765179512033:337765179512364 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 337765179513296:337765179523705 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 -337765179525068:337765179525069 158125:158125 MARK(name(after HIP LaunchKernel)) 337765179527733:337765180639169 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :157 337765188914788:337765191325545 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :158 -337765191326677:337765191326678 158125:158125 MARK(name(before HIP LaunchKernel)) 337765191330795:337765191331176 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 337765191332067:337765191332398 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 337765191333390:337765191344000 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 -337765191347256:337765191347257 158125:158125 MARK(name(after HIP LaunchKernel)) 337765191349470:337765192463481 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :164 337765200792860:337765203211492 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :165 -337765203212494:337765203212495 158125:158125 MARK(name(before HIP LaunchKernel)) 337765203215940:337765203216341 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 337765203219537:337765203219968 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 337765203220870:337765203231429 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 -337765203232592:337765203232593 158125:158125 MARK(name(after HIP LaunchKernel)) 337765203234595:337765204352734 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :171 337765212603176:337765215045302 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :172 -337765215046304:337765215046305 158125:158125 MARK(name(before HIP LaunchKernel)) 337765215049851:337765215050272 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 337765215051123:337765215051534 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 337765215052546:337765215064719 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 -337765215067103:337765215067104 158125:158125 MARK(name(after HIP LaunchKernel)) 337765215069267:337765216191183 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :178 337765224482622:337765226963902 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :179 -337765226964894:337765226964895 158125:158125 MARK(name(before HIP LaunchKernel)) 337765226970044:337765226970405 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 337765226971416:337765226971747 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 337765226972659:337765226982998 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 -337765226984441:337765226984442 158125:158125 MARK(name(after HIP LaunchKernel)) 337765226988078:337765228101498 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :185 337765236379211:337765238861964 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :186 -337765238862966:337765238862967 158125:158125 MARK(name(before HIP LaunchKernel)) 337765238866653:337765238867003 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 337765238867865:337765238868195 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 337765238869147:337765238880008 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 -337765238883504:337765238883505 158125:158125 MARK(name(after HIP LaunchKernel)) 337765238885749:337765239998748 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :192 337765248323740:337765250795292 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :193 -337765250796604:337765250796605 158125:158125 MARK(name(before HIP LaunchKernel)) 337765250802485:337765250802916 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 337765250804259:337765250804589 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 337765250805441:337765250816071 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 -337765250817353:337765250817354 158125:158125 MARK(name(after HIP LaunchKernel)) 337765250819517:337765251927607 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :199 337765260173130:337765262594897 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :200 -337765262595959:337765262595960 158125:158125 MARK(name(before HIP LaunchKernel)) 337765262600167:337765262600618 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 337765262601500:337765262601851 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 337765262604976:337765262616528 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 -337765262617680:337765262617681 158125:158125 MARK(name(after HIP LaunchKernel)) 337765262619774:337765263725670 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :206 337765272031895:337765274457500 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :207 -337765274458662:337765274458663 158125:158125 MARK(name(before HIP LaunchKernel)) 337765274464183:337765274464634 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 337765274465525:337765274465986 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 337765274466978:337765274477879 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 -337765274479241:337765274479242 158125:158125 MARK(name(after HIP LaunchKernel)) 337765274481415:337765275593242 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :213 337765283871867:337765286274118 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :214 -337765286275260:337765286275261 158125:158125 MARK(name(before HIP LaunchKernel)) 337765286279107:337765286279578 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 337765286280470:337765286280811 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 337765286283075:337765286294416 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 -337765286295799:337765286295800 158125:158125 MARK(name(after HIP LaunchKernel)) 337765286297883:337765287415781 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :220 337765295729662:337765298174083 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :221 -337765298175245:337765298175246 158125:158125 MARK(name(before HIP LaunchKernel)) 337765298180806:337765298181657 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 337765298182549:337765298182890 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 337765298183851:337765298195233 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 -337765298196646:337765298196647 158125:158125 MARK(name(after HIP LaunchKernel)) 337765298198950:337765299310807 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :227 337765307576858:337765310015998 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :228 -337765310017191:337765310017192 158125:158125 MARK(name(before HIP LaunchKernel)) 337765310020557:337765310021088 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 337765310022240:337765310022601 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 337765310025797:337765310037178 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 -337765310038511:337765310038512 158125:158125 MARK(name(after HIP LaunchKernel)) 337765310041296:337765311153774 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :234 337765319462296:337765321917687 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :235 -337765321918829:337765321918830 158125:158125 MARK(name(before HIP LaunchKernel)) 337765321924690:337765321925111 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 337765321926043:337765321926383 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 337765321927325:337765321939187 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 -337765321940840:337765321940841 158125:158125 MARK(name(after HIP LaunchKernel)) 337765321943836:337765323047217 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :241 337765331334799:337765333800680 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :242 -337765333801632:337765333801633 158125:158125 MARK(name(before HIP LaunchKernel)) 337765333805689:337765333806020 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 337765333806881:337765333807292 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 337765333809647:337765333820417 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 -337765333821790:337765333821791 158125:158125 MARK(name(after HIP LaunchKernel)) 337765333823934:337765334937344 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :248 337765343248378:337765345732022 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :249 -337765345733114:337765345733115 158125:158125 MARK(name(before HIP LaunchKernel)) 337765345739366:337765345739777 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 337765345740558:337765345740879 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 337765345741750:337765345752751 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 -337765345754084:337765345754085 158125:158125 MARK(name(after HIP LaunchKernel)) 337765345758552:337765346872854 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :255 337765355158251:337765357542558 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :256 -337765357543580:337765357543581 158125:158125 MARK(name(before HIP LaunchKernel)) 337765357547237:337765357548089 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 337765357548880:337765357549201 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 337765357552347:337765357562656 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 -337765357564540:337765357564541 158125:158125 MARK(name(after HIP LaunchKernel)) 337765357566714:337765358703679 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :262 337765366968407:337765369387249 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :263 -337765369390796:337765369390797 158125:158125 MARK(name(before HIP LaunchKernel)) 337765369394683:337765369395074 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 337765369395886:337765369396226 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 337765369397238:337765369409070 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 -337765369410593:337765369410594 158125:158125 MARK(name(after HIP LaunchKernel)) 337765369413920:337765370514896 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :269 337765378813919:337765381249433 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :270 -337765381250495:337765381250496 158125:158125 MARK(name(before HIP LaunchKernel)) 337765381255795:337765381256146 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 337765381257067:337765381257398 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 337765381259262:337765381270573 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 -337765381271925:337765381271926 158125:158125 MARK(name(after HIP LaunchKernel)) 337765381273999:337765382389143 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :276 337765390675582:337765393103622 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :277 -337765393106477:337765393106478 158125:158125 MARK(name(before HIP LaunchKernel)) 337765393110024:337765393110414 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 337765393111286:337765393111627 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 337765393112599:337765393123419 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 -337765393124852:337765393124853 158125:158125 MARK(name(after HIP LaunchKernel)) 337765393127787:337765394243081 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :283 337765402542775:337765405018485 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :284 -337765405019727:337765405019728 158125:158125 MARK(name(before HIP LaunchKernel)) 337765405023564:337765405024035 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 337765405027412:337765405027862 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 337765405028914:337765405040186 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 -337765405041428:337765405041429 158125:158125 MARK(name(after HIP LaunchKernel)) 337765405043412:337765406153906 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :290 337765414439543:337765416968903 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :291 -337765416971859:337765416971860 158125:158125 MARK(name(before HIP LaunchKernel)) 337765416975906:337765416976508 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 337765416977560:337765416977880 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 337765416978852:337765416990364 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 -337765416991516:337765416991517 158125:158125 MARK(name(after HIP LaunchKernel)) 337765416994742:337765418107140 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :297 337765426444636:337765428930194 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :298 -337765428931216:337765428931217 158125:158125 MARK(name(before HIP LaunchKernel)) 337765428935253:337765428935804 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 337765428937708:337765428938029 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 337765428938940:337765428949350 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 -337765428951043:337765428951044 158125:158125 MARK(name(after HIP LaunchKernel)) 337765428953277:337765430060145 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :304 337765438339150:337765441268235 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :305 -337765441270489:337765441270490 158125:158125 MARK(name(before HIP LaunchKernel)) 337765441274216:337765441274747 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 337765441275729:337765441276060 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 337765441276962:337765441287842 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 -337765441289235:337765441289236 158125:158125 MARK(name(after HIP LaunchKernel)) 337765441292160:337765442412533 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :311 337765450709312:337765453167268 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :312 -337765453168290:337765453168291 158125:158125 MARK(name(before HIP LaunchKernel)) 337765453171837:337765453172288 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 337765453175113:337765453175684 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 337765453176656:337765453187697 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 -337765453189130:337765453189131 158125:158125 MARK(name(after HIP LaunchKernel)) 337765453191354:337765454301107 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :318 337765462848969:337765465268904 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :319 -337765465272340:337765465272341 158125:158125 MARK(name(before HIP LaunchKernel)) 337765465276338:337765465276859 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 337765465277761:337765465278171 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 337765465279123:337765465290104 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 -337765465292639:337765465292640 158125:158125 MARK(name(after HIP LaunchKernel)) 337765465294733:337765466406910 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :325 337765474686697:337765477122020 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :326 -337765477122992:337765477122993 158125:158125 MARK(name(before HIP LaunchKernel)) 337765477126609:337765477126970 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 337765477130416:337765477130767 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 337765477131749:337765477144473 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 -337765477145815:337765477145816 158125:158125 MARK(name(after HIP LaunchKernel)) 337765477147809:337765478265337 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :332 337765486542638:337765489033045 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :333 -337765489035209:337765489035210 158125:158125 MARK(name(before HIP LaunchKernel)) 337765489038946:337765489039336 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 337765489040108:337765489040549 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 337765489041490:337765489052010 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 -337765489054255:337765489054256 158125:158125 MARK(name(after HIP LaunchKernel)) 337765489056579:337765490169488 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :339 337765498462309:337765500912371 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :340 -337765500913433:337765500913434 158125:158125 MARK(name(before HIP LaunchKernel)) 337765500917410:337765500917801 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 337765500920967:337765500921307 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 337765500922309:337765500934001 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 -337765500935354:337765500935355 158125:158125 MARK(name(after HIP LaunchKernel)) 337765500937468:337765502051439 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :346 337765510338339:337765512803419 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :347 -337765512804631:337765512804632 158125:158125 MARK(name(before HIP LaunchKernel)) 337765512808448:337765512808879 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 337765512809771:337765512810091 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 337765512811123:337765512821753 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 -337765512826432:337765512826433 158125:158125 MARK(name(after HIP LaunchKernel)) 337765512828546:337765513936426 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :353 337765522202487:337765524571746 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :354 -337765524573359:337765524573360 158125:158125 MARK(name(before HIP LaunchKernel)) 337765524577146:337765524577546 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 337765524581193:337765524581524 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 337765524582496:337765524594358 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 -337765524595821:337765524595822 158125:158125 MARK(name(after HIP LaunchKernel)) 337765524598015:337765525748215 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :360 337765534071724:337765536497660 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :361 -337765536499463:337765536499464 158125:158125 MARK(name(before HIP LaunchKernel)) 337765536504743:337765536505134 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 337765536505985:337765536506326 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 337765536507728:337765536518278 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 -337765536521014:337765536521015 158125:158125 MARK(name(after HIP LaunchKernel)) 337765536523027:337765537637159 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :367 337765545903460:337765548307234 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :368 -337765548308867:337765548308868 158125:158125 MARK(name(before HIP LaunchKernel)) 337765548314498:337765548314909 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 337765548315930:337765548316411 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 337765548317443:337765548328604 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 -337765548329837:337765548329838 158125:158125 MARK(name(after HIP LaunchKernel)) 337765548332271:337765549443808 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :374 337765557720648:337765560151162 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :375 -337765560152143:337765560152144 158125:158125 MARK(name(before HIP LaunchKernel)) 337765560156001:337765560156351 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 337765560157363:337765560157684 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 337765560158596:337765560169526 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 -337765560173414:337765560173415 158125:158125 MARK(name(after HIP LaunchKernel)) 337765560175588:337765561286232 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :381 337765569553195:337765572027742 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :382 -337765572029295:337765572029296 158125:158125 MARK(name(before HIP LaunchKernel)) 337765572036198:337765572036739 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 337765572037741:337765572038072 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 337765572039034:337765572051167 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 -337765572052609:337765572052610 158125:158125 MARK(name(after HIP LaunchKernel)) 337765572055224:337765573176419 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :388 337765581527320:337765583981769 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :389 -337765583982862:337765583982863 158125:158125 MARK(name(before HIP LaunchKernel)) 337765583986849:337765583987270 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 337765583988041:337765583988372 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 337765583989234:337765584005755 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 -337765584008750:337765584008751 158125:158125 MARK(name(after HIP LaunchKernel)) 337765584010904:337765585124174 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :395 337765593393521:337765595851157 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :396 -337765595852249:337765595852250 158125:158125 MARK(name(before HIP LaunchKernel)) 337765595857308:337765595858080 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 337765595858982:337765595859312 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 337765595860174:337765595870794 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 -337765595872116:337765595872117 158125:158125 MARK(name(after HIP LaunchKernel)) 337765595874491:337765596989704 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :402 337765605549108:337765608027193 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :403 -337765608028255:337765608028256 158125:158125 MARK(name(before HIP LaunchKernel)) 337765608032172:337765608032733 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 337765608033524:337765608033965 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 337765608039165:337765608050527 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 -337765608051949:337765608051950 158125:158125 MARK(name(after HIP LaunchKernel)) 337765608054193:337765609169507 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :409 337765617468711:337765619920355 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :410 -337765619921347:337765619921348 158125:158125 MARK(name(before HIP LaunchKernel)) 337765619927148:337765619927679 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 337765619928660:337765619929001 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 337765619929913:337765619939821 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 -337765619941124:337765619941125 158125:158125 MARK(name(after HIP LaunchKernel)) 337765619943298:337765621073860 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :416 337765629366101:337765631863661 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :417 -337765631864673:337765631864674 158125:158125 MARK(name(before HIP LaunchKernel)) 337765631868260:337765631868631 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 337765631869402:337765631869823 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 337765631873550:337765631884220 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 -337765631885433:337765631885434 158125:158125 MARK(name(after HIP LaunchKernel)) 337765631887536:337765632994073 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :423 337765641290129:337765643763524 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :424 -337765643764546:337765643764547 158125:158125 MARK(name(before HIP LaunchKernel)) 337765643769886:337765643770227 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 337765643771119:337765643771439 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 337765643772311:337765643782781 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 -337765643784173:337765643784174 158125:158125 MARK(name(after HIP LaunchKernel)) 337765643787810:337765644905058 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :430 337765653182400:337765655559503 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :431 -337765655561036:337765655561037 158125:158125 MARK(name(before HIP LaunchKernel)) 337765655564733:337765655565074 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 337765655565925:337765655566336 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 337765655569733:337765655579972 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 -337765655581435:337765655581436 158125:158125 MARK(name(after HIP LaunchKernel)) 337765655583769:337765656701417 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :437 337765665019797:337765667416377 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :438 -337765667417629:337765667417630 158125:158125 MARK(name(before HIP LaunchKernel)) 337765667423450:337765667423791 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 337765667424873:337765667425294 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 337765667426136:337765667436074 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 -337765667437407:337765667437408 158125:158125 MARK(name(after HIP LaunchKernel)) 337765667440122:337765668548412 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :444 337765676812980:337765679222275 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :445 -337765679223267:337765679223268 158125:158125 MARK(name(before HIP LaunchKernel)) 337765679227645:337765679227995 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 337765679228957:337765679229378 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 337765679232354:337765679244447 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 -337765679245809:337765679245810 158125:158125 MARK(name(after HIP LaunchKernel)) 337765679247953:337765680356955 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :451 337765688606675:337765691044864 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :452 -337765691045926:337765691045927 158125:158125 MARK(name(before HIP LaunchKernel)) 337765691051106:337765691051466 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 337765691052448:337765691052869 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 337765691053670:337765691064300 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 -337765691065613:337765691065614 158125:158125 MARK(name(after HIP LaunchKernel)) 337765691068518:337765692184794 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :458 337765700496681:337765702965147 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :459 -337765702966329:337765702966330 158125:158125 MARK(name(before HIP LaunchKernel)) 337765702970397:337765702970738 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 337765702971629:337765702971960 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 337765702975026:337765702985896 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 -337765702987239:337765702987240 158125:158125 MARK(name(after HIP LaunchKernel)) 337765702989493:337765704101801 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :465 337765712370696:337765714849511 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :466 -337765714852497:337765714852498 158125:158125 MARK(name(before HIP LaunchKernel)) 337765714856404:337765714856865 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 337765714857817:337765714858157 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 337765714859129:337765714869088 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 -337765714870220:337765714870221 158125:158125 MARK(name(after HIP LaunchKernel)) 337765714873206:337765715984822 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :472 337765724257526:337765726752442 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :473 -337765726753554:337765726753555 158125:158125 MARK(name(before HIP LaunchKernel)) 337765726757381:337765726757902 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 337765726758763:337765726759094 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 337765726762511:337765726773591 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 -337765726774964:337765726774965 158125:158125 MARK(name(after HIP LaunchKernel)) 337765726777028:337765727891159 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :479 337765736175495:337765738571424 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :480 -337765738573508:337765738573509 158125:158125 MARK(name(before HIP LaunchKernel)) 337765738577195:337765738577716 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 337765738578677:337765738578998 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 337765738580090:337765738590129 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 -337765738591672:337765738591673 158125:158125 MARK(name(after HIP LaunchKernel)) 337765738594668:337765739707977 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :486 337765748070500:337765750493230 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :487 -337765750494372:337765750494373 158125:158125 MARK(name(before HIP LaunchKernel)) 337765750497859:337765750498270 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 337765750500444:337765750500804 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 337765750501846:337765750512667 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 -337765750514159:337765750514160 158125:158125 MARK(name(after HIP LaunchKernel)) 337765750516434:337765751633932 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :493 337765759933566:337765762341477 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :494 -337765762344814:337765762344815 158125:158125 MARK(name(before HIP LaunchKernel)) 337765762348350:337765762348841 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 337765762349723:337765762350174 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 337765762351566:337765762361946 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 -337765762363499:337765762363500 158125:158125 MARK(name(after HIP LaunchKernel)) 337765762366384:337765763484233 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :500 337765771735786:337765774156773 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :501 -337765774157805:337765774157806 158125:158125 MARK(name(before HIP LaunchKernel)) 337765774161492:337765774162063 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 337765774165299:337765774165640 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 337765774166712:337765774177812 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 -337765774179285:337765774179286 158125:158125 MARK(name(after HIP LaunchKernel)) 337765774181369:337765775302103 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :507 337765783578812:337765786029635 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :508 -337765786033162:337765786033163 158125:158125 MARK(name(before HIP LaunchKernel)) 337765786036808:337765786037349 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 337765786038131:337765786038582 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 337765786039473:337765786052037 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 -337765786054091:337765786054092 158125:158125 MARK(name(after HIP LaunchKernel)) 337765786056656:337765787172160 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :514 337765795490429:337765797965808 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :515 -337765797966980:337765797966981 158125:158125 MARK(name(before HIP LaunchKernel)) 337765797970798:337765797971409 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 337765797973573:337765797973923 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 337765797974775:337765797985515 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 -337765797986738:337765797986739 158125:158125 MARK(name(after HIP LaunchKernel)) 337765797988832:337765799105267 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :521 337765807418016:337765809874289 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :522 -337765809877956:337765809877957 158125:158125 MARK(name(before HIP LaunchKernel)) 337765809881653:337765809882114 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 337765809883016:337765809883427 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 337765809884378:337765809895339 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 -337765809897333:337765809897334 158125:158125 MARK(name(after HIP LaunchKernel)) 337765809899497:337765811016343 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :528 337765819288546:337765821755830 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :529 -337765821757352:337765821757353 158125:158125 MARK(name(before HIP LaunchKernel)) 337765821760979:337765821761450 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 337765821764346:337765821764816 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 337765821765718:337765821776088 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 -337765821777450:337765821777451 158125:158125 MARK(name(after HIP LaunchKernel)) 337765821779594:337765822898395 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :535 337765831205783:337765833694167 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :536 -337765833697303:337765833697304 158125:158125 MARK(name(before HIP LaunchKernel)) 337765833701471:337765833701831 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 337765833702623:337765833702953 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 337765833703825:337765833715307 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 -337765833717601:337765833717602 158125:158125 MARK(name(after HIP LaunchKernel)) 337765833719785:337765834853393 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :542 337765843156995:337765845538948 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :543 -337765845540120:337765845540121 158125:158125 MARK(name(before HIP LaunchKernel)) 337765845544178:337765845544508 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 337765845546542:337765845546883 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 337765845547704:337765845557583 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 -337765845558916:337765845558917 158125:158125 MARK(name(after HIP LaunchKernel)) 337765845561130:337765846664872 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :549 337765854951950:337765857350254 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :550 -337765857351296:337765857351297 158125:158125 MARK(name(before HIP LaunchKernel)) 337765857354933:337765857355293 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 337765857356055:337765857356546 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 337765857357508:337765857367727 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 -337765857371213:337765857371214 158125:158125 MARK(name(after HIP LaunchKernel)) 337765857373287:337765858496676 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :556 337765866775191:337765869163646 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :557 -337765869164748:337765869164749 158125:158125 MARK(name(before HIP LaunchKernel)) 337765869168926:337765869169316 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 337765869174366:337765869174827 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 337765869175788:337765869186388 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 -337765869187891:337765869187892 158125:158125 MARK(name(after HIP LaunchKernel)) 337765869190156:337765870296673 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :563 337765878585186:337765881030107 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :564 -337765881031149:337765881031150 158125:158125 MARK(name(before HIP LaunchKernel)) 337765881034957:337765881035347 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 337765881036219:337765881036550 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 337765881037982:337765881050857 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 -337765881053522:337765881053523 158125:158125 MARK(name(after HIP LaunchKernel)) 337765881055616:337765882175217 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :570 337765890460034:337765892915986 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :571 -337765892917308:337765892917309 158125:158125 MARK(name(before HIP LaunchKernel)) 337765892922147:337765892922558 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 337765892923320:337765892923650 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 337765892925103:337765892935142 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 -337765892936464:337765892936465 158125:158125 MARK(name(after HIP LaunchKernel)) 337765892938639:337765894040557 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :577 337765902321336:337765904774573 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :578 -337765904775605:337765904775606 158125:158125 MARK(name(before HIP LaunchKernel)) 337765904779151:337765904779552 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 337765904780414:337765904780754 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 337765904781706:337765904792316 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 -337765904794911:337765904794912 158125:158125 MARK(name(after HIP LaunchKernel)) 337765904797416:337765906407563 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :584 337765914710163:337765917154774 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :585 -337765917155946:337765917155947 158125:158125 MARK(name(before HIP LaunchKernel)) 337765917162037:337765917162428 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 337765917163400:337765917163841 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 337765917164873:337765917176795 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 -337765917178248:337765917178249 158125:158125 MARK(name(after HIP LaunchKernel)) 337765917181103:337765918298240 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :591 337765926830502:337765929245907 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :592 -337765929247080:337765929247081 158125:158125 MARK(name(before HIP LaunchKernel)) 337765929250937:337765929251328 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 337765929252269:337765929252720 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 337765929253702:337765929266025 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 -337765929269762:337765929269763 158125:158125 MARK(name(after HIP LaunchKernel)) 337765929272007:337765930388232 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :598 337765938673068:337765941095628 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :599 -337765941096650:337765941096651 158125:158125 MARK(name(before HIP LaunchKernel)) 337765941101008:337765941101419 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 337765941102370:337765941102821 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 337765941103783:337765941115335 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 -337765941116758:337765941116759 158125:158125 MARK(name(after HIP LaunchKernel)) 337765941118952:337765942236489 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :605 337765950583563:337765953028034 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :606 -337765953029146:337765953029147 158125:158125 MARK(name(before HIP LaunchKernel)) 337765953032813:337765953033184 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 337765953034396:337765953034847 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 337765953037271:337765953047330 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 -337765953048773:337765953048774 158125:158125 MARK(name(after HIP LaunchKernel)) 337765953050847:337765954167944 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :612 337765962480472:337765964922048 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :613 -337765964923280:337765964923281 158125:158125 MARK(name(before HIP LaunchKernel)) 337765964928680:337765964929111 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 337765964930153:337765964930594 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 337765964931365:337765964942516 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 -337765964943658:337765964943659 158125:158125 MARK(name(after HIP LaunchKernel)) 337765964945752:337765966055586 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :619 337765974287412:337765976754355 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :620 -337765976755427:337765976755428 158125:158125 MARK(name(before HIP LaunchKernel)) 337765976759224:337765976759605 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 337765976760457:337765976760897 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 337765976763903:337765976774673 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 -337765976776036:337765976776037 158125:158125 MARK(name(after HIP LaunchKernel)) 337765976778220:337765977896830 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :626 337765986168812:337765988577395 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :627 -337765988578958:337765988578959 158125:158125 MARK(name(before HIP LaunchKernel)) 337765988584569:337765988584999 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 337765988585891:337765988586482 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 337765988587334:337765988598775 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 -337765988600238:337765988600239 158125:158125 MARK(name(after HIP LaunchKernel)) 337765988604055:337765989709069 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :633 337765998093663:337766000492497 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :634 -337766000493649:337766000493650 158125:158125 MARK(name(before HIP LaunchKernel)) 337766000497646:337766000498087 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 337766000498999:337766000499330 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 337766000501814:337766000511843 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 -337766000513607:337766000513608 158125:158125 MARK(name(after HIP LaunchKernel)) 337766000515690:337766001640322 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :640 337766009956637:337766012392983 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :641 -337766012393985:337766012393986 158125:158125 MARK(name(before HIP LaunchKernel)) 337766012399555:337766012400226 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 337766012401198:337766012401659 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 337766012402521:337766012412960 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 -337766012414383:337766012414384 158125:158125 MARK(name(after HIP LaunchKernel)) 337766012417790:337766013529516 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :647 337766021796369:337766024223527 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :648 -337766024224779:337766024224780 158125:158125 MARK(name(before HIP LaunchKernel)) 337766024228837:337766024229218 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 337766024229979:337766024230440 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 337766024234097:337766024245328 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 -337766024246560:337766024246561 158125:158125 MARK(name(after HIP LaunchKernel)) 337766024248784:337766025360712 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :654 337766033651129:337766036557400 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :655 -337766036560857:337766036560858 158125:158125 MARK(name(before HIP LaunchKernel)) 337766036564965:337766036565345 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 337766036566287:337766036566758 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 337766036567640:337766036580564 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 -337766036582197:337766036582198 158125:158125 MARK(name(after HIP LaunchKernel)) 337766036585143:337766037708642 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :661 337766045986585:337766048388886 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :662 -337766048389958:337766048389959 158125:158125 MARK(name(before HIP LaunchKernel)) 337766048393825:337766048394376 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 337766048395278:337766048395819 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 337766048397773:337766048409445 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 -337766048410777:337766048410778 158125:158125 MARK(name(after HIP LaunchKernel)) 337766048413082:337766049518447 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :668 337766058038627:337766060454343 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :669 -337766060457078:337766060457079 158125:158125 MARK(name(before HIP LaunchKernel)) 337766060461226:337766060461647 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 337766060462509:337766060462950 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 337766060463811:337766060475754 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 -337766060476986:337766060476987 158125:158125 MARK(name(after HIP LaunchKernel)) 337766060479932:337766061599133 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :675 337766069885611:337766072274686 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :676 -337766072275738:337766072275739 158125:158125 MARK(name(before HIP LaunchKernel)) 337766072279706:337766072280117 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 337766072280978:337766072281419 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 337766072284986:337766072296648 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 -337766072298511:337766072298512 158125:158125 MARK(name(after HIP LaunchKernel)) 337766072300575:337766073410839 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :682 337766081680868:337766084149093 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :683 -337766084152289:337766084152290 158125:158125 MARK(name(before HIP LaunchKernel)) 337766084156237:337766084156647 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 337766084157629:337766084157970 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 337766084158992:337766084170043 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 -337766084171385:337766084171386 158125:158125 MARK(name(after HIP LaunchKernel)) 337766084174451:337766085284064 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :689 337766093580402:337766096024893 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :690 -337766096026455:337766096026456 158125:158125 MARK(name(before HIP LaunchKernel)) 337766096030032:337766096030433 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 337766096034350:337766096034831 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 337766096035793:337766096047385 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 -337766096048818:337766096048819 158125:158125 MARK(name(after HIP LaunchKernel)) 337766096051563:337766097168119 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :696 337766105477451:337766107930458 158125:158125 hipMemcpy(dst=0x7f95c0800000, src=0x7f95c88ff010, sizeBytes=4194304, kind=1) :697 -337766107932672:337766107932673 158125:158125 MARK(name(before HIP LaunchKernel)) 337766107936439:337766107936860 158125:158125 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 337766107937852:337766107938193 158125:158125 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 337766107939084:337766107949224 158125:158125 hipLaunchKernel(function_address=0x200ee0, numBlocks={}, dimBlocks={}, args=0x7ffea16f38c8, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 -337766107950656:337766107950657 158125:158125 MARK(name(after HIP LaunchKernel)) 337766107953562:337766109068685 158125:158125 hipMemcpy(dst=0x7f95c84fe010, src=0x7f9484c00000, sizeBytes=4194304, kind=2) :703 337766117332762:337766117364702 158125:158125 hipFree(ptr=0x7f95c0800000) :704 337766117365825:337766117379931 158125:158125 hipFree(ptr=0x7f9484c00000) :705 diff --git a/test/golden_traces/activity_and_callback_trace.txt b/test/golden_traces/activity_and_callback_trace.txt new file mode 100644 index 00000000..4669e6c3 --- /dev/null +++ b/test/golden_traces/activity_and_callback_trace.txt @@ -0,0 +1,65 @@ + + +<__hipPushCallConfiguration id(2) correlation_id(2) on-enter pid(877336) tid(877336)> +<__hipPushCallConfiguration id(2) correlation_id(2) on-exit pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(3) on-enter pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(3) on-exit pid(877336) tid(877336)> + + + + + + +<__hipPushCallConfiguration id(2) correlation_id(7) on-enter pid(877336) tid(877336)> +<__hipPushCallConfiguration id(2) correlation_id(7) on-exit pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(8) on-enter pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(8) on-exit pid(877336) tid(877336)> + + + + + hipSetDevice correlation_id(6) time_ns(861794298279896:861794298283613) + __hipPushCallConfiguration correlation_id(7) time_ns(861794298290125:861794298293211) + __hipPopCallConfiguration correlation_id(8) time_ns(861794298293903:861794298295325) + hipLaunchKernel correlation_id(9) time_ns(861794298296377:861794298313029) + hipDeviceSynchronize correlation_id(10) time_ns(861794298313470:861794298331113) + hipSetDevice correlation_id(11) time_ns(861794298565986:861794298566277) + __hipPushCallConfiguration correlation_id(12) time_ns(861794298566738:861794298567148) + __hipPopCallConfiguration correlation_id(13) time_ns(861794298567569:861794298568010) + hipLaunchKernel correlation_id(14) time_ns(861794298568391:861794298577638) + hipDeviceSynchronize correlation_id(15) time_ns(861794298578069:861794298594841) + + +<__hipPushCallConfiguration id(2) correlation_id(17) on-enter pid(877336) tid(877336)> +<__hipPushCallConfiguration id(2) correlation_id(17) on-exit pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(18) on-enter pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(18) on-exit pid(877336) tid(877336)> + + + + + + +<__hipPushCallConfiguration id(2) correlation_id(22) on-enter pid(877336) tid(877336)> +<__hipPushCallConfiguration id(2) correlation_id(22) on-exit pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(23) on-enter pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(23) on-exit pid(877336) tid(877336)> + + + + + hipSetDevice correlation_id(21) time_ns(861794299364583:861794299365585) + __hipPushCallConfiguration correlation_id(22) time_ns(861794299366106:861794299367329) + __hipPopCallConfiguration correlation_id(23) time_ns(861794299367830:861794299369082) + hipLaunchKernel correlation_id(24) time_ns(861794299369523:861794299377227) + hipDeviceSynchronize correlation_id(25) time_ns(861794299377748:861794299394730) + + +<__hipPushCallConfiguration id(2) correlation_id(27) on-enter pid(877336) tid(877336)> +<__hipPushCallConfiguration id(2) correlation_id(27) on-exit pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(28) on-enter pid(877336) tid(877336)> +<__hipPopCallConfiguration id(1) correlation_id(28) on-exit pid(877336) tid(877336)> + + + + diff --git a/test/golden_traces/multi_pool_activities_trace.txt b/test/golden_traces/multi_pool_activities_trace.txt new file mode 100644 index 00000000..fda36886 --- /dev/null +++ b/test/golden_traces/multi_pool_activities_trace.txt @@ -0,0 +1,30 @@ + :KernelExecution : correlation_id(0) time_ns(109660011583441:109660011588241) + :KernelExecution : correlation_id(0) time_ns(109660011855280:109660011859600) + :KernelExecution : correlation_id(0) time_ns(109660011969840:109660011973679) + :KernelExecution : correlation_id(0) time_ns(109660012037039:109660012041199) + :KernelExecution : correlation_id(0) time_ns(109660012081199:109660012084559) + :KernelExecution : correlation_id(0) time_ns(109660012122799:109660012126159) + :KernelExecution : correlation_id(0) time_ns(109660012164399:109660012168239) + :KernelExecution : correlation_id(0) time_ns(109660012206158:109660012209838) + :KernelExecution : correlation_id(0) time_ns(109660012247118:109660012250478) + :KernelExecution : correlation_id(0) time_ns(109660012289678:109660012293518) + :CopyHostToDevice : correlation_id(1) time_ns(109660008446578:109660008452178) + :hipMemcpy : correlation_id(1) time_ns(109659777462237:109660008474607) + :CopyHostToDevice : correlation_id(2) time_ns(109660011646881:109660011651041) + :hipMemcpy : correlation_id(2) time_ns(109660011115400:109660011817555) + :CopyHostToDevice : correlation_id(3) time_ns(109660011942080:109660011946240) + :hipMemcpy : correlation_id(3) time_ns(109660011846359:109660011951538) + :CopyHostToDevice : correlation_id(4) time_ns(109660011985759:109660011989919) + :hipMemcpy : correlation_id(4) time_ns(109660011961286:109660011994288) + :CopyHostToDevice : correlation_id(5) time_ns(109660012053439:109660012057599) + :hipMemcpy : correlation_id(5) time_ns(109660012029645:109660012062688) + :CopyHostToDevice : correlation_id(6) time_ns(109660012096639:109660012100799) + :hipMemcpy : correlation_id(6) time_ns(109660012073037:109660012105278) + :CopyHostToDevice : correlation_id(7) time_ns(109660012138239:109660012142879) + :hipMemcpy : correlation_id(7) time_ns(109660012114796:109660012147087) + :CopyHostToDevice : correlation_id(8) time_ns(109660012180158:109660012184478) + :hipMemcpy : correlation_id(8) time_ns(109660012156274:109660012188795) + :CopyHostToDevice : correlation_id(9) time_ns(109660012221438:109660012225758) + :hipMemcpy : correlation_id(9) time_ns(109660012198213:109660012230234) + :CopyHostToDevice : correlation_id(10) time_ns(109660012262398:109660012266878) + :hipMemcpy : correlation_id(10) time_ns(109660012239211:109660012271171) diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index c224382e..f6ceb02c 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -14,9 +14,11 @@ MatrixTranspose_hip_input_trace --check-events .* copy_hsa_trace --check-events .* copy_hsa_input_trace --check-events .* load_unload_reload_trace --check-order .* --ignore-count hsa_agent_get_info -hsa_co_trace --check-none code_obj_trace --check-none trace_buffer --check-none memory_pool --check-none +activity_and_callback_trace --check-order .* +multi_pool_activities_trace --check-order .* roctx_test_trace --check-count .* -backward_compat_test_trace --check-none \ No newline at end of file +backward_compat_test_trace --check-none +dlopen --check-none \ No newline at end of file diff --git a/test/hip/MatrixTranspose.cpp b/test/hip/MatrixTranspose.cpp index 7f19fe99..b96f2b80 100644 --- a/test/hip/MatrixTranspose.cpp +++ b/test/hip/MatrixTranspose.cpp @@ -44,9 +44,6 @@ #define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 -// Mark API -extern "C" void roctracer_mark(const char* str); - // Device (Kernel) function, it must be void __global__ void matrixTranspose(float* out, float* in, const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -100,7 +97,6 @@ int main() { // Memory transfer from host to device HIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice)); - roctracer_mark("before HIP LaunchKernel"); roctxMark("before hipLaunchKernel"); int rangeId = roctxRangeStart("hipLaunchKernel range"); roctxRangePush("hipLaunchKernel"); @@ -108,7 +104,6 @@ int main() { hipLaunchKernelGGL( matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH); - roctracer_mark("after HIP LaunchKernel"); roctxMark("after hipLaunchKernel"); // Memory transfer from device to host diff --git a/test/run.sh b/test/run.sh index 5a5ed005..7cd1cbd5 100755 --- a/test/run.sh +++ b/test/run.sh @@ -62,13 +62,16 @@ xeval_test() { test_number=$test_number } -eval_test() { - bright=$(tput bold) - red=$(tput setaf 1) - green=$(tput setaf 2) - blue=$(tput setaf 4) - normal=$(tput sgr0) +ncolors=$(tput colors || echo 0) +if [ -n "$ncolors" ] && [ $ncolors -ge 8 ]; then + bright="$(tput bold || echo)" + red="$(tput setaf 1 || echo)" + green="$(tput setaf 2 || echo)" + blue="$(tput setaf 4 || echo)" + normal="$(tput sgr0 || echo)" +fi +eval_test() { label=$1 cmdline=$2 test_name=$3 @@ -95,9 +98,9 @@ eval_test() { fi fi if [ $is_failed = 0 ] ; then - echo "${bright}${blue}$test_name: ${green}PASSED${normal}" + echo "${bright:-}${blue:-}$test_name: ${green:-}PASSED${normal:-}" else - echo "${bright}${blue}$test_name: ${red}FAILED${normal}" + echo "${bright:-}${blue:-}$test_name: ${red:-}FAILED${normal:-}" failed_tests="$failed_tests\n $test_number: $test_name - \"$label\"" test_status=$(($test_status + 1)) fi @@ -166,21 +169,21 @@ unset ROCP_INPUT # Check that the tracer tool can be unloaded and then reloaded. eval_test "Load/Unload/Reload the tracer tool" ./test/load_unload_reload_test load_unload_reload_trace -export HSA_TOOLS_LIB="./test/libhsaco_test.so" -eval_test "tool HSA codeobj" ./test/MatrixTranspose hsa_co_trace - -export ROCP_TOOL_LIB=./test/libcodeobj_test.so -export HSA_TOOLS_LIB="librocprofiler64.so" +export LD_PRELOAD=./test/libcodeobj_test.so eval_test "tool tracer codeobj" ./test/MatrixTranspose code_obj_trace +unset LD_PRELOAD #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. eval_test "directed TraceBuffer test" ./test/trace_buffer trace_buffer eval_test "directed MemoryPool test" ./test/memory_pool memory_pool +eval_test "enable/disable callbacks and activities test" ./test/activity_and_callback activity_and_callback_trace +eval_test "use multiple memory pools in HIP activities test" ./test/multi_pool_activities multi_pool_activities_trace +eval_test "Dynamically load the tracer library test" ./test/dlopen dlopen -eval_test "backward compatibilty tests" ./test/backward_compat_test backward_compat_test_trace +eval_test "backward compatibility tests" ./test/backward_compat_test backward_compat_test_trace echo "$test_number tests total / $test_runnum tests run / $test_status tests failed" if [ $test_status != 0 ] ; then From 8b7434ec2d88e0c3192ecf73c771997dc79bc78b Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 14 Oct 2022 09:52:18 -0700 Subject: [PATCH 42/47] SWDEV-362165 - Escape strings in the API function's arguments Strings ([const] char *, [const] char[]) passed as arguments to API functions may not always contain printable characters. All string arguments should be quoted and escaped in the trace logs. Change-Id: Ie39058f2190048b1a0090df16d9ac6bc6507e28a (cherry picked from commit b556f8681e30e942c79ed34516ef1d4d918e08e4) --- plugin/file/file.cpp | 6 +++--- script/gen_ostream_ops.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/plugin/file/file.cpp b/plugin/file/file.cpp index c0d78b13..fc68a6e7 100644 --- a/plugin/file/file.cpp +++ b/plugin/file/file.cpp @@ -243,8 +243,8 @@ class file_plugin_t { << ((record->op == HSA_API_ID_hsa_shut_down) ? record->begin_ns : record->end_ns) << " " << record->process_id << ":" << record->thread_id << " " - << hsa_api_data_pair_t(record->op, *data) << " :" << data->correlation_id - << std::endl; + << hsa_api_data_pair_t(record->op, *data) << " :" << std::dec + << data->correlation_id << std::endl; break; } case ACTIVITY_DOMAIN_HIP_API: { @@ -265,7 +265,7 @@ class file_plugin_t { *output_file << std::dec << record->begin_ns << ":" << record->end_ns << " " << record->process_id << ":" << record->thread_id << " " << hipApiString((hip_api_id_t)record->op, data) << kernel_name << " :" - << data->correlation_id << std::endl; + << std::dec << data->correlation_id << std::endl; break; } default: diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index b9a52a30..ec762c4e 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -53,7 +53,20 @@ header_basic = \ 'namespace detail {\n' + \ -'template \n' + \ +' inline static void print_escaped_string(std::ostream& out, const char *v, size_t len) {\n' + \ +' out << \'"\'; \n' + \ +' for (size_t i = 0; i < len && v[i]; ++i) {\n' + \ +' if (std::isprint((unsigned char)v[i])) std::operator<<(out, v[i]);\n' + \ +' else {\n' + \ +' std::ios_base::fmtflags flags(out.flags());\n' + \ +' out << "\\\\x" << std::setfill(\'0\') << std::setw(2) << std::hex << (unsigned int)(unsigned char)v[i];\n' + \ +' out.flags(flags);\n' + \ +' }\n' + \ +' }\n' + \ +' out << \'"\'; \n' + \ +' }\n' + \ +'\n' + \ +' template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ ' static bool recursion = false;\n' + \ @@ -66,6 +79,15 @@ '\n' + \ ' inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n' + \ ' out << (unsigned char)v;\n' + \ +' return out;\n }\n' + \ +'\n' + \ +' template \n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const char (&v)[N]) {\n' + \ +' print_escaped_string(out, v, N);\n' + \ +' return out;\n }\n' + \ +'\n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const char *v) {\n' + \ +' print_escaped_string(out, v, strlen(v));\n' + \ ' return out;\n }\n' structs_analyzed = {} @@ -120,9 +142,9 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a indent = "" str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ") != std::string::npos) {\n" indent = " " - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \"" + name + "=\");\n" + str += indent + " std::operator<<(out, \"" + name + "=\");\n" str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v." + name + ");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \", \");\n" + str += indent + " std::operator<<(out, \", \");\n" str += " }\n" if "void" not in mtype: global_str += str @@ -166,7 +188,9 @@ def gen_cppheader(infilepath, outfilepath, rank): '\n' + \ '#ifdef __cplusplus\n' + \ '#include \n' + \ - '#include \n' + '#include \n' + \ + '#include \n' + \ + '#include \n' output_filename_h.write(header_s) output_filename_h.write('\n') From c7ae4795b2db0dd1593f4ced2fe50d73d2168d01 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 14 Oct 2022 09:52:18 -0700 Subject: [PATCH 43/47] SWDEV-362165 - Escape strings in the API function's arguments Strings ([const] char *, [const] char[]) passed as arguments to API functions may not always contain printable characters. All string arguments should be quoted and escaped in the trace logs. Change-Id: Ie39058f2190048b1a0090df16d9ac6bc6507e28a (cherry picked from commit b556f8681e30e942c79ed34516ef1d4d918e08e4) --- plugin/file/file.cpp | 6 +++--- script/gen_ostream_ops.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/plugin/file/file.cpp b/plugin/file/file.cpp index c0d78b13..fc68a6e7 100644 --- a/plugin/file/file.cpp +++ b/plugin/file/file.cpp @@ -243,8 +243,8 @@ class file_plugin_t { << ((record->op == HSA_API_ID_hsa_shut_down) ? record->begin_ns : record->end_ns) << " " << record->process_id << ":" << record->thread_id << " " - << hsa_api_data_pair_t(record->op, *data) << " :" << data->correlation_id - << std::endl; + << hsa_api_data_pair_t(record->op, *data) << " :" << std::dec + << data->correlation_id << std::endl; break; } case ACTIVITY_DOMAIN_HIP_API: { @@ -265,7 +265,7 @@ class file_plugin_t { *output_file << std::dec << record->begin_ns << ":" << record->end_ns << " " << record->process_id << ":" << record->thread_id << " " << hipApiString((hip_api_id_t)record->op, data) << kernel_name << " :" - << data->correlation_id << std::endl; + << std::dec << data->correlation_id << std::endl; break; } default: diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index b9a52a30..ec762c4e 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -53,7 +53,20 @@ header_basic = \ 'namespace detail {\n' + \ -'template \n' + \ +' inline static void print_escaped_string(std::ostream& out, const char *v, size_t len) {\n' + \ +' out << \'"\'; \n' + \ +' for (size_t i = 0; i < len && v[i]; ++i) {\n' + \ +' if (std::isprint((unsigned char)v[i])) std::operator<<(out, v[i]);\n' + \ +' else {\n' + \ +' std::ios_base::fmtflags flags(out.flags());\n' + \ +' out << "\\\\x" << std::setfill(\'0\') << std::setw(2) << std::hex << (unsigned int)(unsigned char)v[i];\n' + \ +' out.flags(flags);\n' + \ +' }\n' + \ +' }\n' + \ +' out << \'"\'; \n' + \ +' }\n' + \ +'\n' + \ +' template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ ' static bool recursion = false;\n' + \ @@ -66,6 +79,15 @@ '\n' + \ ' inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n' + \ ' out << (unsigned char)v;\n' + \ +' return out;\n }\n' + \ +'\n' + \ +' template \n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const char (&v)[N]) {\n' + \ +' print_escaped_string(out, v, N);\n' + \ +' return out;\n }\n' + \ +'\n' + \ +' inline static std::ostream &operator<<(std::ostream &out, const char *v) {\n' + \ +' print_escaped_string(out, v, strlen(v));\n' + \ ' return out;\n }\n' structs_analyzed = {} @@ -120,9 +142,9 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a indent = "" str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ") != std::string::npos) {\n" indent = " " - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \"" + name + "=\");\n" + str += indent + " std::operator<<(out, \"" + name + "=\");\n" str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v." + name + ");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \", \");\n" + str += indent + " std::operator<<(out, \", \");\n" str += " }\n" if "void" not in mtype: global_str += str @@ -166,7 +188,9 @@ def gen_cppheader(infilepath, outfilepath, rank): '\n' + \ '#ifdef __cplusplus\n' + \ '#include \n' + \ - '#include \n' + '#include \n' + \ + '#include \n' + \ + '#include \n' output_filename_h.write(header_s) output_filename_h.write('\n') From 2f5c8d9a5a73611fa9eeb46a4e9783cb364aa8c2 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Tue, 18 Oct 2022 20:06:44 -0700 Subject: [PATCH 44/47] Report HSA_OPS activities using the ROCr driver_node_id instead of the device's index When multiple ranks are used, each rank's first logical device always has GPU ID 0, regardless of which physical device is selected with CUDA_VISIBLE_DEVICES. Because of this, when merging trace files from multiple ranks, GPU IDs from different processes may overlap. The long term solution is to use the KFD's gpu_id which is stable across APIs and processes. Unfortunately the gpu_id is not yet exposed by the ROCr, so for now use the driver's node id. Change-Id: I2f5af8d2a7e8a89efeb5e0a1b86bdfa547b25fc8 (cherry picked from commit 799f0323cdaf5d020c9e8f8b24b2cd7e5571539f) --- src/roctracer/hsa_support.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/roctracer/hsa_support.cpp b/src/roctracer/hsa_support.cpp index d6bfa7b4..31153987 100644 --- a/src/roctracer/hsa_support.cpp +++ b/src/roctracer/hsa_support.cpp @@ -61,7 +61,7 @@ AmdExtTable saved_amd_ext_api{}; hsa_ven_amd_loader_1_01_pfn_t hsa_loader_api{}; struct AgentInfo { - int index; + uint32_t id; hsa_device_type_t type; }; std::unordered_map agent_info_map; @@ -275,7 +275,7 @@ hsa_status_t MemoryPoolAllocateIntercept(hsa_amd_memory_pool_t pool, size_t size hsa_evt_data_t data{}; data.device.type = it->second.type; - data.device.id = it->second.index; + data.device.id = it->second.id; data.device.agent = agent; data.device.ptr = ptr; @@ -314,7 +314,7 @@ hsa_status_t AgentsAllowAccessIntercept(uint32_t num_agents, const hsa_agent_t* hsa_evt_data_t data{}; data.device.type = it->second.type; - data.device.id = it->second.index; + data.device.id = it->second.id; data.device.agent = agent; data.device.ptr = ptr; @@ -540,15 +540,20 @@ void Initialize(HsaApiTable* table) { switch (agent_info.type) { case HSA_DEVICE_TYPE_CPU: static int cpu_agent_count = 0; - agent_info.index = cpu_agent_count++; - break; - case HSA_DEVICE_TYPE_GPU: - static int gpu_agent_count = 0; - agent_info.index = gpu_agent_count++; + agent_info.id = cpu_agent_count++; break; + case HSA_DEVICE_TYPE_GPU: { + uint32_t driver_node_id; + if (hsa_support::saved_core_api.hsa_agent_get_info_fn( + agent, static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &driver_node_id) != HSA_STATUS_SUCCESS) + fatal("hsa_agent_get_info failed"); + + agent_info.id = driver_node_id; + } break; default: static int other_agent_count = 0; - agent_info.index = other_agent_count++; + agent_info.id = other_agent_count++; break; } hsa_support::agent_info_map.emplace(agent.handle, agent_info); From 080875fbe0c06f328fcd519063ad4ba1c1106c23 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Mon, 8 May 2023 16:16:36 -0700 Subject: [PATCH 45/47] SWDEV-383221 - Set the default value of ROCM_HEADER_WRAPPER_WERROR to OFF Using wrapper header files will result in #warning message by default Change-Id: Ib8a05d11f2391dfcdac8601da26e1096821cd555 --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 80598d24..1bf88004 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -253,7 +253,7 @@ if(FILE_REORG_BACKWARD_COMPATIBILITY) set(ROCM_HEADER_WRAPPER_WERROR "$ENV{ROCM_HEADER_WRAPPER_WERROR}" CACHE STRING "Header wrapper warnings as errors.") else() - set(ROCM_HEADER_WRAPPER_WERROR "ON" CACHE STRING "Header wrapper warnings as errors.") + set(ROCM_HEADER_WRAPPER_WERROR "OFF" CACHE STRING "Header wrapper warnings as errors.") endif() endif() if(ROCM_HEADER_WRAPPER_WERROR) From 4a788c92d6a3087a4f0e18ba95083341335bc219 Mon Sep 17 00:00:00 2001 From: gobhardw Date: Fri, 12 May 2023 20:29:22 +0530 Subject: [PATCH 46/47] SWDEV-398161, SWDEV-398764 Fixed hsa-trace failures for profiling data corrupted Change-Id: I3d8dbb2a40d948cd06cb1278acc50dc5be4ca0ef (cherry picked from commit ee713682a11fbebaf61c1cc7fced2cb5aac8e32a) --- src/roctracer/hsa_support.cpp | 44 +++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/src/roctracer/hsa_support.cpp b/src/roctracer/hsa_support.cpp index 31153987..c5ca4640 100644 --- a/src/roctracer/hsa_support.cpp +++ b/src/roctracer/hsa_support.cpp @@ -27,6 +27,7 @@ #include "roctracer.h" #include "roctracer_hsa.h" +#include #include #include #include @@ -415,11 +416,13 @@ hsa_status_t ExecutableDestroyIntercept(hsa_executable_t executable) { return saved_core_api.hsa_executable_destroy_fn(executable); } -bool profiling_async_copy_enable = false; +std::atomic profiling_async_copy_enable{false}; hsa_status_t ProfilingAsyncCopyEnableIntercept(bool enable) { hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(enable); - if (status == HSA_STATUS_SUCCESS) profiling_async_copy_enable = enable; + if (status == HSA_STATUS_SUCCESS) { + profiling_async_copy_enable.exchange(enable, std::memory_order_release); + } return status; } @@ -434,6 +437,36 @@ void MemoryASyncCopyHandler(const Tracker::entry_t* entry) { ReportActivity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY, &record); } +hsa_status_t MemoryASyncCopyOnEngineIntercept( + void* dst, hsa_agent_t dst_agent, const void* src, hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, hsa_signal_t completion_signal, + hsa_amd_sdma_engine_id_t engine_id, bool force_copy_on_sdma) { + bool is_enabled = IsEnabled(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY); + + // FIXME: what happens if the state changes before returning? + [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); + assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); + + if (!is_enabled) { + return saved_amd_ext_api.hsa_amd_memory_async_copy_on_engine_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, completion_signal, + engine_id, force_copy_on_sdma); + } + + Tracker::entry_t* entry = new Tracker::entry_t(); + entry->handler = MemoryASyncCopyHandler; + entry->correlation_id = CorrelationId(); + Tracker::Enable(Tracker::COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); + + status = saved_amd_ext_api.hsa_amd_memory_async_copy_on_engine_fn( + dst, dst_agent, src, src_agent, size, num_dep_signals, dep_signals, entry->signal, engine_id, + force_copy_on_sdma); + if (status != HSA_STATUS_SUCCESS) Tracker::Disable(entry); + + return status; +} + hsa_status_t MemoryASyncCopyIntercept(void* dst, hsa_agent_t dst_agent, const void* src, hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, @@ -442,7 +475,7 @@ hsa_status_t MemoryASyncCopyIntercept(void* dst, hsa_agent_t dst_agent, const vo // FIXME: what happens if the state changes before returning? [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( - profiling_async_copy_enable | is_enabled); + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); if (!is_enabled) { @@ -473,7 +506,7 @@ hsa_status_t MemoryASyncCopyRectIntercept(const hsa_pitched_ptr_t* dst, // FIXME: what happens if the state changes before returning? [[maybe_unused]] hsa_status_t status = saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn( - profiling_async_copy_enable | is_enabled); + profiling_async_copy_enable.load(std::memory_order_relaxed) || is_enabled); assert(status == HSA_STATUS_SUCCESS && "hsa_amd_profiling_async_copy_enable failed"); if (!is_enabled) { @@ -570,6 +603,7 @@ void Initialize(HsaApiTable* table) { // Install the HSA_OPS intercept table->amd_ext_->hsa_amd_memory_async_copy_fn = MemoryASyncCopyIntercept; table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = MemoryASyncCopyRectIntercept; + table->amd_ext_->hsa_amd_memory_async_copy_on_engine_fn = MemoryASyncCopyOnEngineIntercept; table->amd_ext_->hsa_amd_profiling_async_copy_enable_fn = ProfilingAsyncCopyEnableIntercept; // Install the HSA_EVT intercept @@ -590,7 +624,7 @@ void Initialize(HsaApiTable* table) { void Finalize() { if (hsa_status_t status = - saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(profiling_async_copy_enable); + saved_amd_ext_api.hsa_amd_profiling_async_copy_enable_fn(profiling_async_copy_enable.load(std::memory_order_relaxed)); status != HSA_STATUS_SUCCESS) assert(!"hsa_amd_profiling_async_copy_enable failed"); From 166d3d43a5aebbd39f63d6c65d3509bdd32d058b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 30 Jul 2023 05:36:17 -0700 Subject: [PATCH 47/47] Simplify finding the hip package On Fedora, where hip is installed as an rpm, its cmake files can not be found and are reported as an error. CMake Error at test/CMakeLists.txt:32 (find_package): No "FindHIP.cmake" found in CMAKE_MODULE_PATH. This change treats hip as a the normal package. Signed-off-by: Tom Rix --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c7c5903b..3b5d14ce 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) -find_package(HIP REQUIRED MODULE) +find_package(HIP REQUIRED) find_package(Clang REQUIRED CONFIG PATHS "${ROCM_PATH}"