diff --git a/CMakeLists.txt b/CMakeLists.txt index f1ad1982..f5c1d73c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,12 +100,15 @@ set ( PUBLIC_HEADERS roctracer_kfd.h roctracer_roctx.h roctracer_cb_table.h + ext/prof_protocol.h + ext/hsa_rt_utils.hpp +) +set ( GEN_HEADERS hip_ostream_ops.h hsa_prof_str.h - kfd_ostream_ops.h + hsa_ostream_ops.h kfd_prof_str.h - ext/prof_protocol.h - ext/hsa_rt_utils.hpp + kfd_ostream_ops.h ) if ( ${LIBRARY_TYPE} STREQUAL SHARED ) @@ -137,6 +140,11 @@ foreach ( header ${PUBLIC_HEADERS} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) endforeach () +foreach ( header ${GEN_HEADERS} ) + get_filename_component ( header_subdir ${header} DIRECTORY ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION ${DEST_NAME}/include/${header_subdir} ) + install ( FILES ${PROJECT_BINARY_DIR}/inc/${header} DESTINATION include/${DEST_NAME}/${header_subdir} ) +endforeach () #install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION include RENAME ${DEST_NAME} ) install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so ) install ( FILES ${PROJECT_BINARY_DIR}/so-major-link DESTINATION lib RENAME ${ROCTRACER_LIBRARY}.so.${LIB_VERSION_MAJOR} ) @@ -169,18 +177,49 @@ install ( TARGETS "kfdwrapper64" LIBRARY DESTINATION ${DEST_NAME}/lib ) ## Packaging directives set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) set ( CPACK_PACKAGE_NAME "${ROCTRACER_NAME}-dev" ) -set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." ) set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) -set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ) +if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) + set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) +endif() +message ( "-- CPACK_PACKAGE_VERSION: ${CPACK_PACKAGE_VERSION}" ) +set ( CPACK_PACKAGE_CONTACT "TODO " ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD ROCTRACER library" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables +if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) + set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) +else() + set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) +set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables +if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) + set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) +else() + set ( CPACK_RPM_PACKAGE_RELEASE "local" ) +endif() +message ( "Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}" ) + +## 'dist' breaks manual builds on debian systems due to empty Provides +execute_process( COMMAND rpm --eval %{?dist} + RESULT_VARIABLE PROC_RESULT + OUTPUT_VARIABLE EVAL_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE ) +message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}") + +if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) + string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) +endif() +set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) +message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) diff --git a/README.md b/README.md index 8f3e8481..5dce1e8d 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,8 @@ rocTX API: - Set environment: export CMAKE_PREFIX_PATH=/opt/rocm - - To use custom HIP/HCC versions: + - To use custom HIP version: export HIP_PATH=/opt/rocm/hip - export HCC_HOME=/opt/rocm/hcc - To build roctracer library: export CMAKE_BUILD_TYPE= # release by default @@ -74,3 +73,8 @@ rocTX API: or make package && dpkg -i *.deb ``` + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- OpenMP applications are not fully supported by the roctracer. diff --git a/build.sh b/build.sh index a8515b69..a5201275 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash -e SRC_DIR=`dirname $0` COMPONENT="roctracer" ROCM_PATH="${ROCM_PATH:=/opt/rocm}" @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/build_static.sh b/build_static.sh index 938df3ce..bb6ecf29 100755 --- a/build_static.sh +++ b/build_static.sh @@ -17,7 +17,6 @@ if [ -e "$DEFAULTS" ] ; then source "$DEFAULTS"; fi if [ -z "$ROCTRACER_ROOT" ]; then ROCTRACER_ROOT=$SRC_DIR; fi if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$PWD; fi if [ -z "$HIP_PATH" ] ; then export HIP_PATH="$ROCM_PATH/hip"; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="$ROCM_PATH/$COMPONENT"; fi diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 8dbf2c9c..3f5dec60 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -53,6 +53,11 @@ set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) +## Set RUNPATH if ROCM_RPATH is defined and passed by the environment +if ( DEFINED ROCM_RPATH ) + set ( CMAKE_SHARED_LINKER_FLAGS " -Wl,--enable-new-dtags -Wl,--rpath,${ROCM_RPATH} ${CMAKE_SHARED_LINKER_FLAGS}" ) +endif () + set ( CMAKE_SKIP_BUILD_RPATH TRUE ) ## CLANG options @@ -61,8 +66,11 @@ if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) endif() ## Enable debug trace +if ( DEFINED CMAKE_DEBUG_TRACE ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) +endif() if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) - add_definitions ( -DDEBUG_TRACE=1 ) + add_definitions ( -DDEBUG_TRACE_ON=1 ) endif() if ( NOT DEFINED LIBRARY_TYPE ) @@ -89,7 +97,7 @@ else() set ( HIP_DEFINES "-D__HIP_PLATFORM_HCC__=1") endif() -## Enable HIP/HCC local build +## Enable HIP local build if ( DEFINED LOCAL_BUILD ) add_definitions ( -DLOCAL_BUILD=${LOCAL_BUILD} ) else() @@ -114,15 +122,10 @@ if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) endif() set ( ENV{CMAKE_PREFIX_PATH} ${CMAKE_PREFIX_PATH} ) -set ( HCC_HOME "/opt/rocm/hcc" ) set ( HIP_PATH "/opt/rocm/hip" ) -if ( DEFINED ENV{HCC_HOME} ) - set ( HCC_HOME $ENV{HCC_HOME} ) -endif() if ( DEFINED ENV{HIP_PATH} ) set ( HIP_PATH $ENV{HIP_PATH} ) endif() -set ( HCC_INC_DIR "${HCC_HOME}/include" ) set ( HIP_INC_DIR "${HIP_PATH}/include" ) ## Extend Compiler flags based on build type @@ -153,10 +156,10 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH "${HSA_RUNTIME_LIB}" DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) -get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) - set ( HSA_KMT_INC_PATH "${HSA_KMT_LIB_PATH}/../include" ) -set ( ROCM_INC_PATH "${HSA_KMT_INC_PATH}" ) + +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) +set ( ROCM_INC_PATH "${ROCM_ROOT_DIR}/include" ) ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) @@ -169,8 +172,8 @@ message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) +message ( "-------ROCM_INC_PATH: ${ROCM_INC_PATH}" ) message ( "-------------KFD-Inc: ${HSA_KMT_INC_PATH}" ) -message ( "-------------HCC-Inc: ${HCC_INC_DIR}" ) message ( "-------------HIP-Inc: ${HIP_INC_DIR}" ) message ( "-------------HIP-VDI: ${HIP_VDI}" ) message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) diff --git a/doc/roctracer_spec.md b/doc/roctracer_spec.md index 33666bd9..e7148eb6 100644 --- a/doc/roctracer_spec.md +++ b/doc/roctracer_spec.md @@ -3,38 +3,33 @@ ROC Tracer API version 2 ROC-TX API version 1 -- The rocTracer API is agnostic to specific runtime and may trace -the runtime API calls and asynchronous GPU activity. +- The rocTracer API is runtime-agnostic and can trace the runtime API calls and asynchronous GPU activity. - The rocTX API is provided for application code annotation. ``` ## 1. High level overview ``` -The goal of the implementation is to provide a runtime independent API +The goal of the implementation is to provide a runtime-independent API for tracing of runtime calls and asynchronous activity, like GPU kernel -dispatches and memory moves. The tracing includes callback API for +dispatches and memory move. The tracing includes callback API for runtime API tracing and activity API for asynchronous activity records logging. -Depending on particular runtime intercepting mechanism, the rocTracer -library can be dynamically linked, dynamically loaded by the runtime as -a plugin or some API wrapper can be loaded using LD_PRELOAD. +Depending on particular runtime intercepting mechanism, the rocTracer library can be dynamically linked and loaded by the runtime as +a plugin or an API wrapper can be loaded using LD_PRELOAD. The library has a C API. The rocTracer library is an API that intercepts runtime API calls and -traces asynchronous activity. The activity tracing results are recorded -in a ring buffer. +traces asynchronous activity. The activity-tracing results are recorded in a ring buffer. -The rocTX contains application code instrumentation API to support high -level correlation of runtime API/activity events. The API includes mark +RocTX contains application code instrumentation API to support high-level correlation of runtime API/activity events. The API includes mark and nested ranges. ``` ## 2. General API ### 2.1. Description ``` -The library supports method for getting the error number and error string -of the last failed library API call. It allows to check the conformance -of used library API header and the library binary, the version macros and -API methods can be used. +The library supports methods for getting the error number and error string +of the last failed library API call. It allows checking the conformance +of the used library API header, the library binary, the version macros, and the used API methods. Returning the error and error string methods: • roctracer_status_t – error code enumeration @@ -66,14 +61,14 @@ const char* roctracer_error_string(); ``` ### 2.3. Library version ``` -The library provides major and minor versions. Major version is for -incompatible API changes and minor version for bug fixes. +The library provides a major version for +incompatible API changes and a minor version for bug fixes. API version macros defined in the library API header ‘roctracer.h’: ROCTRACER_VERSION_MAJOR ROCTRACER_VERSION_MINOR -Methods to check library major and minor venison: +Methods to check library major and minor version: uint32_t roctracer_major_version(); uint32_t roctracer_minor_version(); ``` @@ -84,33 +79,26 @@ The rocTracer provides support for runtime API callbacks and activity records logging. The APIs of different runtimes at different levels are considered as different API domains with assigned domain IDs. For example, language level and driver level. The API callbacks provide -the API calls arguments and are called on two phases on “enter” and -on “exit”. The activity records are logged to the ring buffer and can +the API calls arguments and are invoked on “enter” and +on “exit” phases. The activity records are logged to the ring buffer and can be associated with the respective API calls using the correlation ID. -Activity API can be used to enable collecting of the records with +Activity API can be used to enable the collection of records with timestamping data for API calls and asynchronous activity like the -kernel submits, memory copies and barriers +kernel submits, memory copies, and barriers. Tracing domains: -• roctracer_domain_t – runtime API domains, HIP, HSA, etc… -• roctracer_op_string – Return Op string by given domain and - activity Op code -• roctracer_op_code – Return Op code and kind by given string +• roctracer_domain_t – Runtime API domains, HIP, HSA, etc. +• roctracer_op_string – Returns Op string using given domain and activity Opcode. +• roctracer_op_code – Returns Opcode using given string. Callback API: -• roctracer_rtapi_callback_t – runtime API callback type -• roctracer_enable_op_callback – enable runtime API callback - by domain and Op code -• roctracer_enable_domain_callback – enable runtime API callback - by domain for all Ops -• roctracer_enable_callback – enable runtime API callback for - all domains, all Ops -• roctracer_disable_op_callback – disable runtime API callback - by domain and Op code -• roctracer_enable_op_callback – enable runtime API callback - by domain for all Ops -• roctracer_enable_op_callback – enable runtime API callback for - all domains, all Ops +• roctracer_rtapi_callback_t – Runtime API callback type +• roctracer_enable_op_callback – Enable runtime API callback using domain and Opcode. +• roctracer_enable_domain_callback – Enable runtime API callback using the domain for all Ops. +• roctracer_enable_callback – Enable runtime API callback for all domains and all Ops. +• roctracer_disable_op_callback – Disable runtime API callback using domain and Opcode. +• roctracer_enable_op_callback – Enable runtime API callback using the domain for all Ops. +• roctracer_enable_op_callback – Enable runtime API callback for all domains and all Ops. Activity API: • roctracer_record_t – activity record @@ -132,19 +120,17 @@ Activity API: • roctracer_get_timestamp – return correlated GPU/CPU system timestamp External correlation ID API: -• roctracer_activity_push_external_correlation_id - push an external - correlation id for the calling thread -• roctracer_activity_pop_external_correlation_id - pop an external - correlation id for the calling thread +• roctracer_activity_push_external_correlation_id - push an external correlation id for the calling thread +• roctracer_activity_pop_external_correlation_id - pop an external correlation id for the calling thread Tracing control API: -• roctracer_start – tracing start -• roctracer_stop – tracer stop +• roctracer_start – start tracing +• roctracer_stop – stop tracing ``` ### 3.2. Tracing Domains ``` -Various tracing domains are supported. Each domain is assigned with +Various tracing domains are supported where each domain is assigned with a domain ID. The domains include HSA, HIP, and HCC runtime levels. Traced API domains: @@ -159,12 +145,12 @@ typedef enum { ACTIVITY_DOMAIN_NUMBER = 7 } activity_domain_t; -Return name by given domain and Op code: +Return name using given domain and Opcode: const char* roctracer_op_string( // NULL returned on error and error number is set uint32_t domain, // tracing domain - uint32_t op, // activity op code - uint32_t kind); // activity kind -Return Op code and kind by given string: + uint32_t op, // activity opcode + uint32_t kind); // kind of activity +Return Op code and kind using given string: roctracer_status_t roctracer_op_code( uint32_t domain, // tracing domain const char* str, // [in] op string @@ -174,8 +160,7 @@ roctracer_status_t roctracer_op_code( ### 3.3. Callback API ``` The tracer provides support for runtime API callbacks and activity records -logging. The API callbacks provide the API calls arguments and are called -on two phases on “enter”, on “exit”. +logging. The API callbacks provide the API call arguments and are invoked on the “enter” and “exit” phases. API phase passed to the callbacks: typedef enum { @@ -222,9 +207,8 @@ roctracer_status_t roctracer_disable_callback(); ``` The activity records are asynchronously logged to the pool and can be associated with the respective API callbacks using the correlation ID. -Activity API can be used to enable collecting the records with -timestamp data for API calls and GPU activity like kernel submits, -memory copies, and barriers. +Activity API can be used to enable the collection of records with +timestamp data for API calls and GPU activity like kernel submits, memory copies, and barriers. // Correlation id typedef uint64_t activity_correlation_id_t; @@ -337,6 +321,7 @@ roctracer_status_t roctracer_enable_activity(); roctracer_status_t roctracer_enable_activity_expl( roctracer_pool_t* pool); // memory pool, NULL means default pool + Disable activity records logging: roctracer_status_t roctracer_disable_op_activity( activity_domain_t domain, // tracing domain @@ -462,13 +447,13 @@ int main() { ``` ### 4.2. MatrixTranspose HIP sample with all APIs/activity tracing enabled ``` -This shows a MatrixTranspose HIP sample with enabled tracing of -all HIP API and all GPU asynchronous activity. +This shows a MatrixTranspose HIP sample with the tracing of +all HIP API and all GPU asynchronous activity enabled. /* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy +Permission is here by granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell @@ -480,7 +465,7 @@ all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -734,15 +719,15 @@ void stop_tracing() { ## 5. rocTX application code annotation API ``` Basic annotation API: markers and nested ranges. -// A marker created by given ASCII massage +// A marker created by given ASCII message void roctxMark(const char* message); -// Returns the 0 based level of a nested range being started by given message associated to this range. +// Returns the 0 based level of a nested range started by a given message associated with this range. // A negative value is returned on the error. int roctxRangePush(const char* message); // Marks the end of a nested range. -// Returns the 0 based level the range. -// A negative value is returned on the error. +// Returns the 0 based level of the range. +// A negative value is returned on error. int roctxRangePop(); ``` diff --git a/inc/ext/prof_protocol.h b/inc/ext/prof_protocol.h index c29ff0e6..1c00e972 100644 --- a/inc/ext/prof_protocol.h +++ b/inc/ext/prof_protocol.h @@ -36,6 +36,7 @@ typedef enum { ACTIVITY_DOMAIN_KFD_API = 4, // KFD API domain ACTIVITY_DOMAIN_EXT_API = 5, // External ID domain ACTIVITY_DOMAIN_ROCTX = 6, // ROCTX domain + ACTIVITY_DOMAIN_HSA_EVT = 7, // HSA events ACTIVITY_DOMAIN_NUMBER } activity_domain_t; diff --git a/inc/roctracer_hip.h b/inc/roctracer_hip.h index 86ffc1ae..391831bf 100644 --- a/inc/roctracer_hip.h +++ b/inc/roctracer_hip.h @@ -23,9 +23,23 @@ THE SOFTWARE. #ifndef INC_ROCTRACER_HIP_H_ #define INC_ROCTRACER_HIP_H_ -#include +#ifdef __cplusplus +#include + +inline static std::ostream& operator<<(std::ostream& out, const unsigned char& v) { + out << (unsigned int)v; + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const char& v) { + out << (unsigned char)v; + return out; +} +#endif // __cplusplus + #include -#include +#include +#include #include diff --git a/inc/roctracer_hsa.h b/inc/roctracer_hsa.h index b9b0cf98..1e50c3ab 100644 --- a/inc/roctracer_hsa.h +++ b/inc/roctracer_hsa.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include +#include // HSA OP ID enumeration enum hsa_op_id_t { @@ -34,7 +35,7 @@ enum hsa_op_id_t { HSA_OP_ID_COPY = 1, HSA_OP_ID_BARRIER = 2, HSA_OP_ID_RESERVED1 = 3, - HSA_OP_ID_NUMBER = 4 + HSA_OP_ID_NUMBER }; #ifdef __cplusplus @@ -65,23 +66,6 @@ typedef hsa_support::ops_properties_t hsa_ops_properties_t; #include "hsa_ostream_ops.h" -std::ostream& operator<<(std::ostream& out, const hsa_amd_memory_pool_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_image_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} - -std::ostream& operator<<(std::ostream& out, const hsa_ext_sampler_t& v) -{ - roctracer::hsa_support::operator<<(out, v); - return out; -} #else // !__cplusplus typedef void* hsa_amd_queue_intercept_handler; diff --git a/run_test.sh b/run_test.sh index c2ea74a6..61f1b301 100755 --- a/run_test.sh +++ b/run_test.sh @@ -7,7 +7,6 @@ fatal() { } if [ -z "$BUILD_DIR" ] ; then export BUILD_DIR=$PWD; fi -if [ -z "$HCC_HOME" ] ; then export HCC_HOME="$ROCM_PATH/hcc"; fi cd $BUILD_DIR ./run.sh diff --git a/script/check_trace.py b/script/check_trace.py index 29baff8a..c10eb3c5 100644 --- a/script/check_trace.py +++ b/script/check_trace.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - #Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved. # #Permission is hereby granted, free of charge, to any person obtaining a copy @@ -120,14 +118,14 @@ def diff_strings(cnt_r, cnt, metric): for evt2 in events_order[tid]: if diff_cnt == diff_cnt_r: if evt != evt2: - print (">I< Difference starts at index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(diff_cnt_r) + ", tid_r " + str(tid_r) + ", tid " + str(tid) + ", with evts " + evt + " and " + evt2 + "\n") found_diff_evt = 1 break diff_cnt += 1 diff_cnt_r += 1 if found_diff_evt: break if len(events_order_r[tid_r]) != len(events_order[tid]) and found_diff_evt == 0: - print (">I< Difference starts at index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") + print (">I< Difference starts at tid rank: " + str(cnt_tid) + " event index: " + str(min(len(events_order_r[tid_r]), len(events_order[tid]))) + ", with missing evts\n") break cnt_tid += 1 cnt_tid_r += 1 @@ -200,10 +198,10 @@ def check_trace_status(tracename, verbose, check_trace_flag): events_order_r[tid] = events_order[tid] cnt = gen_events_info(trace,trace_level,no_events_cnt,events2ignore,events2chkcnt,events2chkord,verbose) if verbose: - print '\n' + rtrace + ':\n' - print cnt_r - print '\n' + trace + ':\n' - print cnt + print ('\n' + rtrace + ':\n') + print (cnt_r) + print ('\n' + trace + ':\n') + print (cnt) diff_strings(cnt_r, cnt, metric) if cnt_r == cnt: @@ -292,6 +290,10 @@ def gen_events_info(tracefile, trace_level, no_events_cnt, events2ignore, events if metric == 'or': for tid in sorted (events_order.keys()) : res = res + str(events_order[tid]) + if metric == 'cnt': + newres = res.split('\n') + newres = sorted(newres) + res = str(newres) return res parser = argparse.ArgumentParser(description='check_trace.py: check a trace aainst golden ref. Returns 0 for success, 1 for failure') diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index 142ec98e..180103a5 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -1,8 +1,7 @@ -#!/usr/bin/python - import os, sys, re import CppHeaderParser import argparse +import string LICENSE = \ '/*\n' + \ @@ -28,12 +27,7 @@ '*/\n' -header = 'template \n' + \ -'struct output_streamer {\n' + \ -' inline static std::ostream& put(std::ostream& out, const T& v) { return out; }\n' + \ -'};\n\n' - -header_hip = \ +header_basic = \ 'template \n' + \ ' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ ' using std::operator<<;\n' + \ @@ -42,7 +36,10 @@ ' return out; }\n' structs_analyzed = {} -global_ops_hip = '' +global_ops = '' +global_str = '' +output_filename_h = None +apiname = "" # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): @@ -51,6 +48,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a # cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...) # parent_hier_name: parent hierarchical name used for nested structs/enums # apiname: for example hip, kfd. + global global_str if cppHeader_struct == 'max_align_t': #function pointers not working in cppheaderparser return @@ -58,8 +56,7 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a return if cppHeader_struct in structs_analyzed: return - - structs_analyzed[cppHeader_struct] = 1; + structs_analyzed[cppHeader_struct] = 1 for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): key = 'name' name = "" @@ -85,16 +82,17 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: prop = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key4] + str = '' if "union" not in mtype: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - str = " roctracer::" + apiname.lower() + "_support::operator<<(out, v."+name+");\n" - else: - if array_size == "": - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+">::put(out,v."+name+");\n" - else: - str = " roctracer::" + apiname.lower() + "_support::output_streamer<"+mtype+"["+array_size+"]>::put(out,v."+name+");\n" + indent = "" + str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ")) {\n" + indent = " " + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \"" + name + "=\");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, v." + name + ");\n" + str += indent + " roctracer::" + apiname.lower() + "_support::operator<<(out, \", \");\n" + str += " }\n" if "void" not in mtype: - file_handle.write(str) + global_str += str else: if prop != '': next_cppHeader_struct = prop + "::" @@ -105,99 +103,104 @@ def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, a process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) # Parses API header file and generates ostream ops files ostream_ops.h -def gen_cppheader(infilepath, outfilepath): +def gen_cppheader(infilepath, outfilepath, rank): # infilepath: API Header file to be parsed # outfilepath: Output file where ostream operators are written - global_ops_hip = '' - global_ops_hsa = '' + global global_ops + global output_filename_h + global apiname + global global_str try: cppHeader = CppHeaderParser.CppHeader(infilepath) except CppHeaderParser.CppParseError as e: print(e) sys.exit(1) - mpath = os.path.dirname(outfilepath) - if mpath == "": - mpath = os.getcwd() - apiname = outfilepath.replace(mpath+"/","") - apiname = apiname.replace("_ostream_ops.h","") - apiname = apiname.upper() - f = open(outfilepath,"w+") - f.write("// automatically generated\n") - f.write(LICENSE + '\n') - header_s = \ - '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#ifdef __cplusplus\n' + \ - '#include \n' + \ - '\n' + \ - '#include "roctracer.h"\n' - if apiname.lower() == 'hip': - header_s = header_s + '\n' + \ - '#include "hip/hip_runtime_api.h"\n' + \ - '#include "hip/hcc_detail/hip_vector_types.h"\n\n' - - f.write(header_s) - f.write('\n') - f.write('namespace roctracer {\n') - f.write('namespace ' + apiname.lower() + '_support {\n') - f.write('// begin ostream ops for '+ apiname + ' \n') - if apiname.lower() == "hip" or apiname.lower() == "hsa": - f.write("// basic ostream ops\n") - f.write(header_hip) - f.write("// End of basic ostream ops\n\n") - else: - f.write(header) + if rank == 0 or rank == 2: + mpath = os.path.dirname(outfilepath) + if mpath == "": + mpath = os.getcwd() + apiname = outfilepath.replace(mpath + "/","") + output_filename_h = open(outfilepath,"w+") + apiname = apiname.replace("_ostream_ops.h","") + apiname = apiname.upper() + output_filename_h.write("// automatically generated\n") + output_filename_h.write(LICENSE + '\n') + header_s = \ + '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + '#ifdef __cplusplus\n' + \ + '#include \n' + \ + '\n' + \ + '#include "roctracer.h"\n' + header_s += '#include \n' + + output_filename_h.write(header_s) + output_filename_h.write('\n') + output_filename_h.write('namespace roctracer {\n') + output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') + output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') + output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \"\";\n') + output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') + output_filename_h.write("// basic ostream ops\n") + output_filename_h.write(header_basic) + output_filename_h.write("// End of basic ostream ops\n\n") for c in cppHeader.classes: if "union" in c: continue - if apiname.lower() == 'hsa': - if c == 'max_align_t' or c == '__fsid_t': #already defined for hip + if c in structs_analyzed: continue - #if apiname.lower() == 'hip' and c == 'hipIpcEventHandle_t': #feature is TBD - # continue - if len(cppHeader.classes[c]["properties"]["public"])!=0: - if apiname.lower() == 'hip' or apiname.lower() == 'hsa': - f.write("std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") - f.write("{\n") - process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") - f.write("}\n") - else: - f.write("\ntemplate<>\n") - f.write("struct output_streamer<" + c + "&> {\n") - f.write(" inline static std::ostream& put(std::ostream& out, "+c+"& v)\n") - f.write("{\n") - process_struct(f, c, cppHeader, "", apiname) - f.write(" return out;\n") - f.write("}\n") - f.write("};\n") - if apiname.lower() == 'hip': - global_ops_hip += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hip_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - if apiname.lower() == 'hsa': - global_ops_hsa += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::hsa_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" - - footer = \ - '// end ostream ops for '+ apiname + ' \n' - footer += '};};\n\n' - f.write(footer) - f.write(global_ops_hip) - f.write(global_ops_hsa) - footer = '#endif //__cplusplus\n' + \ - '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - ' \n' - f.write(footer) - f.close() - print('File ' + outfilepath + ' generated') + if c == 'max_align_t' or c == '__fsid_t': # Skipping as it is defined in multiple domains + continue + if len(cppHeader.classes[c]["properties"]["public"]) != 0: + output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") + output_filename_h.write("{\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '{');\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") + output_filename_h.write(" if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" ) + process_struct(output_filename_h, c, cppHeader, "", apiname) + global_str = "\n".join(global_str.split("\n")[0:-3]) + if global_str != '': global_str += "\n }\n" + output_filename_h.write(global_str) + output_filename_h.write(" };\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") + output_filename_h.write(" roctracer::" + apiname.lower() + "_support::operator<<(out, '}');\n") + output_filename_h.write(" return out;\n") + output_filename_h.write("}\n") + global_str = '' + global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + + if rank == 1 or rank == 2: + footer = '// end ostream ops for '+ apiname + ' \n' + footer += '};};\n\n' + output_filename_h.write(footer) + output_filename_h.write(global_ops) + footer = '#endif //__cplusplus\n' + \ + '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ + ' \n' + output_filename_h.write(footer) + output_filename_h.close() + print('File ' + outfilepath + ' generated') return parser = argparse.ArgumentParser(description='genOstreamOps.py: generates ostream operators for all typedefs in provided input file.') requiredNamed = parser.add_argument_group('Required arguments') -requiredNamed.add_argument('-in', metavar='file', help='Header file to be parsed', required=True) +requiredNamed.add_argument('-in', metavar='fileList', help='Comma separated list of header files to be parsed', required=True) requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) args = vars(parser.parse_args()) if __name__ == '__main__': - gen_cppheader(args['in'], args['out']) + flist = args['in'].split(',') + if len(flist) == 1: + gen_cppheader(flist[0], args['out'],2) + else: + for i in range(len(flist)): + if i == 0: + gen_cppheader(flist[i], args['out'],0) + elif i == len(flist)-1: + gen_cppheader(flist[i], args['out'],1) + else: + gen_cppheader(flist[i], args['out'],-1) diff --git a/script/hsaap.py b/script/hsaap.py index a50b3d4d..e1a3d717 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -1,4 +1,3 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re @@ -456,7 +455,10 @@ def gen_out_stream(self, n, name, call, struct): for ind in range(len(arg_list)): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var - self.content += ' out << ' + arg_val + if re.search(r'char\* ', struct['astr'][arg_var]): + self.content += ' out << "0x" << std::hex << (uint64_t)' + arg_val + else: + self.content += ' out << ' + arg_val ''' arg_item = struct['tlst'][ind] if re.search(r'\(\* ', arg_item): arg_pref = '' @@ -490,15 +492,15 @@ def gen_out_stream(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file=sys.stderr) + print ("Usage:", sys.argv[0], " ", file=sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' HSA_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT, HSA_DIR, API_TABLES_H, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT +out_file = PREFIX + OUT print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content[:-1]) diff --git a/script/kfdap.py b/script/kfdap.py index e920bbf6..8de1d19e 100755 --- a/script/kfdap.py +++ b/script/kfdap.py @@ -1,11 +1,10 @@ -#!/usr/bin/python from __future__ import print_function import os, sys, re -OUT_H = 'inc/kfd_prof_str.h' -OUT_CPP = 'src/kfd/kfd_wrapper.cpp' -API_HEADERS_H = ( - ('HSAKMTAPI', 'hsakmt.h'), +OUT_H = 'inc/kfd_prof_str.h' +OUT_CPP = 'src/kfd_wrapper.cpp' +API_HEADERS_H = ( + ('HSAKMTAPI', 'hsakmt.h'), ) LICENSE = \ @@ -38,7 +37,7 @@ def fatal(module, msg): sys.exit(1) # Get next text block -def NextBlock(pos, record): +def NextBlock(pos, record): if len(record) == 0: return pos space_pattern = re.compile(r'(\s+)') @@ -80,10 +79,10 @@ def __init__(self, header, name, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') - self.beg_pattern = re.compile(name) - self.end_pattern = re.compile('.*\)\s*;\s*$'); + self.beg_pattern = re.compile(name) + self.end_pattern = re.compile('.*\)\s*;\s*$'); self.array = [] self.parse() @@ -92,10 +91,10 @@ def norm_line(self, line): return re.sub(r'^\s+', r' ', line) def fix_comment_line(self, line): - return re.sub(r'\/\/.*', r'', line) + return re.sub(r'\/\/.*', r'', line) def remove_ret_line(self, line): - return re.sub(r'\n', r'', line) + return re.sub(r'\n', r'', line) # check for start record def is_start(self, record): @@ -107,7 +106,7 @@ def is_end(self, record): # check for declaration entry record def is_entry(self, record): - return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) + return re.match(r'^\s*HSAKMTAPI\s*(.*)\s*\((.*)\)', record) # parse method def parse(self): @@ -121,7 +120,7 @@ def parse(self): line = self.norm_line(line) line = self.fix_comment_line(line) - if cumulate == 1: record += " " + line; + if cumulate == 1: record += " " + line; else: record = line; if self.is_start(line): rettype = prev_line.strip(); cumulate = 1; prev_line = line; continue; if self.is_end(line): record = self.remove_ret_line(record); cumulate = 0; active = 1; @@ -132,7 +131,7 @@ def parse(self): mycall_full = rettype + " " + m.group(1) + ' (' + m.group(2) + ')' mycall = m.group(1) self.full_fct[mycall] = mycall_full - self.array.append(mycall) + self.array.append(mycall) rettype = ""; prev_line = line @@ -146,7 +145,7 @@ def __init__(self, header, array, data, full_fct): if not os.path.isfile(header): self.fatal("file '" + header + "' not found") - self.inp = open(header, 'r') + self.inp = open(header, 'r', encoding='utf-8') self.end_pattern = re.compile('\)\s*;\s*$') self.data = data @@ -173,7 +172,7 @@ def get_args(self, record): struct = {'ret': '', 'args': '', 'astr': {}, 'alst': [], 'tlst': []} record = re.sub(r'^\s+', r'', record) record = re.sub(r'\s*(\*+)\s*', r'\1 ', record) - rind = NextBlock(0, record) + rind = NextBlock(0, record) struct['ret'] = record[0:rind] pos = record.find('(') end = NextBlock(pos, record); @@ -184,7 +183,7 @@ def get_args(self, record): struct['args'] = re.sub(r',', r', ', args) if args == "void": return struct - + if len(args) == 0: return struct pos = 0 @@ -217,7 +216,7 @@ def get_args(self, record): # parse given api def parse(self, call, full_fct): - if call in full_fct: + if call in full_fct: self.data[call] = self.get_args(full_fct[call]) else: self.data[call] = self.get_args(call) @@ -238,7 +237,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.api_calls = {} self.api_rettypes = set() self.api_id = {} - + api_data = {} full_fct = {} api_list = [] @@ -271,7 +270,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.ns_calls = ns_calls self.content_h += "// automatically generated\n\n" + license + '\n' - + self.content_h += "/////////////////////////////////////////////////////////////////////////////\n" for call in self.ns_calls: self.content_h += '// ' + call + ' was not parsed\n' @@ -298,7 +297,7 @@ def __init__(self, out_file, kfd_dir, api_headers, license): self.content_h += 'namespace kfd_support {\n' self.add_section('API get_name function', ' ', self.gen_get_name) - self.add_section('API get_code function', ' ', self.gen_get_code) + self.add_section('API get_code function', ' ', self.gen_get_code) self.add_section('API intercepting code', '', self.gen_intercept_decl) self.add_section('API intercepting code', '', self.gen_intercept) @@ -369,7 +368,7 @@ def gen_id_enum(self, n, name, call, data): self.content_h += ' KFD_API_ID_NUMBER = ' + str(n) + ',\n' self.content_h += ' KFD_API_ID_ANY = ' + str(n + 1) + ',\n' self.content_h += '};\n' - + # generate API args structure def gen_arg_struct(self, n, name, call, struct): if n == -1: @@ -396,7 +395,7 @@ def gen_arg_struct(self, n, name, call, struct): else: self.content_h += ' } args;\n' self.content_h += '} kfd_api_data_t;\n' - + # generate API callbacks def gen_callbacks(self, n, name, call, struct): if n == -1: @@ -406,7 +405,7 @@ def gen_callbacks(self, n, name, call, struct): if call != '-': call_id = self.api_id[call]; ret_type = struct['ret'] - self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + + self.content_h += ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' # 'static ' + self.content_h += ' if (' + name + '_table == NULL) intercept_KFDApiTable();\n' self.content_h += ' kfd_api_data_t api_data{};\n' for var in struct['alst']: @@ -448,7 +447,7 @@ def gen_intercept(self, n, name, call, struct): if call != '-': self.content_h += ' typedef decltype(' + name + '_table_t::' + call + '_fn) ' + call + '_t;\n' - self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' + self.content_h += ' ' + name + '_table->' + call + '_fn = (' + call + '_t)' + 'dlsym(RTLD_NEXT,\"' + call + '\");\n' # generate API name function def gen_get_name(self, n, name, call, struct): @@ -493,9 +492,8 @@ def gen_out_stream(self, n, name, call, struct): arg_var = arg_list[ind] arg_val = 'api_data.args.' + call + '.' + arg_var if re.search(r'MemFlags',arg_var): - continue - self.content_h += ' typedef decltype(' + arg_val.replace("[]","") + ') arg_val_type_t' + str(ind) + ';\n' - self.content_h += ' roctracer::kfd_support::output_streamer::put(out, ' + arg_val.replace("[]","") + ')' + continue + self.content_h += ' out << ' + arg_val.replace("[]","") if ind < len(arg_list)-1: self.content_h += ' << ", ";\n' else: self.content_h += ';\n' if struct['ret'] != 'void': @@ -510,11 +508,10 @@ def gen_out_stream(self, n, name, call, struct): self.content_h += ' abort();\n' self.content_h += ' }\n' self.content_h += ' return out;\n' - self.content_h += '}\n' + self.content_h += '}\n' self.content_h += '#endif\n' - self.content_cpp += 'inline std::ostream& operator<< (std::ostream& out, const HsaMemFlags& v) { out << "HsaMemFlags"; return out; }\n' - # generate PUBLIC_API for all API fcts + # generate PUBLIC_API for all API fcts def gen_public_api(self, n, name, call, struct): if n == -1: self.content_cpp += 'extern "C" {\n' @@ -540,21 +537,21 @@ def gen_public_api(self, n, name, call, struct): # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file = sys.stderr) + print ("Usage:", sys.argv[0], " ", file = sys.stderr) sys.exit(1) else: - ROOT = sys.argv[1] + '/' + PREFIX = sys.argv[1] + '/' KFD_DIR = sys.argv[2] + '/' descr = API_DescrParser(OUT_H, KFD_DIR, API_HEADERS_H, LICENSE) -out_file = ROOT + OUT_H +out_file = PREFIX + OUT_H print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_h[:-1]) f.close() -out_file = ROOT + OUT_CPP +out_file = PREFIX + OUT_CPP print ('Generating "' + out_file + '"') f = open(out_file, 'w') f.write(descr.content_cpp[:-1]) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ceb33c74..556ea16d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,23 @@ -# +# Generating tracing primitives +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( GEN_SRC_DIR ${PROJECT_BINARY_DIR}/src ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_INC_DIR}" ) +execute_process ( COMMAND sh -xc "mkdir -p ${GEN_SRC_DIR}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${GEN_INC_DIR}/hsa_pp.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h > ${GEN_INC_DIR}/hsa_ext_amd_pp.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsa_pp.h,${GEN_INC_DIR}/hsa_ext_amd_pp.h -out ${GEN_INC_DIR}/hsa_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/hsaap.py ${PROJECT_BINARY_DIR} ${HSA_RUNTIME_INC_PATH}" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/kfdap.py ${PROJECT_BINARY_DIR} ${HSA_KMT_INC_PATH}" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${GEN_INC_DIR}/hsakmttypes_pp.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hsakmttypes_pp.h -out ${GEN_INC_DIR}/kfd_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HIP_PATH}/include/hip/hip_runtime_api.h ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include > ${GEN_INC_DIR}/hip_runtime_api_pp.h" ) +execute_process ( COMMAND sh -xc "python3 ${ROOT_DIR}/script/gen_ostream_ops.py -in ${GEN_INC_DIR}/hip_runtime_api_pp.h -out ${GEN_INC_DIR}/hip_ostream_ops.h" ) +execute_process ( COMMAND sh -xc "mkdir ${GEN_INC_DIR}/rocprofiler" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/inc/rocprofiler.h ${GEN_INC_DIR}/rocprofiler/rocprofiler.h" ) +execute_process ( COMMAND sh -xc "ln -s ${ROOT_DIR}/../rocprofiler/src/core/activity.h ${GEN_INC_DIR}/rocprofiler/activity.h" ) + # Build dynamic Library object -# -set ( TARGET_LIB "${TARGET_NAME}" ) +set ( TARGET_LIB ${TARGET_NAME} ) set ( LIB_SRC ${LIB_DIR}/core/roctracer.cpp ${LIB_DIR}/proxy/proxy_queue.cpp @@ -10,33 +26,22 @@ set ( LIB_SRC ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} ${LIBRARY_TYPE} ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${ROCM_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) -# Generating HSA tracing primitives -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/hsaap.py ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH}" ) - -# Generating KFD/Thunk tracing primitives +# Build KFD/Thunk tracing library set ( KFD_LIB "kfdwrapper64" ) -set ( KFD_LIB_SRC - ${LIB_DIR}/kfd/kfd_wrapper.cpp -) -execute_process ( COMMAND sh -xc "${CMAKE_CXX_COMPILER} -E ${HSA_KMT_INC_PATH}/hsakmttypes.h > ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsakmttypes_pp.h -out ${ROOT_DIR}/inc/kfd_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} ${HIP_DEFINES} -I${HIP_PATH}/include -I${ROCM_ROOT_DIR}/hsa/include -E ${HIP_PATH}/include/hip/hip_runtime_api.h > ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hip_runtime_api_pp.h -out ${ROOT_DIR}/inc/hip_ostream_ops.h" ) -execute_process ( COMMAND sh -xc "${CMAKE_C_COMPILER} -E ${HSA_RUNTIME_INC_PATH}/hsa.h > ${PROJECT_BINARY_DIR}/hsa_pp.h" ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/gen_ostream_ops.py -in ${PROJECT_BINARY_DIR}/hsa_pp.h -out ${ROOT_DIR}/inc/hsa_ostream_ops.h" ) +set ( KFD_LIB_SRC ${GEN_SRC_DIR}/kfd_wrapper.cpp) add_library ( ${KFD_LIB} SHARED ${KFD_LIB_SRC} ) -target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HSA_KMT_INC_PATH} ) +target_include_directories ( ${KFD_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${KFD_LIB} PRIVATE c stdc++ ) -execute_process ( COMMAND sh -xc "${ROOT_DIR}/script/kfdap.py ${ROOT_DIR} ${HSA_KMT_INC_PATH}" ) +# Build ROCTX tracing library set ( ROCTX_LIB "roctx64" ) set ( ROCTX_LIB_SRC ${LIB_DIR}/roctx/roctx.cpp ${LIB_DIR}/roctx/roctx_intercept.cpp ) add_library ( ${ROCTX_LIB} SHARED ${ROCTX_LIB_SRC} ) -target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ) +target_include_directories ( ${ROCTX_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries( ${ROCTX_LIB} PRIVATE c stdc++ ) diff --git a/src/core/loader.h b/src/core/loader.h index 5d6e0d6c..946521d1 100644 --- a/src/core/loader.h +++ b/src/core/loader.h @@ -94,6 +94,10 @@ class RocpApi { EnableCallback_t* EnableActivityCallback; NameCallback_t* GetOpName; + RegisterCallback_t* RegisterEvtCallback; + OperateCallback_t* RemoveEvtCallback; + NameCallback_t* GetEvtName; + protected: void init(Loader* loader) { RegisterApiCallback = loader->GetFun("RegisterApiCallback"); @@ -101,6 +105,10 @@ class RocpApi { InitActivityCallback = loader->GetFun("InitActivityCallback"); EnableActivityCallback = loader->GetFun("EnableActivityCallback"); GetOpName = loader->GetFun("GetOpName"); + + RegisterEvtCallback = loader->GetFun("RegisterEvtCallback"); + RemoveEvtCallback = loader->GetFun("RemoveEvtCallback"); + GetEvtName = loader->GetFun("GetEvtName"); } }; @@ -290,8 +298,8 @@ typedef HipLoaderShared HipLoader; template bool roctracer::BaseLoader::to_check_open_ = true; \ template bool roctracer::BaseLoader::to_check_symb_ = true; \ template<> const char* roctracer::RocpLoader::lib_name_ = "librocprofiler64.so"; \ + template<> bool roctracer::RocpLoader::to_load_ = true; \ template<> const char* roctracer::HccLoader::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HccLoader::to_check_open_ = false; \ template<> const char* roctracer::KfdLoader::lib_name_ = "libkfdwrapper64.so"; \ template<> const char* roctracer::RocTxLoader::lib_name_ = "libroctx64.so"; \ template<> bool roctracer::RocTxLoader::to_load_ = true; @@ -302,8 +310,7 @@ typedef HipLoaderShared HipLoader; roctracer::HipLoaderStatic::instance_t roctracer::HipLoaderStatic::instance_{}; #else #define LOADER_INSTANTIATE_HIP() \ - template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; \ - template<> bool roctracer::HipLoaderShared::to_check_open_ = false; + template<> const char* roctracer::HipLoaderShared::lib_name_ = "libamdhip64.so"; #endif #if HIP_VDI diff --git a/src/core/roctracer.cpp b/src/core/roctracer.cpp index 2d15bbba..272bad26 100644 --- a/src/core/roctracer.cpp +++ b/src/core/roctracer.cpp @@ -93,7 +93,6 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") - static inline uint32_t GetPid() { return syscall(__NR_getpid); } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -172,6 +171,9 @@ void RestoreHsaApi() { } namespace roctracer { +// timestamp definitino +typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; @@ -218,11 +220,11 @@ template<> bool act_en_functor_t::fun(const act_en_functor_t::record_t& record) void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry); void hsa_kernel_handler(::proxy::Tracker::entry_t* entry); -TraceBuffer::flush_prm_t trace_buffer_prm[] = { +constexpr TraceBuffer::flush_prm_t trace_buffer_prm[] = { {COPY_ENTRY_TYPE, hsa_async_copy_handler}, {KERNEL_ENTRY_TYPE, hsa_kernel_handler} }; -TraceBuffer trace_buffer("HSA GPU", 0x200000, trace_buffer_prm, 2); +TraceBuffer* trace_buffer = NULL; namespace hsa_support { // callbacks table @@ -277,7 +279,8 @@ struct record_pair_t { roctracer_api_data_t data; record_pair_t() {}; }; -static thread_local std::stack record_pair_stack; +typedef std::stack record_pair_stack_t; +static thread_local record_pair_stack_t* record_pair_stack = NULL; // Correlation id storage static thread_local activity_correlation_id_t correlation_id_tls = 0; @@ -294,13 +297,19 @@ static inline void CorrelationIdRegistr(const activity_correlation_id_t& correla if (correlation_id_map == NULL) correlation_id_map = new correlation_id_map_t; const auto ret = correlation_id_map->insert({correlation_id, correlation_id_tls}); if (ret.second == false) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id is not unique(" << correlation_id << ")"); + + DEBUG_TRACE("CorrelationIdRegistr id(%lu) id_tls(%lu)\n", correlation_id, correlation_id_tls); } static inline activity_correlation_id_t CorrelationIdLookup(const activity_correlation_id_t& correlation_id) { auto it = correlation_id_map->find(correlation_id); if (correlation_id_wait) while (it == correlation_id_map->end()) it = correlation_id_map->find(correlation_id); if (it == correlation_id_map->end()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "HCC activity id lookup failed(" << correlation_id << ")"); - return it->second; + const activity_correlation_id_t ret_val = it->second; + + DEBUG_TRACE("CorrelationIdLookup id(%lu) ret(%lu)\n", correlation_id, ret_val); + + return ret_val; } typedef std::mutex hip_activity_mutex_t; @@ -341,6 +350,10 @@ void* HIP_SyncApiDataCallback( const void* callback_data, void* arg) { + static hsa_rt_utils::Timer timer; + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; + + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -357,8 +370,8 @@ void* HIP_SyncApiDataCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); data = &(top.data.hip); data_ptr = const_cast(data); data_ptr->phase = phase; @@ -375,16 +388,20 @@ void* HIP_SyncApiDataCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncApiDataCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) time_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timer.timestamp_ns()); + + return ret; } void* HIP_SyncActivityCallback( @@ -394,7 +411,10 @@ void* HIP_SyncActivityCallback( void* arg) { static hsa_rt_utils::Timer timer; + const timestamp_t timestamp_ns = timer.timestamp_ns(); + if (record_pair_stack == NULL) record_pair_stack = new record_pair_stack_t; + void* ret = NULL; const hip_api_data_t* data = reinterpret_cast(callback_data); hip_api_data_t* data_ptr = const_cast(data); MemoryPool* pool = reinterpret_cast(arg); @@ -411,8 +431,8 @@ void* HIP_SyncActivityCallback( // Allocating a record if NULL passed if (record == NULL) { if (data != NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback enter: record is NULL"); - record_pair_stack.push({}); - auto& top = record_pair_stack.top(); + record_pair_stack->push({}); + auto& top = record_pair_stack->top(); record = &(top.record); data = &(top.data.hip); data_ptr = const_cast(data); @@ -423,7 +443,7 @@ void* HIP_SyncActivityCallback( // Filing record info record->domain = ACTIVITY_DOMAIN_HIP_API; record->op = op_id; - record->begin_ns = timer.timestamp_ns(); + record->begin_ns = timestamp_ns; // Correlation ID generating uint64_t correlation_id = data->correlation_id; @@ -436,19 +456,19 @@ void* HIP_SyncActivityCallback( // Passing correlatin ID correlation_id_tls = correlation_id; - return data_ptr; + ret = data_ptr; } else { if (pool == NULL) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: pool is NULL"); // Getting record of stacked if (record == NULL) { - if (record_pair_stack.empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); - auto& top = record_pair_stack.top(); + if (record_pair_stack->empty()) EXC_ABORT(ROCTRACER_STATUS_ERROR, "ActivityCallback exit: record stack is empty"); + auto& top = record_pair_stack->top(); record = &(top.record); } // Filing record info - record->end_ns = timer.timestamp_ns(); + record->end_ns = timestamp_ns; record->process_id = syscall(__NR_getpid); record->thread_id = syscall(__NR_gettid); @@ -465,13 +485,17 @@ void* HIP_SyncActivityCallback( pool->Write(*record); // popping the record entry - if (!record_pair_stack.empty()) record_pair_stack.pop(); + if (!record_pair_stack->empty()) record_pair_stack->pop(); // Clearing correlatin ID correlation_id_tls = 0; - - return NULL; } + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, op_id, 0); + DEBUG_TRACE("HIP_SyncActivityCallback(\"%s\") phase(%d): op(%u) record(%p) data(%p) pool(%p) depth(%d) correlation_id(%lu) beg_ns(%lu) end_ns(%lu)\n", + name, phase, op_id, record, data, pool, (int)(record_pair_stack->size()), (data_ptr) ? data_ptr->correlation_id : 0, timestamp_ns); + + return ret; } void HCC_ActivityIdCallback(activity_correlation_id_t correlation_id) { @@ -484,6 +508,10 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) { record_ptr->domain = ACTIVITY_DOMAIN_HCC_OPS; record_ptr->correlation_id = CorrelationIdLookup(record_ptr->correlation_id); pool->Write(*record_ptr); + + const char * name = roctracer_op_string(ACTIVITY_DOMAIN_HCC_OPS, record_ptr->op, record_ptr->kind); + DEBUG_TRACE("HCC_AsyncActivityCallback(\"%s\"): op(%u) kind(%u) record(%p) pool(%p) correlation_id(%d) beg_ns(%lu) end_ns(%lu)\n", + name, record_ptr->op, record_ptr->kind, record, pool, record_ptr->correlation_id, record_ptr->begin_ns, record_ptr->end_ns); } // Open output file @@ -567,7 +595,7 @@ hsa_status_t hsa_amd_memory_async_copy_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, src_agent, size, num_dep_signals, @@ -591,7 +619,7 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( { hsa_status_t status = HSA_STATUS_SUCCESS; if (hsa_support::async_copy_callback_enabled) { - trace_entry_t* entry = trace_buffer.GetEntry(); + trace_entry_t* entry = trace_buffer->GetEntry(); ::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry); status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, copy_agent, @@ -665,6 +693,8 @@ PUBLIC_API const char* roctracer_op_string( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: return roctracer::hsa_support::GetApiName(op); + case ACTIVITY_DOMAIN_HSA_EVT: + return roctracer::RocpLoader::Instance().GetEvtName(op); case ACTIVITY_DOMAIN_HSA_OPS: return roctracer::RocpLoader::Instance().GetOpName(op); case ACTIVITY_DOMAIN_HCC_OPS: @@ -673,6 +703,8 @@ PUBLIC_API const char* roctracer_op_string( return roctracer::HipLoader::Instance().ApiName(op); case ACTIVITY_DOMAIN_KFD_API: return roctracer::kfd_support::GetApiName(op); + case ACTIVITY_DOMAIN_EXT_API: + return "EXT_API"; default: EXC_RAISING(ROCTRACER_STATUS_BAD_DOMAIN, "invalid domain ID(" << domain << ")"); } @@ -690,11 +722,25 @@ PUBLIC_API roctracer_status_t roctracer_op_code( switch (domain) { case ACTIVITY_DOMAIN_HSA_API: { *op = roctracer::hsa_support::GetApiCode(str); + if (*op == HSA_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } case ACTIVITY_DOMAIN_KFD_API: { *op = roctracer::kfd_support::GetApiCode(str); + if (*op == KFD_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } + if (kind != NULL) *kind = 0; + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + *op = hipApiIdByName(str); + if (*op == HIP_API_ID_NUMBER) { + EXC_RAISING(ROCTRACER_STATUS_BAD_PARAMETER, "Invalid API name \"" << str << "\", domain ID(" << domain << ")"); + } if (kind != NULL) *kind = 0; break; } @@ -708,6 +754,7 @@ static inline uint32_t get_op_num(const uint32_t& domain) { switch (domain) { case ACTIVITY_DOMAIN_HSA_OPS: return HSA_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HSA_API: return HSA_API_ID_NUMBER; + case ACTIVITY_DOMAIN_HSA_EVT: return HSA_EVT_ID_NUMBER; case ACTIVITY_DOMAIN_HCC_OPS: return HIP_OP_ID_NUMBER; case ACTIVITY_DOMAIN_HIP_API: return HIP_API_ID_NUMBER; case ACTIVITY_DOMAIN_KFD_API: return KFD_API_ID_NUMBER; @@ -737,13 +784,18 @@ static roctracer_status_t roctracer_enable_callback_fun( #if 0 if (op == HSA_API_ID_DISPATCH) { const bool succ = roctracer::RocpLoader::Instance().RegisterApiCallback(op, (void*)callback, user_data); - if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::EnableActivityCallback error(" << op << ") failed"); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterApiCallback error(" << op << ") failed"); break; } #endif roctracer::hsa_support::cb_table.set(op, callback, user_data); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RegisterEvtCallback(op, (void*)callback, user_data); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RegisterEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_HCC_OPS: break; case ACTIVITY_DOMAIN_HIP_API: { if (roctracer::HipLoader::Instance().Enabled() == false) break; @@ -852,6 +904,11 @@ static roctracer_status_t roctracer_disable_callback_fun( } break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + const bool succ = roctracer::RocpLoader::Instance().RemoveEvtCallback(op); + if (succ == false) HCC_EXC_RAISING(ROCTRACER_STATUS_HSA_ERR, "HSA::RemoveEvtCallback error(" << op << ") failed"); + break; + } case ACTIVITY_DOMAIN_ROCTX: { if (roctracer::RocTxLoader::Instance().Enabled()) { const bool suc = roctracer::RocTxLoader::Instance().RemoveApiCallback(op); @@ -961,6 +1018,7 @@ static roctracer_status_t roctracer_enable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { const bool init_phase = (roctracer::HccLoader::GetRef() == NULL); @@ -1057,6 +1115,7 @@ static roctracer_status_t roctracer_disable_activity_fun( break; } case ACTIVITY_DOMAIN_HSA_API: break; + case ACTIVITY_DOMAIN_HSA_EVT: break; case ACTIVITY_DOMAIN_KFD_API: break; case ACTIVITY_DOMAIN_HCC_OPS: { if (roctracer::HccLoader::Instance().Enabled() == false) break; @@ -1126,7 +1185,7 @@ PUBLIC_API roctracer_status_t roctracer_flush_activity_expl(roctracer_pool_t* po API_METHOD_PREFIX if (pool == NULL) pool = roctracer_default_pool(); roctracer::MemoryPool* memory_pool = reinterpret_cast(pool); - memory_pool->Flush(); + if (memory_pool != NULL) memory_pool->Flush(); roctracer::TraceBufferBase::FlushAll(); API_METHOD_SUFFIX } @@ -1227,6 +1286,9 @@ PUBLIC_API roctracer_status_t roctracer_set_properties( roctracer::kfd_support::intercept_KFDApiTable(); break; } + case ACTIVITY_DOMAIN_HSA_EVT: { + break; + } case ACTIVITY_DOMAIN_HSA_API: { // HSA API properties HsaApiTable* table = reinterpret_cast(properties); @@ -1289,13 +1351,14 @@ PUBLIC_API void roctracer_unload() { PUBLIC_API void roctracer_flush_buf() { ONLOAD_TRACE_BEG(); - roctracer::trace_buffer.Flush(); + roctracer::trace_buffer->Flush(); ONLOAD_TRACE_END(); } CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); roctracer::util::Logger::Create(); + roctracer::trace_buffer = new roctracer::TraceBuffer("HSA GPU", 0x200000, roctracer::trace_buffer_prm, 2); roctracer_load(); ONLOAD_TRACE_END(); } diff --git a/src/core/trace_buffer.h b/src/core/trace_buffer.h index cd62dda7..cb6767f2 100644 --- a/src/core/trace_buffer.h +++ b/src/core/trace_buffer.h @@ -36,15 +36,17 @@ enum { TRACE_ENTRY_COMPL = 2 }; -enum { - API_ENTRY_TYPE, - COPY_ENTRY_TYPE, - KERNEL_ENTRY_TYPE +enum entry_type_t { + DFLT_ENTRY_TYPE = 0, + API_ENTRY_TYPE = 1, + COPY_ENTRY_TYPE = 2, + KERNEL_ENTRY_TYPE = 3, + NUM_ENTRY_TYPE = 4 }; struct trace_entry_t { std::atomic valid; - uint32_t type; + entry_type_t type; uint64_t dispatch; uint64_t begin; // kernel begin timestamp, ns uint64_t end; // kernel end timestamp, ns @@ -67,14 +69,26 @@ struct trace_entry_t { template struct push_element_fun { T* const elem_; - void fun(T* node) { if (node->next_elem_ == NULL) node->next_elem_ = elem_; } - push_element_fun(T* elem) : elem_(elem) {} + T** prev_; + bool fun(T* node) { + if (node->priority_ > elem_->priority_) { + *prev_ = elem_; + elem_->next_elem_ = node; + } else if (node->next_elem_ == NULL) { + node->next_elem_ = elem_; + } else { + prev_ = &(node->next_elem_); + return false; + } + return true; + } + push_element_fun(T* elem, T** prev) : elem_(elem), prev_(prev) {} }; template struct call_element_fun { void (T::*fptr_)(); - void fun(T* node) { (node->*fptr_)(); } + bool fun(T* node) const { (node->*fptr_)(); return false; } call_element_fun(void (T::*f)()) : fptr_(f) {} }; @@ -89,10 +103,10 @@ struct TraceBufferBase { static void Push(TraceBufferBase* elem) { if (head_elem_ == NULL) head_elem_ = elem; - else foreach(push_element_fun(elem)); + else foreach(push_element_fun(elem, &head_elem_)); } - TraceBufferBase() : next_elem_(NULL) {} + TraceBufferBase(const uint32_t& prior) : priority_(prior), next_elem_(NULL) {} template static void foreach(const F& f_in) { @@ -101,11 +115,12 @@ struct TraceBufferBase { TraceBufferBase* p = head_elem_; while (p != NULL) { TraceBufferBase* next = p->next_elem_; - f.fun(p); + if (f.fun(p) == true) break; p = next; } } + const uint32_t priority_; TraceBufferBase* next_elem_; static TraceBufferBase* head_elem_; static mutex_t mutex_; @@ -118,26 +133,34 @@ class TraceBuffer : protected TraceBufferBase { typedef TraceBuffer Obj; typedef uint64_t pointer_t; typedef std::recursive_mutex mutex_t; + typedef typename std::list buf_list_t; + typedef typename buf_list_t::iterator buf_list_it_t; struct flush_prm_t { - uint32_t type; + entry_type_t type; callback_t fun; }; - TraceBuffer(const char* name, uint32_t size, flush_prm_t* flush_prm_arr, uint32_t flush_prm_count) : - is_flushed_(false), + TraceBuffer(const char* name, uint32_t size, const flush_prm_t* flush_prm_arr, uint32_t flush_prm_count, uint32_t prior = 0) : + TraceBufferBase(prior), + size_(size), work_thread_started_(false) { name_ = strdup(name); - size_ = size; data_ = allocate_fun(); next_ = allocate_fun(); read_pointer_ = 0; + write_pointer_ = 0; end_pointer_ = size; buf_list_.push_back(data_); - flush_prm_arr_ = flush_prm_arr; - flush_prm_count_ = flush_prm_count; + memset(f_array_, 0, sizeof(f_array_)); + for (const flush_prm_t* prm = flush_prm_arr; prm < flush_prm_arr + flush_prm_count; prm++) { + const entry_type_t type = prm->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + if (f_array_[type] != NULL) FATAL("handler function ptr redefinition (" << type << ")"); + f_array_[type] = prm->fun; + } TraceBufferBase::Push(this); } @@ -169,10 +192,13 @@ class TraceBuffer : protected TraceBufferBase { } Entry* GetEntry() { - const pointer_t pointer = read_pointer_.fetch_add(1); + const pointer_t pointer = write_pointer_.fetch_add(1); if (pointer >= end_pointer_) wrap_buffer(pointer); if (pointer >= end_pointer_) FATAL("pointer >= end_pointer_ after buffer wrap"); - return data_ + (pointer + size_ - end_pointer_); + Entry* entry = data_ + (size_ + pointer - end_pointer_); + entry->valid = TRACE_ENTRY_INV; + entry->type = DFLT_ENTRY_TYPE; + return entry; } void Flush() { flush_buf(); } @@ -180,31 +206,38 @@ class TraceBuffer : protected TraceBufferBase { private: void flush_buf() { std::lock_guard lck(mutex_); - const bool is_flushed = is_flushed_.exchange(true, std::memory_order_acquire); - - if (is_flushed == false) { - for (flush_prm_t* prm = flush_prm_arr_; prm < flush_prm_arr_ + flush_prm_count_; prm++) { - // Flushed entries type - uint32_t type = prm->type; - // Flushing function - callback_t fun = prm->fun; - if (fun == NULL) FATAL("flush function is not set"); - - pointer_t pointer = 0; - for (Entry* ptr : buf_list_) { - Entry* end = ptr + size_; - while ((ptr < end) && (pointer < read_pointer_)) { - if (ptr->type == type) { - if (ptr->valid == TRACE_ENTRY_COMPL) { - fun(ptr); - } - } - ptr++; - pointer++; - } - } + + pointer_t pointer = read_pointer_; + pointer_t curr_pointer = write_pointer_.load(std::memory_order_relaxed); + buf_list_it_t it = buf_list_.begin(); + buf_list_it_t end_it = buf_list_.end(); + while(it != end_it) { + Entry* buf = *it; + Entry* ptr = buf + (pointer % size_); + Entry* end_ptr = buf + size_; + while ((ptr < end_ptr) && (pointer < curr_pointer)) { + if (ptr->valid != TRACE_ENTRY_COMPL) break; + + entry_type_t type = ptr->type; + if (type >= NUM_ENTRY_TYPE) FATAL("out of f_array bounds (" << type << ")"); + callback_t f_ptr = f_array_[type]; + if (f_ptr == NULL) FATAL("f_ptr == NULL"); + (*f_ptr)(ptr); + + ptr++; + pointer++; } + + buf_list_it_t prev = it; + it++; + if (ptr == end_ptr) { + free_fun(*prev); + buf_list_.erase(prev); + } + if (pointer == curr_pointer) break; } + + read_pointer_ = pointer; } inline Entry* allocate_fun() { @@ -214,6 +247,10 @@ class TraceBuffer : protected TraceBufferBase { return ptr; } + inline void free_fun(void* ptr) { + free(ptr); + } + static void* allocate_worker(void* arg) { Obj* obj = (Obj*)arg; @@ -246,16 +283,14 @@ class TraceBuffer : protected TraceBufferBase { } const char* name_; - uint32_t size_; + const uint32_t size_; Entry* data_; Entry* next_; - volatile std::atomic read_pointer_; + pointer_t read_pointer_; + volatile std::atomic write_pointer_; volatile std::atomic end_pointer_; - std::list buf_list_; - - flush_prm_t* flush_prm_arr_; - uint32_t flush_prm_count_; - volatile std::atomic is_flushed_; + buf_list_t buf_list_; + callback_t f_array_[NUM_ENTRY_TYPE]; pthread_t work_thread_; pthread_mutex_t work_mutex_; diff --git a/src/kfd/.gitignore b/src/kfd/.gitignore deleted file mode 100644 index 0c2acea7..00000000 --- a/src/kfd/.gitignore +++ /dev/null @@ -1 +0,0 @@ -kfd_wrapper.cpp diff --git a/src/proxy/intercept_queue.h b/src/proxy/intercept_queue.h index f92f1ce6..000c7e88 100644 --- a/src/proxy/intercept_queue.h +++ b/src/proxy/intercept_queue.h @@ -39,7 +39,7 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" #include "util/exception.h" -namespace roctracer { extern TraceBuffer trace_buffer; } +namespace roctracer { extern TraceBuffer* trace_buffer; } namespace rocprofiler { extern decltype(hsa_queue_create)* hsa_queue_create_fn; @@ -160,7 +160,7 @@ class InterceptQueue { const char* kernel_name = GetKernelName(kernel_symbol); // Adding kernel timing tracker - ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer.GetEntry(); + ::proxy::Tracker::entry_t* entry = roctracer::trace_buffer->GetEntry(); entry->kernel.tid = syscall(__NR_gettid); entry->kernel.name = kernel_name; ::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id, completion_signal, entry); diff --git a/src/proxy/tracker.h b/src/proxy/tracker.h index edb223b0..dc0322bd 100644 --- a/src/proxy/tracker.h +++ b/src/proxy/tracker.h @@ -40,9 +40,10 @@ class Tracker { public: typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef roctracer::trace_entry_t entry_t; + typedef roctracer::entry_type_t entry_type_t; // Add tracker entry - inline static void Enable(uint32_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { + inline static void Enable(entry_type_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance()); @@ -88,13 +89,16 @@ class Tracker { } entry->complete = hsa_rsrc->TimestampNs(); + hsa_signal_t orig = entry->orig; + hsa_signal_t signal = entry->signal; + + // Releasing completed entry entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); // Original intercepted signal completion - hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); - amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(signal.handle); orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; @@ -102,7 +106,7 @@ class Tracker { if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); hsa_signal_store_screlease(orig, signal_value); } - hsa_signal_destroy(entry->signal); + hsa_signal_destroy(signal); } // Handler for packet completion @@ -113,7 +117,6 @@ class Tracker { // Complete entry Tracker::Complete(signal_value, entry); - return false; } }; diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index e1ef9268..cf172cae 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -742,6 +742,19 @@ hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t return hsa_api_.hsa_executable_freeze(executable, options);; } +void HsaRsrcFactory::DumpHandles(FILE* file) { + auto beg = agent_map_.begin(); + auto end = agent_map_.end(); + for (auto it = beg; it != end; ++it) { + const AgentInfo* agent_info = it->second; + fprintf(file, "0x%lx agent %s\n", agent_info->dev_id.handle, (agent_info->dev_type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu"); + if (agent_info->cpu_pool.handle != 0) fprintf(file, "0x%lx pool cpu\n", agent_info->cpu_pool.handle); + if (agent_info->kern_arg_pool.handle != 0) fprintf(file, "0x%lx pool cpu kernarg\n", agent_info->kern_arg_pool.handle); + if (agent_info->gpu_pool.handle != 0) fprintf(file, "0x%lx pool gpu\n", agent_info->gpu_pool.handle); + } + fflush(file); +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 466ccf1f..3bfeda68 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -439,6 +439,8 @@ class HsaRsrcFactory { return HSA_STATUS_SUCCESS; } + void DumpHandles(FILE* output_file); + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); diff --git a/src/util/logger.h b/src/util/logger.h index cd8dd470..8e525f68 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -100,10 +100,10 @@ class Logger { return *obj; } - private: static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } + private: Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { const char* path = getenv("ROCTRACER_LOG"); if (path != NULL) { @@ -198,4 +198,20 @@ class Logger { } while(0) #endif +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", + roctracer::util::Logger::GetPid(), roctracer::util::Logger::GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3e3b9654..e07e7d8b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,12 +44,14 @@ endif () ## Path to HSA test set ( HSA_TEST_DIR "${TEST_DIR}/hsa/test" ) -set ( HSA_REV "5b47aae" ) +set ( HSA_REV "a4fcdae" ) ## test run script set ( RUN_SCRIPT "${TEST_DIR}/run.sh" ) ## build HIP tests +set ( GEN_INC_DIR ${PROJECT_BINARY_DIR}/inc ) +set ( INC_PATH "${INC_PATH} ${GEN_INC_DIR}" ) set ( TEST_ENV HIP_VDI=${HIP_VDI} ROCM_PATH=${ROCM_ROOT_DIR} HSA_PATH=${ROCM_ROOT_DIR}/hsa INC_PATH=${INC_PATH} LIB_PATH=${LIB_PATH} HIPCC_VERBOSE=3 ) add_custom_target( mytest COMMAND ${TEST_ENV} make -C "${TEST_DIR}/MatrixTranspose" @@ -67,6 +69,14 @@ add_custom_target( mytest COMMAND sh -xc "cp ${TEST_DIR}/golden_traces/tests_trace_cmp_levels.txt ${PROJECT_BINARY_DIR}/test/" ) +## Build HSA test +execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) +execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) +set ( TMP ${TEST_DIR} ) +set ( TEST_DIR ${HSA_TEST_DIR} ) +add_subdirectory ( ${HSA_TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) +set ( TEST_DIR ${TMP} ) + ## Util sources file( GLOB UTIL_SRC "${HSA_TEST_DIR}/util/*.cpp" ) @@ -75,15 +85,23 @@ if ( DEFINED ROCTRACER_TARGET ) set ( TEST_LIB "tracer_tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tracer_tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) - target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_HSA_INC_PATH} ${HIP_INC_DIR} ${HCC_INC_DIR} ${HSA_KMT_INC_PATH} ) + target_include_directories ( ${TEST_LIB} PRIVATE ${HSA_TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${HSA_RUNTIME_INC_PATH} ${HIP_INC_DIR} ${HSA_KMT_INC_PATH} ${GEN_INC_DIR} ) target_link_libraries ( ${TEST_LIB} ${ROCTRACER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) endif () -## Build HSA test -execute_process ( COMMAND sh -xc "if [ ! -e ${TEST_DIR}/hsa ] ; then git clone https://github.com/ROCmSoftwarePlatform/hsa-class.git ${TEST_DIR}/hsa; fi" ) -execute_process ( COMMAND sh -xc "if [ -e ${TEST_DIR}/hsa ] ; then cd ${TEST_DIR}/hsa && git fetch origin && git checkout ${HSA_REV}; fi" ) -set ( TEST_DIR ${HSA_TEST_DIR} ) -add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test/hsa ) +## Build hsaco_test.cpp referenc test +set ( CO_LIB_NAME "hsaco_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/hsaco_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${HSA_RUNTIME_LIB} c stdc++ ) + +## Build codeobj event test +set ( CO_LIB_NAME "codeobj_test" ) +set ( CO_LIB_SRC ${TEST_DIR}/app/codeobj_test.cpp ) +add_library ( ${CO_LIB_NAME} SHARED ${CO_LIB_SRC} ) +target_include_directories ( ${CO_LIB_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${ROOT_DIR}/inc ${GEN_INC_DIR} ${HSA_RUNTIME_INC_PATH} ${ROCM_INC_PATH} ) +target_link_libraries ( ${CO_LIB_NAME} ${ROCTRACER_TARGET} c stdc++ ) ## copying run script execute_process ( COMMAND sh -xc "cp ${RUN_SCRIPT} ${PROJECT_BINARY_DIR}" ) diff --git a/test/MatrixTranspose/Makefile b/test/MatrixTranspose/Makefile index 647067dd..9a805fb1 100644 --- a/test/MatrixTranspose/Makefile +++ b/test/MatrixTranspose/Makefile @@ -23,7 +23,7 @@ EXECUTABLE=./MatrixTranspose all: clean $(EXECUTABLE) -CXXFLAGS =-g -I$(INC_PATH) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) +CXXFLAGS =-g $(INC_PATH:%=-I%) -DLOCAL_BUILD=1 --rocm-path=$(ROCM_PATH) CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) diff --git a/test/MatrixTranspose_test/Makefile b/test/MatrixTranspose_test/Makefile index 3e879ee8..758f8d94 100644 --- a/test/MatrixTranspose_test/Makefile +++ b/test/MatrixTranspose_test/Makefile @@ -17,7 +17,7 @@ TARGET=hcc EXECUTABLE=./MatrixTranspose OBJECTS = MatrixTranspose.o -FLAGS =-g -I$(INC_PATH) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 +FLAGS =-g $(INC_PATH:%=-I%) -I$(ROCM_PATH)/hsa/include/hsa -I$(ROCM_PATH)/hsa/include -I$(ROCM_PATH)/hip/include -I$(ROCM_PATH)/include -DLOCAL_BUILD=1 -DHIP_VDI=${HIP_VDI} -DITERATIONS=$(ITERATIONS) -DAMD_INTERNAL_BUILD=1 ifeq ($(C_TEST), 1) COMP=${CC} diff --git a/test/app/codeobj_test.cpp b/test/app/codeobj_test.cpp new file mode 100644 index 00000000..124715cd --- /dev/null +++ b/test/app/codeobj_test.cpp @@ -0,0 +1,89 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include + +#include "inc/roctracer.h" +#include "inc/roctracer_hsa.h" +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Check returned HSA API status +void check_status(roctracer_status_t status) { + if (status != ROCTRACER_STATUS_SUCCESS) { + const char* error_string = roctracer_error_string(); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// codeobj callback +void codeobj_callback(uint32_t domain, uint32_t cid, const void* data, void* arg) { + const hsa_evt_data_t* evt_data = reinterpret_cast(data); + const char* uri = evt_data->codeobj.uri; + printf("codeobj_callback domain(%u) cid(%u): load_base(0x%lx) load_size(0x%lx) load_delta(0x%lx) uri(\"%s\")\n", + domain, + cid, + evt_data->codeobj.load_base, + evt_data->codeobj.load_size, + evt_data->codeobj.load_delta, + uri); + free((void*)uri); + fflush(stdout); +} + +void initialize() { + roctracer_status_t status = roctracer_enable_op_callback(ACTIVITY_DOMAIN_HSA_EVT, HSA_EVT_ID_CODEOBJ, codeobj_callback, NULL); + check_status(status); +} + +void cleanup() { + roctracer_status_t status = roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_EVT); + check_status(status); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { + // Enable HSA events intercepting + settings->hsa_intercepting = 1; + // Initialize profiling + initialize(); +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + // Final resources cleanup + cleanup(); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + OnUnloadTool(); +} diff --git a/test/app/hsaco_test.cpp b/test/app/hsaco_test.cpp new file mode 100644 index 00000000..23200137 --- /dev/null +++ b/test/app/hsaco_test.cpp @@ -0,0 +1,142 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define HSA_RT(call) \ + do { \ + const hsa_status_t status = call; \ + if (status != HSA_STATUS_SUCCESS) { \ + printf("error \"%s\"\n", #call); fflush(stdout); \ + abort(); \ + } \ + } while(0) + +// HSA API intercepting primitives +decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +hsa_ven_amd_loader_1_01_pfn_t loader_api_table{}; + +hsa_status_t code_object_callback( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* arg) +{ + printf("code_object_callback\n"); fflush(stdout); + + uint64_t load_base = 0; + uint64_t load_size = 0; + uint64_t load_delta = 0; + uint32_t uri_len = 0; + char* uri_str = NULL; + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &load_base)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &load_size)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &load_delta)); + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &uri_len)); + + uri_str = (char*)calloc(uri_len + 1, sizeof(char)); + if (!uri_str) { + perror("calloc"); + abort(); + } + + HSA_RT(loader_api_table.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + uri_str)); + + printf("load_base(0x%lx)\n", load_base); fflush(stdout); + printf("load_size(0x%lx)\n", load_size); fflush(stdout); + printf("load_delta(0x%lx)\n", load_delta); fflush(stdout); + printf("uri_len(%u)\n", uri_len); fflush(stdout); + printf("uri_str(\"%s\")\n", uri_str); fflush(stdout); + + free(uri_str); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t hsa_executable_freeze_interceptor( + hsa_executable_t executable, + const char *options) +{ + HSA_RT(loader_api_table.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + code_object_callback, + NULL)); + HSA_RT(hsa_executable_freeze_fn( + executable, + options)); + return HSA_STATUS_SUCCESS; +} + +// HSA-runtime tool on-load method +extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, + uint64_t runtime_version, + uint64_t failed_tool_count, + const char* const* failed_tool_names) +{ + printf("OnLoad: begin\n"); fflush(stdout); + // intercepting hsa_executable_freeze API + hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + // Fetching AMD Loader HSA extension API + HSA_RT(hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(hsa_ven_amd_loader_1_01_pfn_t), + &loader_api_table)); + printf("OnLoad: end\n"); fflush(stdout); + return true; +} + +extern "C" PUBLIC_API void OnUnload() { + printf("OnUnload\n"); fflush(stdout); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + printf("destructor\n"); fflush(stdout); +} diff --git a/test/golden_traces/MatrixTranspose_dryrun_trace.txt b/test/golden_traces/MatrixTranspose_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt index 27ef8e95..6f0c4d17 100644 --- a/test/golden_traces/MatrixTranspose_hip_flush_trace.txt +++ b/test/golden_traces/MatrixTranspose_hip_flush_trace.txt @@ -1,25 +1,58 @@ -+ ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose -ROCTracer (pid=1991): +ROCTracer (pid=14696): ROCTracer: trace control flush rate(100000us) -3802701299772587 +129855595266140 HIP-trace() -Device name Device 687f +Device name Device 738c ## Iteration (99) ################# -3802701304199730:3802701304207180 1991:1991 hipGetDeviceProperties(props=, device=0) -3802701305255618:3802701305368889 1991:1991 hipMalloc(ptr=0x7fce16e0dec3, size=4194304) -3802701305370969:3802701305429809 1991:1991 hipMalloc(ptr=0x7fffc1295178, size=4194304) +129855603476896:129855603483734 14696:14696 hipGetDeviceProperties(props={}, device=0) :1 +129855604686134:129855605152950 14696:14696 hipMalloc(ptr=0x7fd65ce00000, size=4194304) :2 +129855605160451:129855605528247 14696:14696 hipMalloc(ptr=0x7fd65c800000, size=4194304) :3 PASSED! ## Iteration (98) ################# -3802701580515709:3802701582582904 0:0 CopyHostToDevice:4:1991 -3802701583225872:3802701584425191 0:0 KernelExecution:8:1991 -3802701583217109:3802701586447303 0:0 CopyDeviceToHost:10:1991 -3802701594795564:3802701596533727 0:0 CopyHostToDevice:11:1991 -3802701596646592:3802701597848875 0:0 KernelExecution:15:1991 -3802701596604988:3802701599522360 0:0 CopyDeviceToHost:17:1991 PASSED! ## Iteration (97) ################# PASSED! ## Iteration (96) ################# +129855955913848:129855957428192 0:0 CopyHostToDevice:4:14696 +129855958763342:129855959991823 0:0 KernelExecution:8:14696 +129855958734601:129855961705377 0:0 CopyDeviceToHost:10:14696 +129855971471522:129855972254607 0:0 CopyHostToDevice:11:14696 +129855972381516:129855973633356 0:0 KernelExecution:15:14696 +129855972673800:129855974135421 0:0 CopyDeviceToHost:17:14696 +129855980290261:129855981019714 0:0 CopyHostToDevice:18:14696 +129855981112002:129855982336482 0:0 KernelExecution:22:14696 +129855981076333:129855982783351 0:0 CopyDeviceToHost:24:14696 +129855988849671:129855989612220 0:0 CopyHostToDevice:25:14696 +129855989696159:129855990920319 0:0 KernelExecution:29:14696 +129855989668256:129855991384209 0:0 CopyDeviceToHost:31:14696 +129855605540988:129855957443403 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :4 +129855957456260:129855957456261 14696:14696 MARK(name(before HIP LaunchKernel)) +129855957507034:129855957514510 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :6 +129855957521000:129855957523014 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :7 +129855957529950:129855958671150 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :8 +129855958701410:129855958701411 14696:14696 MARK(name(after HIP LaunchKernel)) +129855958708321:129855961719221 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :10 +129855971408776:129855972257972 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :11 +129855972261515:129855972261516 14696:14696 MARK(name(before HIP LaunchKernel)) +129855972266736:129855972268234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :13 +129855972271629:129855972272780 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :14 +129855972276181:129855972282118 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :15 +129855972663504:129855972663505 14696:14696 MARK(name(after HIP LaunchKernel)) +129855972666015:129855974143463 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :17 +129855980222888:129855981023250 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :18 +129855981025473:129855981025474 14696:14696 MARK(name(before HIP LaunchKernel)) +129855981028834:129855981029831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :20 +129855981032043:129855981032913 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :21 +129855981035237:129855981038997 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :22 +129855981041265:129855981041266 14696:14696 MARK(name(after HIP LaunchKernel)) +129855981043695:129855982796928 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :24 +129855988764565:129855989615901 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :25 +129855989618073:129855989618074 14696:14696 MARK(name(before HIP LaunchKernel)) +129855989621096:129855989622129 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :27 +129855989624243:129855989625087 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :28 +129855989627271:129855989630934 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :29 +129855989632959:129855989632960 14696:14696 MARK(name(after HIP LaunchKernel)) +129855989635351:129855991396402 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :31 PASSED! ## Iteration (95) ################# PASSED! @@ -32,30 +65,6 @@ PASSED! ## Iteration (91) ################# PASSED! ## Iteration (90) ################# -3802701606826614:3802701608688328 0:0 CopyHostToDevice:18:1991 -3802701608781496:3802701609988668 0:0 KernelExecution:22:1991 -3802701608758548:3802701611510159 0:0 CopyDeviceToHost:24:1991 -3802701618702082:3802701620571865 0:0 CopyHostToDevice:25:1991 -3802701620675087:3802701621878110 0:0 KernelExecution:29:1991 -3802701620650876:3802701623502597 0:0 CopyDeviceToHost:31:1991 -3802701630690881:3802701632557164 0:0 CopyHostToDevice:32:1991 -3802701632661061:3802701633864973 0:0 KernelExecution:36:1991 -3802701632637885:3802701635182424 0:0 CopyDeviceToHost:38:1991 -3802701642392578:3802701644307152 0:0 CopyHostToDevice:39:1991 -3802701644410516:3802701645608650 0:0 KernelExecution:43:1991 -3802701644387082:3802701647064112 0:0 CopyDeviceToHost:45:1991 -3802701654288485:3802701656163049 0:0 CopyHostToDevice:46:1991 -3802701656267334:3802701657467098 0:0 KernelExecution:50:1991 -3802701656244070:3802701658916870 0:0 CopyDeviceToHost:52:1991 -3802701666450396:3802701668378780 0:0 CopyHostToDevice:53:1991 -3802701668482438:3802701669683832 0:0 KernelExecution:57:1991 -3802701668458481:3802701671148361 0:0 CopyDeviceToHost:59:1991 -3802701678631556:3802701680505490 0:0 CopyHostToDevice:60:1991 -3802701680609945:3802701681806894 0:0 KernelExecution:64:1991 -3802701680586811:3802701683591443 0:0 CopyDeviceToHost:66:1991 -3802701691032768:3802701692918102 0:0 CopyHostToDevice:67:1991 -3802701693021896:3802701694223438 0:0 KernelExecution:71:1991 -3802701692999202:3802701695886464 0:0 CopyDeviceToHost:73:1991 PASSED! ## Iteration (89) ################# PASSED! @@ -68,36 +77,132 @@ PASSED! ## Iteration (85) ################# PASSED! ## Iteration (84) ################# +129855997366746:129855998130772 0:0 CopyHostToDevice:32:14696 +129855998225065:129855999449385 0:0 KernelExecution:36:14696 +129855998197249:129855999925825 0:0 CopyDeviceToHost:38:14696 +129856005895171:129856006661973 0:0 CopyHostToDevice:39:14696 +129856006745770:129856007968491 0:0 KernelExecution:43:14696 +129856006717709:129856008455141 0:0 CopyDeviceToHost:45:14696 +129856014425283:129856015187951 0:0 CopyHostToDevice:46:14696 +129856015270363:129856016493884 0:0 KernelExecution:50:14696 +129856015242633:129856016989490 0:0 CopyDeviceToHost:52:14696 +129856022971470:129856023730704 0:0 CopyHostToDevice:53:14696 +129856023813883:129856025033244 0:0 KernelExecution:57:14696 +129856023785712:129856025544334 0:0 CopyDeviceToHost:59:14696 +129856031596064:129856032498907 0:0 CopyHostToDevice:60:14696 +129856032586758:129856033809639 0:0 KernelExecution:64:14696 +129856032558443:129856034354036 0:0 CopyDeviceToHost:66:14696 +129856040416553:129856041127473 0:0 CopyHostToDevice:67:14696 +129856041212287:129856042435488 0:0 KernelExecution:71:14696 +129856041184491:129856042941958 0:0 CopyDeviceToHost:73:14696 +129856049061163:129856049826011 0:0 CopyHostToDevice:74:14696 +129856049910719:129856051134400 0:0 KernelExecution:78:14696 +129856049882831:129856051651620 0:0 CopyDeviceToHost:80:14696 +129856057864499:129856058629610 0:0 CopyHostToDevice:81:14696 +129856058712855:129856059935896 0:0 KernelExecution:85:14696 +129856058684894:129856060452569 0:0 CopyDeviceToHost:87:14696 +129856066769721:129856067537899 0:0 CopyHostToDevice:88:14696 +129856067621801:129856068845321 0:0 KernelExecution:92:14696 +129856067594217:129856069423348 0:0 CopyDeviceToHost:94:14696 +129856075784739:129856076568384 0:0 CopyHostToDevice:95:14696 +129856076658166:129856077880567 0:0 KernelExecution:99:14696 +129856076630540:129856078394130 0:0 CopyDeviceToHost:101:14696 +129856084835135:129856085603333 0:0 CopyHostToDevice:102:14696 +129856085689351:129856086911912 0:0 KernelExecution:106:14696 +129856085661614:129856087438495 0:0 CopyDeviceToHost:108:14696 +129856093911070:129856094682948 0:0 CopyHostToDevice:109:14696 +129856094767987:129856095991348 0:0 KernelExecution:113:14696 +129856094739044:129856096520182 0:0 CopyDeviceToHost:115:14696 +129855997303698:129855998134058 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :32 +129855998136242:129855998136243 14696:14696 MARK(name(before HIP LaunchKernel)) +129855998138933:129855998139817 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :34 +129855998141918:129855998142773 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :35 +129855998144935:129855998149221 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :36 +129855998151431:129855998151432 14696:14696 MARK(name(after HIP LaunchKernel)) +129855998153828:129855999937506 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :38 +129856005829520:129856006665192 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :39 +129856006667396:129856006667397 14696:14696 MARK(name(before HIP LaunchKernel)) +129856006670307:129856006671160 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :41 +129856006673376:129856006674209 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :42 +129856006676323:129856006679651 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :43 +129856006681635:129856006681636 14696:14696 MARK(name(after HIP LaunchKernel)) +129856006683967:129856008469471 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :45 +129856014360174:129856015191285 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :46 +129856015193489:129856015193490 14696:14696 MARK(name(before HIP LaunchKernel)) +129856015196342:129856015197217 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :48 +129856015199400:129856015200221 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :49 +129856015202314:129856015205930 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :50 +129856015208058:129856015208059 14696:14696 MARK(name(after HIP LaunchKernel)) +129856015210764:129856017001555 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :52 +129856022908053:129856023733985 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :53 +129856023736320:129856023736321 14696:14696 MARK(name(before HIP LaunchKernel)) +129856023739178:129856023740063 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :55 +129856023742240:129856023743090 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :56 +129856023745309:129856023748845 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :57 +129856023750891:129856023750892 14696:14696 MARK(name(after HIP LaunchKernel)) +129856023753396:129856025556257 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :59 +129856031530409:129856032503170 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :60 +129856032505392:129856032505393 14696:14696 MARK(name(before HIP LaunchKernel)) +129856032508345:129856032509226 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :62 +129856032511486:129856032512316 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :63 +129856032514599:129856032518036 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :64 +129856032520150:129856032520151 14696:14696 MARK(name(after HIP LaunchKernel)) +129856032522410:129856034373111 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :66 +129856040397979:129856041130687 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :67 +129856041132973:129856041132974 14696:14696 MARK(name(before HIP LaunchKernel)) +129856041136399:129856041137389 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :69 +129856041139653:129856041140500 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :70 +129856041142893:129856041146663 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :71 +129856041148645:129856041148646 14696:14696 MARK(name(after HIP LaunchKernel)) +129856041151128:129856042953843 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :73 +129856048994841:129856049829566 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :74 +129856049831724:129856049831725 14696:14696 MARK(name(before HIP LaunchKernel)) +129856049834527:129856049835413 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :76 +129856049837759:129856049838585 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :77 +129856049840796:129856049844487 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :78 +129856049846529:129856049846530 14696:14696 MARK(name(after HIP LaunchKernel)) +129856049848934:129856051663797 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :80 +129856057798518:129856058633464 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :81 +129856058635650:129856058635651 14696:14696 MARK(name(before HIP LaunchKernel)) +129856058638530:129856058639560 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :83 +129856058641994:129856058642826 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :84 +129856058645125:129856058648721 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :85 +129856058650749:129856058650750 14696:14696 MARK(name(after HIP LaunchKernel)) +129856058653478:129856060466863 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :87 +129856066704603:129856067541502 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :88 +129856067543802:129856067543803 14696:14696 MARK(name(before HIP LaunchKernel)) +129856067546791:129856067547681 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :90 +129856067550027:129856067550854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :91 +129856067553125:129856067556952 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :92 +129856067559149:129856067559150 14696:14696 MARK(name(after HIP LaunchKernel)) +129856067561903:129856069442958 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :94 +129856075719215:129856076572398 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :95 +129856076574828:129856076574829 14696:14696 MARK(name(before HIP LaunchKernel)) +129856076578071:129856076578997 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :97 +129856076581286:129856076582119 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :98 +129856076584498:129856076588395 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :99 +129856076590554:129856076590555 14696:14696 MARK(name(after HIP LaunchKernel)) +129856076592857:129856078406672 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :101 +129856084768530:129856085607081 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :102 +129856085609437:129856085609438 14696:14696 MARK(name(before HIP LaunchKernel)) +129856085612528:129856085613498 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :104 +129856085615751:129856085616602 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :105 +129856085618831:129856085623039 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :106 +129856085625178:129856085625179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856085627731:129856087451206 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :108 +129856093846767:129856094686797 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :109 +129856094689153:129856094689154 14696:14696 MARK(name(before HIP LaunchKernel)) +129856094692497:129856094693485 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :111 +129856094695727:129856094696598 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :112 +129856094698884:129856094702856 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :113 +129856094705178:129856094705179 14696:14696 MARK(name(after HIP LaunchKernel)) +129856094707931:129856096534639 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :115 PASSED! ## Iteration (83) ################# PASSED! ## Iteration (82) ################# PASSED! ## Iteration (81) ################# -3802701703288299:3802701705170783 0:0 CopyHostToDevice:74:1991 -3802701705274243:3802701706486156 0:0 KernelExecution:78:1991 -3802701705250604:3802701707936074 0:0 CopyDeviceToHost:80:1991 -3802701715184407:3802701716946440 0:0 CopyHostToDevice:81:1991 -3802701717062173:3802701718258234 0:0 KernelExecution:85:1991 -3802701717027281:3802701719895352 0:0 CopyDeviceToHost:87:1991 -3802701727144976:3802701729139460 0:0 CopyHostToDevice:88:1991 -3802701729244175:3802701730445125 0:0 KernelExecution:92:1991 -3802701729220511:3802701732165583 0:0 CopyDeviceToHost:94:1991 -3802701739387037:3802701741142680 0:0 CopyHostToDevice:95:1991 -3802701741249310:3802701742453815 0:0 KernelExecution:99:1991 -3802701741225710:3802701744149042 0:0 CopyDeviceToHost:101:1991 -3802701751388465:3802701753137668 0:0 CopyHostToDevice:102:1991 -3802701753243075:3802701754440321 0:0 KernelExecution:106:1991 -3802701753219589:3802701756153951 0:0 CopyDeviceToHost:108:1991 -3802701763443335:3802701765498080 0:0 CopyHostToDevice:109:1991 -3802701765603802:3802701766820456 0:0 KernelExecution:113:1991 -3802701765580171:3802701768590463 0:0 CopyDeviceToHost:115:1991 -3802701775866137:3802701777758951 0:0 CopyHostToDevice:116:1991 -3802701777862528:3802701779073255 0:0 KernelExecution:120:1991 -3802701777839322:3802701780544442 0:0 CopyDeviceToHost:122:1991 -3802701787979987:3802701790138553 0:0 CopyHostToDevice:123:1991 -3802701790243940:3802701791446371 0:0 KernelExecution:127:1991 -3802701790220103:3802701792896973 0:0 CopyDeviceToHost:129:1991 PASSED! ## Iteration (80) ################# PASSED! @@ -114,35 +219,118 @@ PASSED! ## Iteration (74) ################# PASSED! ## Iteration (73) ################# -3802701800291738:3802701802179392 0:0 CopyHostToDevice:130:1991 -3802701802285163:3802701803481223 0:0 KernelExecution:134:1991 -3802701802261733:3802701804931343 0:0 CopyDeviceToHost:136:1991 -3802701812337128:3802701814252581 0:0 CopyHostToDevice:137:1991 -3802701814356366:3802701815565464 0:0 KernelExecution:141:1991 -3802701814332902:3802701817015292 0:0 CopyDeviceToHost:143:1991 -3802701824392847:3802701826310401 0:0 CopyHostToDevice:144:1991 -3802701826415256:3802701827613539 0:0 KernelExecution:148:1991 -3802701826391761:3802701829071431 0:0 CopyDeviceToHost:150:1991 -3802701836291435:3802701838179779 0:0 CopyHostToDevice:151:1991 -3802701838283081:3802701839480623 0:0 KernelExecution:155:1991 -3802701838259290:3802701840931690 0:0 CopyDeviceToHost:157:1991 -3802701848294054:3802701850186618 0:0 CopyHostToDevice:158:1991 -3802701850293201:3802701851487632 0:0 KernelExecution:162:1991 -3802701850269869:3802701852937908 0:0 CopyDeviceToHost:164:1991 -3802701860182332:3802701862143417 0:0 CopyHostToDevice:165:1991 -3802701862248805:3802701863444865 0:0 KernelExecution:169:1991 -3802701862224967:3802701865141909 0:0 CopyDeviceToHost:171:1991 -3802701872353003:3802701874265587 0:0 CopyHostToDevice:172:1991 -3802701874371291:3802701875572092 0:0 KernelExecution:176:1991 -3802701874348307:3802701877019147 0:0 CopyDeviceToHost:178:1991 -3802701884267750:3802701886153054 0:0 CopyHostToDevice:179:1991 -3802701886259179:3802701887463536 0:0 KernelExecution:183:1991 -3802701886235615:3802701888914085 0:0 CopyDeviceToHost:185:1991 -3802701896155929:3802701898142244 0:0 CopyHostToDevice:186:1991 -3802701898246687:3802701899454155 0:0 KernelExecution:190:1991 -3802701898223504:3802701901145246 0:0 CopyDeviceToHost:192:1991 +129856103067958:129856103841032 0:0 CopyHostToDevice:116:14696 +129856103927769:129856105150970 0:0 KernelExecution:120:14696 +129856103899316:129856105721054 0:0 CopyDeviceToHost:122:14696 +129856112245852:129856113015798 0:0 CopyHostToDevice:123:14696 +129856113100485:129856114323526 0:0 KernelExecution:127:14696 +129856113072690:129856114900649 0:0 CopyDeviceToHost:129:14696 +129856121600998:129856122374148 0:0 CopyHostToDevice:130:14696 +129856122460856:129856123685017 0:0 KernelExecution:134:14696 +129856122432406:129856124221503 0:0 CopyDeviceToHost:136:14696 +129856130996154:129856131718339 0:0 CopyHostToDevice:137:14696 +129856131803770:129856133026171 0:0 KernelExecution:141:14696 +129856131775718:129856133613724 0:0 CopyDeviceToHost:143:14696 +129856140505813:129856141285491 0:0 CopyHostToDevice:144:14696 +129856141371337:129856142594218 0:0 KernelExecution:148:14696 +129856141343575:129856143188801 0:0 CopyDeviceToHost:150:14696 +129856150234971:129856151016053 0:0 CopyHostToDevice:151:14696 +129856151102892:129856152327053 0:0 KernelExecution:155:14696 +129856151074919:129856152872907 0:0 CopyDeviceToHost:157:14696 +129856159481376:129856160253347 0:0 CopyHostToDevice:158:14696 +129856160343525:129856161566086 0:0 KernelExecution:162:14696 +129856160315355:129856162137295 0:0 CopyDeviceToHost:164:14696 +129856168059715:129856168791250 0:0 CopyHostToDevice:165:14696 +129856168876828:129856170099709 0:0 KernelExecution:169:14696 +129856168849139:129856170629902 0:0 CopyDeviceToHost:171:14696 +129856176005269:129856176724156 0:0 CopyHostToDevice:172:14696 +129856176811979:129856178033100 0:0 KernelExecution:176:14696 +129856176783784:129856178564862 0:0 CopyDeviceToHost:178:14696 +129856183804454:129856184516916 0:0 CopyHostToDevice:179:14696 +129856184609470:129856185832511 0:0 KernelExecution:183:14696 +129856184581802:129856186368858 0:0 CopyDeviceToHost:185:14696 +129856191541921:129856192254454 0:0 CopyHostToDevice:186:14696 +129856192345329:129856193569809 0:0 KernelExecution:190:14696 +129856192317767:129856194105080 0:0 CopyDeviceToHost:192:14696 +129856103003811:129856103844379 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :116 +129856103846787:129856103846788 14696:14696 MARK(name(before HIP LaunchKernel)) +129856103849922:129856103850838 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :118 +129856103853240:129856103854136 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :119 +129856103856444:129856103860149 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :120 +129856103862386:129856103862387 14696:14696 MARK(name(after HIP LaunchKernel)) +129856103864691:129856105741098 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :122 +129856112200226:129856113019342 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :123 +129856113021598:129856113021599 14696:14696 MARK(name(before HIP LaunchKernel)) +129856113024595:129856113025504 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :125 +129856113027902:129856113028756 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :126 +129856113031010:129856113034968 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :127 +129856113037098:129856113037099 14696:14696 MARK(name(after HIP LaunchKernel)) +129856113039452:129856114918382 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :129 +129856121536590:129856122377686 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :130 +129856122380177:129856122380178 14696:14696 MARK(name(before HIP LaunchKernel)) +129856122383242:129856122384157 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :132 +129856122386562:129856122387438 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :133 +129856122389743:129856122393887 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :134 +129856122395917:129856122395918 14696:14696 MARK(name(after HIP LaunchKernel)) +129856122398705:129856124236553 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :136 +129856130930250:129856131721919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :137 +129856131724534:129856131724535 14696:14696 MARK(name(before HIP LaunchKernel)) +129856131727544:129856131728453 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :139 +129856131730840:129856131731718 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :140 +129856131734248:129856131738338 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :141 +129856131740508:129856131740509 14696:14696 MARK(name(after HIP LaunchKernel)) +129856131742956:129856133633762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :143 +129856140484642:129856141289559 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :144 +129856141292040:129856141292041 14696:14696 MARK(name(before HIP LaunchKernel)) +129856141295360:129856141296366 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :146 +129856141298705:129856141299584 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :147 +129856141301885:129856141305904 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :148 +129856141308287:129856141308288 14696:14696 MARK(name(after HIP LaunchKernel)) +129856141310745:129856143207185 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :150 +129856150167842:129856151019519 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :151 +129856151021903:129856151021904 14696:14696 MARK(name(before HIP LaunchKernel)) +129856151025430:129856151026339 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :153 +129856151028846:129856151029731 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :154 +129856151032070:129856151036399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :155 +129856151038525:129856151038526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856151041204:129856152887054 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :157 +129856159416500:129856160257922 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :158 +129856160260251:129856160260252 14696:14696 MARK(name(before HIP LaunchKernel)) +129856160263327:129856160264253 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :160 +129856160266588:129856160267551 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :161 +129856160269815:129856160273583 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :162 +129856160275639:129856160275640 14696:14696 MARK(name(after HIP LaunchKernel)) +129856160277873:129856162154856 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :164 +129856167989129:129856168794954 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :165 +129856168796817:129856168796818 14696:14696 MARK(name(before HIP LaunchKernel)) +129856168799680:129856168800356 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :167 +129856168802336:129856168803043 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :168 +129856168804923:129856168808196 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :169 +129856168810026:129856168810027 14696:14696 MARK(name(after HIP LaunchKernel)) +129856168811889:129856170642148 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :171 +129856175935119:129856176727698 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :172 +129856176729573:129856176729574 14696:14696 MARK(name(before HIP LaunchKernel)) +129856176732312:129856176733001 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :174 +129856176734764:129856176735517 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :175 +129856176737306:129856176740961 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :176 +129856176742551:129856176742552 14696:14696 MARK(name(after HIP LaunchKernel)) +129856176744384:129856178576608 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :178 +129856183733862:129856184521359 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :179 PASSED! ## Iteration (72) ################# +129856184523202:129856184523203 14696:14696 MARK(name(before HIP LaunchKernel)) +129856184526239:129856184526918 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :181 +129856184528695:129856184529339 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :182 +129856184531203:129856184534819 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :183 +129856184536444:129856184536445 14696:14696 MARK(name(after HIP LaunchKernel)) +129856184538159:129856186381152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :185 +129856191471466:129856192258965 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :186 +129856192260887:129856192260888 14696:14696 MARK(name(before HIP LaunchKernel)) +129856192264565:129856192265231 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :188 +129856192266936:129856192267582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :189 +129856192269493:129856192272647 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :190 +129856192274238:129856192274239 14696:14696 MARK(name(after HIP LaunchKernel)) +129856192276014:129856194117333 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :192 PASSED! ## Iteration (71) ################# PASSED! @@ -157,30 +345,6 @@ PASSED! ## Iteration (66) ################# PASSED! ## Iteration (65) ################# -3802701908363640:3802701910282004 0:0 CopyHostToDevice:193:1991 -3802701910388686:3802701911593636 0:0 KernelExecution:197:1991 -3802701910364944:3802701913041924 0:0 CopyDeviceToHost:199:1991 -3802701920274197:3802701922171761 0:0 CopyHostToDevice:200:1991 -3802701922278125:3802701923475222 0:0 KernelExecution:204:1991 -3802701922254592:3802701924925132 0:0 CopyDeviceToHost:206:1991 -3802701932168496:3802701934142771 0:0 CopyHostToDevice:207:1991 -3802701934246976:3802701935438295 0:0 KernelExecution:211:1991 -3802701934223551:3802701937141613 0:0 CopyDeviceToHost:213:1991 -3802701944352056:3802701946257570 0:0 CopyHostToDevice:214:1991 -3802701946362997:3802701947574317 0:0 KernelExecution:218:1991 -3802701946339571:3802701949023790 0:0 CopyDeviceToHost:220:1991 -3802701956400665:3802701958316110 0:0 CopyHostToDevice:221:1991 -3802701958422590:3802701959641615 0:0 KernelExecution:225:1991 -3802701958399130:3802701961106280 0:0 CopyDeviceToHost:227:1991 -3802701968320724:3802701970208178 0:0 CopyHostToDevice:228:1991 -3802701970318670:3802701971521693 0:0 KernelExecution:232:1991 -3802701970295529:3802701972971609 0:0 CopyDeviceToHost:234:1991 -3802701980199792:3802701982142436 0:0 CopyHostToDevice:235:1991 -3802701982245928:3802701983440062 0:0 KernelExecution:239:1991 -3802701982222487:3802701985143188 0:0 CopyDeviceToHost:241:1991 -3802701992355642:3802701994267646 0:0 CopyHostToDevice:242:1991 -3802701994371730:3802701995578753 0:0 KernelExecution:246:1991 -3802701994348667:3802701997026937 0:0 CopyDeviceToHost:248:1991 PASSED! ## Iteration (64) ################# PASSED! @@ -193,39 +357,152 @@ PASSED! ## Iteration (60) ################# PASSED! ## Iteration (59) ################# +129856199280943:129856199989681 0:0 CopyHostToDevice:193:14696 +129856200075190:129856201299831 0:0 KernelExecution:197:14696 +129856200047538:129856201850341 0:0 CopyDeviceToHost:199:14696 +129856206897412:129856207614253 0:0 CopyHostToDevice:200:14696 +129856207705498:129856208928859 0:0 KernelExecution:204:14696 +129856207676917:129856209473592 0:0 CopyDeviceToHost:206:14696 +129856214432984:129856215196409 0:0 CopyHostToDevice:207:14696 +129856215281304:129856216504825 0:0 KernelExecution:211:14696 +129856215253529:129856217050195 0:0 CopyDeviceToHost:213:14696 +129856221931666:129856222699124 0:0 CopyHostToDevice:214:14696 +129856222785050:129856224007611 0:0 KernelExecution:218:14696 +129856222756874:129856224558196 0:0 CopyDeviceToHost:220:14696 +129856229435728:129856230202586 0:0 CopyHostToDevice:221:14696 +129856230289822:129856231510942 0:0 KernelExecution:225:14696 +129856230262176:129856232049379 0:0 CopyDeviceToHost:227:14696 +129856236838217:129856237549415 0:0 CopyHostToDevice:228:14696 +129856237635376:129856238857136 0:0 KernelExecution:232:14696 +129856237607782:129856239407224 0:0 CopyDeviceToHost:234:14696 +129856244299394:129856245007567 0:0 CopyHostToDevice:235:14696 +129856245099279:129856246322159 0:0 KernelExecution:239:14696 +129856245071193:129856246864706 0:0 CopyDeviceToHost:241:14696 +129856251723187:129856252431603 0:0 CopyHostToDevice:242:14696 +129856252521404:129856253744124 0:0 KernelExecution:246:14696 +129856252493576:129856254289474 0:0 CopyDeviceToHost:248:14696 +129856259171693:129856259879626 0:0 CopyHostToDevice:249:14696 +129856259964936:129856261188937 0:0 KernelExecution:253:14696 +129856259937195:129856261731637 0:0 CopyDeviceToHost:255:14696 +129856266605795:129856267371070 0:0 CopyHostToDevice:256:14696 +129856267455912:129856268680233 0:0 KernelExecution:260:14696 +129856267428297:129856269227260 0:0 CopyDeviceToHost:262:14696 +129856274075448:129856274840296 0:0 CopyHostToDevice:263:14696 +129856274927804:129856276150525 0:0 KernelExecution:267:14696 +129856274899679:129856276695018 0:0 CopyDeviceToHost:269:14696 +129856281565009:129856282326831 0:0 CopyHostToDevice:270:14696 +129856282411157:129856283637077 0:0 KernelExecution:274:14696 +129856282383503:129856284175523 0:0 CopyDeviceToHost:276:14696 +129856288995752:129856289705630 0:0 CopyHostToDevice:277:14696 +129856289793308:129856291014269 0:0 KernelExecution:281:14696 +129856289765547:129856291559219 0:0 CopyDeviceToHost:283:14696 +129856296360197:129856297069117 0:0 CopyHostToDevice:284:14696 +129856297157310:129856298378111 0:0 KernelExecution:288:14696 +129856297129589:129856298914568 0:0 CopyDeviceToHost:290:14696 +129856199220209:129856199993256 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :193 +129856199995165:129856199995166 14696:14696 MARK(name(before HIP LaunchKernel)) +129856199998331:129856199999016 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :195 +129856200000971:129856200001630 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :196 +129856200003348:129856200006409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :197 +129856200007997:129856200007998 14696:14696 MARK(name(after HIP LaunchKernel)) +129856200009781:129856201864796 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :199 +129856206828954:129856207617612 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :200 +129856207619342:129856207619343 14696:14696 MARK(name(before HIP LaunchKernel)) +129856207633427:129856207634203 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :202 +129856207635929:129856207636565 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :203 +129856207638289:129856207641619 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :204 +129856207643379:129856207643380 14696:14696 MARK(name(after HIP LaunchKernel)) +129856207645338:129856209486625 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :206 +129856214367871:129856215199634 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :207 +129856215201421:129856215201422 14696:14696 MARK(name(before HIP LaunchKernel)) +129856215205034:129856215205701 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :209 +129856215207421:129856215208068 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :210 +129856215209926:129856215213001 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :211 +129856215214576:129856215214577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856215216591:129856217062762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :213 +129856221865656:129856222702390 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :214 +129856222704143:129856222704144 14696:14696 MARK(name(before HIP LaunchKernel)) +129856222707593:129856222708263 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :216 +129856222709907:129856222710533 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :217 +129856222712408:129856222715305 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :218 +129856222716820:129856222716821 14696:14696 MARK(name(after HIP LaunchKernel)) +129856222718703:129856224572291 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :220 +129856229369321:129856230206171 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :221 +129856230207933:129856230207934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856230211408:129856230212070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :223 +129856230213729:129856230214356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :224 +129856230216306:129856230219552 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :225 +129856230221084:129856230221085 14696:14696 MARK(name(after HIP LaunchKernel)) +129856230222856:129856232061167 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :227 +129856236820359:129856237552651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :228 +129856237554349:129856237554350 14696:14696 MARK(name(before HIP LaunchKernel)) +129856237557958:129856237558615 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :230 +129856237560382:129856237561016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :231 +129856237562876:129856237566063 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :232 +129856237567608:129856237567609 14696:14696 MARK(name(after HIP LaunchKernel)) +129856237569296:129856239419101 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :234 +129856244174381:129856245010977 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :235 +129856245012718:129856245012719 14696:14696 MARK(name(before HIP LaunchKernel)) +129856245025693:129856245026451 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :237 +129856245028210:129856245028855 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :238 +129856245030730:129856245034177 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :239 +129856245035805:129856245035806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856245038122:129856246876538 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :241 +129856251653109:129856252435896 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :242 +129856252437833:129856252437834 14696:14696 MARK(name(before HIP LaunchKernel)) +129856252441362:129856252442017 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :244 +129856252443660:129856252444296 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :245 +129856252446165:129856252449155 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :246 +129856252450809:129856252450810 14696:14696 MARK(name(after HIP LaunchKernel)) +129856252452579:129856254303055 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :248 +129856259101952:129856259882749 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :249 +129856259884515:129856259884516 14696:14696 MARK(name(before HIP LaunchKernel)) +129856259886742:129856259887392 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :251 +129856259889040:129856259889671 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :252 +129856259891415:129856259894919 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :253 +129856259896631:129856259896632 14696:14696 MARK(name(after HIP LaunchKernel)) +129856259898324:129856261743974 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :255 +129856266541050:129856267374498 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :256 +129856267376266:129856267376267 14696:14696 MARK(name(before HIP LaunchKernel)) +129856267379647:129856267380320 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :258 +129856267381929:129856267382540 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :259 +129856267384409:129856267387474 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :260 +129856267389033:129856267389034 14696:14696 MARK(name(after HIP LaunchKernel)) +129856267390764:129856269239563 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :262 +129856274008890:129856274843415 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :263 +129856274845095:129856274845096 14696:14696 MARK(name(before HIP LaunchKernel)) +129856274847806:129856274848470 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :265 +129856274850117:129856274850733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :266 +129856274852427:129856274855749 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :267 +129856274857358:129856274857359 14696:14696 MARK(name(after HIP LaunchKernel)) +129856274859228:129856276707873 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :269 +129856281498759:129856282330118 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :270 +129856282332044:129856282332045 14696:14696 MARK(name(before HIP LaunchKernel)) +129856282335358:129856282336015 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :272 +129856282338029:129856282338668 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :273 +129856282340644:129856282343485 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :274 +129856282345028:129856282345029 14696:14696 MARK(name(after HIP LaunchKernel)) +129856282347024:129856284203838 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :276 +129856288978096:129856289708673 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :277 +129856289710414:129856289710415 14696:14696 MARK(name(before HIP LaunchKernel)) +129856289714250:129856289714924 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :279 +129856289716689:129856289717305 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :280 +129856289719150:129856289722057 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :281 +129856289723677:129856289723678 14696:14696 MARK(name(after HIP LaunchKernel)) +129856289725380:129856291571314 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :283 +129856296341271:129856297072486 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :284 +129856297074313:129856297074314 14696:14696 MARK(name(before HIP LaunchKernel)) +129856297077733:129856297078380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :286 +129856297080109:129856297080733 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :287 +129856297082729:129856297085646 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :288 +129856297087184:129856297087185 14696:14696 MARK(name(after HIP LaunchKernel)) +129856297089004:129856298926004 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :290 PASSED! ## Iteration (58) ################# PASSED! ## Iteration (57) ################# PASSED! ## Iteration (56) ################# -3802702004315971:3802702006430907 0:0 CopyHostToDevice:249:1991 -3802702006513343:3802702007718885 0:0 KernelExecution:253:1991 -3802702006490217:3802702009041896 0:0 CopyDeviceToHost:255:1991 -3802702018262184:3802702019943876 0:0 CopyHostToDevice:256:1991 -3802702020050568:3802702021249295 0:0 KernelExecution:260:1991 -3802702020026907:3802702022584386 0:0 CopyDeviceToHost:262:1991 -3802702029541468:3802702031219270 0:0 CopyHostToDevice:263:1991 -3802702031312763:3802702032510305 0:0 KernelExecution:267:1991 -3802702031289161:3802702033843490 0:0 CopyDeviceToHost:269:1991 -3802702040805082:3802702042480244 0:0 CopyHostToDevice:270:1991 -3802702042572785:3802702043776105 0:0 KernelExecution:274:1991 -3802702042549004:3802702045110673 0:0 CopyDeviceToHost:276:1991 -3802702052065204:3802702053741167 0:0 CopyHostToDevice:277:1991 -3802702053835958:3802702055052463 0:0 KernelExecution:281:1991 -3802702053813487:3802702056374447 0:0 CopyDeviceToHost:283:1991 -3802702063333568:3802702065014061 0:0 CopyHostToDevice:284:1991 -3802702065111999:3802702066319615 0:0 KernelExecution:288:1991 -3802702065088771:3802702067654340 0:0 CopyDeviceToHost:290:1991 -3802702074618962:3802702076284625 0:0 CopyHostToDevice:291:1991 -3802702076384443:3802702077569835 0:0 KernelExecution:295:1991 -3802702076360685:3802702078904404 0:0 CopyDeviceToHost:297:1991 -3802702085881125:3802702087555758 0:0 CopyHostToDevice:298:1991 -3802702087649675:3802702088847958 0:0 KernelExecution:302:1991 -3802702087626608:3802702090183277 0:0 CopyDeviceToHost:304:1991 -3802702097151929:3802702098830722 0:0 CopyHostToDevice:305:1991 -3802702098924116:3802702100140473 0:0 KernelExecution:309:1991 -3802702098901192:3802702101472621 0:0 CopyDeviceToHost:311:1991 PASSED! ## Iteration (55) ################# PASSED! @@ -244,35 +521,140 @@ PASSED! ## Iteration (48) ################# PASSED! ## Iteration (47) ################# -3802702108615424:3802702110296796 0:0 CopyHostToDevice:312:1991 -3802702110392443:3802702111600207 0:0 KernelExecution:316:1991 -3802702110368957:3802702112934696 0:0 CopyDeviceToHost:318:1991 -3802702119898217:3802702121579670 0:0 CopyHostToDevice:319:1991 -3802702121673899:3802702122873960 0:0 KernelExecution:323:1991 -3802702121650880:3802702124193909 0:0 CopyDeviceToHost:325:1991 -3802702131156331:3802702132834494 0:0 CopyHostToDevice:326:1991 -3802702132927702:3802702134121984 0:0 KernelExecution:330:1991 -3802702132904324:3802702135456513 0:0 CopyDeviceToHost:332:1991 -3802702142434925:3802702144099207 0:0 CopyHostToDevice:333:1991 -3802702144200141:3802702145401090 0:0 KernelExecution:337:1991 -3802702144175248:3802702146735777 0:0 CopyDeviceToHost:339:1991 -3802702153706898:3802702155385711 0:0 CopyHostToDevice:340:1991 -3802702155488005:3802702156685843 0:0 KernelExecution:344:1991 -3802702155464581:3802702158018890 0:0 CopyDeviceToHost:346:1991 -3802702164987312:3802702166668385 0:0 CopyHostToDevice:347:1991 -3802702166762069:3802702167965537 0:0 KernelExecution:351:1991 -3802702166739105:3802702169298644 0:0 CopyDeviceToHost:353:1991 -3802702176260016:3802702177933188 0:0 CopyHostToDevice:354:1991 -3802702178026430:3802702179223971 0:0 KernelExecution:358:1991 -3802702178002518:3802702180540757 0:0 CopyDeviceToHost:360:1991 -3802702187490789:3802702189167931 0:0 CopyHostToDevice:361:1991 -3802702189262737:3802702190474501 0:0 KernelExecution:365:1991 -3802702189239082:3802702191808141 0:0 CopyDeviceToHost:367:1991 -3802702198761922:3802702200425845 0:0 CopyHostToDevice:368:1991 PASSED! ## Iteration (46) ################# PASSED! ## Iteration (45) ################# +129856303845436:129856304622018 0:0 CopyHostToDevice:291:14696 +129856304714456:129856305941176 0:0 KernelExecution:295:14696 +129856304686879:129856306490313 0:0 CopyDeviceToHost:297:14696 +129856311333818:129856312045157 0:0 CopyHostToDevice:298:14696 +129856312128568:129856313351929 0:0 KernelExecution:302:14696 +129856312100713:129856313892452 0:0 CopyDeviceToHost:304:14696 +129856318773490:129856319480599 0:0 CopyHostToDevice:305:14696 +129856319573103:129856320793904 0:0 KernelExecution:309:14696 +129856319544959:129856321343459 0:0 CopyDeviceToHost:311:14696 +129856326211019:129856326977511 0:0 CopyHostToDevice:312:14696 +129856327061875:129856328282996 0:0 KernelExecution:316:14696 +129856327034134:129856328825473 0:0 CopyDeviceToHost:318:14696 +129856333673698:129856334437330 0:0 CopyHostToDevice:319:14696 +129856334523567:129856335745168 0:0 KernelExecution:323:14696 +129856334495713:129856336293262 0:0 CopyDeviceToHost:325:14696 +129856341101442:129856341984561 0:0 CopyHostToDevice:326:14696 +129856342071670:129856343294870 0:0 KernelExecution:330:14696 +129856342043988:129856343840850 0:0 CopyDeviceToHost:332:14696 +129856348646308:129856349354803 0:0 CopyHostToDevice:333:14696 +129856349441279:129856350662399 0:0 KernelExecution:337:14696 +129856349413003:129856351203503 0:0 CopyDeviceToHost:339:14696 +129856356094471:129856356820623 0:0 CopyHostToDevice:340:14696 +129856356907355:129856358130235 0:0 KernelExecution:344:14696 +129856356879789:129856358671945 0:0 CopyDeviceToHost:346:14696 +129856363528023:129856364288036 0:0 CopyHostToDevice:347:14696 +129856364405580:129856365626380 0:0 KernelExecution:351:14696 +129856364377906:129856366172703 0:0 CopyDeviceToHost:353:14696 +129856371087592:129856371798847 0:0 CopyHostToDevice:354:14696 +129856371883929:129856373108889 0:0 KernelExecution:358:14696 +129856371855593:129856373655534 0:0 CopyDeviceToHost:360:14696 +129856378493711:129856379257336 0:0 CopyHostToDevice:361:14696 +129856379342581:129856380565301 0:0 KernelExecution:365:14696 +129856379314699:129856381113012 0:0 CopyDeviceToHost:367:14696 +129856385977586:129856386744228 0:0 CopyHostToDevice:368:14696 +129856386831442:129856388055123 0:0 KernelExecution:372:14696 +129856386803378:129856388598263 0:0 CopyDeviceToHost:374:14696 +129856393484361:129856394251866 0:0 CopyHostToDevice:375:14696 +129856394339138:129856395561058 0:0 KernelExecution:379:14696 +129856394311639:129856396103600 0:0 CopyDeviceToHost:381:14696 +129856303774990:129856304626161 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :291 +129856304627884:129856304627885 14696:14696 MARK(name(before HIP LaunchKernel)) +129856304631072:129856304631723 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :293 +129856304633373:129856304634007 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :294 +129856304635811:129856304639104 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :295 +129856304640848:129856304640849 14696:14696 MARK(name(after HIP LaunchKernel)) +129856304642651:129856306501959 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :297 +129856311264292:129856312048766 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :298 +129856312050539:129856312050540 14696:14696 MARK(name(before HIP LaunchKernel)) +129856312053498:129856312054174 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :300 +129856312055946:129856312056653 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :301 +129856312058397:129856312061589 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :302 +129856312063201:129856312063202 14696:14696 MARK(name(after HIP LaunchKernel)) +129856312065053:129856313904746 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :304 +129856318704110:129856319483869 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :305 +129856319485543:129856319485544 14696:14696 MARK(name(before HIP LaunchKernel)) +129856319499258:129856319500048 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :307 +129856319501759:129856319502401 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :308 +129856319504307:129856319507787 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :309 +129856319509535:129856319509536 14696:14696 MARK(name(after HIP LaunchKernel)) +129856319511552:129856321356021 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :311 +129856326144210:129856326980680 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :312 +129856326982483:129856326982484 14696:14696 MARK(name(before HIP LaunchKernel)) +129856326986163:129856326986815 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :314 +129856326988581:129856326989210 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :315 +129856326991095:129856326994082 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :316 +129856326995650:129856326995651 14696:14696 MARK(name(after HIP LaunchKernel)) +129856326997461:129856328838450 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :318 +129856333608209:129856334440902 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :319 +129856334442697:129856334442698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856334446427:129856334447095 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :321 +129856334448793:129856334449426 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :322 +129856334451308:129856334454120 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :323 +129856334455718:129856334455719 14696:14696 MARK(name(after HIP LaunchKernel)) +129856334457508:129856336307654 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :325 +129856341084552:129856341987761 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :326 +129856341989501:129856341989502 14696:14696 MARK(name(before HIP LaunchKernel)) +129856341992961:129856341993616 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :328 +129856341995311:129856341995915 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :329 +129856341997784:129856342000844 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :330 +129856342002457:129856342002458 14696:14696 MARK(name(after HIP LaunchKernel)) +129856342004209:129856343852827 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :332 +129856348628207:129856349358297 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :333 +129856349360014:129856349360015 14696:14696 MARK(name(before HIP LaunchKernel)) +129856349363641:129856349364301 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :335 +129856349365955:129856349366590 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :336 +129856349368410:129856349371392 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :337 +129856349373001:129856349373002 14696:14696 MARK(name(after HIP LaunchKernel)) +129856349374736:129856351215163 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :339 +129856356026231:129856356823939 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :340 +129856356825939:129856356825940 14696:14696 MARK(name(before HIP LaunchKernel)) +129856356829316:129856356829967 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :342 +129856356831607:129856356832235 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :343 +129856356834103:129856356837300 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :344 +129856356838880:129856356838881 14696:14696 MARK(name(after HIP LaunchKernel)) +129856356840997:129856358683474 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :346 +129856363457621:129856364292098 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :347 +129856364293909:129856364293910 14696:14696 MARK(name(before HIP LaunchKernel)) +129856364296242:129856364296921 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :349 +129856364298665:129856364299325 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :350 +129856364301137:129856364304805 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :351 +129856364306614:129856364306615 14696:14696 MARK(name(after HIP LaunchKernel)) +129856364308432:129856366185192 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :353 +129856371019019:129856371802348 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :354 +129856371804072:129856371804073 14696:14696 MARK(name(before HIP LaunchKernel)) +129856371807407:129856371808089 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :356 +129856371809769:129856371810408 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :357 +129856371812409:129856371815399 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :358 +129856371816938:129856371816939 14696:14696 MARK(name(after HIP LaunchKernel)) +129856371818730:129856373668223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :360 +129856378427685:129856379260530 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :361 +129856379262413:129856379262414 14696:14696 MARK(name(before HIP LaunchKernel)) +129856379266028:129856379266680 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :363 +129856379268334:129856379268974 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :364 +129856379270951:129856379274011 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :365 +129856379275576:129856379275577 14696:14696 MARK(name(after HIP LaunchKernel)) +129856379277516:129856381125442 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :367 +129856385912709:129856386747747 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :368 +129856386749617:129856386749618 14696:14696 MARK(name(before HIP LaunchKernel)) +129856386753015:129856386753700 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :370 +129856386755603:129856386756230 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :371 +129856386758107:129856386761145 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :372 +129856386762828:129856386762829 14696:14696 MARK(name(after HIP LaunchKernel)) +129856386764527:129856388613300 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :374 +129856393418103:129856394255127 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :375 +129856394257084:129856394257085 14696:14696 MARK(name(before HIP LaunchKernel)) +129856394260727:129856394261393 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :377 +129856394263117:129856394263752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :378 +129856394266100:129856394269007 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :379 +129856394270594:129856394270595 14696:14696 MARK(name(after HIP LaunchKernel)) +129856394272528:129856396115719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :381 PASSED! ## Iteration (44) ################# PASSED! @@ -287,33 +669,6 @@ PASSED! ## Iteration (39) ################# PASSED! ## Iteration (38) ################# -3802702200526879:3802702201738792 0:0 KernelExecution:372:1991 -3802702200502735:3802702203073505 0:0 CopyDeviceToHost:374:1991 -3802702210101096:3802702211781099 0:0 CopyHostToDevice:375:1991 -3802702211874278:3802702213070339 0:0 KernelExecution:379:1991 -3802702211851149:3802702214405528 0:0 CopyDeviceToHost:381:1991 -3802702221371170:3802702223046872 0:0 CopyHostToDevice:382:1991 -3802702223141537:3802702224348264 0:0 KernelExecution:386:1991 -3802702223118273:3802702225680452 0:0 CopyDeviceToHost:388:1991 -3802702232644404:3802702234313936 0:0 CopyHostToDevice:389:1991 -3802702234409358:3802702235607788 0:0 KernelExecution:393:1991 -3802702234385867:3802702236943196 0:0 CopyDeviceToHost:395:1991 -3802702243900787:3802702245580279 0:0 CopyHostToDevice:396:1991 -3802702245674663:3802702246878279 0:0 KernelExecution:400:1991 -3802702245651760:3802702248198969 0:0 CopyDeviceToHost:402:1991 -3802702255168930:3802702256847073 0:0 CopyHostToDevice:403:1991 -3802702256941454:3802702258155589 0:0 KernelExecution:407:1991 -3802702256918733:3802702259489683 0:0 CopyDeviceToHost:409:1991 -3802702266456174:3802702268121957 0:0 CopyHostToDevice:410:1991 -3802702268222984:3802702269419637 0:0 KernelExecution:414:1991 -3802702268198287:3802702270718936 0:0 CopyDeviceToHost:416:1991 -3802702277684438:3802702279355020 0:0 CopyHostToDevice:417:1991 -3802702279449065:3802702280656977 0:0 KernelExecution:421:1991 -3802702279425380:3802702281990519 0:0 CopyDeviceToHost:423:1991 -3802702288963001:3802702290626813 0:0 CopyHostToDevice:424:1991 -3802702290725647:3802702291916077 0:0 KernelExecution:428:1991 -3802702290702274:3802702293249973 0:0 CopyDeviceToHost:430:1991 -3802702300213905:3802702301888607 0:0 CopyHostToDevice:431:1991 PASSED! ## Iteration (37) ################# PASSED! @@ -326,38 +681,152 @@ PASSED! ## Iteration (33) ################# PASSED! ## Iteration (32) ################# +129856400949298:129856401690102 0:0 CopyHostToDevice:382:14696 +129856401774737:129856402998097 0:0 KernelExecution:386:14696 +129856401746598:129856403538591 0:0 CopyDeviceToHost:388:14696 +129856408364229:129856409075649 0:0 CopyHostToDevice:389:14696 +129856409181579:129856410405739 0:0 KernelExecution:393:14696 +129856409154049:129856410946890 0:0 CopyDeviceToHost:395:14696 +129856415833858:129856416545026 0:0 CopyHostToDevice:396:14696 +129856416634688:129856417856288 0:0 KernelExecution:400:14696 +129856416607076:129856418397645 0:0 CopyDeviceToHost:402:14696 +129856423255064:129856423962733 0:0 CopyHostToDevice:403:14696 +129856424049344:129856425272224 0:0 KernelExecution:407:14696 +129856424021555:129856425837337 0:0 CopyDeviceToHost:409:14696 +129856430719717:129856431443207 0:0 CopyHostToDevice:410:14696 +129856431530370:129856432753411 0:0 KernelExecution:414:14696 +129856431502760:129856433290891 0:0 CopyDeviceToHost:416:14696 +129856438127461:129856438893077 0:0 CopyHostToDevice:417:14696 +129856438981153:129856440204834 0:0 KernelExecution:421:14696 +129856438953062:129856440755527 0:0 CopyDeviceToHost:423:14696 +129856445658301:129856446425541 0:0 CopyHostToDevice:424:14696 +129856446512512:129856447734433 0:0 KernelExecution:428:14696 +129856446484748:129856448303143 0:0 CopyDeviceToHost:430:14696 +129856453131279:129856453895371 0:0 CopyHostToDevice:431:14696 +129856453982502:129856455205222 0:0 KernelExecution:435:14696 +129856453954390:129856455747092 0:0 CopyDeviceToHost:437:14696 +129856460549446:129856461267384 0:0 CopyHostToDevice:438:14696 +129856461354488:129856462578648 0:0 KernelExecution:442:14696 +129856461327009:129856463119514 0:0 CopyDeviceToHost:444:14696 +129856467954463:129856468665082 0:0 CopyHostToDevice:445:14696 +129856468756966:129856469978566 0:0 KernelExecution:449:14696 +129856468728958:129856470519550 0:0 CopyDeviceToHost:451:14696 +129856475396016:129856476106990 0:0 CopyHostToDevice:452:14696 +129856476191506:129856477415026 0:0 KernelExecution:456:14696 +129856476164143:129856477979522 0:0 CopyDeviceToHost:458:14696 +129856482841902:129856483550322 0:0 CopyHostToDevice:459:14696 +129856483636804:129856484858245 0:0 KernelExecution:463:14696 +129856483608842:129856485404598 0:0 CopyDeviceToHost:465:14696 +129856490264533:129856491036044 0:0 CopyHostToDevice:466:14696 +129856491121979:129856492348219 0:0 KernelExecution:470:14696 +129856491094217:129856492893929 0:0 CopyDeviceToHost:472:14696 +129856497730065:129856498496809 0:0 CopyHostToDevice:473:14696 +129856498583201:129856499806882 0:0 KernelExecution:477:14696 +129856498555486:129856500349740 0:0 CopyDeviceToHost:479:14696 +129856400931528:129856401693841 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :382 +129856401695697:129856401695698 14696:14696 MARK(name(before HIP LaunchKernel)) +129856401698086:129856401698763 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :384 +129856401700644:129856401701356 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :385 +129856401703387:129856401706670 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :386 +129856401708283:129856401708284 14696:14696 MARK(name(after HIP LaunchKernel)) +129856401710202:129856403550731 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :388 +129856408346178:129856409079144 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :389 +129856409080946:129856409080947 14696:14696 MARK(name(before HIP LaunchKernel)) +129856409119575:129856409120361 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :391 +129856409122350:129856409122982 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :392 +129856409124716:129856409127974 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :393 +129856409129722:129856409129723 14696:14696 MARK(name(after HIP LaunchKernel)) +129856409131595:129856410958682 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :395 +129856415764088:129856416549283 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :396 +129856416551147:129856416551148 14696:14696 MARK(name(before HIP LaunchKernel)) +129856416554753:129856416555457 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :398 +129856416557440:129856416558065 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :399 +129856416560077:129856416563543 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :400 +129856416565220:129856416565221 14696:14696 MARK(name(after HIP LaunchKernel)) +129856416567086:129856418410890 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :402 +129856423185992:129856423965984 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :403 +129856423967686:129856423967687 14696:14696 MARK(name(before HIP LaunchKernel)) +129856423971156:129856423971813 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :405 +129856423973453:129856423974058 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :406 +129856423975959:129856423979023 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :407 +129856423980620:129856423980621 14696:14696 MARK(name(after HIP LaunchKernel)) +129856423982481:129856425851437 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :409 +129856430649566:129856431446819 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :410 +129856431448647:129856431448648 14696:14696 MARK(name(before HIP LaunchKernel)) +129856431451980:129856431452627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :412 +129856431454467:129856431455103 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :413 +129856431457061:129856431460021 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :414 +129856431461633:129856431461634 14696:14696 MARK(name(after HIP LaunchKernel)) +129856431463427:129856433305223 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :416 +129856438060199:129856438896337 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :417 +129856438898056:129856438898057 14696:14696 MARK(name(before HIP LaunchKernel)) +129856438901614:129856438902293 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :419 +129856438903944:129856438904582 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :420 +129856438906471:129856438909460 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :421 +129856438910995:129856438910996 14696:14696 MARK(name(after HIP LaunchKernel)) +129856438913099:129856440770029 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :423 +129856445589904:129856446428787 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :424 +129856446430525:129856446430526 14696:14696 MARK(name(before HIP LaunchKernel)) +129856446434097:129856446434755 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :426 +129856446436446:129856446437074 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :427 +129856446438958:129856446442103 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :428 +129856446443705:129856446443706 14696:14696 MARK(name(after HIP LaunchKernel)) +129856446445611:129856448319675 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :430 +129856453113306:129856453898651 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :431 +129856453900443:129856453900444 14696:14696 MARK(name(before HIP LaunchKernel)) +129856453903924:129856453904588 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :433 +129856453906239:129856453906854 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :434 +129856453908740:129856453911874 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :435 +129856453913486:129856453913487 14696:14696 MARK(name(after HIP LaunchKernel)) +129856453915356:129856455761272 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :437 +129856460531599:129856461270590 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :438 +129856461272368:129856461272369 14696:14696 MARK(name(before HIP LaunchKernel)) +129856461275845:129856461276515 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :440 +129856461278198:129856461278850 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :441 +129856461280791:129856461283899 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :442 +129856461285595:129856461285596 14696:14696 MARK(name(after HIP LaunchKernel)) +129856461287388:129856463133280 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :444 +129856467884995:129856468668564 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :445 +129856468670291:129856468670292 14696:14696 MARK(name(before HIP LaunchKernel)) +129856468673055:129856468673710 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :447 +129856468675408:129856468676048 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :448 +129856468677942:129856468681455 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :449 +129856468683148:129856468683149 14696:14696 MARK(name(after HIP LaunchKernel)) +129856468685101:129856470532724 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :451 +129856475326269:129856476110399 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :452 +129856476112220:129856476112221 14696:14696 MARK(name(before HIP LaunchKernel)) +129856476115691:129856476116355 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :454 +129856476118083:129856476118692 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :455 +129856476120553:129856476123478 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :456 +129856476125144:129856476125145 14696:14696 MARK(name(after HIP LaunchKernel)) +129856476126929:129856477993159 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :458 +129856482771986:129856483553655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :459 +129856483555435:129856483555436 14696:14696 MARK(name(before HIP LaunchKernel)) +129856483559048:129856483559715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :461 +129856483561368:129856483561995 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :462 +129856483563875:129856483567045 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :463 +129856483569037:129856483569038 14696:14696 MARK(name(after HIP LaunchKernel)) +129856483570875:129856485418803 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :465 +129856490199703:129856491039451 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :466 +129856491041225:129856491041226 14696:14696 MARK(name(before HIP LaunchKernel)) +129856491044551:129856491045204 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :468 +129856491046844:129856491047481 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :469 +129856491049291:129856491052245 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :470 +129856491053805:129856491053806 14696:14696 MARK(name(after HIP LaunchKernel)) +129856491055528:129856492907612 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :472 +129856497665310:129856498500405 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :473 +129856498502066:129856498502067 14696:14696 MARK(name(before HIP LaunchKernel)) +129856498505506:129856498506141 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :475 +129856498507858:129856498508491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :476 +129856498510523:129856498513554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :477 +129856498515137:129856498515138 14696:14696 MARK(name(after HIP LaunchKernel)) +129856498517011:129856500365762 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :479 PASSED! ## Iteration (31) ################# PASSED! ## Iteration (30) ################# PASSED! ## Iteration (29) ################# -3802702301982442:3802702303186799 0:0 KernelExecution:435:1991 -3802702301959178:3802702304503997 0:0 CopyDeviceToHost:437:1991 -3802702311466108:3802702313146390 0:0 CopyHostToDevice:438:1991 -3802702313238825:3802702314439626 0:0 KernelExecution:442:1991 -3802702313215791:3802702315773720 0:0 CopyDeviceToHost:444:1991 -3802702322736361:3802702324399864 0:0 CopyHostToDevice:445:1991 -3802702324503098:3802702325721085 0:0 KernelExecution:449:1991 -3802702324478794:3802702327055594 0:0 CopyDeviceToHost:451:1991 -3802702334030715:3802702335709388 0:0 CopyHostToDevice:452:1991 -3802702335806620:3802702337014532 0:0 KernelExecution:456:1991 -3802702335783968:3802702338348468 0:0 CopyDeviceToHost:458:1991 -3802702345327399:3802702347004601 0:0 CopyHostToDevice:459:1991 -3802702347117082:3802702348318476 0:0 KernelExecution:463:1991 -3802702347074012:3802702349651691 0:0 CopyDeviceToHost:465:1991 -3802702356616483:3802702358289405 0:0 CopyHostToDevice:466:1991 -3802702358382881:3802702359585164 0:0 KernelExecution:470:1991 -3802702358359406:3802702360920335 0:0 CopyDeviceToHost:472:1991 -3802702367904117:3802702369585909 0:0 CopyHostToDevice:473:1991 -3802702369679903:3802702370875371 0:0 KernelExecution:477:1991 -3802702369656630:3802702372196308 0:0 CopyDeviceToHost:479:1991 -3802702379156600:3802702380837792 0:0 CopyHostToDevice:480:1991 -3802702380930326:3802702382134830 0:0 KernelExecution:484:1991 -3802702380907402:3802702383472292 0:0 CopyDeviceToHost:486:1991 -3802702390441713:3802702392106996 0:0 CopyHostToDevice:487:1991 -3802702392207713:3802702393417847 0:0 KernelExecution:491:1991 -3802702392183556:3802702394752325 0:0 CopyDeviceToHost:493:1991 PASSED! ## Iteration (28) ################# PASSED! @@ -376,37 +845,147 @@ PASSED! ## Iteration (21) ################# PASSED! ## Iteration (20) ################# -3802702401709737:3802702403387670 0:0 CopyHostToDevice:494:1991 -3802702403489293:3802702404695872 0:0 KernelExecution:498:1991 -3802702403465280:3802702406030229 0:0 CopyDeviceToHost:500:1991 -3802702412982171:3802702414646163 0:0 CopyHostToDevice:501:1991 -3802702414739104:3802702415951461 0:0 KernelExecution:505:1991 -3802702414715973:3802702417282642 0:0 CopyDeviceToHost:507:1991 -3802702424250984:3802702425925207 0:0 CopyHostToDevice:508:1991 -3802702426022614:3802702427240009 0:0 KernelExecution:512:1991 -3802702425999277:3802702428556726 0:0 CopyDeviceToHost:514:1991 -3802702435521608:3802702437497583 0:0 CopyHostToDevice:515:1991 -3802702437591756:3802702438798483 0:0 KernelExecution:519:1991 -3802702437567843:3802702440117692 0:0 CopyDeviceToHost:521:1991 -3802702447076184:3802702448752496 0:0 CopyHostToDevice:522:1991 -3802702448844326:3802702450040979 0:0 KernelExecution:526:1991 -3802702448821457:3802702451374905 0:0 CopyDeviceToHost:528:1991 -3802702458338087:3802702460115460 0:0 CopyHostToDevice:529:1991 -3802702460219046:3802702461421625 0:0 KernelExecution:533:1991 -3802702460192041:3802702462758090 0:0 CopyDeviceToHost:535:1991 -3802702469730872:3802702471408304 0:0 CopyHostToDevice:536:1991 -3802702471502923:3802702472699724 0:0 KernelExecution:540:1991 -3802702471478905:3802702474035724 0:0 CopyDeviceToHost:542:1991 -3802702481000815:3802702482659947 0:0 CopyHostToDevice:543:1991 -3802702482757759:3802702483952190 0:0 KernelExecution:547:1991 -3802702482734898:3802702485283566 0:0 CopyDeviceToHost:549:1991 -3802702492244298:3802702493917401 0:0 CopyHostToDevice:550:1991 -3802702494011385:3802702495222705 0:0 KernelExecution:554:1991 -3802702493988441:3802702496538570 0:0 CopyDeviceToHost:556:1991 PASSED! ## Iteration (19) ################# PASSED! ## Iteration (18) ################# +129856505198157:129856505972108 0:0 CopyHostToDevice:480:14696 +129856506058878:129856507279678 0:0 KernelExecution:484:14696 +129856506031181:129856507818608 0:0 CopyDeviceToHost:486:14696 +129856512668452:129856513378344 0:0 CopyHostToDevice:487:14696 +129856513463906:129856514683906 0:0 KernelExecution:491:14696 +129856513435880:129856515225665 0:0 CopyDeviceToHost:493:14696 +129856520057898:129856520789533 0:0 CopyHostToDevice:494:14696 +129856520877018:129856522100858 0:0 KernelExecution:498:14696 +129856520849406:129856522643928 0:0 CopyDeviceToHost:500:14696 +129856527495540:129856528214422 0:0 CopyHostToDevice:501:14696 +129856528300948:129856529522228 0:0 KernelExecution:505:14696 +129856528273469:129856530060374 0:0 CopyDeviceToHost:507:14696 +129856534970413:129856535678341 0:0 CopyHostToDevice:508:14696 +129856535767312:129856536986193 0:0 KernelExecution:512:14696 +129856535739484:129856537527830 0:0 CopyDeviceToHost:514:14696 +129856542452848:129856543222239 0:0 CopyHostToDevice:515:14696 +129856543308707:129856544531907 0:0 KernelExecution:519:14696 +129856543281047:129856545069937 0:0 CopyDeviceToHost:521:14696 +129856549924160:129856550693828 0:0 CopyHostToDevice:522:14696 +129856550779510:129856552004150 0:0 KernelExecution:526:14696 +129856550751409:129856552552270 0:0 CopyDeviceToHost:528:14696 +129856557413139:129856558179223 0:0 CopyHostToDevice:529:14696 +129856558266309:129856559487269 0:0 KernelExecution:533:14696 +129856558237736:129856560027323 0:0 CopyDeviceToHost:535:14696 +129856564827841:129856565542599 0:0 CopyHostToDevice:536:14696 +129856565630041:129856566854841 0:0 KernelExecution:540:14696 +129856565602389:129856567397324 0:0 CopyDeviceToHost:542:14696 +129856572247710:129856572954375 0:0 CopyHostToDevice:543:14696 +129856573041963:129856574264203 0:0 KernelExecution:547:14696 +129856573013452:129856574809983 0:0 CopyDeviceToHost:549:14696 +129856579656436:129856580368439 0:0 CopyHostToDevice:550:14696 +129856580456039:129856581680039 0:0 KernelExecution:554:14696 +129856580428344:129856582226693 0:0 CopyDeviceToHost:556:14696 +129856587092681:129856587802199 0:0 CopyHostToDevice:557:14696 +129856587888587:129856589111627 0:0 KernelExecution:561:14696 +129856587861029:129856589654526 0:0 CopyDeviceToHost:563:14696 +129856594498640:129856595270698 0:0 CopyHostToDevice:564:14696 +129856595356053:129856596579733 0:0 KernelExecution:568:14696 +129856595328424:129856597128257 0:0 CopyDeviceToHost:570:14696 +129856601984341:129856602751266 0:0 CopyHostToDevice:571:14696 +129856505180003:129856505975222 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :480 +129856505976980:129856505976981 14696:14696 MARK(name(before HIP LaunchKernel)) +129856505980587:129856505981234 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :482 +129856505982935:129856505983566 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :483 +129856505985434:129856505988514 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :484 +129856505990096:129856505990097 14696:14696 MARK(name(after HIP LaunchKernel)) +129856505991997:129856507832334 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :486 +129856512649603:129856513382084 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :487 +129856513384599:129856513384600 14696:14696 MARK(name(before HIP LaunchKernel)) +129856513388119:129856513389080 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :489 +129856513391435:129856513392275 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :490 +129856513394697:129856513399367 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :491 +129856513401523:129856513401524 14696:14696 MARK(name(after HIP LaunchKernel)) +129856513404257:129856515239416 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :493 +129856519992571:129856520793180 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :494 +129856520794974:129856520794975 14696:14696 MARK(name(before HIP LaunchKernel)) +129856520798420:129856520799070 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :496 +129856520800911:129856520801530 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :497 +129856520803611:129856520806841 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :498 +129856520808737:129856520808738 14696:14696 MARK(name(after HIP LaunchKernel)) +129856520810545:129856522657358 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :500 +129856527425346:129856528218117 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :501 +129856528219874:129856528219875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856528221975:129856528222627 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :503 +129856528224439:129856528225291 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :504 +129856528227108:129856528230172 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :505 +129856528231752:129856528231753 14696:14696 MARK(name(after HIP LaunchKernel)) +129856528233473:129856530074548 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :507 +129856534899214:129856535681957 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :508 +129856535683676:129856535683677 14696:14696 MARK(name(before HIP LaunchKernel)) +129856535686401:129856535687061 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :510 +129856535688790:129856535689423 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :511 +129856535691153:129856535694294 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :512 +129856535695868:129856535695869 14696:14696 MARK(name(after HIP LaunchKernel)) +129856535697671:129856537541753 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :514 +129856542387175:129856543225418 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :515 +129856543227192:129856543227193 14696:14696 MARK(name(before HIP LaunchKernel)) +129856543230911:129856543231570 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :517 +129856543233243:129856543233871 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :518 +129856543235930:129856543238762 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :519 +129856543240359:129856543240360 14696:14696 MARK(name(after HIP LaunchKernel)) +129856543242179:129856545084137 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :521 +129856549857104:129856550696919 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :522 +129856550698874:129856550698875 14696:14696 MARK(name(before HIP LaunchKernel)) +129856550702196:129856550702852 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :524 +129856550704612:129856550705254 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :525 +129856550707079:129856550709869 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :526 +129856550711442:129856550711443 14696:14696 MARK(name(after HIP LaunchKernel)) +129856550713182:129856552568840 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :528 +129856557336788:129856558182426 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :529 +129856558184195:129856558184196 14696:14696 MARK(name(before HIP LaunchKernel)) +129856558187727:129856558188380 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :531 +129856558190122:129856558190752 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :532 +129856558192774:129856558195554 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :533 +129856558197324:129856558197325 14696:14696 MARK(name(after HIP LaunchKernel)) +129856558199234:129856560041419 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :535 +129856564809360:129856565545640 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :536 +129856565547393:129856565547394 14696:14696 MARK(name(before HIP LaunchKernel)) +129856565549636:129856565550299 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :538 +129856565551969:129856565552581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :539 +129856565554301:129856565557438 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :540 +129856565559047:129856565559048 14696:14696 MARK(name(after HIP LaunchKernel)) +129856565560847:129856567411065 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :542 +129856572215770:129856572957492 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :543 +129856572959234:129856572959235 14696:14696 MARK(name(before HIP LaunchKernel)) +129856572962526:129856572963184 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :545 +129856572964912:129856572965546 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :546 +129856572967421:129856572970453 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :547 +129856572972097:129856572972098 14696:14696 MARK(name(after HIP LaunchKernel)) +129856572974076:129856574823083 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :549 +129856579588261:129856580372449 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :550 +129856580374262:129856580374263 14696:14696 MARK(name(before HIP LaunchKernel)) +129856580376547:129856580377227 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :552 +129856580378975:129856580379619 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :553 +129856580381546:129856580384467 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :554 +129856580386225:129856580386226 14696:14696 MARK(name(after HIP LaunchKernel)) +129856580388205:129856582240020 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :556 +129856587022783:129856587805709 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :557 +129856587807440:129856587807441 14696:14696 MARK(name(before HIP LaunchKernel)) +129856587811171:129856587811825 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :559 +129856587813530:129856587814170 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :560 +129856587816040:129856587819243 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :561 +129856587820912:129856587820913 14696:14696 MARK(name(after HIP LaunchKernel)) +129856587822927:129856589666874 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :563 +129856594433516:129856595273993 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :564 +129856595275800:129856595275801 14696:14696 MARK(name(before HIP LaunchKernel)) +129856595278990:129856595279652 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :566 +129856595281384:129856595282018 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :567 +129856595283991:129856595287449 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :568 +129856595289101:129856595289102 14696:14696 MARK(name(after HIP LaunchKernel)) +129856595291045:129856597140491 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :570 +129856601919460:129856602754655 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :571 +129856602756445:129856602756446 14696:14696 MARK(name(before HIP LaunchKernel)) +129856602769740:129856602770661 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :573 +129856602772396:129856602773016 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :574 +129856602775079:129856602778192 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :575 +129856602779755:129856602779756 14696:14696 MARK(name(after HIP LaunchKernel)) PASSED! ## Iteration (17) ################# PASSED! @@ -419,33 +998,6 @@ PASSED! ## Iteration (13) ################# PASSED! ## Iteration (12) ################# -3802702503514682:3802702505171794 0:0 CopyHostToDevice:557:1991 -3802702505267652:3802702506468157 0:0 KernelExecution:561:1991 -3802702505243905:3802702507802814 0:0 CopyDeviceToHost:563:1991 -3802702514761386:3802702516425008 0:0 CopyHostToDevice:564:1991 -3802702516524080:3802702517731252 0:0 KernelExecution:568:1991 -3802702516500289:3802702519068477 0:0 CopyDeviceToHost:570:1991 -3802702526022439:3802702527700141 0:0 CopyHostToDevice:571:1991 -3802702527794104:3802702528982164 0:0 KernelExecution:575:1991 -3802702527771042:3802702530315231 0:0 CopyDeviceToHost:577:1991 -3802702537274272:3802702538953635 0:0 CopyHostToDevice:578:1991 -3802702539050334:3802702540254987 0:0 KernelExecution:582:1991 -3802702539025425:3802702541587134 0:0 CopyDeviceToHost:584:1991 -3802702548553016:3802702550225609 0:0 CopyHostToDevice:585:1991 -3802702550319337:3802702551518805 0:0 KernelExecution:589:1991 -3802702550295569:3802702552853758 0:0 CopyDeviceToHost:591:1991 -3802702559816479:3802702561495482 0:0 CopyHostToDevice:592:1991 -3802702561589855:3802702562788137 0:0 KernelExecution:596:1991 -3802702561565542:3802702564108601 0:0 CopyDeviceToHost:598:1991 -3802702571069693:3802702572746995 0:0 CopyHostToDevice:599:1991 -3802702572840650:3802702574041747 0:0 KernelExecution:603:1991 -3802702572817856:3802702575375565 0:0 CopyDeviceToHost:605:1991 -3802702582343137:3802702584028249 0:0 CopyHostToDevice:606:1991 -3802702584131111:3802702585314874 0:0 KernelExecution:610:1991 -3802702584098390:3802702586648988 0:0 CopyDeviceToHost:612:1991 -3802702593620890:3802702595300582 0:0 CopyHostToDevice:613:1991 -3802702595394737:3802702596603391 0:0 KernelExecution:617:1991 -3802702595371233:3802702597936882 0:0 CopyDeviceToHost:619:1991 PASSED! ## Iteration (11) ################# PASSED! @@ -460,37 +1012,143 @@ PASSED! ## Iteration (6) ################# PASSED! ## Iteration (5) ################# +129856602843500:129856604064780 0:0 KernelExecution:575:14696 +129856602815760:129856604621212 0:0 CopyDeviceToHost:577:14696 +129856609545997:129856610317997 0:0 CopyHostToDevice:578:14696 +129856610406851:129856611631491 0:0 KernelExecution:582:14696 +129856610379025:129856612168754 0:0 CopyDeviceToHost:584:14696 +129856616987475:129856617701731 0:0 CopyHostToDevice:585:14696 +129856617790176:129856619014496 0:0 KernelExecution:589:14696 +129856617761809:129856619559063 0:0 CopyDeviceToHost:591:14696 +129856624349579:129856625289209 0:0 CopyHostToDevice:592:14696 +129856625377836:129856626603916 0:0 KernelExecution:596:14696 +129856625350001:129856627147692 0:0 CopyDeviceToHost:598:14696 +129856632033149:129856632742303 0:0 CopyHostToDevice:599:14696 +129856632836527:129856634057647 0:0 KernelExecution:603:14696 +129856632808948:129856634598487 0:0 CopyDeviceToHost:605:14696 +129856639443412:129856640151030 0:0 CopyHostToDevice:606:14696 +129856640260250:129856641484890 0:0 KernelExecution:610:14696 +129856640232509:129856642041965 0:0 CopyDeviceToHost:612:14696 +129856646912100:129856647619752 0:0 CopyHostToDevice:613:14696 +129856647705914:129856648930874 0:0 KernelExecution:617:14696 +129856647678197:129856649476287 0:0 CopyDeviceToHost:619:14696 +129856654338593:129856655101879 0:0 CopyHostToDevice:620:14696 +129856655189659:129856656412699 0:0 KernelExecution:624:14696 +129856655161891:129856656960409 0:0 CopyDeviceToHost:626:14696 +129856661822483:129856662586330 0:0 CopyHostToDevice:627:14696 +129856662679432:129856663900712 0:0 KernelExecution:631:14696 +129856662650940:129856664447428 0:0 CopyDeviceToHost:633:14696 +129856669274444:129856670036595 0:0 CopyHostToDevice:634:14696 +129856670129015:129856671350615 0:0 KernelExecution:638:14696 +129856670101388:129856671895354 0:0 CopyDeviceToHost:640:14696 +129856676687339:129856677401038 0:0 CopyHostToDevice:641:14696 +129856677491350:129856678712950 0:0 KernelExecution:645:14696 +129856677463387:129856679258027 0:0 CopyDeviceToHost:647:14696 +129856684088485:129856684823542 0:0 CopyHostToDevice:648:14696 +129856684910895:129856686132975 0:0 KernelExecution:652:14696 +129856684882539:129856686675228 0:0 CopyDeviceToHost:654:14696 +129856691574066:129856692284982 0:0 CopyHostToDevice:655:14696 +129856692371897:129856693594617 0:0 KernelExecution:659:14696 +129856692344278:129856694142257 0:0 CopyDeviceToHost:661:14696 +129856699000899:129856699713058 0:0 CopyHostToDevice:662:14696 +129856699797526:129856701023446 0:0 KernelExecution:666:14696 +129856699769937:129856701569372 0:0 CopyDeviceToHost:668:14696 +129856602781709:129856604636152 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :577 +129856609479851:129856610321075 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :578 +129856610323078:129856610323079 14696:14696 MARK(name(before HIP LaunchKernel)) +129856610326500:129856610327162 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :580 +129856610328857:129856610329498 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :581 +129856610331492:129856610334664 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :582 +129856610336290:129856610336291 14696:14696 MARK(name(after HIP LaunchKernel)) +129856610338048:129856612222255 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :584 +129856616969217:129856617705105 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :585 +129856617706989:129856617706990 14696:14696 MARK(name(before HIP LaunchKernel)) +129856617710485:129856617711142 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :587 +129856617712846:129856617713491 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :588 +129856617715518:129856617718644 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :589 +129856617720274:129856617720275 14696:14696 MARK(name(after HIP LaunchKernel)) +129856617722118:129856619570993 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :591 +129856624331436:129856625292310 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :592 +129856625294207:129856625294208 14696:14696 MARK(name(before HIP LaunchKernel)) +129856625297113:129856625297761 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :594 +129856625299459:129856625300093 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :595 +129856625301835:129856625305409 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :596 +129856625307116:129856625307117 14696:14696 MARK(name(after HIP LaunchKernel)) +129856625309051:129856627159676 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :598 +129856631962417:129856632745795 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :599 +129856632747622:129856632747623 14696:14696 MARK(name(before HIP LaunchKernel)) +129856632761013:129856632761762 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :601 +129856632763565:129856632764219 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :602 +129856632766094:129856632769110 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :603 +129856632770707:129856632770708 14696:14696 MARK(name(after HIP LaunchKernel)) +129856632772662:129856634610068 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :605 +129856639375744:129856640154106 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :606 +129856640155933:129856640155934 14696:14696 MARK(name(before HIP LaunchKernel)) +129856640159565:129856640160216 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :608 +129856640161841:129856640162476 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :609 +129856640164410:129856640167293 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :610 +129856640168886:129856640168887 14696:14696 MARK(name(after HIP LaunchKernel)) +129856640170703:129856642054780 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :612 +129856646841774:129856647623131 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :613 +129856647624849:129856647624850 14696:14696 MARK(name(before HIP LaunchKernel)) +129856647628076:129856647628742 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :615 +129856647630426:129856647631050 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :616 +129856647632957:129856647636281 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :617 +129856647637872:129856647637873 14696:14696 MARK(name(after HIP LaunchKernel)) +129856647639599:129856649488719 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :619 +129856654273909:129856655105030 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :620 +129856655106878:129856655106879 14696:14696 MARK(name(before HIP LaunchKernel)) +129856655109847:129856655110497 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :622 +129856655112292:129856655112914 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :623 +129856655114757:129856655118162 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :624 +129856655119835:129856655119836 14696:14696 MARK(name(after HIP LaunchKernel)) +129856655121792:129856656973292 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :626 PASSED! ## Iteration (4) ################# +129856661755424:129856662589447 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :627 +129856662591236:129856662591237 14696:14696 MARK(name(before HIP LaunchKernel)) +129856662604066:129856662604831 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :629 +129856662606611:129856662607261 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :630 +129856662608995:129856662611988 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :631 +129856662613644:129856662613645 14696:14696 MARK(name(after HIP LaunchKernel)) +129856662615584:129856664462467 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :633 +129856669256336:129856670039683 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :634 +129856670041634:129856670041635 14696:14696 MARK(name(before HIP LaunchKernel)) +129856670054499:129856670055254 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :636 +129856670056982:129856670057615 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :637 +129856670059351:129856670062513 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :638 +129856670064113:129856670064114 14696:14696 MARK(name(after HIP LaunchKernel)) +129856670066200:129856671906923 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :640 +129856676668791:129856677404223 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :641 +129856677406068:129856677406069 14696:14696 MARK(name(before HIP LaunchKernel)) +129856677408812:129856677409484 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :643 +129856677411095:129856677411722 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :644 +129856677413461:129856677416941 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :645 +129856677418503:129856677418504 14696:14696 MARK(name(after HIP LaunchKernel)) +129856677420242:129856679269939 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :647 +129856684019418:129856684826552 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :648 +129856684828363:129856684828364 14696:14696 MARK(name(before HIP LaunchKernel)) +129856684832034:129856684832695 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :650 +129856684834368:129856684834970 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :651 +129856684836877:129856684839963 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :652 +129856684841560:129856684841561 14696:14696 MARK(name(after HIP LaunchKernel)) +129856684843320:129856686688518 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :654 +129856691504696:129856692288950 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :655 +129856692290798:129856692290799 14696:14696 MARK(name(before HIP LaunchKernel)) +129856692292859:129856692293513 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :657 +129856692295227:129856692295860 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :658 +129856692297819:129856692300821 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :659 +129856692302355:129856692302356 14696:14696 MARK(name(after HIP LaunchKernel)) +129856692304530:129856694153679 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :661 +129856698928289:129856699716162 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :662 +129856699717890:129856699717891 14696:14696 MARK(name(before HIP LaunchKernel)) +129856699720061:129856699720715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :664 +129856699722330:129856699722941 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :665 +129856699724836:129856699728198 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :666 +129856699729953:129856699729954 14696:14696 MARK(name(after HIP LaunchKernel)) +129856699731887:129856701581422 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :668 PASSED! ## Iteration (3) ################# -3802702605023015:3802702606699537 0:0 CopyHostToDevice:620:1991 -3802702606793386:3802702607994484 0:0 KernelExecution:624:1991 -3802702606770418:3802702609331847 0:0 CopyDeviceToHost:626:1991 -3802702616295619:3802702617971351 0:0 CopyHostToDevice:627:1991 -3802702618064582:3802702619276198 0:0 KernelExecution:631:1991 -3802702618041252:3802702620593170 0:0 CopyDeviceToHost:633:1991 -3802702627572022:3802702629249514 0:0 CopyHostToDevice:634:1991 -3802702629343204:3802702630550228 0:0 KernelExecution:638:1991 -3802702629319715:3802702631886524 0:0 CopyDeviceToHost:640:1991 -3802702638854896:3802702640514568 0:0 CopyHostToDevice:641:1991 -3802702640601153:3802702641794250 0:0 KernelExecution:645:1991 -3802702640583338:3802702643131137 0:0 CopyDeviceToHost:647:1991 -3802702650106259:3802702651784942 0:0 CopyHostToDevice:648:1991 -3802702651876671:3802702653079250 0:0 KernelExecution:652:1991 -3802702651853582:3802702654414351 0:0 CopyDeviceToHost:654:1991 -3802702661383522:3802702663061155 0:0 CopyHostToDevice:655:1991 -3802702663154356:3802702664347453 0:0 KernelExecution:659:1991 -3802702663130645:3802702665680984 0:0 CopyDeviceToHost:661:1991 -3802702672630496:3802702674303238 0:0 CopyHostToDevice:662:1991 -3802702674398093:3802702675599190 0:0 KernelExecution:666:1991 -3802702674374489:3802702676932868 0:0 CopyDeviceToHost:668:1991 -3802702683898880:3802702685606503 0:0 CopyHostToDevice:669:1991 -3802702685701165:3802702686898410 0:0 KernelExecution:673:1991 -3802702685678193:3802702688219002 0:0 CopyDeviceToHost:675:1991 -3802702695162453:3802702696838515 0:0 CopyHostToDevice:676:1991 -3802702696932444:3802702698137097 0:0 KernelExecution:680:1991 -3802702696909796:3802702699473165 0:0 CopyDeviceToHost:682:1991 PASSED! ## Iteration (2) ################# PASSED! @@ -498,12 +1156,55 @@ PASSED! PASSED! ## Iteration (0) ################# PASSED! -3802702706580728:3802702708245350 0:0 CopyHostToDevice:683:1991 -3802702708346791:3802702709549370 0:0 KernelExecution:687:1991 -3802702708322181:3802702710885410 0:0 CopyDeviceToHost:689:1991 -3802702717849822:3802702719525044 0:0 CopyHostToDevice:690:1991 -3802702719618857:3802702720813139 0:0 KernelExecution:694:1991 -3802702719594825:3802702722149644 0:0 CopyDeviceToHost:696:1991 -3802702729111215:3802702730788167 0:0 CopyHostToDevice:697:1991 -3802702730881622:3802702732076497 0:0 KernelExecution:701:1991 -3802702730858498:3802702733412517 0:0 CopyDeviceToHost:703:1991 +129856706468741:129856707235310 0:0 CopyHostToDevice:669:14696 +129856707327230:129856708548510 0:0 KernelExecution:673:14696 +129856707299810:129856709098218 0:0 CopyDeviceToHost:675:14696 +129856713958124:129856714730788 0:0 CopyHostToDevice:676:14696 +129856714818472:129856716040872 0:0 KernelExecution:680:14696 +129856714790211:129856716592662 0:0 CopyDeviceToHost:682:14696 +129856721429109:129856722193080 0:0 CopyHostToDevice:683:14696 +129856722282194:129856723505714 0:0 KernelExecution:687:14696 +129856722254384:129856724056420 0:0 CopyDeviceToHost:689:14696 +129856728891611:129856729607012 0:0 CopyHostToDevice:690:14696 +129856729693911:129856730917431 0:0 KernelExecution:694:14696 +129856729665766:129856731460761 0:0 CopyDeviceToHost:696:14696 +129856736249266:129856736963101 0:0 CopyHostToDevice:697:14696 +129856737053267:129856738276147 0:0 KernelExecution:701:14696 +129856737025461:129856738822547 0:0 CopyDeviceToHost:703:14696 +129856706409352:129856707238410 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :669 +129856707240341:129856707240342 14696:14696 MARK(name(before HIP LaunchKernel)) +129856707253495:129856707254390 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :671 +129856707256214:129856707256878 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :672 +129856707258659:129856707261885 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :673 +129856707263518:129856707263519 14696:14696 MARK(name(after HIP LaunchKernel)) +129856707265698:129856709110388 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :675 +129856713891418:129856714734007 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :676 +129856714735794:129856714735795 14696:14696 MARK(name(before HIP LaunchKernel)) +129856714739058:129856714739715 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :678 +129856714741339:129856714741972 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :679 +129856714743986:129856714747316 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :680 +129856714748993:129856714748994 14696:14696 MARK(name(after HIP LaunchKernel)) +129856714750976:129856716607126 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :682 +129856721364192:129856722196489 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :683 +129856722198322:129856722198323 14696:14696 MARK(name(before HIP LaunchKernel)) +129856722202102:129856722202759 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :685 +129856722204452:129856722205080 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :686 +129856722207098:129856722210100 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :687 +129856722211652:129856722211653 14696:14696 MARK(name(after HIP LaunchKernel)) +129856722213452:129856724068250 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :689 +129856728873958:129856729610520 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :690 +129856729612474:129856729612475 14696:14696 MARK(name(before HIP LaunchKernel)) +129856729615953:129856729616618 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :692 +129856729618275:129856729618880 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :693 +129856729620844:129856729623983 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :694 +129856729625525:129856729625526 14696:14696 MARK(name(after HIP LaunchKernel)) +129856729627363:129856731472859 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :696 +129856736212718:129856736966611 14696:14696 hipMemcpy(dst=0x7fd65ce00000, src=0x7fd7781ff010, sizeBytes=4194304, kind=1) :697 +129856736968384:129856736968385 14696:14696 MARK(name(before HIP LaunchKernel)) +129856736971498:129856736972186 14696:14696 __hipPushCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :699 +129856736973934:129856736974581 14696:14696 __hipPopCallConfiguration(gridDim={}, blockDim={}, sharedMem=0, stream=0) :700 +129856736976433:129856736979849 14696:14696 hipLaunchKernel(function_address=0x4010c0, numBlocks={}, dimBlocks={}, args=0x7ffe6d9cea08, sharedMemBytes=0, stream=0) kernel=matrixTranspose(float*, float*, int) :701 +129856736981559:129856736981560 14696:14696 MARK(name(after HIP LaunchKernel)) +129856736983603:129856738834349 14696:14696 hipMemcpy(dst=0x7fd65d707010, src=0x7fd65c800000, sizeBytes=4194304, kind=2) :703 +129856743571751:129856743607276 14696:14696 hipFree(ptr=0x7fd65ce00000) :704 +129856743609591:129856743621235 14696:14696 hipFree(ptr=0x7fd65c800000) :705 diff --git a/test/golden_traces/ctrl_dryrun_trace.txt b/test/golden_traces/ctrl_dryrun_trace.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/golden_traces/tests_trace_cmp_levels.txt b/test/golden_traces/tests_trace_cmp_levels.txt index 5e6dbaa7..ed27e868 100644 --- a/test/golden_traces/tests_trace_cmp_levels.txt +++ b/test/golden_traces/tests_trace_cmp_levels.txt @@ -1,4 +1,6 @@ # dummy +MatrixTranspose_dryrun_trace --check-none +ctrl_dryrun_trace --check-none MatrixTranspose_ctest_trace --check-count .* MatrixTranspose_test_trace --check-count .* --ignore-count hsaKmt.* MatrixTranspose_hipaact_test_trace --check-count .* --ignore-count hsaKmt.*|hipMemcpy|__hipPushCallConfiguration|hipLaunchKernel|__hipPopCallConfiguration @@ -10,3 +12,5 @@ MatrixTranspose_hip_flush_trace --check-order .* MatrixTranspose_kfd_trace --check-events .* ctrl_hsa_trace --check-event .* ctrl_hsa_input_trace --check-event .* +hsa_co_trace --check-none +code_obj_trace --check-none diff --git a/test/run.sh b/test/run.sh index 962033f6..9a7ffc74 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,4 +1,4 @@ -#!/bin/sh -x +#!/bin/sh ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. @@ -79,15 +79,18 @@ eval_test() { test_runnum=$((test_runnum + 1)) eval "$cmdline" >$test_trace 2>&1 is_failed=$? - cat $test_trace + if [ $is_failed != 0 ] ; then + cat $test_trace + fi if [ $IS_CI = 1 ] ; then is_failed=0; else if [ $is_failed = 0 ] ; then - python ./test/check_trace.py -in $test_name -ck $check_trace_flag + python3 ./test/check_trace.py -in $test_name -ck $check_trace_flag is_failed=$? if [ $is_failed != 0 ] ; then - python ./test/check_trace.py -v -in $test_name -ck $check_trace_flag + echo "Trace checker error:" + python3 ./test/check_trace.py -v -in $test_name -ck $check_trace_flag fi fi fi @@ -103,6 +106,10 @@ eval_test() { test_number=$((test_number + 1)) } +# Tests dry run +eval_test "MatrixTranspose dry run" ./test/MatrixTranspose MatrixTranspose_dryrun_trace +eval_test "ctrl dry run" ./test/hsa/ctrl ctrl_dryrun_trace + # Standalone test # rocTrecer is used explicitely by test eval_test "standalone C test" "LD_PRELOAD=libkfdwrapper64.so ./test/MatrixTranspose_ctest" MatrixTranspose_ctest_trace @@ -127,6 +134,11 @@ export ROCTRACER_DOMAIN="hip" eval_test "tool period test" "ROCP_CTRL_RATE=10:100000:1000000 ./test/MatrixTranspose" MatrixTranspose_hip_period_trace eval_test "tool flushing test" "ROCP_FLUSH_RATE=100000 ./test/MatrixTranspose" MatrixTranspose_hip_flush_trace +#API records filtering +echo "" > input.xml +export ROCP_INPUT=input.xml +eval_test "tool HIP test input" ./test/MatrixTranspose hip_input_trace + # HSA test export ROCTRACER_DOMAIN="hsa" # test trace @@ -148,6 +160,13 @@ echo " diff --git a/test/tool/tracer_tool.cpp b/test/tool/tracer_tool.cpp index df2530a5..cf555747 100644 --- a/test/tool/tracer_tool.cpp +++ b/test/tool/tracer_tool.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include /* names denangle */ #include #include +#include #include #include #include /* SYS_xxx definitions */ @@ -42,6 +43,7 @@ THE SOFTWARE. #include "src/core/loader.h" #include "src/core/trace_buffer.h" +#include "util/evt_stats.h" #include "util/hsa_rsrc_factory.h" #include "util/xml.h" @@ -77,6 +79,24 @@ THE SOFTWARE. #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") +static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } + +#if DEBUG_TRACE_ON +inline static void DEBUG_TRACE(const char* fmt, ...) { + constexpr int size = 256; + char buf[size]; + + va_list valist; + va_start(valist, fmt); + vsnprintf(buf, size, fmt, valist); + printf("%u:%u %s", GetPid(), GetTid(), buf); fflush(stdout); + va_end(valist); +} +#else +inline static void DEBUG_TRACE(const char* fmt, ...) {} +#endif + typedef hsa_rt_utils::Timer::timestamp_t timestamp_t; hsa_rt_utils::Timer* timer = NULL; thread_local timestamp_t hsa_begin_timestamp = 0; @@ -92,10 +112,17 @@ bool trace_pcs = false; // API trace vector std::vector hsa_api_vec; std::vector kfd_api_vec; +std::vector hip_api_vec; LOADER_INSTANTIATE(); TRACE_BUFFER_INSTANTIATE(); +typedef EvtStatsT EvtStatsA; +// HIP stats +EvtStats* hip_api_stats = NULL; +EvtStatsA* hip_kernel_stats = NULL; +EvtStatsA* hip_memcpy_stats = NULL; + // Global output file handle FILE* begin_ts_file_handle = NULL; FILE* roctx_file_handle = NULL; @@ -118,9 +145,6 @@ void close_file_handles() { if (pc_sample_file_handle) close_output_file(pc_sample_file_handle); } -static inline uint32_t GetPid() { return syscall(__NR_getpid); } -static inline uint32_t GetTid() { return syscall(__NR_gettid); } - static const uint32_t my_pid = GetPid(); // Error handler @@ -137,7 +161,7 @@ static inline const char* cxx_demangle(const char* symbol) { size_t funcnamesize; int status; const char* ret = (symbol != NULL) ? abi::__cxa_demangle(symbol, NULL, &funcnamesize, &status) : symbol; - return (ret != NULL) ? ret : symbol; + return (ret != NULL) ? ret : strdup(symbol); } // Tracing control thread @@ -169,6 +193,8 @@ void* control_thr_fun(void*) { usleep(dist_us); } } + + return NULL; } // Flushing control thread @@ -197,8 +223,8 @@ void* flush_thr_fun(void*) { // rocTX annotation tracing struct roctx_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t time; uint32_t pid; @@ -208,8 +234,8 @@ struct roctx_trace_entry_t { }; void roctx_flush_cb(roctx_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t roctx_flush_prm[1] = {{0, roctx_flush_cb}}; -roctracer::TraceBuffer roctx_trace_buffer("rocTX API", 0x200000, roctx_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t roctx_flush_prm = {roctracer::DFLT_ENTRY_TYPE, roctx_flush_cb}; +roctracer::TraceBuffer* roctx_trace_buffer = NULL; // rocTX callback function static inline void roctx_callback_fun( @@ -224,15 +250,14 @@ static inline void roctx_callback_fun( #else const timestamp_t time = timer->timestamp_fn_ns(); #endif - roctx_trace_entry_t* entry = roctx_trace_buffer.GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; + roctx_trace_entry_t* entry = roctx_trace_buffer->GetEntry(); entry->cid = cid; entry->time = time; entry->pid = GetPid(); entry->tid = tid; entry->rid = rid; entry->message = (message != NULL) ? strdup(message) : NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } void roctx_api_callback( @@ -275,8 +300,8 @@ void roctx_flush_cb(roctx_trace_entry_t* entry) { // HSA API tracing struct hsa_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t cid; timestamp_t begin; timestamp_t end; @@ -286,8 +311,8 @@ struct hsa_api_trace_entry_t { }; void hsa_api_flush_cb(hsa_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hsa_flush_prm[1] = {{0, hsa_api_flush_cb}}; -roctracer::TraceBuffer hsa_api_trace_buffer("HSA API", 0x200000, hsa_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hsa_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hsa_api_flush_cb}; +roctracer::TraceBuffer* hsa_api_trace_buffer = NULL; // HSA API callback function void hsa_api_callback( @@ -302,15 +327,14 @@ void hsa_api_callback( hsa_begin_timestamp = timer->timestamp_fn_ns(); } else { const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns(); - hsa_api_trace_entry_t* entry = hsa_api_trace_buffer.GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; + hsa_api_trace_entry_t* entry = hsa_api_trace_buffer->GetEntry(); entry->cid = cid; entry->begin = hsa_begin_timestamp; entry->end = end_timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } } @@ -334,8 +358,8 @@ void hsa_activity_callback( // HIP API tracing struct hip_api_trace_entry_t { - uint32_t valid; - uint32_t type; + std::atomic valid; + roctracer::entry_type_t type; uint32_t domain; uint32_t cid; timestamp_t begin; @@ -348,8 +372,8 @@ struct hip_api_trace_entry_t { }; void hip_api_flush_cb(hip_api_trace_entry_t* entry); -roctracer::TraceBuffer::flush_prm_t hip_flush_prm[1] = {{0, hip_api_flush_cb}}; -roctracer::TraceBuffer hip_api_trace_buffer("HIP", 0x200000, hip_flush_prm, 1); +constexpr roctracer::TraceBuffer::flush_prm_t hip_api_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_api_flush_cb}; +roctracer::TraceBuffer* hip_api_trace_buffer = NULL; static inline bool is_hip_kernel_launch_api(const uint32_t& cid) { bool ret = @@ -371,21 +395,20 @@ void hip_api_callback( { (void)arg; const hip_api_data_t* data = reinterpret_cast(callback_data); + const timestamp_t timestamp = timer->timestamp_fn_ns(); + hip_api_trace_entry_t* entry = NULL; if (data->phase == ACTIVITY_API_PHASE_ENTER) { - hip_begin_timestamp = timer->timestamp_fn_ns(); + hip_begin_timestamp = timestamp; } else { // Post onit of HIP APU args hipApiArgsInit((hip_api_id_t)cid, const_cast(data)); - const timestamp_t end_timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; + entry = hip_api_trace_buffer->GetEntry(); entry->cid = cid; entry->domain = domain; entry->begin = hip_begin_timestamp; - entry->end = end_timestamp; + entry->end = timestamp; entry->pid = GetPid(); entry->tid = GetTid(); entry->data = *data; @@ -427,7 +450,13 @@ void hip_api_callback( } } } + + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } + + const char * name = roctracer_op_string(domain, cid, 0); + DEBUG_TRACE("hip_api_callback(\"%s\") phase(%d): cid(%u) data(%p) entry(%p) name(\"%s\") correlation_id(%lu) timestamp(%lu)\n", + name, data->phase, cid, data, entry, (entry) ? entry->name : NULL, data->correlation_id, timestamp); } void mark_api_callback( @@ -440,9 +469,7 @@ void mark_api_callback( const char* name = reinterpret_cast(callback_data); const timestamp_t timestamp = timer->timestamp_fn_ns(); - hip_api_trace_entry_t* entry = hip_api_trace_buffer.GetEntry(); - entry->valid = roctracer::TRACE_ENTRY_COMPL; - entry->type = 0; + hip_api_trace_entry_t* entry = hip_api_trace_buffer->GetEntry(); entry->cid = 0; entry->domain = domain; entry->begin = timestamp; @@ -452,12 +479,18 @@ void mark_api_callback( entry->data = {}; entry->name = strdup(name); entry->ptr = NULL; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); } +typedef std::map hip_kernel_map_t; +hip_kernel_map_t* hip_kernel_map = NULL; +std::mutex hip_kernel_mutex; + void hip_api_flush_cb(hip_api_trace_entry_t* entry) { const uint32_t domain = entry->domain; const uint32_t cid = entry->cid; const hip_api_data_t* data = &(entry->data); + const uint64_t correlation_id = data->correlation_id; const timestamp_t begin_timestamp = entry->begin; const timestamp_t end_timestamp = entry->end; std::ostringstream rec_ss; @@ -467,14 +500,29 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { rec_ss << std::dec << begin_timestamp << ":" << end_timestamp << " " << entry->pid << ":" << entry->tid; oss << std::dec << rec_ss.str() << " " << str; + const char * name = roctracer_op_string(entry->domain, entry->cid, 0); + DEBUG_TRACE("hip_api_flush_cb(\"%s\"): domain(%u) cid(%u) entry(%p) name(\"%s\" correlation_id(%lu) beg(%lu) end(%lu))\n", + name, entry->domain, entry->cid, entry, entry->name, correlation_id, begin_timestamp, end_timestamp); + if (domain == ACTIVITY_DOMAIN_HIP_API) { #if HIP_PROF_HIP_API_STRING - const char* str = hipApiString((hip_api_id_t)cid, data); - rec_ss << " " << str; - if (is_hip_kernel_launch_api(cid)) { - if (entry->name) rec_ss << " kernel=" << cxx_demangle(entry->name); + if (hip_api_stats != NULL) { + hip_api_stats->add_event(cid, end_timestamp - begin_timestamp); + if (is_hip_kernel_launch_api(cid)) { + hip_kernel_mutex.lock(); + (*hip_kernel_map)[correlation_id] = entry->name; + hip_kernel_mutex.unlock(); + } + } else { + const char* str = hipApiString((hip_api_id_t)cid, data); + rec_ss << " " << str; + if (is_hip_kernel_launch_api(cid) && entry->name) { + const char* kernel_name = cxx_demangle(entry->name); + rec_ss << " kernel=" << kernel_name; + } + rec_ss<< " :" << correlation_id; + fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); } - fprintf(hip_api_file_handle, "%s\n", rec_ss.str().c_str()); #else // !HIP_PROF_HIP_API_STRING switch (cid) { case HIP_API_ID_hipMemcpy: @@ -536,6 +584,46 @@ void hip_api_flush_cb(hip_api_trace_entry_t* entry) { fflush(hip_api_file_handle); } +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// HSA API tracing + +struct hip_act_trace_entry_t { + std::atomic valid; + roctracer::entry_type_t type; + uint32_t kind; + timestamp_t dur; + uint64_t correlation_id; +}; + +void hip_act_flush_cb(hip_act_trace_entry_t* entry); +constexpr roctracer::TraceBuffer::flush_prm_t hip_act_flush_prm = {roctracer::DFLT_ENTRY_TYPE, hip_act_flush_cb}; +roctracer::TraceBuffer* hip_act_trace_buffer = NULL; + +// HIP ACT trace buffer flush callback +void hip_act_flush_cb(hip_act_trace_entry_t* entry) { + const uint32_t domain = ACTIVITY_DOMAIN_HCC_OPS; + const uint32_t op = 0; + const char * name = roctracer_op_string(domain, op, entry->kind); + if (name == NULL) { + printf("hip_act_flush_cb name is NULL\n"); fflush(stdout); + abort(); + } + + if (strncmp("Kernel", name, 6) == 0) { + hip_kernel_mutex.lock(); + if (hip_kernel_stats == NULL) { + printf("hip_act_flush_cb hip_kernel_stats is NULL\n"); fflush(stdout); + abort(); + } + name = (*hip_kernel_map)[entry->correlation_id]; + hip_kernel_mutex.unlock(); + const char* kernel_name = cxx_demangle(name); + hip_kernel_stats->add_event(kernel_name, entry->dur); + } else { + hip_memcpy_stats->add_event(name, entry->dur); + } +} + // Activity tracing callback // hipMalloc id(3) correlation_id(1): begin_ns(1525888652762640464) end_ns(1525888652762877067) void pool_activity_callback(const char* begin, const char* end, void* arg) { @@ -544,13 +632,24 @@ void pool_activity_callback(const char* begin, const char* end, void* arg) { while (record < end_record) { const char * name = roctracer_op_string(record->domain, record->op, record->kind); + DEBUG_TRACE("pool_activity_callback(\"%s\"): domain(%u) op(%u) kind(%u) record(%p) correlation_id(%lu) beg(%lu) end(%lu)\n", + name, record->domain, record->op, record->kind, record, record->correlation_id, record->begin_ns, record->end_ns); + switch(record->domain) { case ACTIVITY_DOMAIN_HCC_OPS: - fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", - record->begin_ns, record->end_ns, - record->device_id, record->queue_id, - name, record->correlation_id, my_pid); - fflush(hcc_activity_file_handle); + if (hip_memcpy_stats != NULL) { + hip_act_trace_entry_t* entry = hip_act_trace_buffer->GetEntry(); + entry->kind = record->kind; + entry->dur = record->end_ns - record->begin_ns; + entry->correlation_id = record->correlation_id; + entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release); + } else { + fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu:%u\n", + record->begin_ns, record->end_ns, + record->device_id, record->queue_id, + name, record->correlation_id, my_pid); + fflush(hcc_activity_file_handle); + } break; case ACTIVITY_DOMAIN_HSA_OPS: if (record->op == HSA_OP_ID_RESERVED1) { @@ -639,8 +738,10 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const } // Open output file -FILE* open_output_file(const char* prefix, const char* name) { +FILE* open_output_file(const char* prefix, const char* name, const char** path = NULL) { FILE* file_handle = NULL; + if (path != NULL) *path = NULL; + if (prefix != NULL) { std::ostringstream oss; oss << prefix << "/" << GetPid() << "_" << name; @@ -651,6 +752,8 @@ FILE* open_output_file(const char* prefix, const char* name) { perror(errmsg.str().c_str()); abort(); } + + if (path != NULL) *path = strdup(oss.str().c_str()); } else file_handle = stdout; return file_handle; } @@ -720,7 +823,6 @@ void tool_unload() { // Flush tracing pool close_tracing_pool(); roctracer::TraceBufferBase::FlushAll(); - close_file_handles(); ONLOAD_TRACE_END(); } @@ -819,6 +921,7 @@ void tool_load() { found = true; trace_hip_api = true; trace_hip_activity = true; + hip_api_vec = api_vec; } if (name == "KFD") { found = true; @@ -920,10 +1023,16 @@ void tool_load() { ONLOAD_TRACE_END(); } +void exit_handler(int status, void* arg) { + ONLOAD_TRACE("status(" << status << ") arg(" << arg << ")"); +} + // HSA-runtime tool on-load method extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { ONLOAD_TRACE_BEG(); + on_exit(exit_handler, NULL); + timer = new hsa_rt_utils::Timer(table->core_->hsa_system_get_info_fn); const char* output_prefix = getenv("ROCP_OUTPUT_DIR"); @@ -979,15 +1088,51 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, roctracer_set_properties(ACTIVITY_DOMAIN_HIP_API, (void*)mark_api_callback); // Allocating tracing pool open_tracing_pool(); + + // Check for optimized stats + const bool is_stats_opt = (getenv("ROCP_STATS_OPT") != NULL); + + // HIP kernel ma pinstantiation + if (is_stats_opt) hip_kernel_map = new hip_kernel_map_t; + // Enable tracing if (trace_hip_api) { hip_api_file_handle = open_output_file(output_prefix, "hip_api_trace.txt"); - ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); - ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API)); + if (hip_api_vec.size() != 0) { + for (unsigned i = 0; i < hip_api_vec.size(); ++i) { + uint32_t cid = HIP_API_ID_NUMBER; + const char* api = hip_api_vec[i].c_str(); + ROCTRACER_CALL(roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, api, &cid, NULL)); + ROCTRACER_CALL(roctracer_enable_op_callback(ACTIVITY_DOMAIN_HIP_API, cid, hip_api_callback, NULL)); + printf(" %s", api); + } + } else { + ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL)); + } + + if (is_stats_opt) { + const char* path = NULL; + FILE* f = open_output_file(output_prefix, "hip_api_stats.csv", &path); + hip_api_stats = new EvtStats(f, path); + for (uint32_t id = 0; id < HIP_API_ID_NUMBER; id += 1) { + const char* label = roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, id, 0); + hip_api_stats->set_label(id, label); + } + } } + if (trace_hip_activity) { hcc_activity_file_handle = open_output_file(output_prefix, "hcc_ops_trace.txt"); ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS)); + + if (is_stats_opt) { + FILE* f = NULL; + const char* path = NULL; + f = open_output_file(output_prefix, "hip_kernel_stats.csv", &path); + hip_kernel_stats = new EvtStatsA(f, path); + f = open_output_file(output_prefix, "hip_memcpy_stats.csv", &path); + hip_memcpy_stats = new EvtStatsA(f, path); + } } } @@ -999,6 +1144,11 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, ROCTRACER_CALL(roctracer_enable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_RESERVED1)); } + // Dumping HSA handles for agents and pools + FILE* handles_file_handle = open_output_file(output_prefix, "hsa_handles.txt"); + HsaRsrcFactory::Instance().DumpHandles(handles_file_handle); + close_output_file(handles_file_handle); + ONLOAD_TRACE_END(); return true; } @@ -1010,15 +1160,26 @@ extern "C" PUBLIC_API void OnUnload() { extern "C" CONSTRUCTOR_API void constructor() { ONLOAD_TRACE_BEG(); + roctracer::hip_support::HIP_depth_max = 0; + roctx_trace_buffer = new roctracer::TraceBuffer("rocTX API", 0x200000, &roctx_flush_prm, 1); + hip_api_trace_buffer = new roctracer::TraceBuffer("HIP API", 0x200000, &hip_api_flush_prm, 1); + hip_act_trace_buffer = new roctracer::TraceBuffer("HIP ACT", 0x200000, &hip_act_flush_prm, 1, 1); + hsa_api_trace_buffer = new roctracer::TraceBuffer("HSA API", 0x200000, &hsa_flush_prm, 1); roctracer_load(); tool_load(); ONLOAD_TRACE_END(); } extern "C" DESTRUCTOR_API void destructor() { ONLOAD_TRACE_BEG(); - roctracer_flush_buf(); tool_unload(); + roctracer_flush_buf(); + close_file_handles(); + + + if (hip_api_stats) hip_api_stats->dump(); + if (hip_kernel_stats) hip_kernel_stats->dump(); + if (hip_memcpy_stats) hip_memcpy_stats->dump(); + roctracer_unload(); ONLOAD_TRACE_END(); } -