From 2e4c3ce9c0cc7200a46f99531782731a574f3753 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Fri, 8 Nov 2024 19:53:27 +0100 Subject: [PATCH] [L0 v2] implement enqueueCooperativeKernelLaunchExp --- .../v2/queue_immediate_in_order.cpp | 60 ++++++++++++++++--- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 08fae0719f..57620f9330 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -971,15 +971,57 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = hKernel; - std::ignore = workDim; - std::ignore = pGlobalWorkOffset; - std::ignore = pGlobalWorkSize; - std::ignore = pLocalWorkSize; - std::ignore = numEventsInWaitList; - std::ignore = phEventWaitList; - std::ignore = phEvent; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); + + std::scoped_lock Lock(this->Mutex, + hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto signalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); + + auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList); + + bool memoryMigrated = false; + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (handler.commandList.get(), dst, src, size, nullptr, + waitList.second, waitList.first)); + memoryMigrated = true; + }; + + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + if (memoryMigrated) { + // If memory was migrated, we don't need to pass the wait list to + // the copy command again. + waitList.first = nullptr; + waitList.second = 0; + } + + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" + "zeCommandListAppendLaunchCooperativeKernel"); + auto zeSignalEvent = signalEvent ? signalEvent->getZeEvent() : nullptr; + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + (handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitList.second, waitList.first)); + + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(