Skip to content

Commit

Permalink
[SYCL][L0] Add experimental options for fine-tune of dynamic batching (
Browse files Browse the repository at this point in the history
…#4492)

Signed-off-by: Sergey V Maslov <[email protected]>
  • Loading branch information
smaslov-intel authored Sep 10, 2021
1 parent 9dd1ea3 commit 5342ec1
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 39 deletions.
132 changes: 107 additions & 25 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,10 @@ enum DebugLevel {
};

// Controls Level Zero calls tracing.
static int ZeDebug = ZE_DEBUG_NONE;
static const int ZeDebug = [] {
const char *DebugMode = std::getenv("ZE_DEBUG");
return DebugMode ? std::atoi(DebugMode) : ZE_DEBUG_NONE;
}();

static void zePrint(const char *Format, ...) {
if (ZeDebug & ZE_DEBUG_BASIC) {
Expand Down Expand Up @@ -843,20 +846,100 @@ static const int ZeMaxCommandListCacheSize = [] {
return CommandListCacheSizeValue;
}();

static const pi_uint32 ZeCommandListBatchSize = [] {
// Configuration of the command-list batching.
typedef struct {
// Default value of 0. This specifies to use dynamic batch size adjustment.
// Other values will try to collect specified amount of commands.
pi_uint32 Size{0};

// If doing dynamic batching, specifies start batch size.
pi_uint32 DynamicSizeStart{4};

// The maximum size for dynamic batch.
pi_uint32 DynamicSizeMax{16};

// The step size for dynamic batch increases.
pi_uint32 DynamicSizeStep{1};

// Thresholds for when increase batch size (number of closed early is small
// and number of closed full is high).
pi_uint32 NumTimesClosedEarlyThreshold{2};
pi_uint32 NumTimesClosedFullThreshold{10};

// Tells the starting size of a batch.
pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
// Tells is we are doing dynamic batch size adjustment.
bool dynamic() const { return Size == 0; }
} zeCommandListBatchConfig;

static const zeCommandListBatchConfig ZeCommandListBatch = [] {
zeCommandListBatchConfig Config{}; // default initialize

// Default value of 0. This specifies to use dynamic batch size adjustment.
pi_uint32 BatchSizeVal = 0;
const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
if (BatchSizeStr) {
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
// Level Zero may only support a limted number of commands per command
// list. The actual upper limit is not specified by the Level Zero
// Specification. For now we allow an arbitrary upper limit.
// Negative numbers will be silently ignored.
if (BatchSizeStrVal >= 0)
BatchSizeVal = BatchSizeStrVal;
if (BatchSizeStrVal > 0) {
Config.Size = BatchSizeStrVal;
} else if (BatchSizeStrVal == 0) {
Config.Size = 0;
// We are requested to do dynamic batching. Collect specifics, if any.
// The extended format supported is ":" separated values.
//
// NOTE: these extra settings are experimental and are intended to
// be used only for finding a better default heuristic.
//
std::string BatchConfig(BatchSizeStr);
size_t Ord = 0;
size_t Pos = 0;
while (true) {
if (++Ord > 5)
break;

Pos = BatchConfig.find(":", Pos);
if (Pos == std::string::npos)
break;
++Pos; // past the ":"

pi_uint32 Val;
try {
Val = std::stoi(BatchConfig.substr(Pos));
} catch (...) {
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
break;
}
switch (Ord) {
case 1:
Config.DynamicSizeStart = Val;
break;
case 2:
Config.DynamicSizeMax = Val;
break;
case 3:
Config.DynamicSizeStep = Val;
break;
case 4:
Config.NumTimesClosedEarlyThreshold = Val;
break;
case 5:
Config.NumTimesClosedFullThreshold = Val;
break;
default:
die("Unexpected batch config");
}
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
(int)Ord, (int)Val);
};

} else {
// Negative batch sizes are silently ignored.
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
}
}
return BatchSizeVal;
return Config;
}();

// Retrieve an available command list to be used in a PI call
Expand Down Expand Up @@ -1000,7 +1083,7 @@ pi_result _pi_context::getAvailableCommandList(

void _pi_queue::adjustBatchSizeForFullBatch() {
// QueueBatchSize of 0 means never allow batching.
if (QueueBatchSize == 0 || !UseDynamicBatching)
if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic())
return;

NumTimesClosedFull += 1;
Expand All @@ -1009,19 +1092,20 @@ void _pi_queue::adjustBatchSizeForFullBatch() {
// the number of times it has been closed full is high, then raise
// the batching size slowly. Don't raise it if it is already pretty
// high.
if (NumTimesClosedEarly <= 2 && NumTimesClosedFull > 10) {
if (QueueBatchSize < 16) {
QueueBatchSize = QueueBatchSize + 1;
if (NumTimesClosedEarly <= ZeCommandListBatch.NumTimesClosedEarlyThreshold &&
NumTimesClosedFull > ZeCommandListBatch.NumTimesClosedFullThreshold) {
if (QueueBatchSize < ZeCommandListBatch.DynamicSizeMax) {
QueueBatchSize += ZeCommandListBatch.DynamicSizeStep;
zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
}
NumTimesClosedEarly = 0;
NumTimesClosedFull = 0;
}
}

void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
void _pi_queue::adjustBatchSizeForPartialBatch() {
// QueueBatchSize of 0 means never allow batching.
if (QueueBatchSize == 0 || !UseDynamicBatching)
if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic())
return;

NumTimesClosedEarly += 1;
Expand All @@ -1032,7 +1116,7 @@ void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
// batch size that will be able to be closed full at least once
// in a while.
if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) {
QueueBatchSize = PartialBatchSize - 1;
QueueBatchSize = OpenCommandList->second.size() - 1;
if (QueueBatchSize < 1)
QueueBatchSize = 1;
zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
Expand All @@ -1057,10 +1141,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
// kernels started as soon as possible when there are no kernels from this
// queue awaiting execution, while allowing batching to occur when there
// are kernels already executing. Also, if we are using fixed size batching,
// as indicated by !UseDynamicBatching, then just ignore CurrentlyEmpty
// as we want to strictly follow the batching the user specified.
// as indicated by !ZeCommandListBatch.dynamic(), then just ignore
// CurrentlyEmpty as we want to strictly follow the batching the user
// specified.
if (OKToBatchCommand && this->isBatchingAllowed() &&
(!UseDynamicBatching || !CurrentlyEmpty)) {
(!ZeCommandListBatch.dynamic() || !CurrentlyEmpty)) {

if (hasOpenCommandList() && OpenCommandList != CommandList)
die("executeCommandList: OpenCommandList should be equal to"
Expand Down Expand Up @@ -1207,7 +1292,7 @@ pi_result _pi_queue::executeOpenCommandList() {
// If there are any commands still in the open command list for this
// queue, then close and execute that command list now.
if (hasOpenCommandList()) {
adjustBatchSizeForPartialBatch(OpenCommandList->second.size());
adjustBatchSizeForPartialBatch();
auto Res = executeCommandList(OpenCommandList, false, false);
OpenCommandList = CommandListMap.end();
return Res;
Expand Down Expand Up @@ -1444,10 +1529,6 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
PrintPiTrace = true;
}

static const char *DebugMode = std::getenv("ZE_DEBUG");
static const int DebugModeValue = DebugMode ? std::stoi(DebugMode) : 0;
ZeDebug = DebugModeValue;

if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
ZeCallCount = new std::map<const char *, int>;
}
Expand Down Expand Up @@ -2694,8 +2775,9 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
PI_ASSERT(Queue, PI_INVALID_QUEUE);

try {
*Queue = new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context,
Device, ZeCommandListBatchSize, true, Properties);
*Queue =
new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context,
Device, ZeCommandListBatch.startSize(), true, Properties);
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down Expand Up @@ -2879,7 +2961,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
// compute vs. copy Level-Zero queue.
std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
*Queue = new _pi_queue(ZeQueue, ZeroCopyQueues, Context, Device,
ZeCommandListBatchSize, OwnNativeHandle);
ZeCommandListBatch.startSize(), OwnNativeHandle);
return PI_SUCCESS;
}

Expand Down
17 changes: 3 additions & 14 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,19 +587,14 @@ struct _pi_context : _pi_object {
std::mutex NumEventsUnreleasedInEventPoolMutex;
};

// If doing dynamic batching, start batch size at 4.
const pi_uint32 DynamicBatchStartSize = 4;

struct _pi_queue : _pi_object {
_pi_queue(ze_command_queue_handle_t Queue,
std::vector<ze_command_queue_handle_t> &CopyQueues,
pi_context Context, pi_device Device, pi_uint32 BatchSize,
bool OwnZeCommandQueue, pi_queue_properties PiQueueProperties = 0)
: ZeComputeCommandQueue{Queue},
ZeCopyCommandQueues{CopyQueues}, Context{Context}, Device{Device},
QueueBatchSize{BatchSize > 0 ? BatchSize : DynamicBatchStartSize},
OwnZeCommandQueue{OwnZeCommandQueue}, UseDynamicBatching{BatchSize ==
0},
QueueBatchSize{BatchSize}, OwnZeCommandQueue{OwnZeCommandQueue},
PiQueueProperties(PiQueueProperties) {
OpenCommandList = CommandListMap.end();
}
Expand Down Expand Up @@ -668,11 +663,6 @@ struct _pi_queue : _pi_object {
// asked to not transfer the ownership to SYCL RT.
bool OwnZeCommandQueue;

// specifies whether this queue will be using dynamic batch size adjustment
// or not. This is set only at queue creation time, and is therefore
// const for the life of the queue.
const bool UseDynamicBatching;

// These two members are used to keep track of how often the
// batching closes and executes a command list before reaching the
// QueueBatchSize limit, versus how often we reach the limit.
Expand Down Expand Up @@ -704,9 +694,8 @@ struct _pi_queue : _pi_object {
void adjustBatchSizeForFullBatch();

// adjust the queue's batch size, knowing that the current command list
// is being closed with only a partial batch of commands. How many commands
// are in this partial closure is passed as the parameter.
void adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize);
// is being closed with only a partial batch of commands.
void adjustBatchSizeForPartialBatch();

// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
// If the reset command list should be made available, then MakeAvailable
Expand Down

0 comments on commit 5342ec1

Please sign in to comment.