Skip to content

Commit

Permalink
Merge pull request #2273 from rafbiels/rafbiels/improve-memset
Browse files Browse the repository at this point in the history
Fix incorrect outputs and improve performance of commonMemSetLargePattern
  • Loading branch information
callumfare authored Nov 15, 2024
2 parents 29ea893 + 6f9d5c5 commit e3247c2
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 44 deletions.
86 changes: 61 additions & 25 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -961,35 +961,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(

// CUDA has no memset functions that allow setting values more than 4 bytes. UR
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
// the buffer using multiple strided calls. The first 4 patterns are set using
// cuMemsetD32Async then all subsequent 1 byte patterns are set using
// cuMemset2DAsync which is called for each pattern.
// more than 4 bytes. We must break up the pattern into 1, 2 or 4-byte values
// and set the buffer using multiple strided calls.
ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
CUdeviceptr Ptr) {
// Calculate the number of patterns, stride, number of times the pattern
// needs to be applied, and the number of times the first 32 bit pattern
// needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;
auto Count32 = Size / sizeof(uint32_t);

// Get 4-byte chunk of the pattern and call cuMemsetD32Async
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream));
for (auto step = 4u; step < NumberOfSteps; ++step) {
// take 1 byte of the pattern
Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = Ptr + (step * sizeof(uint8_t));

// set all of the pattern chunks
UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
// Find the largest supported word size into which the pattern can be divided
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
: PatternSize % 2u == 0u ? 2u
: 1u;

// Calculate the number of words in the pattern, the stride, and the number of
// times the pattern needs to be applied
auto NumberOfSteps = PatternSize / BackendWordSize;
auto Pitch = NumberOfSteps * BackendWordSize;
auto Height = Size / PatternSize;

// Same implementation works for any pattern word type (uint8_t, uint16_t,
// uint32_t)
auto memsetImpl = [BackendWordSize, NumberOfSteps, Pitch, Height, Size, Ptr,
&Stream](const auto *pPatternWords,
auto &&continuousMemset, auto &&stridedMemset) {
// If the pattern is 1 word or the first word is repeated throughout, a fast
// continuous fill can be used without the need for slower strided fills
bool UseOnlyFirstValue{true};
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
if (*(pPatternWords + Step) != *pPatternWords) {
UseOnlyFirstValue = false;
}
}
auto OptimizedNumberOfSteps{UseOnlyFirstValue ? 1u : NumberOfSteps};

// Fill the pattern in steps of BackendWordSize bytes. Use a continuous
// fill in the first step because it's faster than a strided fill. Then,
// overwrite the other values in subsequent steps.
for (auto Step{0u}; Step < OptimizedNumberOfSteps; ++Step) {
if (Step == 0) {
UR_CHECK_ERROR(continuousMemset(Ptr, *(pPatternWords),
Size / BackendWordSize, Stream));
} else {
UR_CHECK_ERROR(stridedMemset(Ptr + Step * BackendWordSize, Pitch,
*(pPatternWords + Step), 1u, Height,
Stream));
}
}
};

// Apply the implementation to the chosen pattern word type
switch (BackendWordSize) {
case 4u: {
memsetImpl(static_cast<const uint32_t *>(pPattern), cuMemsetD32Async,
cuMemsetD2D32Async);
break;
}
case 2u: {
memsetImpl(static_cast<const uint16_t *>(pPattern), cuMemsetD16Async,
cuMemsetD2D16Async);
break;
}
default: {
memsetImpl(static_cast<const uint8_t *>(pPattern), cuMemsetD8Async,
cuMemsetD2D8Async);
break;
}
}

return UR_RESULT_SUCCESS;
}

Expand Down
81 changes: 62 additions & 19 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,25 +712,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(

static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
hipDeviceptr_t Ptr,
uint32_t StartOffset) {
// Calculate the number of times the pattern needs to be applied
auto Height = Size / PatternSize;

// Calculate the number of patterns, stride and the number of times the
// pattern needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;

for (auto step = 4u; step < NumberOfSteps; ++step) {
for (auto step = StartOffset; step < PatternSize; ++step) {
// take 1 byte of the pattern
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
(step * sizeof(uint8_t)));
auto OffsetPtr =
reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) + step);

// set all of the pattern chunks
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
UR_CHECK_ERROR(
hipMemset2DAsync(OffsetPtr, PatternSize, Value, 1u, Height, Stream));
}
}

Expand All @@ -743,11 +740,55 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
// Find the largest supported word size into which the pattern can be divided
auto BackendWordSize = PatternSize % 4u == 0u ? 4u
: PatternSize % 2u == 0u ? 2u
: 1u;

// Calculate the number of patterns
auto NumberOfSteps = PatternSize / BackendWordSize;

// If the pattern is 1 word or the first word is repeated throughout, a fast
// continuous fill can be used without the need for slower strided fills
bool UseOnlyFirstValue{true};
auto checkIfFirstWordRepeats = [&UseOnlyFirstValue,
NumberOfSteps](const auto *pPatternWords) {
for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
if (*(pPatternWords + Step) != *pPatternWords) {
UseOnlyFirstValue = false;
}
}
};

// Get 4-byte chunk of the pattern and call hipMemsetD32Async
auto Count32 = Size / sizeof(uint32_t);
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
// Use a continuous fill for the first word in the pattern because it's faster
// than a strided fill. Then, overwrite the other values in subsequent steps.
switch (BackendWordSize) {
case 4u: {
auto *pPatternWords = static_cast<const uint32_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD32Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
case 2u: {
auto *pPatternWords = static_cast<const uint16_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD16Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
default: {
auto *pPatternWords = static_cast<const uint8_t *>(pPattern);
checkIfFirstWordRepeats(pPatternWords);
UR_CHECK_ERROR(
hipMemsetD8Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
break;
}
}

if (UseOnlyFirstValue) {
return UR_RESULT_SUCCESS;
}

// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
// to behave incorrectly when acting on host pinned memory.
Expand All @@ -761,7 +802,7 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
// we need to check that isManaged attribute is false.
if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
const auto NumOfCopySteps = Size / PatternSize;
const auto Offset = sizeof(uint32_t);
const auto Offset = BackendWordSize;
const auto LeftPatternSize = PatternSize - Offset;
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
reinterpret_cast<const uint8_t *>(pPattern) + Offset);
Expand All @@ -776,10 +817,12 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
Stream));
}
} else {
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
BackendWordSize);
}
#else
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
BackendWordSize);
#endif
return UR_RESULT_SUCCESS;
}
Expand Down

0 comments on commit e3247c2

Please sign in to comment.