Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RPP Gaussian filter HOST #478

Open
wants to merge 18 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@

Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest)

## (Unreleased) RPP 1.9.4
## RPP 1.9.5 (Unreleased)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
## RPP 1.9.5 (Unreleased)
## (Unreleased) RPP 1.9.5

The unreleased versions need to be labelled with (Unreleased) first in order to not be picked up by the release notes script.


### Changes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
### Changes
### Changed


* RPP Tensor Gaussian Filter support on HOST

## RPP 1.9.1 for ROCm 6.3.0

### Changes

* RPP Tensor Box Filter support on HOST
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ endif()
set(CMAKE_CXX_STANDARD 17)

# RPP Version
set(VERSION "1.9.4")
set(VERSION "1.9.5")

# Set Project Version and Language
project(rpp VERSION ${VERSION} LANGUAGES CXX)
Expand Down
2 changes: 1 addition & 1 deletion include/rpp_version.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern "C" {
// NOTE: IMPORTANT: Match the version with CMakelists.txt version
#define RPP_VERSION_MAJOR 1
#define RPP_VERSION_MINOR 9
#define RPP_VERSION_PATCH 4
#define RPP_VERSION_PATCH 5
#ifdef __cplusplus
}
#endif
Expand Down
23 changes: 23 additions & 0 deletions include/rppt_tensor_filter_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,29 @@ RppStatus rppt_box_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u kernelSize, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Gaussian Filter augmentation on HOST backend for a NCHW/NHWC layout tensor
* \details The Gaussian filter augmentation runs for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* - dstPtr depth ranges - Will be same depth as srcPtr.
* \image html img150x150.png Sample Input
* \image html filter_augmentations_gaussian_filter_kSize3_img150x150.png Sample 3x3 Output
* \image html filter_augmentations_gaussian_filter_kSize5_img150x150.png Sample 5x5 Output
* \image html filter_augmentations_gaussian_filter_kSize7_img150x150.png Sample 7x7 Output
* \image html filter_augmentations_gaussian_filter_kSize9_img150x150.png Sample 9x9 Output
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in HOST memory, of size batchSize, for each image in batch)
* \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u number with kernelSize > 0 that applies to all images in the batch. kernelSize = 3/5/7/9 are optimized to run faster)
* \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_gaussian_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *stdDevTensor, Rpp32u kernelSize, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Gaussian Filter augmentation on HIP backend for a NCHW/NHWC layout tensor
* \details The gaussian filter augmentation runs for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
Expand Down
771 changes: 771 additions & 0 deletions src/include/cpu/rpp_cpu_filter.hpp

Large diffs are not rendered by default.

200 changes: 200 additions & 0 deletions src/include/cpu/rpp_cpu_simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,17 @@ inline void rpp_mm256_print_epi32(__m256i vPrintArray)
}
}

inline void rpp_mm256_print_epi16(__m256i vPrintArray)
{
unsigned short int printArray[8];
_mm256_storeu_si256((__m256i *)printArray, vPrintArray);
printf("\n");
for (int ct = 0; ct < 16; ct++)
{
printf("%hu ", printArray[ct]);
}
}

inline void rpp_mm256_print_ps(__m256 vPrintArray)
{
float printArray[8];
Expand Down Expand Up @@ -1425,6 +1436,43 @@ inline void rpp_load16_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15))); /* Contains pixels 09-16 */
}

inline void rpp_load24_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m128i px1, px2;
px1 = _mm_loadu_si128((__m128i *)(srcPtr));
px2 = _mm_loadl_epi64((__m128i *)(srcPtr + 16));

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask00To03), _mm_shuffle_epi8(px1, xmm_pxMask04To07))); /* Contains pixels 01-08 */
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask08To11), _mm_shuffle_epi8(px1, xmm_pxMask12To15))); /* Contains pixels 09-16 */
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); /* Contains pixels 17-24 */
}

inline void rpp_load32_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m256i px = _mm256_loadu_si256((__m256i *)srcPtr);
__m128i px1 = _mm256_castps256_ps128(px);
__m128i px2 = _mm256_extractf128_si256(px, 1);

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask00To03), _mm_shuffle_epi8(px1, xmm_pxMask04To07))); // Contains pixels 01-08
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask08To11), _mm_shuffle_epi8(px1, xmm_pxMask12To15))); // Contains pixels 09-16
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); // Contains pixels 17-24
p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask08To11), _mm_shuffle_epi8(px2, xmm_pxMask12To15))); // Contains pixels 25-32
}

inline void rpp_load40_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m256i px1 = _mm256_loadu_si256((__m256i *)srcPtr); // Load the first 32 bytes
__m128i px2 = _mm_loadu_si128((__m128i *)(srcPtr + 32)); // Load the remaining 8 bytes
__m128i px1Low = _mm256_castsi256_si128(px1);
__m128i px1High = _mm256_extractf128_si256(px1, 1);

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1Low, xmm_pxMask00To03), _mm_shuffle_epi8(px1Low, xmm_pxMask04To07))); // Pixels 01-08
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1Low, xmm_pxMask08To11), _mm_shuffle_epi8(px1Low, xmm_pxMask12To15))); // Pixels 09-16
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1High, xmm_pxMask00To03), _mm_shuffle_epi8(px1High, xmm_pxMask04To07))); // Pixels 17-24
p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1High, xmm_pxMask08To11), _mm_shuffle_epi8(px1High, xmm_pxMask12To15))); // Pixels 25-32
p[4] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); // Pixels 33-40
}

inline void rpp_load8_u8_to_f32_avx(Rpp8u *srcPtr, __m256 *p)
{
__m128i px;
Expand Down Expand Up @@ -1468,6 +1516,13 @@ inline void rpp_load8_i8_to_f64_avx(Rpp8s *srcPtr, __m256d *p)
p[1] = _mm256_cvtepi32_pd(_mm_shuffle_epi8(px, xmm_pxMask04To07)); /* Contains pixels 05-08 */
}

inline void rpp_load8_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px;
px = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr));
p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask00To03), _mm_shuffle_epi8(px, xmm_pxMask04To07))); /* Contains pixels 01-08 */
}

inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
{
__m128i px;
Expand Down Expand Up @@ -1638,6 +1693,30 @@ inline void rpp_load16_f32_to_f32_avx(Rpp32f *srcPtr, __m256 *p)
p[1] = _mm256_loadu_ps(srcPtr + 8);
}

inline void rpp_load24_f32_to_f32_avx(Rpp32f *srcPtr, __m256 *p)
{
p[0] = _mm256_loadu_ps(srcPtr);
p[1] = _mm256_loadu_ps(srcPtr + 8);
p[2] = _mm256_loadu_ps(srcPtr + 16);
}

inline void rpp_load32_f32_to_f32_avx(Rpp32f *srcPtr, __m256 *p)
{
p[0] = _mm256_loadu_ps(srcPtr);
p[1] = _mm256_loadu_ps(srcPtr + 8);
p[2] = _mm256_loadu_ps(srcPtr + 16);
p[3] = _mm256_loadu_ps(srcPtr + 24);
}

inline void rpp_load40_f32_to_f32_avx(Rpp32f *srcPtr, __m256 *p)
{
p[0] = _mm256_loadu_ps(srcPtr);
p[1] = _mm256_loadu_ps(srcPtr + 8);
p[2] = _mm256_loadu_ps(srcPtr + 16);
p[3] = _mm256_loadu_ps(srcPtr + 24);
p[4] = _mm256_loadu_ps(srcPtr + 32);
}

inline void rpp_store16_f32_to_f32_avx(Rpp32f *dstPtr, __m256 *p)
{
_mm256_storeu_ps(dstPtr, p[0]);
Expand Down Expand Up @@ -1878,6 +1957,43 @@ inline void rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15))); /* Contains pixels 09-16 */
}

inline void rpp_load24_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px1, px2;
px1 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr)));
px2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadl_epi64((__m128i *)(srcPtr + 16)));

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask00To03), _mm_shuffle_epi8(px1, xmm_pxMask04To07))); /* Contains pixels 01-08 */
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask08To11), _mm_shuffle_epi8(px1, xmm_pxMask12To15))); /* Contains pixels 09-16 */
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); /* Contains pixels 17-24 */
}

inline void rpp_load32_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m256i px = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr));
__m128i px1 = _mm256_castps256_ps128(px);
__m128i px2 = _mm256_extractf128_si256(px, 1);

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask00To03), _mm_shuffle_epi8(px1, xmm_pxMask04To07))); // Contains pixels 01-08
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1, xmm_pxMask08To11), _mm_shuffle_epi8(px1, xmm_pxMask12To15))); // Contains pixels 09-16
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); // Contains pixels 17-24
p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask08To11), _mm_shuffle_epi8(px2, xmm_pxMask12To15))); // Contains pixels 25-32
}

inline void rpp_load40_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m256i px1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr)); // Load the first 32 bytes
__m128i px2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + 32))); // Load the remaining 8 bytes
__m128i px1Low = _mm256_castsi256_si128(px1);
__m128i px1High = _mm256_extractf128_si256(px1, 1);

p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1Low, xmm_pxMask00To03), _mm_shuffle_epi8(px1Low, xmm_pxMask04To07))); // Pixels 01-08
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1Low, xmm_pxMask08To11), _mm_shuffle_epi8(px1Low, xmm_pxMask12To15))); // Pixels 09-16
p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1High, xmm_pxMask00To03), _mm_shuffle_epi8(px1High, xmm_pxMask04To07))); // Pixels 17-24
p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px1High, xmm_pxMask08To11), _mm_shuffle_epi8(px1High, xmm_pxMask12To15))); // Pixels 25-32
p[4] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px2, xmm_pxMask00To03), _mm_shuffle_epi8(px2, xmm_pxMask04To07))); // Pixels 33-40
}

inline void rpp_load16_i8_to_f32_mirror_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px;
Expand Down Expand Up @@ -4274,6 +4390,36 @@ inline void rpp_store16_float(Rpp16f *dstPtrTemp, __m256 *pDst)
_mm_storeu_si128((__m128i *)(dstPtrTemp + 8), pxDst[1]);
}

inline void rpp_load16_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 8))));
}

inline void rpp_load24_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 8))));
p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 16))));
}

inline void rpp_load32_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 8))));
p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 16))));
p[3] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 24))));
}

inline void rpp_load40_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 8))));
p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 16))));
p[3] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 24))));
p[4] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 32))));
}

inline void rpp_store12_float_pkd_pln(Rpp32f **dstPtrTempChannels, __m128 *pDst)
{
_mm_storeu_ps(dstPtrTempChannels[0], pDst[0]);
Expand All @@ -4292,4 +4438,58 @@ inline void rpp_store12_float_pkd_pln(Rpp16f **dstPtrTempChannels, __m128 *pDst)
_mm_storeu_si128((__m128i *)(dstPtrTempChannels[2]), pxDst[2]);
}

inline void rpp_store12_float_pkd_pln(Rpp8u **dstPtrTempChannels, __m128 *pDst)
{
__m128i px[4];
for(int i = 0; i < 3; i++)
{
px[0] = _mm_cvtps_epi32(pDst[i]); /* pixels 0-3 */
px[1] = _mm_cvtps_epi32(xmm_p0); /* pixels 4-7 */
px[2] = _mm_cvtps_epi32(xmm_p0); /* pixels 8-11 */
px[3] = _mm_cvtps_epi32(xmm_p0); /* pixels 12-15 */
px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */
px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */
px[0] = _mm_packus_epi16(px[0], px[1]); /* pixels 0-15 */
_mm_storeu_si32((__m128i *)dstPtrTempChannels[i], px[0]); /* store pixels 0-15 */
}
}

inline void rpp_store12_float_pkd_pln(Rpp8s **dstPtrTempChannels, __m128 *pDst)
{
__m128i px[4];
for(int i = 0; i < 3; i++)
{
px[0] = _mm_cvtps_epi32(pDst[i]); /* pixels 0-3 */
px[1] = _mm_cvtps_epi32(xmm_p0); /* pixels 4-7 */
px[2] = _mm_cvtps_epi32(xmm_p0); /* pixels 8-11 */
px[3] = _mm_cvtps_epi32(xmm_p0); /* pixels 12-15 */
px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */
px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */
px[0] = _mm_packus_epi16(px[0], px[1]); /* pixels 0-15 */
px[0] = _mm_sub_epi8(px[0], xmm_pxConvertI8); /* convert back to i8 for px0 store */
_mm_storeu_si32((__m128i *)dstPtrTempChannels[i], px[0]); /* store pixels 0-15 */
}
}

inline void rpp_store8_f32_to_u8_avx(Rpp8u *dstPtrTemp, __m256 pDst)
{
__m256i px1 = _mm256_cvtps_epi32(pDst);
// Pack int32 values to uint16
__m128i px2 = _mm_packus_epi32(_mm256_castsi256_si128(px1), _mm256_extracti128_si256(px1, 1));
// Pack uint16 values to uint8
__m128i px3 = _mm_packus_epi16(px2, _mm_setzero_si128());
// Store the result to dst
_mm_storeu_si64((__m128i*)dstPtrTemp, px3);
}

inline void rpp_store8_f32_to_i8_avx(Rpp8s *dstPtrTemp, __m256 pDst)
{
__m256i px1 = _mm256_cvtps_epi32(pDst);
__m128i px2 = _mm_packus_epi32(_mm256_castsi256_si128(px1), _mm256_extracti128_si256(px1, 1));
__m128i px3 = _mm_packus_epi16(px2, _mm_setzero_si128());
px3 = _mm_sub_epi8(px3, xmm_pxConvertI8); /* convert back to i8 for px0 store */
// Store the result to dst
_mm_storeu_si64((__m128i*)dstPtrTemp, px3);
}

#endif //AMD_RPP_RPP_CPU_SIMD_HPP
1 change: 1 addition & 0 deletions src/modules/cpu/host_tensor_filter_augmentations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ SOFTWARE.
#ifndef HOST_TENSOR_FILTER_AUGMENTATIONS_HPP
#define HOST_TENSOR_FILTER_AUGMENTATIONS_HPP

#include "kernel/gaussian_filter.hpp"
#include "kernel/box_filter.hpp"

#endif // HOST_TENSOR_FILTER_AUGMENTATIONS_HPP
Loading
Loading