Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RPP Threshold on HOST and HIP #456

Open
wants to merge 30 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8921e17
added initial support for U8 PLN1-PLN1 variant
sampath1117 Aug 12, 2024
04616a0
added support for U8 PKD3, PLN3 variants
sampath1117 Aug 13, 2024
007cb7c
modified algorithm to give RGB output for RGB images
sampath1117 Aug 13, 2024
081b8fd
moved common code outside the layout branch conditions
sampath1117 Aug 13, 2024
57b5325
added support for toggle variation of U8
sampath1117 Aug 19, 2024
8507392
added golden output for threshold
sampath1117 Aug 19, 2024
8c66fa3
added threshold output for doxygen
sampath1117 Aug 19, 2024
e9f9347
added support for F32 bit depth
sampath1117 Aug 19, 2024
f61cb53
added support for I8 bitdepth
sampath1117 Aug 20, 2024
0a904ac
added F16 bitdepth support
sampath1117 Aug 20, 2024
8faa129
added HIP support for U8 bitdepth
sampath1117 Aug 23, 2024
d361da5
made changes to support remaining bitdepths
sampath1117 Aug 26, 2024
d28e55d
fixed output issues with I8 variant
sampath1117 Aug 26, 2024
2acef01
removed commented code in HOST
sampath1117 Aug 26, 2024
d739456
added threshold test case in maps used in common.py
sampath1117 Aug 26, 2024
dd5c0ce
Merge branch 'develop' into sr/opt_threshold
sampath1117 Aug 26, 2024
588d639
modified RPP_VERSION_MINOR value and changelog
sampath1117 Aug 26, 2024
5941efa
fixed issues with doxygen
sampath1117 Aug 26, 2024
67529fd
Merge branch 'develop' into sr/opt_threshold
sampath1117 Sep 25, 2024
658acaa
made changes in I8 variants as per review comments
sampath1117 Sep 25, 2024
367b117
added more details for threshold documentation
sampath1117 Sep 25, 2024
7564f90
Merge pull request #322 from sampath1117/sr/opt_threshold
r-abishek Sep 25, 2024
f89204b
Merge remote-tracking branch 'develop' into ar/opt_threshold
HazarathKumarM Oct 21, 2024
c73b9e1
Merge pull request #357 from HazarathKumarM/hk/threshold
r-abishek Oct 22, 2024
822f715
Merge branch 'develop' of https://github.com/ROCm/rpp into ar/opt_thr…
r-abishek Oct 22, 2024
17fa810
Merge branch 'develop' into ar/opt_threshold
r-abishek Oct 29, 2024
74a503e
Merge branch 'develop' into ar/opt_threshold
r-abishek Nov 2, 2024
0c67e82
Merge branch 'develop' into ar/opt_threshold
kiritigowda Nov 5, 2024
772a2cb
Merge branch 'develop' into ar/opt_threshold
r-abishek Nov 7, 2024
72e91ba
Merge branch 'develop' into ar/opt_threshold
kiritigowda Nov 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@

Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest)

## (Unreleased) RPP 1.9.7

### Changes

* RPP Threshold on HOST and HIP

## (Unreleased) RPP 1.9.4

### Changes

* RPP Tensor Box Filter support on HOST
Expand Down Expand Up @@ -457,4 +464,4 @@ RPP Audio Support HIP - Mel Filter Bank

### Known issues

* `CPU` backend is not enabled
* `CPU` backend is not enabled
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ endif()
set(CMAKE_CXX_STANDARD 17)

# RPP Version
set(VERSION "1.9.4")
set(VERSION "1.9.7")

# Set Project Version and Language
project(rpp VERSION ${VERSION} LANGUAGES CXX)
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion include/rpp_version.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ extern "C" {
// NOTE: IMPORTANT: Match the version with CMakelists.txt version
#define RPP_VERSION_MAJOR 1
#define RPP_VERSION_MINOR 9
#define RPP_VERSION_PATCH 4
#define RPP_VERSION_PATCH 7
#ifdef __cplusplus
}
#endif
Expand Down
56 changes: 56 additions & 0 deletions include/rppt_tensor_statistical_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,62 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorStddevArr, Rpp32u tensorStddevArrLength, Rpp32f *meanTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Threshold augmentation on HOST backend for a NCHW/NHWC layout tensor
* \details The Threshold augmentation outputs a black/white binary mask image, based on whether or not each pixel is within the user-specified pixel-range bounds, for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* - dstPtr depth ranges - Will be same depth as srcPtr.<br>
* Note: Returns a black image for below 2 cases:
* 1. If the minimum cutoff value greater than the maximum cutoff value for the given input in a batch.<br>
* 2. Values provided for minimum cutoff value, maximum cutoff value are beyond the below specified min and max values.<br>
Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \image html img150x150.png Sample Input
* \image html statistical_operations_threshold_img150x150.png Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] minTensor minimum cutoff value (1D tensor in HOST memory, of size batchSize * channels)
* - minTensor ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \param [in] maxTensor maximum cutoff value (1D tensor in HOST memory, of size batchSize * channels)
* - maxTensor ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_threshold_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *minTensor, Rpp32f *maxTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief Threshold augmentation on HIP backend for a NCHW/NHWC layout tensor
* \details The Threshold augmentation outputs a black/white binary mask image, based on whether or not each pixel is within the user-specified pixel-range bounds, for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* - dstPtr depth ranges - Will be same depth as srcPtr.<br>
* Note: Returns a black image for below 2 cases:
* 1. If the minimum cutoff value greater than the maximum cutoff value for the given input in a batch.<br>
* 2. Values provided for minimum cutoff value, maximum cutoff value are beyond the below specified min and max values.<br>
Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \image html img150x150.png Sample Input
* \image html statistical_operations_threshold_img150x150.png Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] minTensor minimum cutoff value (1D tensor in pinned/HIP memory, of size batchSize * channels)
* - minTensor ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \param [in] maxTensor maximum cutoff value (1D tensor in pinned/HIP memory, of size batchSize * channels)
* - maxTensor ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_threshold_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *minTensor, Rpp32f *maxTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif

/*! @}
*/

Expand Down
40 changes: 40 additions & 0 deletions src/include/cpu/rpp_cpu_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6599,4 +6599,44 @@ inline RPP_HOST_DEVICE Rpp32s get_idx_reflect(Rpp32s loc, Rpp32s minLoc, Rpp32s
return loc;
}

inline void compute_threshold_8_host(__m256 *p, __m256 *pThresholdParams)
{
p[0] = _mm256_blendv_ps(avx_p0, avx_p1, _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ)));
}

inline void compute_threshold_16_host(__m256 *p, __m256 *pThresholdParams)
{
p[0] = _mm256_blendv_ps(avx_p0, avx_p255, _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ)));
p[1] = _mm256_blendv_ps(avx_p0, avx_p255, _mm256_and_ps(_mm256_cmp_ps(p[1], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[1], pThresholdParams[1],_CMP_LE_OQ)));
}

inline void compute_threshold_24_host(__m256 *p, __m256 *pThresholdParams)
{
__m256 pChannelCheck[3];
pChannelCheck[0] = _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ));
pChannelCheck[1] = _mm256_and_ps(_mm256_cmp_ps(p[1], pThresholdParams[2], _CMP_GE_OQ), _mm256_cmp_ps(p[1], pThresholdParams[3],_CMP_LE_OQ));
pChannelCheck[2] = _mm256_and_ps(_mm256_cmp_ps(p[2], pThresholdParams[4], _CMP_GE_OQ), _mm256_cmp_ps(p[2], pThresholdParams[5],_CMP_LE_OQ));
p[0] = _mm256_blendv_ps(avx_p0, avx_p1, _mm256_and_ps(_mm256_and_ps(pChannelCheck[0], pChannelCheck[1]), pChannelCheck[2]));
p[1] = p[0];
p[2] = p[0];
}

inline void compute_threshold_48_host(__m256 *p, __m256 *pThresholdParams)
{
__m256 pChannelCheck[3];
pChannelCheck[0] = _mm256_and_ps(_mm256_cmp_ps(p[0], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[0], pThresholdParams[1],_CMP_LE_OQ));
pChannelCheck[1] = _mm256_and_ps(_mm256_cmp_ps(p[2], pThresholdParams[2], _CMP_GE_OQ), _mm256_cmp_ps(p[2], pThresholdParams[3],_CMP_LE_OQ));
pChannelCheck[2] = _mm256_and_ps(_mm256_cmp_ps(p[4], pThresholdParams[4], _CMP_GE_OQ), _mm256_cmp_ps(p[4], pThresholdParams[5],_CMP_LE_OQ));
p[0] = _mm256_blendv_ps(avx_p0, avx_p255, _mm256_and_ps(_mm256_and_ps(pChannelCheck[0], pChannelCheck[1]), pChannelCheck[2]));
p[2] = p[0];
p[4] = p[0];

pChannelCheck[0] = _mm256_and_ps(_mm256_cmp_ps(p[1], pThresholdParams[0], _CMP_GE_OQ), _mm256_cmp_ps(p[1], pThresholdParams[1],_CMP_LE_OQ));
pChannelCheck[1] = _mm256_and_ps(_mm256_cmp_ps(p[3], pThresholdParams[2], _CMP_GE_OQ), _mm256_cmp_ps(p[3], pThresholdParams[3],_CMP_LE_OQ));
pChannelCheck[2] = _mm256_and_ps(_mm256_cmp_ps(p[5], pThresholdParams[4], _CMP_GE_OQ), _mm256_cmp_ps(p[5], pThresholdParams[5],_CMP_LE_OQ));
p[1] = _mm256_blendv_ps(avx_p0, avx_p255, _mm256_and_ps(_mm256_and_ps(pChannelCheck[0], pChannelCheck[1]), pChannelCheck[2]));
p[3] = p[1];
p[5] = p[1];
}

#endif //RPP_CPU_COMMON_H
30 changes: 30 additions & 0 deletions src/include/cpu/rpp_cpu_simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1538,6 +1538,24 @@ inline void rpp_store24_f32pln3_to_f32pln3_avx(Rpp32f *dstPtrR, Rpp32f *dstPtrG,
_mm256_storeu_ps(dstPtrB, p[2]);
}

inline void rpp_load24_f16pkd3_to_f32pln3_avx(Rpp16f *srcPtr, __m256 *p)
{
__m128 p128[8];
p128[0] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
p128[1] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 3))));
p128[2] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 6))));
p128[3] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 9))));
_MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]);
p128[4] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 12))));
p128[5] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 15))));
p128[6] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 18))));
p128[7] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr + 21))));
_MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]);
p[0] = _mm256_setr_m128(p128[0], p128[4]);
p[1] = _mm256_setr_m128(p128[1], p128[5]);
p[2] = _mm256_setr_m128(p128[2], p128[6]);
}

inline void rpp_load24_f32pkd3_to_f64pln3_avx(Rpp32f *srcPtr, __m256d *p)
{
__m128 p128[8];
Expand Down Expand Up @@ -1611,6 +1629,13 @@ inline void rpp_store24_f32pln3_to_f32pkd3_avx(Rpp32f *dstPtr, __m256 *p)
_mm_storeu_ps(dstPtr + 21, p128[3]);
}

inline void rpp_load24_f16pln3_to_f32pln3_avx(Rpp16f *srcPtrR, Rpp16f *srcPtrG, Rpp16f *srcPtrB, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtrR))));
p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtrG))));
p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtrB))));
}

inline void rpp_load24_f32pln3_to_f64pln3_avx(Rpp32f *srcPtrR, Rpp32f *srcPtrG, Rpp32f *srcPtrB, __m256d *p)
{
__m128 px128[6];
Expand Down Expand Up @@ -1671,6 +1696,11 @@ inline void rpp_store8_f32_to_f32_avx(Rpp32f *dstPtr, __m256 *p)
_mm256_storeu_ps(dstPtr, p[0]);
}

inline void rpp_load8_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p)
{
p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<Rpp32f *>(srcPtr))));
}

inline void rpp_load8_f32_to_f64_avx(Rpp32f *srcPtr, __m256d *p)
{
__m128 px128[2];
Expand Down
1 change: 1 addition & 0 deletions src/modules/cpu/host_tensor_statistical_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ SOFTWARE.
#include "kernel/tensor_mean.hpp"
#include "kernel/tensor_stddev.hpp"
#include "kernel/normalize.hpp"
#include "kernel/threshold.hpp"

#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
Loading