diff --git a/CHANGELOG.md b/CHANGELOG.md index f1c66e8d2..00fa320ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r * RPP Tensor Gaussian Filter support on HOST * RPP Fog augmentation on HOST and HIP +* RPP Rain augmentation on HOST and HIP * RPP Warp Perspective on HOST and HIP * RPP Threshold on HOST and HIP diff --git a/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png b/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png new file mode 100644 index 000000000..1ac729ff9 Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png differ diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h index 4fb4c0fe8..bfab33a71 100644 --- a/include/rppt_tensor_effects_augmentations.h +++ b/include/rppt_tensor_effects_augmentations.h @@ -645,6 +645,60 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Rain augmentation on HOST backend for a NCHW/NHWC layout tensor + * \details The rain augmentation simulates a rain effect for a batch of RGB (3-channel) / greyscale (1-channel) images with an NHWC/NCHW tensor layout.
+ * NOTE: This augmentation gives a more realistic Rain output when all images in a batch are of similar / same sizes
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be the same depth as srcPtr. + * \image html img640x480.png Sample Input + * \image html effects_augmentations_rain_img640x480.png Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rainPercentage The percentage of the rain effect to be applied (0 <= rainPercentage <= 100) + * \param [in] rainWidth Width of the rain drops in pixels. To be tuned by user depending on size of the image. + * \param [in] rainHeight Height of the rain drops in pixels. To be tuned by user depending on size of the image. + * \param [in] slantAngle Slant angle of the rain drops (positive value for right slant, negative for left slant). A single Rpp32s/f representing the slant of raindrops in degrees. + * Values range from [-90, 90], where -90 represents extreme left slant, 0 is vertical, and 90 is extreme right slant. + * \param [in] alpha An array of alpha blending values to be used for blending the rainLayer and the input image for each image in the batch (0 ≤ alpha ≤ 1 for each image in the batch). + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_rain_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f rainPercentage, Rpp32u rainWidth, Rpp32u rainHeight, Rpp32f slantAngle, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Rain augmentation on HIP backend for a NCHW/NHWC layout tensor + * \details The rain augmentation simulates a rain effect for a batch of RGB (3-channel) / greyscale (1-channel) images with an NHWC/NCHW tensor layout.
+ * NOTE: This augmentation gives a more realistic Rain output when all images in a batch are of similar / same sizes
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be the same depth as srcPtr. + * \image html img640x480.png Sample Input + * \image html effects_augmentations_rain_img640x480.png Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rainPercentage The percentage of the rain effect to be applied (0 <= rainPercentage <= 100) + * \param [in] rainWidth Width of the rain drops in pixels. To be tuned by user depending on size of the image. + * \param [in] rainHeight Height of the rain drops in pixels. To be tuned by user depending on size of the image. + * \param [in] slantAngle Slant angle of the rain drops (positive value for right slant, negative for left slant). A single Rpp32s/f representing the slant of raindrops in degrees. + * Values range from [-90, 90], where -90 represents extreme left slant, 0 is vertical, and 90 is extreme right slant. + * \param [in] alpha An array of alpha blending values in pinned / HIP memory is used for blending the rainLayer and the input image for each image in the batch (0 ≤ alpha ≤ 1 for each image in the batch). + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_rain_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f rainPercentage, Rpp32u rainWidth, Rpp32u rainHeight, Rpp32f slantAngle, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Pixelate augmentation on HOST backend for a NCHW/NHWC layout tensor * \details The pixelate augmentation performs a pixelate transformation for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index aaca7090a..abb531001 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -6630,6 +6630,31 @@ inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst) pDst[7] = _mm256_extractf128_ps(pSrc[3], 1); /* extract [P08|P16|P24|P32] */ } +inline void compute_rain_48_host(__m256 *p1, __m256 *p2, __m256 &pMul) +{ + p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[0]), pMul, p1[0]); // alpha-blending adjustment + p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[1]), pMul, p1[1]); // alpha-blending adjustment + p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[2]), pMul, p1[2]); // alpha-blending adjustment + p1[3] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[3]), pMul, p1[3]); // alpha-blending adjustment + p1[4] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[4]), pMul, p1[4]); // alpha-blending adjustment + p1[5] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[5]), pMul, p1[5]); // alpha-blending adjustment +} + +inline void compute_rain_32_host(__m256 *p1, __m256 *p2, __m256 &pMul) +{ + p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[0]), pMul, p1[0]); // alpha-blending adjustment + p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[1]), pMul, p1[1]); // alpha-blending adjustment + p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2[2], p1[2]), pMul, p1[2]); // alpha-blending adjustment + p1[3] = _mm256_fmadd_ps(_mm256_sub_ps(p2[3], p1[3]), pMul, p1[3]); // alpha-blending adjustment +} + +inline void compute_rain_24_host(__m256 *p1, __m256 p2, __m256 &pMul) +{ + p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[0]), pMul, p1[0]); // alpha-blending adjustment + p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[1]), pMul, p1[1]); // alpha-blending adjustment + p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[2]), pMul, p1[2]); // alpha-blending adjustment +} + // Compute hanning window inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize) { diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp index 968a6cd11..636510e17 100644 --- a/src/modules/cpu/host_tensor_effects_augmentations.hpp +++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp @@ -39,5 +39,6 @@ SOFTWARE. #include "kernel/resize.hpp" //pixelate dependency #include "kernel/erase.hpp" #include "kernel/fog.hpp" +#include "kernel/rain.hpp" #endif // HOST_TENSOR_EFFECTS_AUGMENTATIONS_HPP diff --git a/src/modules/cpu/kernel/rain.hpp b/src/modules/cpu/kernel/rain.hpp new file mode 100644 index 000000000..45b1a4f42 --- /dev/null +++ b/src/modules/cpu/kernel/rain.hpp @@ -0,0 +1,1204 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +// Constants to represent the rain intensity for different data types +#define RAIN_INTENSITY_8U 200 // Intensity value for Rpp8u +#define RAIN_INTENSITY_8S 72 // Intensity value for Rpp8s +#define RAIN_INTENSITY_FLOAT 200 * ONE_OVER_255 // Intensity value for Rpp32f and Rpp16f + +template +inline void create_rain_layer(T *rainLayer, Rpp32f rainPercentage, RpptDescPtr srcDescPtr, Rpp32f slantAngle, Rpp32u dropLength, Rpp32u rainWidth) +{ + Rpp32f rainPercent = rainPercentage * 0.004f; // Scaling factor to convert percentage to a range suitable for rain effect intensity + Rpp32u numDrops = static_cast(rainPercent * srcDescPtr->h * srcDescPtr->w); + Rpp32f slant = sin(slantAngle) * dropLength; + + // Seed the random number generator and set up the uniform distributions + std::mt19937 rng(std::random_device{}()); + std::uniform_int_distribution distX(0, srcDescPtr->w - slant - 1); + std::uniform_int_distribution distY(0, srcDescPtr->h - dropLength - 1); + + // Choose the rain intensity value based on the data type + T rainValue = std::is_same::value ? static_cast(RAIN_INTENSITY_8U) : + std::is_same::value ? static_cast(RAIN_INTENSITY_8S) : + static_cast(RAIN_INTENSITY_FLOAT); + Rpp32f slantPerDropLength = static_cast(slant) / dropLength; + for (Rpp32u i = 0; i < numDrops; i++) + { + Rpp32u xStart = distX(rng); + Rpp32u yStart = distX(rng); + for (Rpp32u j = 0; j < dropLength; j++) + { + Rpp32u x = xStart + j * slantPerDropLength; + Rpp32u y = yStart + j; + + if ((x >= 0) && (x < srcDescPtr->w) && (y < srcDescPtr->h)) + { + T *rainLayerTemp = rainLayer + y * srcDescPtr->w + x; + for (Rpp32u k = 0; k < rainWidth; k++) + rainLayerTemp[k] = rainValue; + } + } + } +} + +RppStatus rain_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alphaValues, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + Rpp8u *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost); + std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp8u)); + create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8u *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32f alpha = alphaValues[batchCount]; +#if __AVX2__ + __m256 pMul = _mm256_set1_ps(alpha); +#endif + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + + // Rain with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0])))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1])))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2])))); + srcPtr1Temp += 3; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + // Rain with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)))); + dstPtrTemp += 3; + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1))))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2))))); + srcPtr2Temp++; + srcPtr1Temp += 3; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)))); + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } + } + // Rain single channel without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[2], p2[2]; + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_32_host(p1, p2, pMul); + rpp_simd_store(rpp_store16_f32_to_u8_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)))); + srcPtr1Temp++; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + return RPP_SUCCESS; +} + +RppStatus rain_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alphaValues, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) + { + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + Rpp32f *rainLayer = handle.GetInitHandle()->mem.mcpu.scratchBufferHost; + std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp32f)); + create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp32f *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32f alpha = alphaValues[batchCount]; +#if __AVX2__ + __m256 pMul = _mm256_set1_ps(alpha); +#endif + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + + // Rain with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0]); + *dstPtrTempG++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1]); + *dstPtrTempB++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2]); + srcPtr1Temp += 3; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + // Rain with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR); + dstPtrTemp[1] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG); + dstPtrTemp[2] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB); + dstPtrTemp += 3; + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp); + *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1)); + *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2)); + srcPtr2Temp++; + srcPtr1Temp += 3; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR); + *dstPtrTempG++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG); + *dstPtrTempB++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB); + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } + } + // Rain single channel without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1, p2; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, &p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads + p1 = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1), pMul, p1); // alpha-blending adjustment + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &p1); // simd stores + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp); + srcPtr1Temp++; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + return RPP_SUCCESS; +} + +RppStatus rain_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alphaValues, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + Rpp16f *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost); + std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp16f)); + create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp16f *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32f alpha = alphaValues[batchCount]; +#if __AVX2__ + __m256 pMul = _mm256_set1_ps(alpha); +#endif + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + + // Rain with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0]))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1]))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2]))); + srcPtr1Temp += 3; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + // Rain with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))); + dstPtrTemp += 3; + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1)))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2)))); + srcPtr2Temp++; + srcPtr1Temp += 3; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2; + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads + compute_rain_24_host(p1, p2, pMul); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))); + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } + } + // Rain single channel without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1, p2; + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, &p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads + p1 = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1), pMul, p1); // alpha-blending adjustment + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, &p1); // simd stores + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))); + srcPtr1Temp++; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + return RPP_SUCCESS; +} + +RppStatus rain_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alphaValues, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + Rpp8s *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost); + std::memset(rainLayer, 0x81, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp8s)); + create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + Rpp8s *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32f alpha = alphaValues[batchCount]; +#if __AVX2__ + __m256 pMul = _mm256_set1_ps(alpha); +#endif + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement; + + // Rain with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0])))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1])))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2])))); + srcPtr1Temp += 3; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + // Rain with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)))); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)))); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)))); + dstPtrTemp += 3; + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1))))); + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2))))); + srcPtr2Temp++; + srcPtr1Temp += 3; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->w; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Rain without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtrChannel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2Row = rainLayer; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[6], p2[2]; + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_48_host(p1, p2, pMul); + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)))); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)))); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)))); + srcPtr2Temp++; + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + } + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } + } + // Rain single channel without fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel; + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtrChannel; + srcPtr2Row = rainLayer; + dstPtrRow = dstPtrChannel; + for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[2], p2[2]; + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads + compute_rain_32_host(p1, p2, pMul); + rpp_simd_store(rpp_store16_f32_to_i8_avx, dstPtrTemp, p1); // simd stores + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)))); + srcPtr1Temp++; + srcPtr2Temp++; + } + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp index d5c6aba75..27cba780e 100644 --- a/src/modules/hip/hip_tensor_effects_augmentations.hpp +++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp @@ -39,5 +39,6 @@ SOFTWARE. #include "kernel/resize.hpp" //pixelate dependency #include "kernel/erase.hpp" #include "kernel/fog.hpp" +#include "kernel/rain.hpp" #endif // HIP_TENSOR_EFFECTS_AUGMENTATIONS_HPP diff --git a/src/modules/hip/kernel/rain.hpp b/src/modules/hip/kernel/rain.hpp new file mode 100644 index 000000000..712060d23 --- /dev/null +++ b/src/modules/hip/kernel/rain.hpp @@ -0,0 +1,304 @@ +#include +#include "rpp_hip_common.hpp" + +// Constants to represent the rain intensity for different data types +#define RAIN_INTENSITY_8U 200 // Intensity value for Rpp8u +#define RAIN_INTENSITY_8S 72 // Intensity value for Rpp8s +#define RAIN_INTENSITY_FLOAT 200 * ONE_OVER_255 // Intensity value for Rpp32f and Rpp16f + +__device__ __forceinline__ void rain_hip_compute(uchar *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4) +{ + dst_f8->f4[0] = rpp_hip_pixel_check_0to255((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]); + dst_f8->f4[1] = rpp_hip_pixel_check_0to255((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]); +} + +__device__ __forceinline__ void rain_hip_compute(float *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4) +{ + dst_f8->f4[0] = rpp_hip_pixel_check_0to1((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]); + dst_f8->f4[1] = rpp_hip_pixel_check_0to1((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]); +} + +__device__ __forceinline__ void rain_hip_compute(schar *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4) +{ + dst_f8->f4[0] = rpp_hip_pixel_check_0to255((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0] + (float4)128) - (float4)128; + dst_f8->f4[1] = rpp_hip_pixel_check_0to255((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1] + (float4)128) - (float4)128; +} + +__device__ __forceinline__ void rain_hip_compute(half *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4) +{ + dst_f8->f4[0] = rpp_hip_pixel_check_0to1((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]); + dst_f8->f4[1] = rpp_hip_pixel_check_0to1((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]); +} + +template +__global__ void rain_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNHW, + T *dstPtr, + uint2 dstStridesNH, + float *alpha, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth * 3)) + { + return; + } + + uint srcIdx1 = (id_z * srcStridesNHW.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + float4 alpha_f4 = static_cast(alpha[id_z]); + d_float24 src1_f24, dst_f24; + d_float8 src2_f8; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, &src1_f24); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8); + rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void rain_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + float *alpha, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx1 = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + float4 alpha_f4 = static_cast(alpha[id_z]); + d_float8 src1_f8, src2_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8); + rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx1 += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8); + rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + srcIdx1 += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8); + rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void rain_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNHW, + T *dstPtr, + uint3 dstStridesNCH, + float *alpha, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx1 = (id_z * srcStridesNHW.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + float4 alpha_f4 = static_cast(alpha[id_z]); + d_float24 src1_f24, dst_f24; + d_float8 src2_f8; + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, &src1_f24); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8); + rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void rain_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + float *alpha, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx1 = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + float4 alpha_f4 = static_cast(alpha[id_z]); + d_float24 src1_f24, dst_f24; + d_float8 src2_f8; + rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, srcStridesNCH.y, &src1_f24); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8); + rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4); + rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_rain_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alpha, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + Rpp32f rainPercent = rainPercentage * 0.004f; //Scaling factor to convert percentage to a range suitable for rain effect intensity + Rpp32u numDrops = static_cast(rainPercent * srcDescPtr->h * srcDescPtr->w); + Rpp32f slant = sin(slantAngle) * rainHeight; + + // Seed the random number generator and set up the uniform distributions + std::mt19937 rng(std::random_device{}()); + std::uniform_int_distribution distX(0, srcDescPtr->w - slant - 1); + std::uniform_int_distribution distY(0, srcDescPtr->h - rainHeight - 1); + + T *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost); + T initValue = 0; + if constexpr (std::is_same::value) + initValue = static_cast(0x81); // 0x81 represents -127 in signed 8-bit integer(Rpp8s). + std::memset(rainLayer, initValue, srcDescPtr->w * srcDescPtr->h * sizeof(T)); + // Choose the rain intensity value based on the data type + T rainValue = std::is_same::value ? static_cast(RAIN_INTENSITY_8U) : + std::is_same::value ? static_cast(RAIN_INTENSITY_8S) : + static_cast(RAIN_INTENSITY_FLOAT); + Rpp32f slantPerDropLength = static_cast(slant) / rainHeight; + for (Rpp32u i = 0; i < numDrops; i++) + { + Rpp32u xStart = distX(rng); + Rpp32u yStart = distX(rng); + for (Rpp32u j = 0; j < rainHeight; j++) + { + Rpp32u x = xStart + j * slantPerDropLength; + Rpp32u y = yStart + j; + + if (x >= 0 && x < srcDescPtr->w && y < srcDescPtr->h) + { + T *rainLayerTemp = rainLayer + y * srcDescPtr->w + x; + for (Rpp32u k = 0; k < rainWidth; k++) + rainLayerTemp[k] = rainValue; + } + } + } + + T *rainLayerHip = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + CHECK_RETURN_STATUS(hipMemcpyAsync(rainLayerHip, rainLayer, srcDescPtr->w * srcDescPtr->h * sizeof(T), hipMemcpyHostToDevice, handle.GetStream())); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3;; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(rain_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + rainLayerHip, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride, srcDescPtr->w), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + alpha, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(rain_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + rainLayerHip, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + alpha, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(rain_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + rainLayerHip, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride, srcDescPtr->w), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + alpha, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(rain_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + rainLayerHip, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + alpha, + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp index d5862c846..ee40f9a31 100644 --- a/src/modules/rppt_tensor_effects_augmentations.cpp +++ b/src/modules/rppt_tensor_effects_augmentations.cpp @@ -1252,6 +1252,91 @@ RppStatus rppt_fog_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** rain ********************/ + +RppStatus rppt_rain_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alpha, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + rain_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + rain_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + rain_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + rain_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -2572,4 +2657,87 @@ RppStatus rppt_fog_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** rain ********************/ + +RppStatus rppt_rain_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f rainPercentage, + Rpp32u rainWidth, + Rpp32u rainHeight, + Rpp32f slantAngle, + Rpp32f *alpha, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_rain_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_rain_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_rain_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_rain_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rainPercentage, + rainWidth, + rainHeight, + slantAngle, + alpha, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/utilities/test_suite/HIP/Tensor_image_hip.cpp b/utilities/test_suite/HIP/Tensor_image_hip.cpp index 2f86282c5..1f50908ba 100644 --- a/utilities/test_suite/HIP/Tensor_image_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_image_hip.cpp @@ -63,7 +63,7 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 28 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); - bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84 || testCase == 49 || testCase == 54); + bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 11 || testCase == 84 || testCase == 49 || testCase == 54); bool nonQACase = (testCase == 24 || testCase == 28 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24|| testCase == 28 || testCase == 79); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); @@ -427,11 +427,15 @@ int main(int argc, char **argv) void *d_interDstPtr; if(testCase == 5) CHECK_RETURN_STATUS(hipHostMalloc(&d_interDstPtr, srcDescPtr->strides.nStride * srcDescPtr->n * sizeof(Rpp32f))); - + Rpp32f *perspectiveTensorPtr = NULL; if(testCase == 28) CHECK_RETURN_STATUS(hipHostMalloc(&perspectiveTensorPtr, batchSize * 9 * sizeof(Rpp32f))); + Rpp32f *alpha = nullptr; + if(testCase == 11) + CHECK_RETURN_STATUS(hipHostMalloc(&alpha, batchSize * sizeof(Rpp32f))); + Rpp32f *minTensor = nullptr, *maxTensor = nullptr; if(testCase == 15) { @@ -710,6 +714,25 @@ int main(int argc, char **argv) break; } + case 11: + { + testCaseName = "rain"; + + Rpp32f rainPercentage = 7; + Rpp32u rainHeight = 6; + Rpp32u rainWidth = 1; + Rpp32f slantAngle = 0; + for (int i = 0; i < batchSize; i++) + alpha[i] = 0.4; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_rain_gpu(d_input, srcDescPtr, d_output, dstDescPtr, rainPercentage, rainWidth, rainHeight, slantAngle, alpha, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 13: { testCaseName = "exposure"; @@ -1690,7 +1713,7 @@ int main(int argc, char **argv) // Calculate exact dstROI in XYWH format for OpenCV dump if (roiTypeSrc == RpptRoiType::LTRB) convert_roi(roiTensorPtrDst, RpptRoiType::XYWH, dstDescPtr->n); - + // Check if the ROI values for each input is within the bounds of the max buffer allocated RpptROI roiDefault; RpptROIPtr roiPtrDefault = &roiDefault; @@ -1791,6 +1814,8 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipFree(d_output)); if(testCase == 5) CHECK_RETURN_STATUS(hipFree(d_interDstPtr)); + if(alpha != NULL) + CHECK_RETURN_STATUS(hipHostFree(alpha)); if (minTensor != nullptr) CHECK_RETURN_STATUS(hipHostFree(minTensor)); if (maxTensor != nullptr) diff --git a/utilities/test_suite/HIP/runImageTests.py b/utilities/test_suite/HIP/runImageTests.py index 13a5d4c58..0e15f1eac 100644 --- a/utilities/test_suite/HIP/runImageTests.py +++ b/utilities/test_suite/HIP/runImageTests.py @@ -272,7 +272,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": @@ -478,7 +478,7 @@ def rpp_test_suite_parser_and_validator(): print_performance_tests_summary(logFile, functionalityGroupList, numRuns) # print the results of qa tests -nonQACaseList = ['6', '8', '10', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support +nonQACaseList = ['6', '8', '10', '11', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: qaFilePath = os.path.join(outFilePath, "QA_results.txt") diff --git a/utilities/test_suite/HOST/Tensor_image_host.cpp b/utilities/test_suite/HOST/Tensor_image_host.cpp index bed1ba467..8914e576c 100644 --- a/utilities/test_suite/HOST/Tensor_image_host.cpp +++ b/utilities/test_suite/HOST/Tensor_image_host.cpp @@ -64,8 +64,8 @@ int main(int argc, char **argv) bool kernelSizeCase = (testCase == 49 || testCase == 54); bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84); - bool nonQACase = (testCase == 24 || testCase == 28); - bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 28 || testCase == 79); + bool nonQACase = (testCase == 24); + bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); bool pln1OutTypeCase = (testCase == 86); @@ -655,6 +655,27 @@ int main(int argc, char **argv) break; } + case 11: + { + testCaseName = "rain"; + + Rpp32f rainPercentage = 7; + Rpp32u rainHeight = 6; + Rpp32u rainWidth = 1; + Rpp32f slantAngle = 0; + Rpp32f alpha[batchSize]; + for (int i = 0; i < batchSize; i++) + alpha[i] = 0.4; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_rain_host(input, srcDescPtr, output, dstDescPtr, rainPercentage, rainWidth, rainHeight, slantAngle, alpha, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 13: { testCaseName = "exposure"; diff --git a/utilities/test_suite/HOST/runImageTests.py b/utilities/test_suite/HOST/runImageTests.py index 9b14af45b..b50f20819 100644 --- a/utilities/test_suite/HOST/runImageTests.py +++ b/utilities/test_suite/HOST/runImageTests.py @@ -260,7 +260,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] if testType == 0: noCaseSupported = all(case not in supportedCaseList for case in caseList) @@ -314,7 +314,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) # print the results of qa tests -nonQACaseList = ['6', '8', '10', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support +nonQACaseList = ['6', '8', '10', '11', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: qaFilePath = os.path.join(outFilePath, "QA_results.txt") diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index 83a0df959..ab896c191 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -44,6 +44,7 @@ 6: ["jitter", "HOST", "HIP"], 8: ["noise", "HOST", "HIP"], 10: ["fog", "HOST", "HIP"], + 11: ["rain", "HOST", "HIP"], 13: ["exposure", "HOST", "HIP"], 15: ["threshold", "HOST", "HIP"], 20: ["flip", "HOST", "HIP"], @@ -117,7 +118,7 @@ ImageAugmentationGroupMap = { "color_augmentations" : [0, 1, 2, 3, 4, 13, 31, 34, 36, 45, 81], - "effects_augmentations" : [5, 6, 8, 10, 29, 30, 32, 35, 46, 82, 83, 84], + "effects_augmentations" : [5, 6, 8, 10, 11, 29, 30, 32, 35, 46, 82, 83, 84], "geometric_augmentations" : [20, 21, 23, 24, 26, 28, 33, 37, 38, 39, 63, 79, 80, 92], "filter_augmentations" : [49, 54], "arithmetic_operations" : [61], diff --git a/utilities/test_suite/rpp_test_suite_image.h b/utilities/test_suite/rpp_test_suite_image.h index 457f3e8de..48d0dd778 100644 --- a/utilities/test_suite/rpp_test_suite_image.h +++ b/utilities/test_suite/rpp_test_suite_image.h @@ -72,6 +72,7 @@ std::map augmentationMap = {6, "jitter"}, {8, "noise"}, {10, "fog"}, + {11, "rain"}, {13, "exposure"}, {15, "threshold"}, {20, "flip"},