diff --git a/CHANGELOG.md b/CHANGELOG.md
index f1c66e8d2..00fa320ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r
* RPP Tensor Gaussian Filter support on HOST
* RPP Fog augmentation on HOST and HIP
+* RPP Rain augmentation on HOST and HIP
* RPP Warp Perspective on HOST and HIP
* RPP Threshold on HOST and HIP
diff --git a/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png b/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png
new file mode 100644
index 000000000..1ac729ff9
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_rain_img640x480.png differ
diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h
index 4fb4c0fe8..bfab33a71 100644
--- a/include/rppt_tensor_effects_augmentations.h
+++ b/include/rppt_tensor_effects_augmentations.h
@@ -645,6 +645,60 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Rain augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The rain augmentation simulates a rain effect for a batch of RGB (3-channel) / greyscale (1-channel) images with an NHWC/NCHW tensor layout.
+ * NOTE: This augmentation gives a more realistic Rain output when all images in a batch are of similar / same sizes
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be the same depth as srcPtr.
+ * \image html img640x480.png Sample Input
+ * \image html effects_augmentations_rain_img640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rainPercentage The percentage of the rain effect to be applied (0 <= rainPercentage <= 100)
+ * \param [in] rainWidth Width of the rain drops in pixels. To be tuned by user depending on size of the image.
+ * \param [in] rainHeight Height of the rain drops in pixels. To be tuned by user depending on size of the image.
+ * \param [in] slantAngle Slant angle of the rain drops (positive value for right slant, negative for left slant). A single Rpp32s/f representing the slant of raindrops in degrees.
+ * Values range from [-90, 90], where -90 represents extreme left slant, 0 is vertical, and 90 is extreme right slant.
+ * \param [in] alpha An array of alpha blending values to be used for blending the rainLayer and the input image for each image in the batch (0 ≤ alpha ≤ 1 for each image in the batch).
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_rain_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f rainPercentage, Rpp32u rainWidth, Rpp32u rainHeight, Rpp32f slantAngle, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Rain augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The rain augmentation simulates a rain effect for a batch of RGB (3-channel) / greyscale (1-channel) images with an NHWC/NCHW tensor layout.
+ * NOTE: This augmentation gives a more realistic Rain output when all images in a batch are of similar / same sizes
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be the same depth as srcPtr.
+ * \image html img640x480.png Sample Input
+ * \image html effects_augmentations_rain_img640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rainPercentage The percentage of the rain effect to be applied (0 <= rainPercentage <= 100)
+ * \param [in] rainWidth Width of the rain drops in pixels. To be tuned by user depending on size of the image.
+ * \param [in] rainHeight Height of the rain drops in pixels. To be tuned by user depending on size of the image.
+ * \param [in] slantAngle Slant angle of the rain drops (positive value for right slant, negative for left slant). A single Rpp32s/f representing the slant of raindrops in degrees.
+ * Values range from [-90, 90], where -90 represents extreme left slant, 0 is vertical, and 90 is extreme right slant.
+ * \param [in] alpha An array of alpha blending values in pinned / HIP memory is used for blending the rainLayer and the input image for each image in the batch (0 ≤ alpha ≤ 1 for each image in the batch).
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_rain_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f rainPercentage, Rpp32u rainWidth, Rpp32u rainHeight, Rpp32f slantAngle, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Pixelate augmentation on HOST backend for a NCHW/NHWC layout tensor
* \details The pixelate augmentation performs a pixelate transformation for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index aaca7090a..abb531001 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6630,6 +6630,31 @@ inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst)
pDst[7] = _mm256_extractf128_ps(pSrc[3], 1); /* extract [P08|P16|P24|P32] */
}
+inline void compute_rain_48_host(__m256 *p1, __m256 *p2, __m256 &pMul)
+{
+ p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[0]), pMul, p1[0]); // alpha-blending adjustment
+ p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[1]), pMul, p1[1]); // alpha-blending adjustment
+ p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[2]), pMul, p1[2]); // alpha-blending adjustment
+ p1[3] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[3]), pMul, p1[3]); // alpha-blending adjustment
+ p1[4] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[4]), pMul, p1[4]); // alpha-blending adjustment
+ p1[5] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[5]), pMul, p1[5]); // alpha-blending adjustment
+}
+
+inline void compute_rain_32_host(__m256 *p1, __m256 *p2, __m256 &pMul)
+{
+ p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2[0], p1[0]), pMul, p1[0]); // alpha-blending adjustment
+ p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2[1], p1[1]), pMul, p1[1]); // alpha-blending adjustment
+ p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2[2], p1[2]), pMul, p1[2]); // alpha-blending adjustment
+ p1[3] = _mm256_fmadd_ps(_mm256_sub_ps(p2[3], p1[3]), pMul, p1[3]); // alpha-blending adjustment
+}
+
+inline void compute_rain_24_host(__m256 *p1, __m256 p2, __m256 &pMul)
+{
+ p1[0] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[0]), pMul, p1[0]); // alpha-blending adjustment
+ p1[1] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[1]), pMul, p1[1]); // alpha-blending adjustment
+ p1[2] = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1[2]), pMul, p1[2]); // alpha-blending adjustment
+}
+
// Compute hanning window
inline RPP_HOST_DEVICE void hann_window(Rpp32f *output, Rpp32s windowSize)
{
diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp
index 968a6cd11..636510e17 100644
--- a/src/modules/cpu/host_tensor_effects_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp
@@ -39,5 +39,6 @@ SOFTWARE.
#include "kernel/resize.hpp" //pixelate dependency
#include "kernel/erase.hpp"
#include "kernel/fog.hpp"
+#include "kernel/rain.hpp"
#endif // HOST_TENSOR_EFFECTS_AUGMENTATIONS_HPP
diff --git a/src/modules/cpu/kernel/rain.hpp b/src/modules/cpu/kernel/rain.hpp
new file mode 100644
index 000000000..45b1a4f42
--- /dev/null
+++ b/src/modules/cpu/kernel/rain.hpp
@@ -0,0 +1,1204 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+// Constants to represent the rain intensity for different data types
+#define RAIN_INTENSITY_8U 200 // Intensity value for Rpp8u
+#define RAIN_INTENSITY_8S 72 // Intensity value for Rpp8s
+#define RAIN_INTENSITY_FLOAT 200 * ONE_OVER_255 // Intensity value for Rpp32f and Rpp16f
+
+template
+inline void create_rain_layer(T *rainLayer, Rpp32f rainPercentage, RpptDescPtr srcDescPtr, Rpp32f slantAngle, Rpp32u dropLength, Rpp32u rainWidth)
+{
+ Rpp32f rainPercent = rainPercentage * 0.004f; // Scaling factor to convert percentage to a range suitable for rain effect intensity
+ Rpp32u numDrops = static_cast(rainPercent * srcDescPtr->h * srcDescPtr->w);
+ Rpp32f slant = sin(slantAngle) * dropLength;
+
+ // Seed the random number generator and set up the uniform distributions
+ std::mt19937 rng(std::random_device{}());
+ std::uniform_int_distribution distX(0, srcDescPtr->w - slant - 1);
+ std::uniform_int_distribution distY(0, srcDescPtr->h - dropLength - 1);
+
+ // Choose the rain intensity value based on the data type
+ T rainValue = std::is_same::value ? static_cast(RAIN_INTENSITY_8U) :
+ std::is_same::value ? static_cast(RAIN_INTENSITY_8S) :
+ static_cast(RAIN_INTENSITY_FLOAT);
+ Rpp32f slantPerDropLength = static_cast(slant) / dropLength;
+ for (Rpp32u i = 0; i < numDrops; i++)
+ {
+ Rpp32u xStart = distX(rng);
+ Rpp32u yStart = distX(rng);
+ for (Rpp32u j = 0; j < dropLength; j++)
+ {
+ Rpp32u x = xStart + j * slantPerDropLength;
+ Rpp32u y = yStart + j;
+
+ if ((x >= 0) && (x < srcDescPtr->w) && (y < srcDescPtr->h))
+ {
+ T *rainLayerTemp = rainLayer + y * srcDescPtr->w + x;
+ for (Rpp32u k = 0; k < rainWidth; k++)
+ rainLayerTemp[k] = rainValue;
+ }
+ }
+ }
+}
+
+RppStatus rain_u8_u8_host_tensor(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alphaValues,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ Rpp8u *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost);
+ std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp8u));
+ create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth);
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8u *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp8u *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32f alpha = alphaValues[batchCount];
+#if __AVX2__
+ __m256 pMul = _mm256_set1_ps(alpha);
+#endif
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+ // Rain with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0]))));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1]))));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2]))));
+ srcPtr1Temp += 3;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))));
+ dstPtrTemp += 3;
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1)))));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2)))));
+ srcPtr2Temp++;
+ srcPtr1Temp += 3;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))));
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride; }
+ }
+ // Rain single channel without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[2], p2[2];
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_32_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store16_f32_to_u8_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECK(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))));
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus rain_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alphaValues,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+ {
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ Rpp32f *rainLayer = handle.GetInitHandle()->mem.mcpu.scratchBufferHost;
+ std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp32f));
+ create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth);
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp32f *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32f alpha = alphaValues[batchCount];
+#if __AVX2__
+ __m256 pMul = _mm256_set1_ps(alpha);
+#endif
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+ // Rain with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0]);
+ *dstPtrTempG++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1]);
+ *dstPtrTempB++ = RPPPIXELCHECKF32((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2]);
+ srcPtr1Temp += 3;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR);
+ dstPtrTemp[1] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG);
+ dstPtrTemp[2] = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB);
+ dstPtrTemp += 3;
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp);
+ *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1));
+ *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2));
+ srcPtr2Temp++;
+ srcPtr1Temp += 3;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR);
+ *dstPtrTempG++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG);
+ *dstPtrTempB++ = RPPPIXELCHECK((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB);
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride; }
+ }
+ // Rain single channel without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1, p2;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, &p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ p1 = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1), pMul, p1); // alpha-blending adjustment
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &p1); // simd stores
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp);
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus rain_f16_f16_host_tensor(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alphaValues,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ Rpp16f *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost);
+ std::memset(rainLayer, 0, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp16f));
+ create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth);
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp16f *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp16f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32f alpha = alphaValues[batchCount];
+#if __AVX2__
+ __m256 pMul = _mm256_set1_ps(alpha);
+#endif
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+ // Rain with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0])));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1])));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2])));
+ srcPtr1Temp += 3;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)));
+ dstPtrTemp += 3;
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1))));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2))));
+ srcPtr2Temp++;
+ srcPtr1Temp += 3;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2;
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ compute_rain_24_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR)));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG)));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB)));
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride; }
+ }
+ // Rain single channel without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1, p2;
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, &p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, &p2); // simd loads
+ p1 = _mm256_fmadd_ps(_mm256_sub_ps(p2, p1), pMul, p1); // alpha-blending adjustment
+ rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, &p1); // simd stores
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp)));
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus rain_i8_i8_host_tensor(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alphaValues,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ Rpp8s *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost);
+ std::memset(rainLayer, 0x81, srcDescPtr->w * srcDescPtr->h * sizeof(Rpp8s));
+ create_rain_layer(rainLayer, rainPercentage, srcDescPtr, slantAngle, rainHeight, rainWidth);
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (Rpp32u batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ Rpp8s *dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp8s *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32f alpha = alphaValues[batchCount];
+#if __AVX2__
+ __m256 pMul = _mm256_set1_ps(alpha);
+#endif
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+ Rpp32u alignedLength = (bufferLength / vectorIncrement) * vectorIncrement;
+
+ // Rain with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[0]) * alpha + srcPtr1Temp[0]))));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[1]) * alpha + srcPtr1Temp[1]))));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - srcPtr1Temp[2]) * alpha + srcPtr1Temp[2]))));
+ srcPtr1Temp += 3;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRow;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))));
+ dstPtrTemp += 3;
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 1)) * alpha + *(srcPtr1Temp + 1)))));
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *(srcPtr1Temp + 2)) * alpha + *(srcPtr1Temp + 2)))));
+ srcPtr2Temp++;
+ srcPtr1Temp += 3;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->w;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Rain without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtrChannel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2Row = rainLayer;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[6], p2[2];
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_48_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempR) * alpha + *srcPtr1TempR))));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempG) * alpha + *srcPtr1TempG))));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1TempB) * alpha + *srcPtr1TempB))));
+ srcPtr2Temp++;
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ }
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride; }
+ }
+ // Rain single channel without fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtrChannel;
+ srcPtr2Row = rainLayer;
+ dstPtrRow = dstPtrChannel;
+ for(Rpp32u i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[2], p2[2];
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ compute_rain_32_host(p1, p2, pMul);
+ rpp_simd_store(rpp_store16_f32_to_i8_avx, dstPtrTemp, p1); // simd stores
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(std::nearbyintf(static_cast((*srcPtr2Temp - *srcPtr1Temp) * alpha + *srcPtr1Temp))));
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp
index d5c6aba75..27cba780e 100644
--- a/src/modules/hip/hip_tensor_effects_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp
@@ -39,5 +39,6 @@ SOFTWARE.
#include "kernel/resize.hpp" //pixelate dependency
#include "kernel/erase.hpp"
#include "kernel/fog.hpp"
+#include "kernel/rain.hpp"
#endif // HIP_TENSOR_EFFECTS_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/kernel/rain.hpp b/src/modules/hip/kernel/rain.hpp
new file mode 100644
index 000000000..712060d23
--- /dev/null
+++ b/src/modules/hip/kernel/rain.hpp
@@ -0,0 +1,304 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// Constants to represent the rain intensity for different data types
+#define RAIN_INTENSITY_8U 200 // Intensity value for Rpp8u
+#define RAIN_INTENSITY_8S 72 // Intensity value for Rpp8s
+#define RAIN_INTENSITY_FLOAT 200 * ONE_OVER_255 // Intensity value for Rpp32f and Rpp16f
+
+__device__ __forceinline__ void rain_hip_compute(uchar *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4)
+{
+ dst_f8->f4[0] = rpp_hip_pixel_check_0to255((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]);
+ dst_f8->f4[1] = rpp_hip_pixel_check_0to255((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]);
+}
+
+__device__ __forceinline__ void rain_hip_compute(float *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4)
+{
+ dst_f8->f4[0] = rpp_hip_pixel_check_0to1((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]);
+ dst_f8->f4[1] = rpp_hip_pixel_check_0to1((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]);
+}
+
+__device__ __forceinline__ void rain_hip_compute(schar *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4)
+{
+ dst_f8->f4[0] = rpp_hip_pixel_check_0to255((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0] + (float4)128) - (float4)128;
+ dst_f8->f4[1] = rpp_hip_pixel_check_0to255((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1] + (float4)128) - (float4)128;
+}
+
+__device__ __forceinline__ void rain_hip_compute(half *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, float4 *alpha_f4)
+{
+ dst_f8->f4[0] = rpp_hip_pixel_check_0to1((src2_f8->f4[0] - src1_f8->f4[0]) * *alpha_f4 + src1_f8->f4[0]);
+ dst_f8->f4[1] = rpp_hip_pixel_check_0to1((src2_f8->f4[1] - src1_f8->f4[1]) * *alpha_f4 + src1_f8->f4[1]);
+}
+
+template
+__global__ void rain_pkd_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNHW,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ float *alpha,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth * 3))
+ {
+ return;
+ }
+
+ uint srcIdx1 = (id_z * srcStridesNHW.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ float4 alpha_f4 = static_cast(alpha[id_z]);
+ d_float24 src1_f24, dst_f24;
+ d_float8 src2_f8;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, &src1_f24);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void rain_pln_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ float *alpha,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx1 = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ float4 alpha_f4 = static_cast(alpha[id_z]);
+ d_float8 src1_f8, src2_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8);
+ rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx1 += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8);
+ rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ srcIdx1 += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx1, &src1_f8);
+ rain_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8, &alpha_f4);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void rain_pkd3_pln3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNHW,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ float *alpha,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx1 = (id_z * srcStridesNHW.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNHW.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ float4 alpha_f4 = static_cast(alpha[id_z]);
+ d_float24 src1_f24, dst_f24;
+ d_float8 src2_f8;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, &src1_f24);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void rain_pln3_pkd3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ float *alpha,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx1 = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint srcIdx2 = ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ float4 alpha_f4 = static_cast(alpha[id_z]);
+ d_float24 src1_f24, dst_f24;
+ d_float8 src2_f8;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx1, srcStridesNCH.y, &src1_f24);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx2, &src2_f8);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f8, &dst_f24.f8[0], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f8, &dst_f24.f8[1], &alpha_f4);
+ rain_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f8, &dst_f24.f8[2], &alpha_f4);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_rain_tensor(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alpha,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ Rpp32f rainPercent = rainPercentage * 0.004f; //Scaling factor to convert percentage to a range suitable for rain effect intensity
+ Rpp32u numDrops = static_cast(rainPercent * srcDescPtr->h * srcDescPtr->w);
+ Rpp32f slant = sin(slantAngle) * rainHeight;
+
+ // Seed the random number generator and set up the uniform distributions
+ std::mt19937 rng(std::random_device{}());
+ std::uniform_int_distribution distX(0, srcDescPtr->w - slant - 1);
+ std::uniform_int_distribution distY(0, srcDescPtr->h - rainHeight - 1);
+
+ T *rainLayer = reinterpret_cast(handle.GetInitHandle()->mem.mcpu.scratchBufferHost);
+ T initValue = 0;
+ if constexpr (std::is_same::value)
+ initValue = static_cast(0x81); // 0x81 represents -127 in signed 8-bit integer(Rpp8s).
+ std::memset(rainLayer, initValue, srcDescPtr->w * srcDescPtr->h * sizeof(T));
+ // Choose the rain intensity value based on the data type
+ T rainValue = std::is_same::value ? static_cast(RAIN_INTENSITY_8U) :
+ std::is_same::value ? static_cast(RAIN_INTENSITY_8S) :
+ static_cast(RAIN_INTENSITY_FLOAT);
+ Rpp32f slantPerDropLength = static_cast(slant) / rainHeight;
+ for (Rpp32u i = 0; i < numDrops; i++)
+ {
+ Rpp32u xStart = distX(rng);
+ Rpp32u yStart = distX(rng);
+ for (Rpp32u j = 0; j < rainHeight; j++)
+ {
+ Rpp32u x = xStart + j * slantPerDropLength;
+ Rpp32u y = yStart + j;
+
+ if (x >= 0 && x < srcDescPtr->w && y < srcDescPtr->h)
+ {
+ T *rainLayerTemp = rainLayer + y * srcDescPtr->w + x;
+ for (Rpp32u k = 0; k < rainWidth; k++)
+ rainLayerTemp[k] = rainValue;
+ }
+ }
+ }
+
+ T *rainLayerHip = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ CHECK_RETURN_STATUS(hipMemcpyAsync(rainLayerHip, rainLayer, srcDescPtr->w * srcDescPtr->h * sizeof(T), hipMemcpyHostToDevice, handle.GetStream()));
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(rain_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ rainLayerHip,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride, srcDescPtr->w),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ alpha,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(rain_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ rainLayerHip,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ alpha,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(rain_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ rainLayerHip,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride, srcDescPtr->w),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ alpha,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(rain_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ rainLayerHip,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ alpha,
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp
index d5862c846..ee40f9a31 100644
--- a/src/modules/rppt_tensor_effects_augmentations.cpp
+++ b/src/modules/rppt_tensor_effects_augmentations.cpp
@@ -1252,6 +1252,91 @@ RppStatus rppt_fog_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** rain ********************/
+
+RppStatus rppt_rain_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alpha,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ rain_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ rain_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ rain_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ rain_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -2572,4 +2657,87 @@ RppStatus rppt_fog_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** rain ********************/
+
+RppStatus rppt_rain_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f rainPercentage,
+ Rpp32u rainWidth,
+ Rpp32u rainHeight,
+ Rpp32f slantAngle,
+ Rpp32f *alpha,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_rain_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_rain_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_rain_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_rain_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rainPercentage,
+ rainWidth,
+ rainHeight,
+ slantAngle,
+ alpha,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/HIP/Tensor_image_hip.cpp b/utilities/test_suite/HIP/Tensor_image_hip.cpp
index 2f86282c5..1f50908ba 100644
--- a/utilities/test_suite/HIP/Tensor_image_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_image_hip.cpp
@@ -63,7 +63,7 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 28 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79);
bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
- bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84 || testCase == 49 || testCase == 54);
+ bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 11 || testCase == 84 || testCase == 49 || testCase == 54);
bool nonQACase = (testCase == 24 || testCase == 28 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24|| testCase == 28 || testCase == 79);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
@@ -427,11 +427,15 @@ int main(int argc, char **argv)
void *d_interDstPtr;
if(testCase == 5)
CHECK_RETURN_STATUS(hipHostMalloc(&d_interDstPtr, srcDescPtr->strides.nStride * srcDescPtr->n * sizeof(Rpp32f)));
-
+
Rpp32f *perspectiveTensorPtr = NULL;
if(testCase == 28)
CHECK_RETURN_STATUS(hipHostMalloc(&perspectiveTensorPtr, batchSize * 9 * sizeof(Rpp32f)));
+ Rpp32f *alpha = nullptr;
+ if(testCase == 11)
+ CHECK_RETURN_STATUS(hipHostMalloc(&alpha, batchSize * sizeof(Rpp32f)));
+
Rpp32f *minTensor = nullptr, *maxTensor = nullptr;
if(testCase == 15)
{
@@ -710,6 +714,25 @@ int main(int argc, char **argv)
break;
}
+ case 11:
+ {
+ testCaseName = "rain";
+
+ Rpp32f rainPercentage = 7;
+ Rpp32u rainHeight = 6;
+ Rpp32u rainWidth = 1;
+ Rpp32f slantAngle = 0;
+ for (int i = 0; i < batchSize; i++)
+ alpha[i] = 0.4;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_rain_gpu(d_input, srcDescPtr, d_output, dstDescPtr, rainPercentage, rainWidth, rainHeight, slantAngle, alpha, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 13:
{
testCaseName = "exposure";
@@ -1690,7 +1713,7 @@ int main(int argc, char **argv)
// Calculate exact dstROI in XYWH format for OpenCV dump
if (roiTypeSrc == RpptRoiType::LTRB)
convert_roi(roiTensorPtrDst, RpptRoiType::XYWH, dstDescPtr->n);
-
+
// Check if the ROI values for each input is within the bounds of the max buffer allocated
RpptROI roiDefault;
RpptROIPtr roiPtrDefault = &roiDefault;
@@ -1791,6 +1814,8 @@ int main(int argc, char **argv)
CHECK_RETURN_STATUS(hipFree(d_output));
if(testCase == 5)
CHECK_RETURN_STATUS(hipFree(d_interDstPtr));
+ if(alpha != NULL)
+ CHECK_RETURN_STATUS(hipHostFree(alpha));
if (minTensor != nullptr)
CHECK_RETURN_STATUS(hipHostFree(minTensor));
if (maxTensor != nullptr)
diff --git a/utilities/test_suite/HIP/runImageTests.py b/utilities/test_suite/HIP/runImageTests.py
index 13a5d4c58..0e15f1eac 100644
--- a/utilities/test_suite/HIP/runImageTests.py
+++ b/utilities/test_suite/HIP/runImageTests.py
@@ -272,7 +272,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
@@ -478,7 +478,7 @@ def rpp_test_suite_parser_and_validator():
print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
# print the results of qa tests
-nonQACaseList = ['6', '8', '10', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '10', '11', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
qaFilePath = os.path.join(outFilePath, "QA_results.txt")
diff --git a/utilities/test_suite/HOST/Tensor_image_host.cpp b/utilities/test_suite/HOST/Tensor_image_host.cpp
index bed1ba467..8914e576c 100644
--- a/utilities/test_suite/HOST/Tensor_image_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_image_host.cpp
@@ -64,8 +64,8 @@ int main(int argc, char **argv)
bool kernelSizeCase = (testCase == 49 || testCase == 54);
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84);
- bool nonQACase = (testCase == 24 || testCase == 28);
- bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 28 || testCase == 79);
+ bool nonQACase = (testCase == 24);
+ bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
@@ -655,6 +655,27 @@ int main(int argc, char **argv)
break;
}
+ case 11:
+ {
+ testCaseName = "rain";
+
+ Rpp32f rainPercentage = 7;
+ Rpp32u rainHeight = 6;
+ Rpp32u rainWidth = 1;
+ Rpp32f slantAngle = 0;
+ Rpp32f alpha[batchSize];
+ for (int i = 0; i < batchSize; i++)
+ alpha[i] = 0.4;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_rain_host(input, srcDescPtr, output, dstDescPtr, rainPercentage, rainWidth, rainHeight, slantAngle, alpha, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 13:
{
testCaseName = "exposure";
diff --git a/utilities/test_suite/HOST/runImageTests.py b/utilities/test_suite/HOST/runImageTests.py
index 9b14af45b..b50f20819 100644
--- a/utilities/test_suite/HOST/runImageTests.py
+++ b/utilities/test_suite/HOST/runImageTests.py
@@ -260,7 +260,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
if testType == 0:
noCaseSupported = all(case not in supportedCaseList for case in caseList)
@@ -314,7 +314,7 @@ def rpp_test_suite_parser_and_validator():
run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
# print the results of qa tests
-nonQACaseList = ['6', '8', '10', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '10', '11', '24', '28', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
qaFilePath = os.path.join(outFilePath, "QA_results.txt")
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index 83a0df959..ab896c191 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -44,6 +44,7 @@
6: ["jitter", "HOST", "HIP"],
8: ["noise", "HOST", "HIP"],
10: ["fog", "HOST", "HIP"],
+ 11: ["rain", "HOST", "HIP"],
13: ["exposure", "HOST", "HIP"],
15: ["threshold", "HOST", "HIP"],
20: ["flip", "HOST", "HIP"],
@@ -117,7 +118,7 @@
ImageAugmentationGroupMap = {
"color_augmentations" : [0, 1, 2, 3, 4, 13, 31, 34, 36, 45, 81],
- "effects_augmentations" : [5, 6, 8, 10, 29, 30, 32, 35, 46, 82, 83, 84],
+ "effects_augmentations" : [5, 6, 8, 10, 11, 29, 30, 32, 35, 46, 82, 83, 84],
"geometric_augmentations" : [20, 21, 23, 24, 26, 28, 33, 37, 38, 39, 63, 79, 80, 92],
"filter_augmentations" : [49, 54],
"arithmetic_operations" : [61],
diff --git a/utilities/test_suite/rpp_test_suite_image.h b/utilities/test_suite/rpp_test_suite_image.h
index 457f3e8de..48d0dd778 100644
--- a/utilities/test_suite/rpp_test_suite_image.h
+++ b/utilities/test_suite/rpp_test_suite_image.h
@@ -72,6 +72,7 @@ std::map augmentationMap =
{6, "jitter"},
{8, "noise"},
{10, "fog"},
+ {11, "rain"},
{13, "exposure"},
{15, "threshold"},
{20, "flip"},