diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00fa320ee..d20747386 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r
* RPP Fog augmentation on HOST and HIP
* RPP Rain augmentation on HOST and HIP
* RPP Warp Perspective on HOST and HIP
+* RPP Tensor Exclusive-Or support on HOST and HIP
* RPP Threshold on HOST and HIP
## (Unreleased) RPP 1.9.4
@@ -24,7 +25,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r
* RPP Audio Support HIP - Spectrogram
-## (Unreleased) RPP 1.9.2
+## (Unreleased) RPP 1.9.2
### Changed
diff --git a/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png b/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png
new file mode 100644
index 000000000..78ec2feca
Binary files /dev/null and b/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png differ
diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h
index 28dff69ce..730eb4fa0 100644
--- a/include/rppt_tensor_logical_operations.h
+++ b/include/rppt_tensor_logical_operations.h
@@ -86,6 +86,50 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr
RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_exclusive_or_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor
+ * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_exclusive_or_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HIP memory
+ * \param [in] srcPtr2 source2 tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor
* \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index c58f207d3..b02cd23b5 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -470,6 +470,30 @@ inline void rpp_load48_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m128i *px)
px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */
}
+inline void rpp_load96_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m256i *px)
+{
+ __m256i pxSrc[8];
+ __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15));
+ pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0);
+ __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+ pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0);
+ pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */
+ pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */
+ pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */
+ pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */
+ pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */
+ pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */
+ pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */
+ pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */
+ pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */
+ pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */
+ pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */
+ pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */
+ px[0] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */
+ px[1] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */
+ px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */
+}
+
inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128i *px)
{
_mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
@@ -477,6 +501,13 @@ inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *
_mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
}
+inline void rpp_store96_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m256i *px)
+{
+ _mm256_storeu_si256((__m256i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */
+ _mm256_storeu_si256((__m256i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */
+ _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */
+}
+
inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128i *px)
{
px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
@@ -499,6 +530,29 @@ inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px)
_mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
}
+inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px)
+{
+ __m256i pxDst[8];
+ __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
+ pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0);
+ pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0);
+ pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0);
+ pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]);
+ pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]);
+ pxDst[4] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB);
+ pxDst[5] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB);
+ pxDst[6] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB);
+ pxDst[7] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB);
+ _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */
+}
+
inline void rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p)
{
__m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */
@@ -765,6 +819,30 @@ inline void rpp_load48_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m128i *px)
px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 and add 128 to get u8 from i8 */
}
+inline void rpp_load96_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m256i *px)
+{
+ __m256i pxSrc[8];
+ __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15));
+ pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0);
+ __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+ pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0);
+ pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */
+ pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */
+ pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */
+ pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */
+ pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */
+ pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */
+ pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */
+ pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */
+ pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */
+ pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */
+ pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */
+ pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */
+ px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */
+ px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */
+ px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */
+}
+
inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px)
{
_mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
@@ -779,6 +857,13 @@ inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *
_mm_storeu_si128((__m128i *)dstPtrB, _mm_sub_epi8(px[2], xmm_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
}
+inline void rpp_store96_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px)
+{
+ _mm256_storeu_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */
+ _mm256_storeu_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */
+ _mm256_storeu_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */
+}
+
inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p)
{
__m128i pxSrc[8];
@@ -819,6 +904,13 @@ inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s
px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
}
+inline void rpp_load96_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *px)
+{
+ px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */
+ px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */
+ px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */
+}
+
inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
{
__m128i pxDst[4];
@@ -849,6 +941,29 @@ inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
_mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
}
+inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px)
+{
+ __m256i pxDst[8];
+ __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
+ pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0);
+ pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0);
+ pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0);
+ pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]);
+ pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]);
+ pxDst[4] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8);
+ pxDst[5] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8);
+ pxDst[6] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8);
+ pxDst[7] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8);
+ _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */
+}
+
inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p)
{
__m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index bab13ded8..e1eeb4eaa 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -1822,6 +1822,18 @@ __device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_flo
dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) | (uchar)(src2_f8->f1[7]));
}
+__device__ __forceinline__ void rpp_hip_math_exclusiveOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ dst_f8->f1[0] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[0]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[0]));
+ dst_f8->f1[1] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[1]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[1]));
+ dst_f8->f1[2] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[2]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[2]));
+ dst_f8->f1[3] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[3]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[3]));
+ dst_f8->f1[4] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[4]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[4]));
+ dst_f8->f1[5] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[5]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[5]));
+ dst_f8->f1[6] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[6]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[6]));
+ dst_f8->f1[7] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[7]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[7]));
+}
+
__device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x)
{
float xHalf = 0.5f * x;
diff --git a/src/modules/cpu/host_tensor_logical_operations.hpp b/src/modules/cpu/host_tensor_logical_operations.hpp
index 0fb3fe5eb..e984fced2 100644
--- a/src/modules/cpu/host_tensor_logical_operations.hpp
+++ b/src/modules/cpu/host_tensor_logical_operations.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
#define HOST_TENSOR_LOGICAL_OPERATIONS_HPP
#include "kernel/bitwise_and.hpp"
+#include "kernel/exclusive_or.hpp"
#include "kernel/bitwise_or.hpp"
#endif // HOST_TENSOR_LOGICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp
new file mode 100644
index 000000000..5ea193d41
--- /dev/null
+++ b/src/modules/cpu/kernel/exclusive_or.hpp
@@ -0,0 +1,1281 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software OR associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, OR/or sell
+copies of the Software, OR to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice OR this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+/* exclusiveOR is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the exclusiveOR is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ The bitwise operation is applied to the char representation of the raw floating-point data in memory */
+
+RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1,
+ Rpp8u *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+#endif
+
+ // Exclusive OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = srcPtr1Temp[0] ^ srcPtr2Temp[0];
+ *dstPtrTempG++ = srcPtr1Temp[1] ^ srcPtr2Temp[1];
+ *dstPtrTempB++ = srcPtr1Temp[2] ^ srcPtr2Temp[2];
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = *srcPtr1TempR ^ *srcPtr2TempR;
+ dstPtrTemp[1] = *srcPtr1TempG ^ *srcPtr2TempG;
+ dstPtrTemp[2] = *srcPtr1TempB ^ *srcPtr2TempB;
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~31;
+#endif
+
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR = *srcPtr1TempR ^ *srcPtr2TempR;
+ *dstPtrTempG = *srcPtr1TempG ^ *srcPtr2TempG;
+ *dstPtrTempB = *srcPtr1TempB ^ *srcPtr2TempB;
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTempR++;
+ dstPtrTempG++;
+ dstPtrTempB++;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~31;
+#endif
+
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1, p2;
+
+ p1 = _mm256_loadu_si256((const __m256i *)srcPtr1Temp); // simd loads
+ p2 = _mm256_loadu_si256((const __m256i *)srcPtr2Temp); // simd loads
+ p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation
+ _mm256_storeu_si256((__m256i *)dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp;
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1,
+ Rpp32f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Exclusive OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255);
+ *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255);
+ *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255);
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255);
+ dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255);
+ dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255);
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255);
+ *dstPtrTempG = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255);
+ *dstPtrTempB = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255);
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTempR++;
+ dstPtrTempG++;
+ dstPtrTempB++;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255);
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1,
+ Rpp16f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Exclusive OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255));
+ *dstPtrTempG = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255));
+ *dstPtrTempB = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTempR++;
+ dstPtrTempG++;
+ dstPtrTempB++;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for (int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1,
+ Rpp8s *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+#endif
+
+ // Exclusive OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) ^ (srcPtr2Temp[0] + 128)) - 128));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) ^ (srcPtr2Temp[1] + 128)) - 128));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) ^ (srcPtr2Temp[2] + 128)) - 128));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~31;
+#endif
+
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation
+ p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation
+ p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation
+ rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1);
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTempR = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128));
+ *dstPtrTempG = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128));
+ *dstPtrTempB = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTempR++;
+ dstPtrTempG++;
+ dstPtrTempB++;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~31;
+#endif
+
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for (int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p1, p2;
+
+ p1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr1Temp)); // simd loads
+ p2 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr2Temp)); // simd loads
+ p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation
+ _mm256_storeu_si256((__m256i *)dstPtrTemp, _mm256_sub_epi8(p1, avx_pxConvertI8)); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_logical_operations.hpp b/src/modules/hip/hip_tensor_logical_operations.hpp
index 636789246..946dfe0f5 100644
--- a/src/modules/hip/hip_tensor_logical_operations.hpp
+++ b/src/modules/hip/hip_tensor_logical_operations.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
#define HIP_TENSOR_LOGICAL_OPERATIONS_HPP
#include "kernel/bitwise_and.hpp"
+#include "kernel/exclusive_or.hpp"
#include "kernel/bitwise_or.hpp"
#endif // HIP_TENSOR_LOGICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp
new file mode 100644
index 000000000..bd5c5cd5f
--- /dev/null
+++ b/src/modules/hip/kernel/exclusive_or.hpp
@@ -0,0 +1,247 @@
+#include
+#include "rpp_hip_common.hpp"
+
+/* ExclusiveOR is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the ExclusiveOR is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ The bitwise operation is applied to the char representation of the raw floating-point data in memory */
+
+template
+__device__ void exclusive_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ if constexpr ((std::is_same::value) || (std::is_same::value))
+ {
+ rpp_hip_math_multiply8_const(src1_f8, src1_f8, static_cast(255));
+ rpp_hip_math_multiply8_const(src2_f8, src2_f8, static_cast(255));
+ rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_multiply8_const(dst_f8, dst_f8, static_cast(ONE_OVER_255));
+ }
+ else if constexpr (std::is_same::value)
+ {
+ rpp_hip_math_add8_const(src1_f8, src1_f8, static_cast(128));
+ rpp_hip_math_add8_const(src2_f8, src2_f8, static_cast(128));
+ rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_subtract8_const(dst_f8, dst_f8, static_cast(128));
+ }
+ else
+ rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8);
+}
+
+template
+__global__ void exclusive_or_pkd_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void exclusive_or_pln_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float8 src1_f8, src2_f8, dst_f8;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void exclusive_or_pkd3_pln3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void exclusive_or_pln3_pkd3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1,
+ T *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(exclusive_or_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(exclusive_or_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(exclusive_or_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(exclusive_or_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp
index dc4adbc9b..636a9c6c5 100644
--- a/src/modules/rppt_tensor_logical_operations.cpp
+++ b/src/modules/rppt_tensor_logical_operations.cpp
@@ -96,6 +96,71 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1,
return RPP_SUCCESS;
}
+/******************** exclusive OR ********************/
+
+RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ exclusive_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ exclusive_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ exclusive_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ exclusive_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/******************** bitwise OR ********************/
RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1,
@@ -232,6 +297,70 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1,
#endif // backend
}
+/******************** exclusive XOR ********************/
+
+RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
/******************** bitwise OR ********************/
RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1,
diff --git a/utilities/test_suite/HIP/Tensor_image_hip.cpp b/utilities/test_suite/HIP/Tensor_image_hip.cpp
index 1f50908ba..997f3d2dc 100644
--- a/utilities/test_suite/HIP/Tensor_image_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_image_hip.cpp
@@ -62,7 +62,7 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 28 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79);
bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68);
bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 11 || testCase == 84 || testCase == 49 || testCase == 54);
bool nonQACase = (testCase == 24 || testCase == 28 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24|| testCase == 28 || testCase == 79);
@@ -1305,6 +1305,18 @@ int main(int argc, char **argv)
break;
}
+ case 67:
+ {
+ testCaseName = "exclusive_or";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_exclusive_or_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 68:
{
testCaseName = "bitwise_or";
diff --git a/utilities/test_suite/HIP/runImageTests.py b/utilities/test_suite/HIP/runImageTests.py
index 0e15f1eac..e8e3b0b26 100644
--- a/utilities/test_suite/HIP/runImageTests.py
+++ b/utilities/test_suite/HIP/runImageTests.py
@@ -272,7 +272,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '67', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
@@ -329,7 +329,7 @@ def rpp_test_suite_parser_and_validator():
run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
elif (testType == 1 and profilingOption == "YES"):
- NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
+ NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 67, 69]
noCaseSupported = all(case not in supportedCaseList for case in caseList)
if noCaseSupported:
diff --git a/utilities/test_suite/HOST/Tensor_image_host.cpp b/utilities/test_suite/HOST/Tensor_image_host.cpp
index 8914e576c..bc4857c9e 100644
--- a/utilities/test_suite/HOST/Tensor_image_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_image_host.cpp
@@ -62,7 +62,7 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 28 || testCase == 49 || testCase ==54 || testCase == 79);
bool kernelSizeCase = (testCase == 49 || testCase == 54);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68);
bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84);
bool nonQACase = (testCase == 24);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
@@ -1285,6 +1285,19 @@ int main(int argc, char **argv)
break;
}
+ case 67:
+ {
+ testCaseName = "exclusive_or";
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_exclusive_or_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 68:
{
testCaseName = "bitwise_or";
diff --git a/utilities/test_suite/HOST/runImageTests.py b/utilities/test_suite/HOST/runImageTests.py
index b50f20819..ff39a7db9 100644
--- a/utilities/test_suite/HOST/runImageTests.py
+++ b/utilities/test_suite/HOST/runImageTests.py
@@ -260,7 +260,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '67', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
if testType == 0:
noCaseSupported = all(case not in supportedCaseList for case in caseList)
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin
new file mode 100644
index 000000000..1c6464a29
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin differ
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index ab896c191..691999bfd 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -71,6 +71,7 @@
61: ["magnitude", "HOST", "HIP"],
63: ["phase", "HOST", "HIP"],
65: ["bitwise_and", "HOST", "HIP"],
+ 67: ["exclusive_or", "HOST", "HIP"],
68: ["bitwise_or", "HOST", "HIP"],
70: ["copy", "HOST", "HIP"],
79: ["remap", "HOST", "HIP"],
@@ -122,7 +123,7 @@
"geometric_augmentations" : [20, 21, 23, 24, 26, 28, 33, 37, 38, 39, 63, 79, 80, 92],
"filter_augmentations" : [49, 54],
"arithmetic_operations" : [61],
- "logical_operations" : [65, 68],
+ "logical_operations" : [65, 67, 68],
"data_exchange_operations" : [70, 85, 86],
"statistical_operations" : [15, 87, 88, 89, 90, 91]
}
diff --git a/utilities/test_suite/rpp_test_suite_image.h b/utilities/test_suite/rpp_test_suite_image.h
index 48d0dd778..6ae9dfa5d 100644
--- a/utilities/test_suite/rpp_test_suite_image.h
+++ b/utilities/test_suite/rpp_test_suite_image.h
@@ -99,6 +99,7 @@ std::map augmentationMap =
{61, "magnitude"},
{63, "phase"},
{65, "bitwise_and"},
+ {67, "exclusive_or"},
{68, "bitwise_or"},
{70, "copy"},
{79, "remap"},