diff --git a/CHANGELOG.md b/CHANGELOG.md index 00fa320ee..d20747386 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r * RPP Fog augmentation on HOST and HIP * RPP Rain augmentation on HOST and HIP * RPP Warp Perspective on HOST and HIP +* RPP Tensor Exclusive-Or support on HOST and HIP * RPP Threshold on HOST and HIP ## (Unreleased) RPP 1.9.4 @@ -24,7 +25,7 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r * RPP Audio Support HIP - Spectrogram -## (Unreleased) RPP 1.9.2 +## (Unreleased) RPP 1.9.2 ### Changed diff --git a/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png b/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png new file mode 100644 index 000000000..78ec2feca Binary files /dev/null and b/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png differ diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 28dff69ce..730eb4fa0 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -86,6 +86,50 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input1 + * \image html img150x150_2.png Sample Input2 + * \image html logical_operations_exclusive_or_img150x150.png Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input1 + * \image html img150x150_2.png Sample Input2 + * \image html logical_operations_exclusive_or_img150x150.png Sample Output + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index c58f207d3..b02cd23b5 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -470,6 +470,30 @@ inline void rpp_load48_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m128i *px) px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ } +inline void rpp_load96_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m256i *px) +{ + __m256i pxSrc[8]; + __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15)); + pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); + __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */ + px[0] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */ + px[1] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */ + px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */ +} + inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128i *px) { _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -477,6 +501,13 @@ inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u * _mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_store96_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m256i *px) +{ + _mm256_storeu_si256((__m256i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + _mm256_storeu_si256((__m256i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ +} + inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128i *px) { px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -499,6 +530,29 @@ inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px) +{ + __m256i pxDst[8]; + __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); + pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0); + pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0); + pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); + pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); + pxDst[4] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); + pxDst[5] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); + pxDst[6] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); + pxDst[7] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */ +} + inline void rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p) { __m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ @@ -765,6 +819,30 @@ inline void rpp_load48_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m128i *px) px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 and add 128 to get u8 from i8 */ } +inline void rpp_load96_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m256i *px) +{ + __m256i pxSrc[8]; + __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15)); + pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); + __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */ +} + inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px) { _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -779,6 +857,13 @@ inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s * _mm_storeu_si128((__m128i *)dstPtrB, _mm_sub_epi8(px[2], xmm_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_store96_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px) +{ + _mm256_storeu_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + _mm256_storeu_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + _mm256_storeu_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ +} + inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p) { __m128i pxSrc[8]; @@ -819,6 +904,13 @@ inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_load96_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *px) +{ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ +} + inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) { __m128i pxDst[4]; @@ -849,6 +941,29 @@ inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) +{ + __m256i pxDst[8]; + __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); + pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0); + pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0); + pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); + pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); + pxDst[4] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[5] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[6] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[7] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */ +} + inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) { __m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index bab13ded8..e1eeb4eaa 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1822,6 +1822,18 @@ __device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_flo dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) | (uchar)(src2_f8->f1[7])); } +__device__ __forceinline__ void rpp_hip_math_exclusiveOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + dst_f8->f1[0] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[0]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[0])); + dst_f8->f1[1] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[1]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[1])); + dst_f8->f1[2] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[2]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[2])); + dst_f8->f1[3] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[3]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[3])); + dst_f8->f1[4] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[4]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[4])); + dst_f8->f1[5] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[5]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[5])); + dst_f8->f1[6] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[6]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[6])); + dst_f8->f1[7] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[7]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[7])); +} + __device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x) { float xHalf = 0.5f * x; diff --git a/src/modules/cpu/host_tensor_logical_operations.hpp b/src/modules/cpu/host_tensor_logical_operations.hpp index 0fb3fe5eb..e984fced2 100644 --- a/src/modules/cpu/host_tensor_logical_operations.hpp +++ b/src/modules/cpu/host_tensor_logical_operations.hpp @@ -26,6 +26,7 @@ SOFTWARE. #define HOST_TENSOR_LOGICAL_OPERATIONS_HPP #include "kernel/bitwise_and.hpp" +#include "kernel/exclusive_or.hpp" #include "kernel/bitwise_or.hpp" #endif // HOST_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp new file mode 100644 index 000000000..5ea193d41 --- /dev/null +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -0,0 +1,1281 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software OR associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, OR/or sell +copies of the Software, OR to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice OR this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +/* exclusiveOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the exclusiveOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + The bitwise operation is applied to the char representation of the raw floating-point data in memory */ + +RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = srcPtr1Temp[0] ^ srcPtr2Temp[0]; + *dstPtrTempG++ = srcPtr1Temp[1] ^ srcPtr2Temp[1]; + *dstPtrTempB++ = srcPtr1Temp[2] ^ srcPtr2Temp[2]; + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = *srcPtr1TempR ^ *srcPtr2TempR; + dstPtrTemp[1] = *srcPtr1TempG ^ *srcPtr2TempG; + dstPtrTemp[2] = *srcPtr1TempB ^ *srcPtr2TempB; + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = *srcPtr1TempR ^ *srcPtr2TempR; + *dstPtrTempG = *srcPtr1TempG ^ *srcPtr2TempG; + *dstPtrTempB = *srcPtr1TempB ^ *srcPtr2TempB; + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1, p2; + + p1 = _mm256_loadu_si256((const __m256i *)srcPtr1Temp); // simd loads + p2 = _mm256_loadu_si256((const __m256i *)srcPtr2Temp); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp; + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + + return RPP_SUCCESS; +} + +RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255); + dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255); + dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255); + *dstPtrTempG = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255); + *dstPtrTempB = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + + } + } + + return RPP_SUCCESS; +} + +RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255)); + *dstPtrTempG = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255)); + *dstPtrTempB = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for (int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) ^ (srcPtr2Temp[0] + 128)) - 128)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) ^ (srcPtr2Temp[1] + 128)) - 128)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) ^ (srcPtr2Temp[2] + 128)) - 128)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); + *dstPtrTempG = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128)); + *dstPtrTempB = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1, p2; + + p1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr1Temp)); // simd loads + p2 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr2Temp)); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, _mm256_sub_epi8(p1, avx_pxConvertI8)); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/hip_tensor_logical_operations.hpp b/src/modules/hip/hip_tensor_logical_operations.hpp index 636789246..946dfe0f5 100644 --- a/src/modules/hip/hip_tensor_logical_operations.hpp +++ b/src/modules/hip/hip_tensor_logical_operations.hpp @@ -26,6 +26,7 @@ SOFTWARE. #define HIP_TENSOR_LOGICAL_OPERATIONS_HPP #include "kernel/bitwise_and.hpp" +#include "kernel/exclusive_or.hpp" #include "kernel/bitwise_or.hpp" #endif // HIP_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp new file mode 100644 index 000000000..bd5c5cd5f --- /dev/null +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -0,0 +1,247 @@ +#include +#include "rpp_hip_common.hpp" + +/* ExclusiveOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the ExclusiveOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + The bitwise operation is applied to the char representation of the raw floating-point data in memory */ + +template +__device__ void exclusive_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + if constexpr ((std::is_same::value) || (std::is_same::value)) + { + rpp_hip_math_multiply8_const(src1_f8, src1_f8, static_cast(255)); + rpp_hip_math_multiply8_const(src2_f8, src2_f8, static_cast(255)); + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_multiply8_const(dst_f8, dst_f8, static_cast(ONE_OVER_255)); + } + else if constexpr (std::is_same::value) + { + rpp_hip_math_add8_const(src1_f8, src1_f8, static_cast(128)); + rpp_hip_math_add8_const(src2_f8, src2_f8, static_cast(128)); + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_subtract8_const(dst_f8, dst_f8, static_cast(128)); + } + else + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); +} + +template +__global__ void exclusive_or_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void exclusive_or_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float8 src1_f8, src2_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void exclusive_or_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void exclusive_or_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24); + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1, + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(exclusive_or_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(exclusive_or_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(exclusive_or_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(exclusive_or_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index dc4adbc9b..636a9c6c5 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -96,6 +96,71 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, return RPP_SUCCESS; } +/******************** exclusive OR ********************/ + +RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + exclusive_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + exclusive_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + exclusive_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + exclusive_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /******************** bitwise OR ********************/ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, @@ -232,6 +297,70 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, #endif // backend } +/******************** exclusive XOR ********************/ + +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + /******************** bitwise OR ********************/ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, diff --git a/utilities/test_suite/HIP/Tensor_image_hip.cpp b/utilities/test_suite/HIP/Tensor_image_hip.cpp index 1f50908ba..997f3d2dc 100644 --- a/utilities/test_suite/HIP/Tensor_image_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_image_hip.cpp @@ -62,7 +62,7 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 28 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68); bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 11 || testCase == 84 || testCase == 49 || testCase == 54); bool nonQACase = (testCase == 24 || testCase == 28 || testCase == 54); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24|| testCase == 28 || testCase == 79); @@ -1305,6 +1305,18 @@ int main(int argc, char **argv) break; } + case 67: + { + testCaseName = "exclusive_or"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_exclusive_or_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 68: { testCaseName = "bitwise_or"; diff --git a/utilities/test_suite/HIP/runImageTests.py b/utilities/test_suite/HIP/runImageTests.py index 0e15f1eac..e8e3b0b26 100644 --- a/utilities/test_suite/HIP/runImageTests.py +++ b/utilities/test_suite/HIP/runImageTests.py @@ -272,7 +272,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '67', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": @@ -329,7 +329,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) elif (testType == 1 and profilingOption == "YES"): - NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69] + NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 67, 69] noCaseSupported = all(case not in supportedCaseList for case in caseList) if noCaseSupported: diff --git a/utilities/test_suite/HOST/Tensor_image_host.cpp b/utilities/test_suite/HOST/Tensor_image_host.cpp index 8914e576c..bc4857c9e 100644 --- a/utilities/test_suite/HOST/Tensor_image_host.cpp +++ b/utilities/test_suite/HOST/Tensor_image_host.cpp @@ -62,7 +62,7 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 28 || testCase == 49 || testCase ==54 || testCase == 79); bool kernelSizeCase = (testCase == 49 || testCase == 54); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68); bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 10 || testCase == 84); bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); @@ -1285,6 +1285,19 @@ int main(int argc, char **argv) break; } + case 67: + { + testCaseName = "exclusive_or"; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_exclusive_or_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 68: { testCaseName = "bitwise_or"; diff --git a/utilities/test_suite/HOST/runImageTests.py b/utilities/test_suite/HOST/runImageTests.py index b50f20819..ff39a7db9 100644 --- a/utilities/test_suite/HOST/runImageTests.py +++ b/utilities/test_suite/HOST/runImageTests.py @@ -260,7 +260,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '10', '11', '13', '15', '20', '21', '23', '24', '26', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '49', '54', '61', '63', '65', '67', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] if testType == 0: noCaseSupported = all(case not in supportedCaseList for case in caseList) diff --git a/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin new file mode 100644 index 000000000..1c6464a29 Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin differ diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index ab896c191..691999bfd 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -71,6 +71,7 @@ 61: ["magnitude", "HOST", "HIP"], 63: ["phase", "HOST", "HIP"], 65: ["bitwise_and", "HOST", "HIP"], + 67: ["exclusive_or", "HOST", "HIP"], 68: ["bitwise_or", "HOST", "HIP"], 70: ["copy", "HOST", "HIP"], 79: ["remap", "HOST", "HIP"], @@ -122,7 +123,7 @@ "geometric_augmentations" : [20, 21, 23, 24, 26, 28, 33, 37, 38, 39, 63, 79, 80, 92], "filter_augmentations" : [49, 54], "arithmetic_operations" : [61], - "logical_operations" : [65, 68], + "logical_operations" : [65, 67, 68], "data_exchange_operations" : [70, 85, 86], "statistical_operations" : [15, 87, 88, 89, 90, 91] } diff --git a/utilities/test_suite/rpp_test_suite_image.h b/utilities/test_suite/rpp_test_suite_image.h index 48d0dd778..6ae9dfa5d 100644 --- a/utilities/test_suite/rpp_test_suite_image.h +++ b/utilities/test_suite/rpp_test_suite_image.h @@ -99,6 +99,7 @@ std::map augmentationMap = {61, "magnitude"}, {63, "phase"}, {65, "bitwise_and"}, + {67, "exclusive_or"}, {68, "bitwise_or"}, {70, "copy"}, {79, "remap"},