From 0a09a27e010a20b05fb0d05fc4adbc6957601b5a Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 21 Oct 2024 22:15:26 +0530 Subject: [PATCH 01/38] Update the initial CODE for HIP Implementation of Exclusive OR --- include/rppt_tensor_logical_operations.h | 23 +++++++ src/include/hip/rpp_hip_common.hpp | 12 ++++ .../hip/hip_tensor_logical_operations.hpp | 1 + .../rppt_tensor_logical_operations.cpp | 64 +++++++++++++++++++ utilities/test_suite/HIP/Tensor_hip.cpp | 14 +++- utilities/test_suite/HIP/runTests.py | 4 +- utilities/test_suite/common.py | 3 +- 7 files changed, 117 insertions(+), 4 deletions(-) diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 28dff69ce..e25bb333b 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -86,6 +86,29 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +#ifdef GPU_SUPPORT +/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input1 + * \image html img150x150_2.png Sample Input2 + * \image html logical_operations_bitwise_and_img150x150.png Sample Output + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 65ec4e06c..1bdf362a3 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1826,6 +1826,18 @@ __device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_flo dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) | (uchar)(src2_f8->f1[7])); } +__device__ __forceinline__ void rpp_hip_math_exclusiveOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) ^ (uchar)(src2_f8->f1[0])); + dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) ^ (uchar)(src2_f8->f1[1])); + dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) ^ (uchar)(src2_f8->f1[2])); + dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) ^ (uchar)(src2_f8->f1[3])); + dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) ^ (uchar)(src2_f8->f1[4])); + dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) ^ (uchar)(src2_f8->f1[5])); + dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) ^ (uchar)(src2_f8->f1[6])); + dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) ^ (uchar)(src2_f8->f1[7])); +} + __device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x) { float xHalf = 0.5f * x; diff --git a/src/modules/hip/hip_tensor_logical_operations.hpp b/src/modules/hip/hip_tensor_logical_operations.hpp index 636789246..946dfe0f5 100644 --- a/src/modules/hip/hip_tensor_logical_operations.hpp +++ b/src/modules/hip/hip_tensor_logical_operations.hpp @@ -26,6 +26,7 @@ SOFTWARE. #define HIP_TENSOR_LOGICAL_OPERATIONS_HPP #include "kernel/bitwise_and.hpp" +#include "kernel/exclusive_or.hpp" #include "kernel/bitwise_or.hpp" #endif // HIP_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index 7d28fe96b..84cf49b73 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -233,6 +233,70 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, #endif // backend } +/******************** exclusive XOR ********************/ + +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + /******************** bitwise OR ********************/ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 84097e675..a95c6f9d6 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -65,7 +65,7 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68); bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); @@ -1186,6 +1186,18 @@ int main(int argc, char **argv) break; } + case 67: + { + testCaseName = "exclusive_or"; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_exclusive_or_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 68: { testCaseName = "bitwise_or"; diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index a5fcc0d91..402576189 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -272,7 +272,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '67', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": @@ -329,7 +329,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) elif (testType == 1 and profilingOption == "YES"): - NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69] + NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 67, 69] noCaseSupported = all(case not in supportedCaseList for case in caseList) if noCaseSupported: diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index 68f97d219..37afe90be 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -67,6 +67,7 @@ 61: ["magnitude", "HOST", "HIP"], 63: ["phase", "HOST", "HIP"], 65: ["bitwise_and", "HOST", "HIP"], + 65: ["exclusive_or", "HOST", "HIP"], 68: ["bitwise_or", "HOST", "HIP"], 70: ["copy", "HOST", "HIP"], 79: ["remap", "HOST", "HIP"], @@ -118,7 +119,7 @@ "geometric_augmentations" : [20, 21, 23, 24, 26, 33, 37, 38, 39, 63, 79, 80, 92], "filter_augmentations" : [49, 54], "arithmetic_operations" : [61], - "logical_operations" : [65, 68], + "logical_operations" : [65, 67, 68], "data_exchange_operations" : [70, 85, 86], "statistical_operations" : [87, 88, 89, 90, 91] } From 4f5d6f0d9c2a31e49509e55ca7ae3c1cd5eff327 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 9 Sep 2024 19:48:39 +0530 Subject: [PATCH 02/38] Add exclusive_or.hpp hip file --- src/modules/hip/kernel/exclusive_or.hpp | 248 ++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 src/modules/hip/kernel/exclusive_or.hpp diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp new file mode 100644 index 000000000..2a17c912d --- /dev/null +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -0,0 +1,248 @@ +#include +#include "rpp_hip_common.hpp" + +/*ExclusiveXOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the ExclusiveXOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + The bitwise operation is applied to the char representation of the raw floating-point data in memory */ + +template +__device__ void exclusive_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) +{ + if constexpr ((std::is_same::value) || (std::is_same::value)) + { + rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255); + rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255); + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255); + } + else if constexpr (std::is_same::value) + { + rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128); + rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128); + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); + rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128); + } + else + rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); +} + +template +__global__ void exclusive_or_pkd_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void exclusive_or_pln_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float8 src1_f8, src2_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8); + rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8); + exclusive_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void exclusive_or_pkd3_pln3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24); + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void exclusive_or_pln3_pkd3_hip_tensor(T *srcPtr1, + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + d_float24 src1_f24, src2_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24); + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]); + exclusive_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1, + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(exclusive_or_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(exclusive_or_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(exclusive_or_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(exclusive_or_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr1, + srcPtr2, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file From 885e808b9e8d1fb899be1b25eac60035927b3cb8 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 10 Sep 2024 11:49:58 +0530 Subject: [PATCH 03/38] Update the code for initial HOST Code --- include/rppt_tensor_logical_operations.h | 21 + .../cpu/host_tensor_logical_operations.hpp | 1 + src/modules/cpu/kernel/exclusive_or.hpp | 526 ++++++++++++++++++ .../rppt_tensor_logical_operations.cpp | 41 ++ utilities/test_suite/HOST/Tensor_host.cpp | 15 +- utilities/test_suite/HOST/runTests.py | 2 +- 6 files changed, 604 insertions(+), 2 deletions(-) create mode 100644 src/modules/cpu/kernel/exclusive_or.hpp diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index e25bb333b..8ade92dfe 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -153,6 +153,27 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input1 + * \image html img150x150_2.png Sample Input2 + * \image html logical_operations_bitwise_and_img150x150.png Sample Output + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + /*! @} */ diff --git a/src/modules/cpu/host_tensor_logical_operations.hpp b/src/modules/cpu/host_tensor_logical_operations.hpp index 0fb3fe5eb..e984fced2 100644 --- a/src/modules/cpu/host_tensor_logical_operations.hpp +++ b/src/modules/cpu/host_tensor_logical_operations.hpp @@ -26,6 +26,7 @@ SOFTWARE. #define HOST_TENSOR_LOGICAL_OPERATIONS_HPP #include "kernel/bitwise_and.hpp" +#include "kernel/exclusive_or.hpp" #include "kernel/bitwise_or.hpp" #endif // HOST_TENSOR_LOGICAL_OPERATIONS_HPP \ No newline at end of file diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp new file mode 100644 index 000000000..a740d5b3d --- /dev/null +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -0,0 +1,526 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software OR associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, OR/or sell +copies of the Software, OR to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice OR this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +/* exclusiveOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the exclusiveOR is applied on a 0-255 + range-translated approximation, of the original 0-1 decimal-range image. + The bitwise operation is applied to the char representation of the raw floating-point data in memory */ +RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) ^ (uint)(srcPtr2Temp[0] * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) ^ (uint)(srcPtr2Temp[1] * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) ^ (uint)(srcPtr2Temp[2] * 255)) / 255); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); + dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); + dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + +#if __AVX2__ + Rpp32u alignedLength = (bufferLength / 24) * 24; + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; +#endif + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrement; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) ^ (uint)(srcPtr2Temp[0] * 255)) / 255)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) ^ (uint)(srcPtr2Temp[1] * 255)) / 255)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) ^ (uint)(srcPtr2Temp[2] * 255)) / 255)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); + srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); + srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]); + + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]); + srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]); + srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]); + } + + __m256 p1[4], p2[4]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); + srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); + } + + __m256 p1[1], p2[1]; + + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index 84cf49b73..4e74eac2d 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -97,6 +97,47 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, return RPP_SUCCESS; } +/******************** exclusive OR ********************/ + +RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + exclusive_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + exclusive_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /******************** bitwise OR ********************/ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 53c9dbaf9..e1eedb0d7 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -65,7 +65,7 @@ int main(int argc, char **argv) int batchSize = atoi(argv[14]); bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); - bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); + bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 67 || testCase == 68); bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84); bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); @@ -1141,6 +1141,19 @@ int main(int argc, char **argv) break; } + case 67: + { + testCaseName = "exclusive_or"; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_exclusive_or_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 68: { testCaseName = "bitwise_or"; diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index df32c02a5..537f6127c 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -252,7 +252,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '5', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '67', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] if testType == 0: noCaseSupported = all(case not in supportedCaseList for case in caseList) From e130368ef81fa7df5c29d14f78b114f96f7a5b7b Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 10 Sep 2024 14:45:20 +0530 Subject: [PATCH 04/38] Make SSE based updatess for exclusive or --- src/modules/cpu/kernel/exclusive_or.hpp | 220 ++++++++++++++++++ .../rppt_tensor_logical_operations.cpp | 14 +- 2 files changed, 233 insertions(+), 1 deletion(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index a740d5b3d..291a3d04c 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -30,6 +30,226 @@ SOFTWARE. For a Rpp32f precision image (pixel values from 0-1), the exclusiveOR is applied on a 0-255 range-translated approximation, of the original 0-1 decimal-range image. The bitwise operation is applied to the char representation of the raw floating-point data in memory */ + +RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 48) * 48; + Rpp32u vectorIncrement = 48; + Rpp32u vectorIncrementPerChannel = 16; + + // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm_xor_si128(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm_xor_si128(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm_xor_si128(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = srcPtr1Temp[0] ^ srcPtr2Temp[0]; + *dstPtrTempG++ = srcPtr1Temp[1] ^ srcPtr2Temp[1]; + *dstPtrTempB++ = srcPtr1Temp[2] ^ srcPtr2Temp[2]; + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1[3], p2[3]; + + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm_xor_si128(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm_xor_si128(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm_xor_si128(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = *srcPtr1TempR ^ *srcPtr2TempR; + dstPtrTemp[1] = *srcPtr1TempG ^ *srcPtr2TempG; + dstPtrTemp[2] = *srcPtr1TempB ^ *srcPtr2TempB; + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m128i p1, p2; + + p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads + p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads + p1 = _mm_xor_si128(p1, p2); // exclusive_or computation + _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp; + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, Rpp32f *srcPtr2, RpptDescPtr srcDescPtr, diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index 4e74eac2d..dda6e3cea 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -110,7 +110,19 @@ RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, { RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); - if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + exclusive_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) { exclusive_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), From 169c59c2c749d868e33d6a7af2bbce888bf0975e Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 10 Sep 2024 17:12:22 +0530 Subject: [PATCH 05/38] Update the code for AVX2 implementation of U8 code --- src/include/cpu/rpp_cpu_simd.hpp | 62 +++++++++++++++++++++++++ src/modules/cpu/kernel/exclusive_or.hpp | 48 +++++++++---------- 2 files changed, 86 insertions(+), 24 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index c24ef90da..a5aa12ee5 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -451,6 +451,30 @@ inline void rpp_load48_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m128i *px) px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ } +inline void rpp_load96_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m256i *px) +{ + __m256i pxSrc[8]; + __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15)); + pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); + __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */ + px[0] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */ + px[1] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */ + px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ +} + inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128i *px) { _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -458,6 +482,13 @@ inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u * _mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_store96_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m256i *px) +{ + _mm256_storeu_si256((__m256i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm256_storeu_si256((__m256i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128i *px) { px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -465,6 +496,13 @@ inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *s px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_load96_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *px) +{ + px[0] = _mm256_loadu_si256((__m256i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm256_loadu_si256((__m256i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm256_loadu_si256((__m256i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) { __m128i pxDst[4]; @@ -480,6 +518,30 @@ inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px) +{ + __m256i pxDst[8]; + __m256i pxZero = _mm256_setzero_si256(); + __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); + pxDst[0] = _mm256_unpacklo_epi8(px[1], pxZero); + pxDst[1] = _mm256_unpackhi_epi8(px[1], pxZero); + pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); + pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); + pxDst[4] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); + pxDst[5] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); + pxDst[6] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); + pxDst[7] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); +} + inline void rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p) { __m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 291a3d04c..49ab36a9f 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -45,7 +45,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); -#pragma omp parallel for num_threads(numThreads) +//#pragma omp parallel for num_threads(numThreads) for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; @@ -64,9 +64,9 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); dstPtrChannel = dstPtrImage; - Rpp32u alignedLength = (bufferLength / 48) * 48; - Rpp32u vectorIncrement = 48; - Rpp32u vectorIncrementPerChannel = 16; + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) @@ -90,14 +90,14 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, int vectorLoopCount = 0; for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) { - __m128i p1[3], p2[3]; + __m256i p1[3], p2[3]; - rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads - rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads - p1[0] = _mm_xor_si128(p1[0], p2[0]); // exclusive_or computation - p1[1] = _mm_xor_si128(p1[1], p2[1]); // exclusive_or computation - p1[2] = _mm_xor_si128(p1[2], p2[2]); // exclusive_or computation - rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load96_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores srcPtr1Temp += vectorIncrement; srcPtr2Temp += vectorIncrement; @@ -150,14 +150,14 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, int vectorLoopCount = 0; for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { - __m128i p1[3], p2[3]; + __m256i p1[3], p2[3]; - rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads - rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads - p1[0] = _mm_xor_si128(p1[0], p2[0]); // exclusive_or computation - p1[1] = _mm_xor_si128(p1[1], p2[1]); // exclusive_or computation - p1[2] = _mm_xor_si128(p1[2], p2[2]); // exclusive_or computation - rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores + rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores srcPtr1TempR += vectorIncrementPerChannel; srcPtr1TempG += vectorIncrementPerChannel; @@ -196,7 +196,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else { - alignedLength = bufferLength & ~15; + alignedLength = bufferLength & ~31; for(int c = 0; c < layoutParams.channelParam; c++) { @@ -215,12 +215,12 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, int vectorLoopCount = 0; for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { - __m128i p1, p2; + __m256i p1, p2; - p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads - p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads - p1 = _mm_xor_si128(p1, p2); // exclusive_or computation - _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores + p1 = _mm256_loadu_si256((const __m256i *)srcPtr1Temp); // simd loads + p2 = _mm256_loadu_si256((const __m256i *)srcPtr2Temp); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, p1); // simd stores srcPtr1Temp += vectorIncrementPerChannel; srcPtr2Temp += vectorIncrementPerChannel; From cce836d2d6bbc49098f538f1d181b0d65d5977f0 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 10 Sep 2024 17:52:24 +0530 Subject: [PATCH 06/38] Uncomment pragma --- src/modules/cpu/kernel/exclusive_or.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 49ab36a9f..35e780e7f 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -45,7 +45,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); -//#pragma omp parallel for num_threads(numThreads) +#pragma omp parallel for num_threads(numThreads) for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; From 5b06d4858b1d4384a85af90dd959a63df3e32f3f Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 11 Sep 2024 06:28:16 +0530 Subject: [PATCH 07/38] Initial commit for I8 --- src/include/cpu/rpp_cpu_simd.hpp | 62 +++++++ src/modules/cpu/kernel/exclusive_or.hpp | 227 +++++++++++++++++++++++- 2 files changed, 285 insertions(+), 4 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index a5aa12ee5..834543335 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -808,6 +808,30 @@ inline void rpp_load48_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m128i *px) px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 and add 128 to get u8 from i8 */ } +inline void rpp_load96_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m256i *px) +{ + __m256i pxSrc[8]; + __m256i pxMask = _mm256_castsi128_si256(_mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15)); + pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); + __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); + pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ +} + inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px) { _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ @@ -822,6 +846,13 @@ inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s * _mm_storeu_si128((__m128i *)dstPtrB, _mm_sub_epi8(px[2], xmm_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_store96_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px) +{ + _mm256_store_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm256_store_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm256_store_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p) { __m128i pxSrc[8]; @@ -862,6 +893,13 @@ inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } +inline void rpp_load96_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *px) +{ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ +} + inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) { __m128i pxDst[4]; @@ -892,6 +930,30 @@ inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ } +inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) +{ + __m256i pxDst[8]; + __m256i pxZero = _mm256_setzero_si256(); + __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); + pxDst[0] = _mm256_unpacklo_epi8(px[1], pxZero); + pxDst[1] = _mm256_unpackhi_epi8(px[1], pxZero); + pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); + pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); + pxDst[4] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[5] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[6] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); + pxDst[7] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); +} + inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) { __m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 35e780e7f..45b7904ba 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -68,7 +68,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, Rpp32u vectorIncrement = 96; Rpp32u vectorIncrementPerChannel = 32; - // Bitwise OR with fused output-layout toggle (NHWC -> NCHW) + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) { Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; @@ -124,7 +124,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, } } - // Bitwise OR with fused output-layout toggle (NCHW -> NHWC) + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) { Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; @@ -193,7 +193,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, } } - // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else { alignedLength = bufferLength & ~31; @@ -743,4 +743,223 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, } return RPP_SUCCESS; -} \ No newline at end of file +} + +RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = Handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage; + srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride; + srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + + Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel; + srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = (bufferLength / 96) * 96; + Rpp32u vectorIncrement = 96; + Rpp32u vectorIncrementPerChannel = 32; + + // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load96_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1Temp += vectorIncrement; + srcPtr2Temp += vectorIncrement; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) + { + *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) ^ (srcPtr2Temp[0] + 128)) - 128)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) ^ (srcPtr2Temp[1] + 128)) - 128)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) ^ (srcPtr2Temp[2] + 128)) - 128)); + + srcPtr1Temp += 3; + srcPtr2Temp += 3; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrement; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTemp += 3; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + alignedLength = bufferLength & ~31; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1, p2; + + p1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr1Temp)); // simd loads + p2 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr2Temp)); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, _mm256_sub_epi8(p1, avx_pxConvertI8)); // simd stores + + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128)); + + srcPtr1Temp++; + srcPtr2Temp++; + } + + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} From 7f5df55e7388399e7b89b05e37d07ae2ff136098 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 11 Sep 2024 08:06:27 +0530 Subject: [PATCH 08/38] Add I8 case --- src/modules/rppt_tensor_logical_operations.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index dda6e3cea..f1e905e7e 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -146,6 +146,18 @@ RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, layoutParams, rpp::deref(rppHandle)); } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + exclusive_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } return RPP_SUCCESS; } From 9fe0d11ffd68c7947beff478207d027b4314a53c Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 17 Sep 2024 08:37:21 +0530 Subject: [PATCH 09/38] Fix issues with PKD3 to PLN3 i8 implementation --- src/include/cpu/rpp_cpu_simd.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 834543335..025b67ff4 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -848,9 +848,9 @@ inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s * inline void rpp_store96_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px) { - _mm256_store_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ - _mm256_store_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ - _mm256_store_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + _mm256_storeu_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm256_storeu_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm256_storeu_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p) From a83f3fad88982ddaaec8827dd6ee6e3c67d43900 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 17 Sep 2024 23:57:29 +0530 Subject: [PATCH 10/38] Initial updates based on self review --- include/rppt_tensor_logical_operations.h | 8 +- src/include/cpu/rpp_cpu_simd.hpp | 116 +++++++------- src/modules/cpu/kernel/exclusive_or.hpp | 110 ++++++------- src/modules/hip/kernel/exclusive_or.hpp | 58 +++---- .../rppt_tensor_logical_operations.cpp | 148 +++++++++--------- utilities/test_suite/common.py | 2 +- 6 files changed, 221 insertions(+), 221 deletions(-) diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 8ade92dfe..03d1b17d9 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -88,12 +88,12 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s #ifdef GPU_SUPPORT /*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor - * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 - * \image html logical_operations_bitwise_and_img150x150.png Sample Output + * \image html logical_operations_exclusive_or_img150x150.png Sample Output * \param [in] srcPtr1 source1 tensor in HIP memory * \param [in] srcPtr2 source2 tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) @@ -154,12 +154,12 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr #endif // GPU_SUPPORT /*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor - * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 - * \image html logical_operations_bitwise_and_img150x150.png Sample Output + * \image html logical_operations_exclusive_or_img150x150.png Sample Output * \param [in] srcPtr1 source1 tensor in HOST memory * \param [in] srcPtr2 source2 tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 025b67ff4..49ddc38e1 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -458,21 +458,21 @@ inline void rpp_load96_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m256i *px) pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); - pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ - pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ - pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */ - pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */ - pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ - pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ - pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ - pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ - pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */ - pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */ - pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */ - pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */ - px[0] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */ - px[1] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */ - px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */ + px[0] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */ + px[1] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */ + px[2] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */ } inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128i *px) @@ -484,9 +484,9 @@ inline void rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u * inline void rpp_store96_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m256i *px) { - _mm256_storeu_si256((__m256i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ - _mm256_storeu_si256((__m256i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ - _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + _mm256_storeu_si256((__m256i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + _mm256_storeu_si256((__m256i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + _mm256_storeu_si256((__m256i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ } inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128i *px) @@ -498,9 +498,9 @@ inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *s inline void rpp_load96_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *px) { - px[0] = _mm256_loadu_si256((__m256i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ - px[1] = _mm256_loadu_si256((__m256i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ - px[2] = _mm256_loadu_si256((__m256i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + px[0] = _mm256_loadu_si256((__m256i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + px[1] = _mm256_loadu_si256((__m256i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + px[2] = _mm256_loadu_si256((__m256i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ } inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) @@ -532,14 +532,14 @@ inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px) pxDst[5] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); pxDst[6] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); pxDst[7] = _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB); - _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); - _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); - _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); - _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); - _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ } inline void rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p) @@ -815,21 +815,21 @@ inline void rpp_load96_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m256i *px) pxMask = _mm256_permute2f128_si256(pxMask, pxMask, 0); __m256i pxMaskRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15)); pxMaskRGB = _mm256_permute2f128_si256(pxMaskRGB, pxMaskRGB, 0); - pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ - pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ - pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */ - pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */ - pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ - pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ - pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ - pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ - pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */ - pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */ - pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */ - pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */ - px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */ - px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */ - px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */ + pxSrc[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)srcPtr)), _mm_loadu_si128((__m128i *)(srcPtr + 48)), 1); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06|R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|R21|G21|B21|R22] - Need RGB 01-04, 17-20 */ + pxSrc[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 12))), _mm_loadu_si128((__m128i *)(srcPtr + 60)), 1); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10|R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|R25|G25|B25|R26] - Need RGB 05-08, 21-24 */ + pxSrc[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 24))), _mm_loadu_si128((__m128i *)(srcPtr + 72)), 1); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14|R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|R29|G29|B29|R30] - Need RGB 09-12, 25-28 */ + pxSrc[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(srcPtr + 36))), _mm_loadu_si128((__m128i *)(srcPtr + 84)), 1); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18|R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|R33|G33|B33|R34] - Need RGB 13-16, 29-32 */ + pxSrc[4] = _mm256_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06 || R17|R18|R19|R20|G17|G18|G19|G20 || B17|B18|B19|B20|R21|G21|B21|R22] - Need R01-04, G01-04, B01-04, R17-20, G17-20, B17-20 */ + pxSrc[5] = _mm256_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10 || R21|R22|R23|R24|G21|G22|G23|G24 || B21|B22|B23|B24|R25|G25|B25|R26] - Need R05-08, G05-08, B05-08, R21-24, G21-24, B21-24 */ + pxSrc[6] = _mm256_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14 || R25|R26|R27|R28|G25|G26|G27|G28 || B25|B26|B27|B28|R29|G29|B29|R30] - Need R09-12, G09-12, B09-12, R25-28, G25-28, B25-28 */ + pxSrc[7] = _mm256_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18 || R29|R30|R31|R32|G29|G30|G31|G32 || B29|B30|B31|B32|R33|G33|B33|R34] - Need R13-16, G13-16, B13-16, R29-32, G29-32, B29-32 */ + pxSrc[0] = _mm256_unpacklo_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[1] = _mm256_unpacklo_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] */ + pxSrc[2] = _mm256_unpackhi_epi8(pxSrc[4], pxSrc[5]); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] */ + pxSrc[3] = _mm256_unpackhi_epi8(pxSrc[6], pxSrc[7]); /* unpack 8 hi-pixels of pxSrc[6] and pxSrc[7] */ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] to get R01-16 */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxSrc[0], pxSrc[1]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] to get G01-16 */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxSrc[2], pxSrc[3]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] to get B01-16 */ } inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px) @@ -848,9 +848,9 @@ inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s * inline void rpp_store96_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m256i *px) { - _mm256_storeu_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ - _mm256_storeu_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ - _mm256_storeu_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + _mm256_storeu_si256((__m256i *)dstPtrR, _mm256_sub_epi8(px[0], avx_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + _mm256_storeu_si256((__m256i *)dstPtrG, _mm256_sub_epi8(px[1], avx_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + _mm256_storeu_si256((__m256i *)dstPtrB, _mm256_sub_epi8(px[2], avx_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ } inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p) @@ -895,9 +895,9 @@ inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s inline void rpp_load96_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *px) { - px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ - px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ - px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + px[0] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ + px[1] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ + px[2] = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ } inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) @@ -944,14 +944,14 @@ inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) pxDst[5] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); pxDst[6] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); pxDst[7] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), avx_pxConvertI8); - _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); - _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); - _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); - _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); - _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); - _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); + _mm_storeu_si128((__m128i *)dstPtr, _mm256_castsi256_si128(pxDst[4])); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm256_castsi256_si128(pxDst[5])); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm256_castsi256_si128(pxDst[6])); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm256_castsi256_si128(pxDst[7])); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ } inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 45b7904ba..419530fbe 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -32,21 +32,21 @@ SOFTWARE. The bitwise operation is applied to the char representation of the raw floating-point data in memory */ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, - Rpp8u *srcPtr2, - RpptDescPtr srcDescPtr, - Rpp8u *dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - RppLayoutParams layoutParams, - rpp::Handle& Handle) + Rpp8u *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) { RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; @@ -78,7 +78,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1Temp = srcPtr1Row; @@ -136,7 +136,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; srcPtr1TempR = srcPtr1RowR; @@ -198,14 +198,14 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, { alignedLength = bufferLength & ~31; - for(int c = 0; c < layoutParams.channelParam; c++) + for (int c = 0; c < layoutParams.channelParam; c++) { Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; srcPtr2Row = srcPtr2Channel; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; srcPtr1Temp = srcPtr1Row; @@ -251,21 +251,21 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, } RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, - Rpp32f *srcPtr2, - RpptDescPtr srcDescPtr, - Rpp32f *dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - RppLayoutParams layoutParams, - rpp::Handle& Handle) + Rpp32f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) { RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; @@ -299,7 +299,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1Temp = srcPtr1Row; @@ -361,7 +361,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; srcPtr1TempR = srcPtr1RowR; @@ -429,14 +429,14 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, alignedLength = bufferLength & ~7; #endif - for(int c = 0; c < layoutParams.channelParam; c++) + for (int c = 0; c < layoutParams.channelParam; c++) { Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; srcPtr2Row = srcPtr2Channel; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; srcPtr1Temp = srcPtr1Row; @@ -484,21 +484,21 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, } RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, - Rpp16f *srcPtr2, - RpptDescPtr srcDescPtr, - Rpp16f *dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - RppLayoutParams layoutParams, - rpp::Handle& Handle) + Rpp16f *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) { RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; @@ -532,7 +532,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1Temp = srcPtr1Row; @@ -547,7 +547,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, { Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; - for(int cnt = 0; cnt < vectorIncrement; cnt++) + for (int cnt = 0; cnt < vectorIncrement; cnt++) { srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); @@ -602,7 +602,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; srcPtr1TempR = srcPtr1RowR; @@ -619,7 +619,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, { Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; - for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + for (int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) { srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); @@ -683,14 +683,14 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, alignedLength = bufferLength & ~7; #endif - for(int c = 0; c < layoutParams.channelParam; c++) + for (int c = 0; c < layoutParams.channelParam; c++) { Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; srcPtr2Row = srcPtr2Channel; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; srcPtr1Temp = srcPtr1Row; @@ -703,7 +703,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, { Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; - for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + for (int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) { srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); @@ -746,21 +746,21 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, } RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, - Rpp8s *srcPtr2, - RpptDescPtr srcDescPtr, - Rpp8s *dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - RppLayoutParams layoutParams, - rpp::Handle& Handle) + Rpp8s *srcPtr2, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& Handle) { RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; Rpp32u numThreads = Handle.GetNumThreads(); omp_set_dynamic(0); #pragma omp parallel for num_threads(numThreads) - for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) { RpptROI roi; RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; @@ -792,7 +792,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1Temp = srcPtr1Row; @@ -850,7 +850,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; srcPtr1TempR = srcPtr1RowR; @@ -912,14 +912,14 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, { alignedLength = bufferLength & ~31; - for(int c = 0; c < layoutParams.channelParam; c++) + for (int c = 0; c < layoutParams.channelParam; c++) { Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; srcPtr2Row = srcPtr2Channel; dstPtrRow = dstPtrChannel; - for(int i = 0; i < roi.xywhROI.roiHeight; i++) + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; srcPtr1Temp = srcPtr1Row; diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp index 2a17c912d..faff8863d 100644 --- a/src/modules/hip/kernel/exclusive_or.hpp +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -29,11 +29,11 @@ __device__ void exclusive_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 template __global__ void exclusive_or_pkd_hip_tensor(T *srcPtr1, - T *srcPtr2, - uint2 srcStridesNH, - T *dstPtr, - uint2 dstStridesNH, - RpptROIPtr roiTensorPtrSrc) + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; @@ -59,12 +59,12 @@ __global__ void exclusive_or_pkd_hip_tensor(T *srcPtr1, template __global__ void exclusive_or_pln_hip_tensor(T *srcPtr1, - T *srcPtr2, - uint3 srcStridesNCH, - T *dstPtr, - uint3 dstStridesNCH, - int channelsDst, - RpptROIPtr roiTensorPtrSrc) + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + RpptROIPtr roiTensorPtrSrc) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; @@ -107,11 +107,11 @@ __global__ void exclusive_or_pln_hip_tensor(T *srcPtr1, template __global__ void exclusive_or_pkd3_pln3_hip_tensor(T *srcPtr1, - T *srcPtr2, - uint2 srcStridesNH, - T *dstPtr, - uint3 dstStridesNCH, - RpptROIPtr roiTensorPtrSrc) + T *srcPtr2, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptROIPtr roiTensorPtrSrc) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; @@ -137,11 +137,11 @@ __global__ void exclusive_or_pkd3_pln3_hip_tensor(T *srcPtr1, template __global__ void exclusive_or_pln3_pkd3_hip_tensor(T *srcPtr1, - T *srcPtr2, - uint3 srcStridesNCH, - T *dstPtr, - uint2 dstStridesNH, - RpptROIPtr roiTensorPtrSrc) + T *srcPtr2, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptROIPtr roiTensorPtrSrc) { int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; @@ -167,20 +167,20 @@ __global__ void exclusive_or_pln3_pkd3_hip_tensor(T *srcPtr1, template RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1, - T *srcPtr2, - RpptDescPtr srcDescPtr, - T *dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - rpp::Handle& handle) + T *srcPtr2, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) { if (roiType == RpptRoiType::LTRB) hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); int globalThreads_x = (dstDescPtr->w + 7) >> 3; int globalThreads_y = dstDescPtr->h; - int globalThreads_z = handle.GetBatchSize(); + int globalThreads_z = dstDescPtr->n; if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) { diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp index f1e905e7e..8f0127418 100644 --- a/src/modules/rppt_tensor_logical_operations.cpp +++ b/src/modules/rppt_tensor_logical_operations.cpp @@ -100,63 +100,63 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, /******************** exclusive OR ********************/ RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, - RppPtr_t srcPtr2, - RpptDescPtr srcDescPtr, - RppPtr_t dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - rppHandle_t rppHandle) + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) { RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) { exclusive_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, - static_cast(srcPtr2) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - roiTensorPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) { exclusive_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), - reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), - srcDescPtr, - reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - roiTensorPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) { exclusive_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), - reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), - srcDescPtr, - reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - roiTensorPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) { exclusive_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, - static_cast(srcPtr2) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - roiTensorPtrSrc, - roiType, - layoutParams, - rpp::deref(rppHandle)); + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); } return RPP_SUCCESS; @@ -301,59 +301,59 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, /******************** exclusive XOR ********************/ RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, - RppPtr_t srcPtr2, - RpptDescPtr srcDescPtr, - RppPtr_t dstPtr, - RpptDescPtr dstDescPtr, - RpptROIPtr roiTensorPtrSrc, - RpptRoiType roiType, - rppHandle_t rppHandle) + RppPtr_t srcPtr2, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) { #ifdef HIP_COMPILE if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) { hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, - static_cast(srcPtr2) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - roiTensorPtrSrc, - roiType, - rpp::deref(rppHandle)); + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) { hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), - reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), - srcDescPtr, - reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - roiTensorPtrSrc, - roiType, - rpp::deref(rppHandle)); + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) { hip_exec_exclusive_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes), - reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), - srcDescPtr, - reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - roiTensorPtrSrc, - roiType, - rpp::deref(rppHandle)); + reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); } else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) { hip_exec_exclusive_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes, - static_cast(srcPtr2) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - roiTensorPtrSrc, - roiType, - rpp::deref(rppHandle)); + static_cast(srcPtr2) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); } return RPP_SUCCESS; diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index 37afe90be..feece52b6 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -67,7 +67,7 @@ 61: ["magnitude", "HOST", "HIP"], 63: ["phase", "HOST", "HIP"], 65: ["bitwise_and", "HOST", "HIP"], - 65: ["exclusive_or", "HOST", "HIP"], + 67: ["exclusive_or", "HOST", "HIP"], 68: ["bitwise_or", "HOST", "HIP"], 70: ["copy", "HOST", "HIP"], 79: ["remap", "HOST", "HIP"], From 30bd007b2dc98c798a28bbcc922903deeceae9c3 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 18 Sep 2024 00:06:16 +0530 Subject: [PATCH 11/38] More updates --- src/modules/hip/kernel/exclusive_or.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp index faff8863d..944a9d2e8 100644 --- a/src/modules/hip/kernel/exclusive_or.hpp +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -1,8 +1,8 @@ #include #include "rpp_hip_common.hpp" -/*ExclusiveXOR is logical operation only on U8/I8 types. - For a Rpp32f precision image (pixel values from 0-1), the ExclusiveXOR is applied on a 0-255 +/* ExclusiveOR is logical operation only on U8/I8 types. + For a Rpp32f precision image (pixel values from 0-1), the ExclusiveOR is applied on a 0-255 range-translated approximation, of the original 0-1 decimal-range image. The bitwise operation is applied to the char representation of the raw floating-point data in memory */ From c782fd2fe29967ff64ab12a2102532c1775cafbd Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Fri, 20 Sep 2024 11:57:49 +0530 Subject: [PATCH 12/38] More cleanup --- src/include/cpu/rpp_cpu_simd.hpp | 4 ++-- src/modules/hip/kernel/exclusive_or.hpp | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 49ddc38e1..b4cf0721e 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -539,7 +539,7 @@ inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px) _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ - _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */ } inline void rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p) @@ -951,7 +951,7 @@ inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) _mm_storeu_si128((__m128i *)(dstPtr + 48), _mm256_extractf128_si256(pxDst[4], 1)); /* store [R17|G17|B17|R18|G18|B18|R19|G19|B19|R20|G20|B20|00|00|00|00] */ _mm_storeu_si128((__m128i *)(dstPtr + 60), _mm256_extractf128_si256(pxDst[5], 1)); /* store [R21|G21|B21|R22|G22|B22|R23|G23|B23|R24|G24|B24|00|00|00|00] */ _mm_storeu_si128((__m128i *)(dstPtr + 72), _mm256_extractf128_si256(pxDst[6], 1)); /* store [R25|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ - _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G25|B25|R26|G26|B26|R27|G27|B27|R28|G28|B28|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 84), _mm256_extractf128_si256(pxDst[7], 1)); /* store [R29|G29|B29|R30|G30|B30|R31|G31|B31|R32|G32|B32|00|00|00|00] */ } inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp index 944a9d2e8..42f11e542 100644 --- a/src/modules/hip/kernel/exclusive_or.hpp +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -11,17 +11,17 @@ __device__ void exclusive_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 { if constexpr ((std::is_same::value) || (std::is_same::value)) { - rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255); - rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255); + rpp_hip_math_multiply8_const(src1_f8, src1_f8, static_cast(255)); + rpp_hip_math_multiply8_const(src2_f8, src2_f8, static_cast(255)); rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); - rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255); + rpp_hip_math_multiply8_const(dst_f8, dst_f8, static_cast(ONE_OVER_255)); } else if constexpr (std::is_same::value) { - rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128); - rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128); + rpp_hip_math_add8_const(src1_f8, src1_f8, static_cast(128)); + rpp_hip_math_add8_const(src2_f8, src2_f8, static_cast(128)); rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); - rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128); + rpp_hip_math_subtract8_const(dst_f8, dst_f8, static_cast(128)); } else rpp_hip_math_exclusiveOr8(src1_f8, src2_f8, dst_f8); From 5672802b5acb819f9a0d427b11a9d8d299aff21b Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 07:18:24 +0530 Subject: [PATCH 13/38] Update separate code for PLN3 to PLN3 U8 --- src/modules/cpu/kernel/exclusive_or.hpp | 144 ++++++++++++++++++------ 1 file changed, 108 insertions(+), 36 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 419530fbe..8dfde9333 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -192,58 +192,130 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = *srcPtr1TempR ^ *srcPtr2TempR; + *dstPtrTempG = *srcPtr1TempG ^ *srcPtr2TempG; + *dstPtrTempB = *srcPtr1TempB ^ *srcPtr2TempB; + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrTempR += dstDescPtr->strides.hStride; + dstPtrTempG += dstDescPtr->strides.hStride; + dstPtrTempB += dstDescPtr->strides.hStride; + } + } // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else { alignedLength = bufferLength & ~31; - for (int c = 0; c < layoutParams.channelParam; c++) + Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) { - Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; - srcPtr1Row = srcPtr1Channel; - srcPtr2Row = srcPtr2Channel; - dstPtrRow = dstPtrChannel; + Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; - for (int i = 0; i < roi.xywhROI.roiHeight; i++) + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { - Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; - srcPtr1Temp = srcPtr1Row; - srcPtr2Temp = srcPtr2Row; - dstPtrTemp = dstPtrRow; - - int vectorLoopCount = 0; - for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) - { - __m256i p1, p2; + __m256i p1, p2; - p1 = _mm256_loadu_si256((const __m256i *)srcPtr1Temp); // simd loads - p2 = _mm256_loadu_si256((const __m256i *)srcPtr2Temp); // simd loads - p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation - _mm256_storeu_si256((__m256i *)dstPtrTemp, p1); // simd stores - - srcPtr1Temp += vectorIncrementPerChannel; - srcPtr2Temp += vectorIncrementPerChannel; - dstPtrTemp += vectorIncrementPerChannel; - } + p1 = _mm256_loadu_si256((const __m256i *)srcPtr1Temp); // simd loads + p2 = _mm256_loadu_si256((const __m256i *)srcPtr2Temp); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, p1); // simd stores - for (; vectorLoopCount < bufferLength; vectorLoopCount++) - { - *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp; + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } - srcPtr1Temp++; - srcPtr2Temp++; - } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp; - srcPtr1Row += srcDescPtr->strides.hStride; - srcPtr2Row += srcDescPtr->strides.hStride; - dstPtrRow += dstDescPtr->strides.hStride; + srcPtr1Temp++; + srcPtr2Temp++; } - srcPtr1Channel += srcDescPtr->strides.cStride; - srcPtr2Channel += srcDescPtr->strides.cStride; - dstPtrChannel += dstDescPtr->strides.cStride; + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; } } From d52e53dad4eb19353a0d7e9819b1ab915cdd3e3e Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 07:29:40 +0530 Subject: [PATCH 14/38] Update separate code for PLN3 to PLN3 I8 --- src/modules/cpu/kernel/exclusive_or.hpp | 146 ++++++++++++++++++------ 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 8dfde9333..40158412a 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -978,58 +978,130 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1[3], p2[3]; + + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation + p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation + p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation + rpp_simd_store(rpp_store96_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } + + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); + *dstPtrTempG = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) ^ static_cast(*srcPtr2TempG + 128)))) - 128)); + *dstPtrTempB = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) ^ static_cast(*srcPtr2TempB + 128)))) - 128)); + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrTempR += dstDescPtr->strides.hStride; + dstPtrTempG += dstDescPtr->strides.hStride; + dstPtrTempB += dstDescPtr->strides.hStride; + } + } // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else { alignedLength = bufferLength & ~31; - for (int c = 0; c < layoutParams.channelParam; c++) - { - Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; - srcPtr1Row = srcPtr1Channel; - srcPtr2Row = srcPtr2Channel; - dstPtrRow = dstPtrChannel; - - for (int i = 0; i < roi.xywhROI.roiHeight; i++) - { - Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; - srcPtr1Temp = srcPtr1Row; - srcPtr2Temp = srcPtr2Row; - dstPtrTemp = dstPtrRow; + Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; - int vectorLoopCount = 0; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; - for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) - { - __m256i p1, p2; + int vectorLoopCount = 0; - p1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr1Temp)); // simd loads - p2 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr2Temp)); // simd loads - p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation - _mm256_storeu_si256((__m256i *)dstPtrTemp, _mm256_sub_epi8(p1, avx_pxConvertI8)); // simd stores + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i p1, p2; - srcPtr1Temp += vectorIncrementPerChannel; - srcPtr2Temp += vectorIncrementPerChannel; - dstPtrTemp += vectorIncrementPerChannel; - } - for (; vectorLoopCount < bufferLength; vectorLoopCount++) - { - *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128)); + p1 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr1Temp)); // simd loads + p2 = _mm256_add_epi8(avx_pxConvertI8, _mm256_loadu_si256((__m256i *)srcPtr2Temp)); // simd loads + p1 = _mm256_xor_si256(p1, p2); // exclusive_or computation + _mm256_storeu_si256((__m256i *)dstPtrTemp, _mm256_sub_epi8(p1, avx_pxConvertI8)); // simd stores - srcPtr1Temp++; - srcPtr2Temp++; - } + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128)); - srcPtr1Row += srcDescPtr->strides.hStride; - srcPtr2Row += srcDescPtr->strides.hStride; - dstPtrRow += dstDescPtr->strides.hStride; + srcPtr1Temp++; + srcPtr2Temp++; } - srcPtr1Channel += srcDescPtr->strides.cStride; - srcPtr2Channel += srcDescPtr->strides.cStride; - dstPtrChannel += dstDescPtr->strides.cStride; + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; } } From 10eddc3c702f416184c3930a5c7dbd7104cc37dd Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 08:55:32 +0530 Subject: [PATCH 15/38] Update separate code for PLN3 to PLN3 F32 --- src/modules/cpu/kernel/exclusive_or.hpp | 153 ++++++++++++++++++------ 1 file changed, 115 insertions(+), 38 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 40158412a..cba0dcb58 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -493,6 +493,85 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } + // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else @@ -501,54 +580,52 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, alignedLength = bufferLength & ~7; #endif - for (int c = 0; c < layoutParams.channelParam; c++) - { - Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; - srcPtr1Row = srcPtr1Channel; - srcPtr2Row = srcPtr2Channel; - dstPtrRow = dstPtrChannel; + Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow; + srcPtr1Row = srcPtr1Channel; + srcPtr2Row = srcPtr2Channel; + dstPtrRow = dstPtrChannel; - for (int i = 0; i < roi.xywhROI.roiHeight; i++) - { - Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; - srcPtr1Temp = srcPtr1Row; - srcPtr2Temp = srcPtr2Row; - dstPtrTemp = dstPtrRow; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp; + srcPtr1Temp = srcPtr1Row; + srcPtr2Temp = srcPtr2Row; + dstPtrTemp = dstPtrRow; - int vectorLoopCount = 0; + int vectorLoopCount = 0; #if __AVX2__ - for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) - { - __m256 p1[1], p2[1]; + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[1], p2[1]; - rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads - rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); - rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores - srcPtr1Temp += vectorIncrementPerChannel; - srcPtr2Temp += vectorIncrementPerChannel; - dstPtrTemp += vectorIncrementPerChannel; - } + srcPtr1Temp += vectorIncrementPerChannel; + srcPtr2Temp += vectorIncrementPerChannel; + dstPtrTemp += vectorIncrementPerChannel; + } #endif - for (; vectorLoopCount < bufferLength; vectorLoopCount++) - { - *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255); - - srcPtr1Temp++; - srcPtr2Temp++; - } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255); - srcPtr1Row += srcDescPtr->strides.hStride; - srcPtr2Row += srcDescPtr->strides.hStride; - dstPtrRow += dstDescPtr->strides.hStride; + srcPtr1Temp++; + srcPtr2Temp++; } - srcPtr1Channel += srcDescPtr->strides.cStride; - srcPtr2Channel += srcDescPtr->strides.cStride; - dstPtrChannel += dstDescPtr->strides.cStride; + srcPtr1Row += srcDescPtr->strides.hStride; + srcPtr2Row += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; } + + srcPtr1Channel += srcDescPtr->strides.cStride; + srcPtr2Channel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } } From 893716437c0d7ca0f42a1f27d7c77b154159f4b4 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 09:08:34 +0530 Subject: [PATCH 16/38] Fix compilation issues --- src/modules/cpu/kernel/exclusive_or.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index cba0dcb58..ffac2e655 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -509,7 +509,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, for (int i = 0; i < roi.xywhROI.roiHeight; i++) { - Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp; + Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1TempR = srcPtr1RowR; srcPtr1TempG = srcPtr1RowG; srcPtr1TempB = srcPtr1RowB; @@ -1069,7 +1069,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; for (int i = 0; i < roi.xywhROI.roiHeight; i++) { - Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; srcPtr1TempR = srcPtr1RowR; srcPtr1TempG = srcPtr1RowG; srcPtr1TempB = srcPtr1RowB; From b52cef4c76942aaccd2489deb442fcbe5d00d841 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 13:03:42 +0530 Subject: [PATCH 17/38] Fix accuracy issues for PLN3 to PLN3 --- src/modules/cpu/kernel/exclusive_or.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index ffac2e655..b7a15835f 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -263,9 +263,9 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2RowR += srcDescPtr->strides.hStride; srcPtr2RowG += srcDescPtr->strides.hStride; srcPtr2RowB += srcDescPtr->strides.hStride; - dstPtrTempR += dstDescPtr->strides.hStride; - dstPtrTempG += dstDescPtr->strides.hStride; - dstPtrTempB += dstDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } } // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) @@ -1126,9 +1126,9 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, srcPtr2RowR += srcDescPtr->strides.hStride; srcPtr2RowG += srcDescPtr->strides.hStride; srcPtr2RowB += srcDescPtr->strides.hStride; - dstPtrTempR += dstDescPtr->strides.hStride; - dstPtrTempG += dstDescPtr->strides.hStride; - dstPtrTempB += dstDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; } } // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) From f5470307449abb6c48fad97685ca32876522f1b0 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 13:20:32 +0530 Subject: [PATCH 18/38] Add comments and formatting --- src/modules/cpu/kernel/exclusive_or.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index b7a15835f..6f971e1a7 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -192,6 +192,8 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; @@ -268,7 +270,8 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrRowB += dstDescPtr->strides.hStride; } } - // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) else { alignedLength = bufferLength & ~31; @@ -493,7 +496,8 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } - // Exclusive OR with fused output-layout toggle (NCHW -> NHWC) + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; @@ -573,7 +577,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, } } - // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) else { #if __AVX2__ @@ -1055,6 +1059,8 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrRow += dstDescPtr->strides.hStride; } } + + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; @@ -1131,7 +1137,8 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrRowB += dstDescPtr->strides.hStride; } } - // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) else { alignedLength = bufferLength & ~31; From 069165dde2205f3c9543ab5ad32bb366815c4b37 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 13:30:12 +0530 Subject: [PATCH 19/38] Rearrange the function declarations --- include/rppt_tensor_logical_operations.h | 33 ++++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 03d1b17d9..fec0c3d53 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -86,28 +86,26 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT -#ifdef GPU_SUPPORT -/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor +/*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 * \image html logical_operations_exclusive_or_img150x150.png Sample Output - * \param [in] srcPtr1 source1 tensor in HIP memory - * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) - * \param [out] dstPtr destination tensor in HIP memory + * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); -#endif // GPU_SUPPORT +RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); /*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
@@ -153,27 +151,28 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT -/*! \brief Exclusive OR computation on HOST backend for a NCHW/NHWC layout tensor +#ifdef GPU_SUPPORT +/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 * \image html logical_operations_exclusive_or_img150x150.png Sample Output - * \param [in] srcPtr1 source1 tensor in HOST memory - * \param [in] srcPtr2 source2 tensor in HOST memory + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) - * \param [out] dstPtr destination tensor in HOST memory + * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); - +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT /*! @} */ From a420d14325a8f6b5b46aa6e83d583dadf50b6f23 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 13:40:49 +0530 Subject: [PATCH 20/38] Add golden outputs for exclusive or --- .../exclusive_or/exclusive_or_u8_Tensor.bin | Bin 0 -> 273600 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin diff --git a/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/exclusive_or/exclusive_or_u8_Tensor.bin new file mode 100644 index 0000000000000000000000000000000000000000..1c6464a29976d172b913fd645460e6bdce8a5a77 GIT binary patch literal 273600 zcmeF(rKG!2R1{E@QV>MC1Qd`|;P5-=`~$pm zt!K}=)?V}CzFu6P@4jd6wP)5)wg-7Oxf{0FnGXh8Hp1v8h-F{ka}GHf*E?Gl>YJBX zINE^iUBQ@OJx1MtR|crB^eBgo5QgX-;}=@W_MR_ZG$y>t`1>KZb*r+U92{!Ht!yr6 z1YsIv_5Q-6F4l)E^CkrE2dZFS1Y?8GMylmTO83Xq_Nmt7GC*vd`&xoy2fu3HDCa7GA ztw6!BTuT{Tz6<3A{ zV^yh_i&^~2O*P9cAVtbd#oAnX zAsQ9V^gUjb!-^z>L8rT7DN^Tyy@x}kOWGnTjjvLD(KTD!xA2Mm=BhQdNrhRV!3s#i zs2UO0wzg#4R(NcV@Jgg)>a=7~bXjdw+-`i7YEO(RD0-bKalhfwTOOP#M%hsvlfiIs zl@n8yCq+9Np*G@ErvTP7J@fQWRrd4MHV+e{h!7<5<6(rcf&#>RT-og$P3^M{p_yus zLPh#)MZ{jG6>OHj=Yp*Qm#Z_^We!~dY7VA!3QS~M`cSXW3xA>Je#tkvX%9K^cln)d ztyOVVzG@T^W@s=oWNTA&S92Wcb|mClLij;vp=wLMYD>;WgWbs6(BJo^zqe z<2O1k(-3FqtXHY8TI(37yDE998E{K*qlJ19grWDR6vX6TrIx3RWyg+H<;C~L#&@I- zC1nVs&|rwVP>BbU=%PRFGK4-PxB*h!XC&xtG@`9kvb7BR^M0i7eeqS$Xz zAVB+(K_lo-odQ@-1Evmln-trhKbn6KTlyn8?@we@{zyT5S66L9e^O6J zL34L*Y33$|G@|evGjAgdp+6A2g&g4-5keCwY893S5r&E%$z$=7hC-;p>(4^`#vsbJGywQJMN-2qjV^_xGtl# zp|>!jJF~66xF|bz31Q=Op1lF>4g7OrtX6!?YI~U)+N&>~3*6y&+T;ORl)xZ(Z?)e4g*zqk=_LmnFX8}#R7~u z5Ws9todQ@-r)|fi?7;N}i{e6vZ(iW!oOI^=I|D8!r8%;#x4yn(NYr8O_{JwKO<3yB z2fheIw`q?qXsdkO;OWeovJ72y`UEJ*BoMQCIee7yp!eQD5smK$eq8Nr~7(0WK(^HKN znKI(joBuS+hAL3#Ez+YC5P){Y6lD2BIk@-a*qC=@Xue}(jf&BBaRg3DlKc#m88TX&MiFE9~9d^8%jxnxGg~(if${uV7gEvSqp6XZd&fo2$;v% zm=E?!TW2#1XmP(e>g$;ekwUubq4aGPWv;Uyhoji32 zU_BhSGK5x``4S!P5`&nONnrQt_xR%J_nG%e8>crbtx~$fTE0ICi0i}yuLWV><<_1v zIWa5D9vJC#`^ZgG5?~T=;#srPk{>%LzOz&OrW^4?uJDpiWmnDx+eqFJQNSM;hV;Pg zDw~uR*Roeh7v@7Fi0Nfy0fwm3xMP~mC5ZG?!}Y4|`&*6U(Ip+yMa}9APUsH5HwA)u zs(`q52%TmKH`GIt(^%NjM-t*PxO~j3Z7rZ?2`2eODSu;WvdAdEj7y%wO@3DMtJmVo zyqLeby~qkZn2W=OOL7ZpOpgna70#a(3z^n+pU|(Ika0(|v8Gj_r#dIYXADugR`%{u z_ZE!J>}=aSY&qB;+PtU0V08NM;PLGzWU|O~Di<*evZE4Oa;lkm>XkTamp$(N)G2`V z^kLd8p~~zLawOsUyurqww>=x~DF#j%tFDaHQQAbT@6Fm?X*YG#Lbogb_6Q(IkDYfD z?9=%0orka!hm**7?AjUPsTx!GgNeg}e?YB27}QM5jwT7j=W}@UuS~xTMtP8>+vP5a zNz)1jb1``+jW%`tO5t_a04--2yvQ@2XfU6y7jJ6catlnnF)!cokNu;U@n8hx@>Bul z0Ch1xZ5Agxb`SfOBesrNNb?_`%6m1jjU+aMTWS6GWccyufvKk19ZJoGLMSxwNQZ%~ z3pz_nG9;edER5jyj6v&amKz!h_g2n-K;Abd3kd|Xv9u*)B8Au1I^S8G$tl9SCp;NG z8mMeay5eDAyU{pUM`Rys)J64n3_N>L`+U4*sB3V&AnJpvM%SuV^tS0&9P_760jw9Z z4WhHNlT-u8P2wl~c4<6>ecK8czclle$&%|@CULw)^mz#vIwz&FjLTU>0r$ZLOHgbc zOXnC_;WL@;XZq-BblZ2z!4H9aJWk$w_R$|f%bnDBx!7kp^lv&vQ!f2I-mK`2UYW5! zI9B{3fjbu@p5?)lM$?j@K6*3KH&Q>^GuqwxxhZ3{tl-8^yXKRBrl@M{Hz<(HQw5kj zgZXr=hkg;J%}~TnI5b?j{lW8HAI&Mu{e*8-jHLDwSN9_ucY`cwSebFe2F^r%*QxZz zv*E&NhRcOXMpx%RG4v+XgDBaY%TABi#(mbWOilcgFz8h%{9OrkaS#qK2DTqL6Po0S z$Oa}n{#$;UC0W6F)9Bp3mm`HQ%IiA%nwyh{yJ8osiV{$fTZPfymbm3iiavD;U_D2% z9j>C;&;x1ZF?_-eJQ2U<{c7&l`x$ZnU(&jtkTsUEAk!G4-2`;`A!v>U)Xs0%SmGPa zjmc?oEVy>PEKuI1aIo~~P~=&l6$;Z=v9s0@zG+iVAFujmP*a9icowmxXig9=nwVY3 zV8t2A@(|j#<$!?+cK6Iu=3G6lv13K9I{(1%%5& zmJ!6#U*L}&!lAeruGK_UW;HnXzmxLa=2!nr?y-ucFwITY8uGEgd&5$ZM6U7;iUA>< zn{+@VQcbCT{(YC24BL+LzvQMTgOQR-N+lXr%nN^SGKpI z2M5hi5ewQGPWGfwDD(yH8%x;dB%3PQ7TL~g{VoVC~8xCAiM^pG&kFZA_O*G1B`OBUjrspInJ&4si$TNRo%ivH-uAd-92?9OAAV zWX~I6X@ov;HxbCG8jY`3Dd^Zt8Z=Mk^ATP1qkupIi&z<}c%BxuQ?)F)po?X6XHU!S z_jm1C>oy+p52cusso0yDqdV+N2hdk_9<)F8y+w75_6&cd_|KddD*lP2HJra|&xudU zxg1Nlj)@PWr^n_coAfvMTH*9NSY0H|P^PPnt!N7Bsc)neM$b*Rv9a;wG5+%n;D`u0 zJBKACZRH~qbnYxVixd)rRw>K3hn6L`#q(3U%Rjm z#|iGB7^}`=B&N>;g!=sjD{N_|bB(Al&S>fxa63f53iPy)*bss;S#V|g-wWst3| zi}%bu6%|uQ{IA}kuU#@~I~?*a3G7Entw)IjqJobrk*)$Ai3)7cWDxmZtbOgi>nS0+ zQTF?ufU}*GZi!YKj@=#4aFj>c(oPa;A(q^eCcN`vP0J*&+s~Hq1N>jZ#jI>p;pe59 z6>lQe#@p`%P=B}r*Dy!I+ ztyXI`Iee~Wio*X0K3q)zCB#i!Oax9_1@T<^R(CQm77v;*Zq~tkCsGVL0x_SelFwvqWoGyRWNCmfp1?@b35$ zPm-Lw!(A4m%vX`uK zTib857hm~SntVQOaZiTIBV<)wOd3EbDa?dGh(Jq#hDm`dMTw10gorOnD`!WqY)jlM=&tJO z@Zqft-|tC@`%T188^Rhv4E#3Gk^uX14|f+^c8LH<8RsBHsJymPm^<*C{K+}$`$a$k z95pO_lp(>dx)hHxTE<_QyYgQ>^aY^sp0*j9^hX$6Ghc~S@y62W0|vJ(iiS%;}M zB_|&r)FK^hzegcfFAJ&93NvZ&axb?w&$MP;G!1q()zp&<&9RbRWPb7(|M`Z>#GsmD zZ}sZXiD!C!&x|^o{r9T1$gzHJHv~RNA^aHd$^H&vc4v_$<~4kl+FzgbCbx37!WmgX zDg+KM5Rq636W0ch!Ls8vIiw}p90F!~uQbR?Ak(3r5+X|7=0UMV_l$;8g%Ewsh znWb0Vpe^?H0m}|af4^^o~IX+9A&}>{n8nzje9RG>81ffLo;ZB_p7E%TLDD8KyFl%GzGX;G~Qsh$kh0 z?S_e^fr)2Agv?Hfe5>jrjf8_8>~r$0!(4#1&?ADBjZW8|nLM1p(Od`=hN9(26k#ET zC*y1;xb%Bbd~bWSe$>4#(!MIgATQpzustGw6t?+RyeTuVG|IX;*Y(L`{O23;3cLz3 z!`d?Z$3}{W`b(<%JVysW!($#OgfyR5F;;k`34KUkdu!*lX8vNHZP8NFHc*$uMc%^1t_3T{I$DmZS*ep@J`i)z zoMOu|A(-Ki)rpvgMlL3m_Lh_yk zRx}2cRC|@Sc~{hgRgM|vjd?tebnZ@Z>xpt7>9tR&gkeleh&R+>)eg|dsNFkjVu)d( ziDP|bl3nG&TII0&t44ZUIhNeoo7YtBsHh{wO$!Ev{5isl&9&*T@fpzarH8{@eW%is z>x2JJL{{`Ko8)m*Y(u12-5VWx&_oHsO6$vH*QZ9v63Q4dm}^W-u9^9@aPwgpwr z2`%<9jS+E8frT9rxpj{DJ@zrx{*BRQ*^xG*#l^XWrT5t}MoW2S!v!BppZ#?-elTa< zQifYMAP@y1X~^OHH1>JpCC4ktLm|l2f5!IK*kx9KLt}>U=(ATjb_;2aWd4CT zUhZHRwi}2PYD7lr?RQkI(v+4~9O;`^5uE&8Jjf&Q8O`OI^qbCZ=bm zbpDqcfe8PBlbLB8$uGeK(q`o}0=0=XvyLr`>y>v(dGhTWls)plh~ zj){eKX(g`7<(@f}F`3UIOZwuo8|IQUw3|99AEnQrZTU1$jT)$)BTexjMN+F(%p7VjF~DT zoGlVY9j!#B&UrS>9+hELTN;s28szS0X<_EuFXg7e9;yYhuTcpz_n`AL;MyC7vGoJNOyb~T?T3(OGHd?u*)1q69%ZnUf?{yXqLAIp;ga(jw@Ms04b z_O^~UA0B@`KA7MAHoNwDVSQ;~>-+NOcW+;e3@i^0zo;zivQX)DR}j&lMGQydlR@L7UTaguEVH%_WmKA2>)-eloH+6z17NMcRAw&@j@)HgU$3mbW z+T6g#C6U@Ak;fsUe<0*^?WD9^BbKXW)@Y{FT<{c`@)3FwCU>Rs-+I9yDaqw?e1)1t{*Xb*A z6Tp#?z_ZwBI$@xBc@W95$k14d5>aD-T#cle=+Q9|rSZhEvEUd81jKi{yr+Pttf zxBB(t+Uw5g>A|zU)(>sg-i>yuF^4xxLfWi;G0xWKoZB3M39} zt#C$NfSLE=g_cOO%sL!y;PiLEPG5*|zQfRc7Am-hW?1U=?po2H2Ke6U$>-(3PIy{q=>!OWlz_T(}C^9|3w)wRww z71Rus*7Ro#r|s9K?{s&RJc~RUi`{NW->px$?I_O8N!+RNN$j-wV2+VY+1q>a{$<)+B4c(;s5-JRf?SA>qQ6R= zrdg!c^18lsXXwU;v}mUUY?gfk_Y z=(ZI)Bd;%9Ad+~|3q-1@5jd*4g%Q{VSd-d-mC=axaWNDiVeVSy6LSo!uO%tiy}y&2 zl6;le_(M?18&3Oej^;HP{w9`Cb7<4C+4l*4)+JXII+QP8WDO@Go;=2Xz9Fe8?z}d7 zyFc?}EcduN^`x%jwmSE^Ic7gUu`t$VtH`T7%`UFV`~?<#$^6mw7v{*<6Q7yzRcTR_ zd{BKMvT@L`-d2}^LK_cm22(FM`7pRJXLy@X?n-+?WjAz~}8xsCy();Z^)@n%dAVy ztVt;tO-vk!itUTP?2EqZj7`r^O;7aBObBU;v7UQChFj~q-9h_sfwXe?V|(i@)|cT$ zG}A0)=qef|KNiXWvu}fndZcM!u8wg8zLW=zco-I&2_kVnJu2U9%KgUl;rmb1Qyi+k z;y$%dHxM3VNL>z|w{Xv$ZZF(u07?wxxlX!5fd1M|M^$|06+i`00aO5 zKmZT`1ONd*01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8-K0YCr{ z00aO5KmZT`1ONd*01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8-K z0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS% z5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK z00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd* z01yBK00BS%5C8-K0YCr{00aO5KmZT`1ONd*01yBK00BS%5C8=Ja{^8VtznvdSh{^^ zI(<00?QohcA;!-mj2puBIv*?0w8oWn!G(Fzk7Ln`{#z*MIjVLSg32I#a2#S_BBFmH z5-biiED9ks8OJ9HB_J`}KM|ratYFeH1Zf|GHYd)E{IX8 zhJKc|L9UKLyOu!*NF&)oC&5xDUQatwUn{{tAxc9dVTieSjCW6tci)U<-4|LO!n_g0 zw1vpBfo+&ippikVl}ezIMXHuXs*z5tmHJr!^2yjsitxW2 zTS7FtLNz4S#xW2PzH49z2!ZG&{rT2SsY4!E!1iYDG5j1jDi zP|%mXMfzK3eOhJzuw)%QiC=dq*hT)mkAiWGWXVME#z`;NT{8{DyrILgr^2+~&G5TU z_+=g6iyn?kUGSEbQI?x!E>vV+3-JMjd}+hI?qZhjVUhRPVy2O%p`WR!pQ&Y(t_w;v z)=V_hihaz{Qa4UdJJ~=kMoleln5Dd(@lcL#+rnha2~z6Cy5+;X4r41tg61Nsq~a=O zU@4~|sOBIlrC}+iV=H9*%Q9>6tcDxz3L%?MS#yKcDNf@e8$gn{qpI(%(zVMJigrK3v_vnEsNd9Slump4M5nauM z0rQMO!>lorlrh7^QG=8Y)9eAWymrx}!&mUDE%x7=JY!y=%TAl*huKZx^w7;}CM>!X&;A1m{GE0Z)Gx^*pvZ7t@_LFR)##?xNjU;T`SAj>i< zjSOppYzWt`y2OR9)Ngb84QKrVXZ2KPodi?OL>;4SWl)ZWNxF_nlCfs8rEY?aZltMv ztbtOpp>DLAN^B=bMK|M*LGE26lTv%5LK~AU8}?0C<05Bhj+<$Uk5am~a*DfYzK>b1 zw@QkaYVyCFqpNXOh|UnE(ipD7Aevk}s%}cST1J>!YLHfZh)!aFNuIA+X&|^P7*ZUj zSAd|Filz{U5j=(#){o}hjSTCB1r0@n4kCH=p$B%Mc=rT_^;*k~Dw`w@nx_pKr1$G3 zj~S-4vt)HJR4AAg=Mc=V*3eA#2Tr6Z&5ed`Jomg?qAmDq5&7pv9&}ISy(`RtA|+vkGJG zA0X}@)~4ki#;G2nGr2eem8AcdV>?$_ zBlC{|u036&3b18~y-|_5Ns+lxg@t~$xka9>S+2ENo|RdaBe=i;l5cOC_b=yYubk|o z9fPPGg{BmbqLhH75C_A`j-78Fu$3Er{ zwf>UfDNw$zAM@YT{B+nU~|5*QWjV z%{Z8PWJK_2Xz&ED4F*ct58WiB7Ds**YeN*pstmorRPg{BR0tzTd9u6-ia5crjGC!F|F>PM~hX*Jbx zo5Os%^7|BxmH6;MniX})n@X}ZD9e_kWv-(|xrTPHi?~d)2-`*iX1EpJS}rQpaq`L0yjvCw@Sui zc_^iLf)l*JMc$CY$NHCZWK_){(20c8h(l6~dMsr1Xq12`B)?&p@8Dx0`;KDzj^g@{ z;`@&g`1BL{4!C)ZdUy}{!3G1pdXT((!umT96oBwbfI)c~Asq?87gi?NoEv`~yTR){v% z8Pb46b#WEd$^7VLIyU4iaxu>dw#0UrRXteo&rbN@ua}P3;5@#GsklTS8F7z z{Vu!J?o(0ni>V<=OK2G{;Zk29RR0lHM;KQ}dewdS{e86RUcZMY{d$l|F{(*1wtktb zex?mH$JR36$)L;&REB0!hGJBV#955YQ-s00jls4XXjB+zR1#!T7G{_UgXY4NQ$rP_ zeRcbxDj9mR3#Fk9BrZ;W1>E&VUf|rRM;SKH6XEL%u(QP zmX82QL|2Z*fW$o>vE?JNRbx@jii4mf|F42A|LOfmrkIMX-R~0AZ57gO9oqAFvcY}4>+I55NnjpbFj|C3tu?gyR3F`F^AMp?F42x)o^XY{18GcOMH?%u2 zqz^5ugH9(-S2arAJW-vY_;dm_{hB@h58HN)&32{Lc8%51bF$^Wu;uQMlUnzkk-(cp z+W2p1nIBQo-XQCuzWUx9ky&MpU{V?g$qP_Tgem8vD<=}Fq>w`LaZC#kAqAn3++ef3 zAd8&G3bo9Mu*^h(mZDh{V}lbgEDQ1E)3J5>BD7FumXP11WW88%tC@=?r70_79DR0-(iSa;1TmW&iSiq*jbZ)fsRN?zV*WnT0n#mT6d{b#SM9XpfIyyGu~JrA?oa zb%VZblfG>O$fnWKy1_lP!#lXcAJ!8Q)QRZdfeUX|6!Jk@5s$H!#F-Tr{fTHE*K8dC{!i|N6?DqYmc#`TeB zjtA{t?XUQmB?Ulpf=shvkOEZG0(|8}64P8g zZV$J7+6?RAoaCQZ47=>SKhVu8Ld**TpydJP<$=%&Kg)7&%d-Dl6^}=M%Q8P`^go>= zG!{-X(IuqQBDC8wxYsmfzzo)B6WZk)Fy`Yu>KZZ%4eK_sZPX8I&=2nig%8-mdR&4Z zQ|ky0?FbF;g9Z152eya!kGb0n1VoIX1@w}trE)@wqaGLvXUS^Etd8c%k6IkQkGXv7 zb~3ZAEmuLVB3@uC_(y_obN9b_rgW`I+SNWfMLU*;inJJ zp$B`Oqe-TnQ4THrI;He3nb<9f_$`sxA7ZgT6w-glrLKu&uf3(WgP4B88RR0FR3KUw z!BnDxHPUD-V>!Vk{FZS%icxGT(afr;jLMO;CMi@J8T9HIG`hWsHZ(29RKM_OnsqFRcFf4hC) zsD0otBw!dEH0&5K>gv<(=ilxUI${~#Wo*;%n4<}-UmrH;7&_t}KH?kP?hPAwyi-N? zZx0O`^A8;i4D2K_%j9R+VKgZ}I+-h5Ldu(?tQ@tjtg=m?B`B(QuNt#WSV4V!)0FPt z5hLFcsdF@BTX`q`YZ`s~ogf?PG^M~jCI2b0&^4Lp9g)N>iNqbX#0`b?IyGA!J<|aj z|6g{TD;CUm%xF^#=oid57i`$m>;h9PSksik=LAx>7)+aJ><8%j1!&+3Or>ZV^Ac{e z3LcAME`>-o#YhH~R2r33O0^6c<3tAIL|UCbPNj_Z3_m{$T~9IJbBiBR2p*yH=U@mQ zVG181@f{)aAH!*7!|P_k^PeE-{0vbk@VAKkmvbajjKelda&d2Wbm_Er8g_IVwsRb_ zaUSt=>GX1Kced{`j~FqEs5Q18GO-#pv>E|h_j}lP`MPujMhrzb48WZGeVqqjdfkMo zSt;MiYeyVk5>K7oGZsx!CcT7DpCm7DaLSp*yPiN>-s4<4`5gZmyR_1(CrZ0M)$8jY zW-+n}7M#C~7{92o&S(Yxl1p5Z3*3+hToa1j5{uu`3O!IsZBVl2(J~&e@!hduUa+IR zW5$@K!@6L=ykN!t&5Couz;{I@cupX>jU&B{EOLUzmyN5QiwTJ%laJw1FBVX$;IMey zm~qS&2`rWgjL-yHlO#ryL~6YbHjBg$vio1TuBTY<1tkv&MNV)8G7MO&W>m$Bh&F`-> z6}J;tw3BY%GyFP~sjalm>LAGL#QRA7@@Son+&?{yB%# z_G7H2z4$}r>TN^L+n6g7+(WK)f%hbFNo#IY#GP=XR!jFOo3I@qB}Q&I=3 zJhwBPk2&sOXvKxA#(1kn`>V$ID3t|5q65G&eo8T63USDqslmF5zRLgfek75LLNbjH z1a;WDc00Iq+q?ETdiL0QbOgHf`M9;acyu{h4L}`-^&E#D3+g!JXxHs&-|g(s?`J(2 z=s4jNJ11|IzbDcwDh^c@xbIyv1vJ;NV5 zzF&{UB6~|C^9Nu2mQ?nRfNq0|d54g57oT~Pit`qq>7JNn2ZM1JLHH*Ye-1tK-^>}3 zPc-BFqQ|dg_g;wYv&+TMStPP5##2}(FoTmA%nMlc+StsBzDQiJvhPoE?+QQOJ87gL zX{Cf}Cx>arg)1h*H8cG#WG~Gq%f0qPnS*)qjq=w z3@5|HKv1HWekx2a)ypK=M?b;IJkH%B&ebB$O)tsKHQCjp!^fl3&tcfhcF5O#5K$%O zdIfpuo^k1(uC2|daM-${-*Nf!(>KJ)8LmGU>nw>AaG9NWt(CTIeFW=hs{*n|VcSXgNSjPfN_UedWW9>gjqL-n(r7x z^cYv<7+rFoknxU^^Ma1_110x48P9Jr?#sysZcgfHG?_zmp%X^N?c*I80hzyo!sn7w zzZZCSMdivlRV!Fc3z?P5m`w{Ap!uxES!|H9Mala;;@NlXhk}yVw0x;pqUksyzu-i! zkkrnQ)zgC25`#1|Fw~OZ1un4ElD!|FJpaQl61g%YaG{rFZm@Y_glajOW)2en5t-m2 zl6oqVZZf{e1v%?BE$c3(>>ZrQ?;x#|2uQS#UZIy~b3n+buT^z~Q{&v#e9U`lLeV2J znpdSA_MN%fRaMsUvs~XO=U&lYrOh%YPoPzeI=7BGrOXL@C!ZFT`YFYGOCWYeN`J*b zc!bZs#YDA$!F-1wnS!dG zOvSu;_=PB|-sam08w2UAB-*?DA-{xo$g5j)Xo9D-QdiW%>5n(AG6FX;(r2^yN1|GB zjNoEAg;*xVXfD-K9T^I}HRq9yT%1@@!Yyqo;Y+XS*#NcB&-_*(zi?; zM?y&Be8>~5Rjy_X);}f*I=URiY)NZ*x?$Ym}dbW&cQV{J<8D$B;ZFBfV!L*dxO` zM3DRqFPnxf9EUBP3@4C`EAF5fY1Nr_!C@nwkfoe?WUN=VemxPSqIh$z?qhiG*^W8Gx(IV-Qpx9qL=_^E;--tq|2tud0GFNa?S2z+E*kWh6 z;ukn#mwq~tVe0?%^HpNGSaR)1EV)=LiF8WQzoZ<;boh%*G@ltcezG&{@lk)DIG1?w zOYGexzclh&60UtD*-Kw=o{vi-%&sQXVbt5Hj#@N@hxtClr41V#6+Hph)a8{uLp@1< zBd#9*SWKd^)9iO2c^^J-Kaj~4euf`U|3sAg9-WutkcH}42=^-^)iy8H9V#$b-m&hXbNTR>s_%3m!lCZ?CFqAI+6=VDrqkMIe zf=sgi<^6~!mq4oEeQG z`hI4D=HNBSUebp6`KIjQ*}LO={<|NX<-@-Dy&(y`XeA>^2lJA%C$GlOR$+3P&SnM9 zW;yN#O-}kvUIw`c7WsIPOd7LHH02^hm2!l~yE6GWX4c!yE!q5O#GV4s!x}lY_>LIj z+l>>J{7HoRLErlR@Y?#|i=SNiuMm%}C5yTOI*VNEtF6!G@pgCUCNO3tM9;Xj3Ym0c zm$-KqS$-@ttV@aB)2KwE$|O*7o>Fk0A_!dLNJeAGM`9{OVJbypC}sHSBnRtd{>wS? z$j4A9l= zowx|Iwos3{2Jiinsk90?jB^f%W}m;K&U;VW_7t)p3G zfL=4AVI_g_GXnJiJo#oCi7Z;a7%B1}AF!9c{(YDBnyj(IJFhD|<~444hkwEVPVsB< z@_yvpF8{1{-|AM+-h9>CO82(sb}i2x$I3w&<8YZh-t{$3m)ktdVhQx>$xCd9`($6g zQSQm`KTs(}Q_5V^a$HbxpQ4K=_{wGYXe9Y+CgLhb<0!=hswIYK=lsh#a;aueYDS@J zL?bKyMi9NmXF6sk|13kW%Feyc!?r0*wK5I=ZVFlSJ=~-$-V#0KXEvU7c*S%dgNh)B z#&FO2K>J1?Xg)5eB-ErRK&_a8apUA%_yxt$Ea~T8N3yXCL=_(ivu5yLG3{o2WoYj8 zY3T9#K>YLX_GH;huIFPsITNz^T{0Jwv$a{Yvgm&tK+T@!bslDQ){r)sSs$WtFTU>Z z8_L%8+w8Zbg+heoJUk_zF221+nVUdf$@q--u*?1X8%NS2QPM|(q-lbpaq5x@)RYf2 z8RHn4y}mV7R>N7Ajn!_2qrtgN{_E?ExqX2>Icg(y_S0-9Tn70o-1|SMzb>(C3vwP% zD8#}^-J*z|BZ-`Q$;Jn&runL+VyPygk43>v+*3{6QcYd{%Q^D>FUL3(^(ZvCBsk#* zBF;?~#zi`^IacNk9{y`t>iKDmnGYyaBIs{sus$9gGVLES-|UdQMtg;<8Bfo?x3Mpt zI)@)Wj+pWm=l3FQ(hS)^rQNS1X;IGe@1&m-zcNLAWB7Ca;uZbh%L9RyzQC&Iu=v+p zt?fj`ljBA2rsJmPi^o3~j=m}C5Jr%wg@LLnlR?OU$de+r#Z2(|4(}($SkFJ;tj?dm{Ie-j{1QK9nj-ZhUSmzb zSf)isGN`5AHg+turN$$A9653trEDCQ`7)GKDV|L)Z<*!jgluh%dRvC?ibgpeQ#l?( z`VLFt!B-*9r7sn)FO_U0iFoFUd2Ns7WR>G=`CraaU@#g{Gm%ImgG4cbQt}>|_lF4M zvN-(?FYh6S(3up&_AK_)EXoT(^r?k&iG&w8Wy6>S{b=c5@K;e@UT$%fzeXIX^l0e` z+dX7Xc}I~x&wg?}Bg}a!Et@{Yb-RPL_L}+cBv<+eu8g1PM@?gR-Fe~JubFGwSrV6K zi#|?fzIvP2FI)4BCa0ZZ?(`F(NS41r6S{UkBj0e^+IjBWY0IaWrp>mY`I@>9H3KiR zJ9_?%^d819-5p|P-TwXa>v&@s>umn-(p%J3R;IJ{cgtLPJ)xr|4!v2% zHB}BR&pnE}y$6acj?YDTBp&F+?9XTo2OxPC3HAxVt?7T8{*$^1yTzQKKTBzyObYki&l*CJW;G)YTyMAQoeF`Dg? zmq^u}es?EA2lrDm9KR+wF8G9Eg=CThL{o(%6DF7w;=TyR&9axh<7yZrOIw|c`|=^X zhrMh<^m0zBr3Ev$pFD9^{P6tb0<7dvn-d31B|YRg}}Ki)ij7f->siy_MK<Uk?>Nuq9`BSm?mr6Lq@74U>%ngB#VvV5RMU*mUmcwEiEaPK>z_Z~ zJS>S84U!eT;;QZC8`YeNr%t>>YRhFUD4?rrWo#VdsU6^H?&laB;27%W?(F|m-@H^- zu~A&W5MRV}lJi!we3Gf58?Co8v~`r>;@~5PPzJt4(!q&rV~bzzKv+RPtgqO#!&Sc) zV)@+2vDeDF-O_r0XP&M+`^CsvI~hf_s(%&8@dccWM|Y}A%f3_Rwny7%!oE55H}ebVA zf9-tvjJr`Z&oZI{w*~i}pZ`3LmNA}_GERW{>KjMoYwSvM?@XPDLn5&sL{bOjFETYK zssg5+QmzU@t6FfIV%VUWU6D*^pJV~QACcxHHQdv4E{oho0zk50ThJHPp}Uyirm!}xWK zXQ7{Su}ymEU*3<{V;68ES2!ONBsXgo?_MvVh_Ap&&OW?l+TLQ>-Fk~S{t8Lv1)B6d z74HvTsyr@+NCmuDCG3^!FSs+wpLXvba8~F(G`>8{Iwe0S6j<6sE-K}?t&=-0V?Vq) z-6&%hX_F%Ui!ZoxvbKdGk*$Ex4+`94VaXFGy5?j~6C%qKp=?)2ZPg}hR%NJ_6R#5a zTFa3>$XnhcSX#x})JE3Vz|h*j+%w49(@s2=K|0b&*`7(8+bX!-COKQF@v2o~ri6XD zjPq@g0As5%+78wm^8NzK-VDmYqy^*`g$f9DCT`C)10&f;;<@l2k7cYBLqcJ#s`!OV z-BbCOS9!lLOK(qRno^L#@0OY{TUKSqW?yWsWMLn7FddANZw-@wsik{g%loE^d8$Ep z;a|=XbMzEt@R)4m8vS!7>t-F>MbE-{56?+G^KR)ah4CJl$xDQn+jK11d}RNJxx4I& zqges~j63Y&zKgrNLjrjt5C{ZMaCevB?(Xgo2<{qufyLc*ad)?y`!6`BzRc(6)J%0( z_4E#i_ZkDb4i=f*JEp8J#Ox^PJqiuwSjh%GhO=Dfzu#ySxUb=gY~qi|5%X&oJIDYp z`?K(+^ON}t0E75|ar6u+90c)fBrlYlqgb8S&7|GY!0u?$p(x7nNYHpV`DhkkFcoPkiY}~BLB8dadbmVS7Ux_h-D15Y zZ>5$~06%cxX%zZ246cpAFHG&a9ZhB<3YUVf`{;29;5tT^J-e;$Ue5jhJlvDQO@7dD z$k~1){eNceeg_5*9$byweg{6%CW=Id`V*dKzqO5*7zyv{u8o&vhz9O}N`qQh_5 z5#je9QSKH+{vKiNE8JL!z(Nw&aspaU2t4$q=e~LBs*HU;d5cVWk6C-|aE*I3Sa>Xy zvbd2RT6*=In1~yjMVyt5pA$foS5FrkNdBjWwy=pat(_w{g5mn^!T%9q{Ssy3g-9Zc zo$?a3yZ?S{9n6^^K<+1lANdJ4j!AGGZ9ai=yg^*LO|+$*ub@;Yu8nQ8oqi~Xaj2cF z-yf&TpKv&c5|#zriosky!rqJnT!gdbh4Xw36;|B`6~5ra9Fi1Nef-(}4Y@#vF#4mz zUnMxYnOI=PDoXwNGunSUgo-!MZJle`FJ}!wjK*G_P z-;=?BgQ4WJ`^4kKmFJe5s}kgeM)<*M@bDjkt|GdT0@{&021tPrw-9-<6nF>;UhfUv?hRk)j@<4Hf!#&lT?NBV z;?|P7#>;RfV-ULih{vK)d&}_#a~V7Tungr>y=h0HvIEleHH!EZQ2YWYc9o)R1(ZAk zN*pIi7{*WP7s~13tnVK>h27q7-CVv-Ex_i+hYyfrvx#Ce0Qp&%({WfMiEs^@uPrgi zMH?4YTc|ln=wSo%Me{RR8}PFkY_u%%XHUc;X)JVcY^*WN%>H8F05QEx=Ffi2jmebJ ziNvQqlw0*=%T*M!&1B2;B2fQt%FZUzz8bpmlZVdhjh<-w^SkrjP1Nal zT;E7;nOHUjB*^B!U|I5;+6};l9Efxt^Z(rxY2A-!)NEHn4-vAZfxhrp?Qr(rh*cZ} z6RQ1DrTC*r6##1;Nw`^v*jWnMT?yZw3SU@`oSqBYgGB8-C+`mUo#f=qL~S3XAkX*% zCxZYhNr2v1OlT?QNCn}*Th2#=tU2;V zAn8*ycrT57FC9NPk?iRNaFBw%8xOydi8G%KnA%5y?jsEbgSSKIZU$X=iPr>!Elz$G;Lk<83sYFXr;L$4F{xy^3o_wN_VYi9xDx7>fgls2-_PIu^p-EN?a-PUP zyhk^Fw{(#f&!Lb2Zr*xG*!s7GRd6daBr3H8XHkDU!fp(v{JgVQe7|1YD(u<-wt+TB zFZpduM$NB<%`QhB(MC?wMoun9Pftf3j>hagCLZ;LFIKLtCg2`~QFeu+&4(h-hmk@3 z$VP*~qi@-fCTx~2c#1QAnkQ}?H+T^}av6}?hm$(UUNF3Qebhp<(S$qg51xxAS&FAT zI=i1e-yS}F8AwH1ibsdV;ml?rPe7KzsbIA+>0!KaxH)1i{b|Ku8~A#L(p0R5ayn3+_%ka3zR*)Q4;&4rMeEZZQjMY5R6|;{v*c zIhlpDmW(=6%W+sKd0x(QT_|`{C-S(3b@TG#<4;$c4g4L<)X*W;;LDtq!kiPy^Tn6< ztB>SIAJK+na8wM%T{O`}3erRb8MKaUxPyGG8g(QBt1sZXH{k*jjsXdU?<*u8&Se?P zXYVQ|>~6&EECh`-lXbP>bvLojRkJLI5TDdAB-U{^`Ex7!iyGysnE5kzq*BC%2sE^+ zS~M%VHEX(L(#r*isfCOEC{aN4Mg7@3oLMZ2T}+EQlpo9yG9wVV#2>Z56u!(HInI=F zA`&~#kvvBdH-GT2)=61eY+Z>s7>^)bsydijeV&+rPUJs~zhy_*z{b(aYiGsKR4ate zXPG;se|xU49JK9^zVDB^7);g~MA92hxsZ-AmWd5bLR#6noLxVMo;`QLZaN!jwu^)p zbEv0^gqJFX7TP%HgBebf@GgDHB4P=jf@$uWnXmnDPXZ`zQi#K|2x5{^0|XFBpKVxy48X|)eKw;CO*!jeAz&KNk_d{2b?C6yo3`x#Za7NVa)r1 zMmtDG8_D}?L4860CO{qrvbP7?;Fc$IS0E0~k!xb!$ zJi5+G>hXH|yGp_OQhpsDs`q~-S`&bA|8l>PtF}O0CrGG0l)@(#k|GuxM& zqx6}o&;lVqFp45$+#PM`G(*f1L*fE+%nDP?0&m=+SaPRCQWq%Y@HFLQG~yzsVmK6b z9*PTTC504R&6FL_)snyIen;l>h0nur;MHwP&x>B`s;$n1oaMNu#*Qi9bQ;2Sk>tz2 zvo&3(TT}O&BD4MctG&$K-M3^{E%ZLYJg^3qhjq?Tx)8h5rGccW4~ zrJOIVjT6>NFc}HAl7%*xO%|QN5uZ&OxQ=|giFv+x{<3+plSnYrj=a!-*4=Z;`+eQ=_OmY)V92!o38wuWyMcIud%L-=i@JA?4dWs+Gc)XlVU#oaw zOPN|rKUs^Np^sT&iSHy$=)q0wgiq*#kMGAw=_bqQnkd;0xylYaxQZm{3BermCF~EP zSZo2mX~(qNz3j8|pvSYE;XBN%IVPzo4#xvot8q<-DJg?_Zic=4ekj(!GJ9#addG-S z?Fc~;^zn4PfhoF0^r)6^ss;JIH2<&2hZjQE~o96f6(baMr{A?>icRP5f1$m?%6%r2|3MCn8;DQv2 zt(0+fRiO+uBJ@NZ_g|llPA`l+UXMnf_P3FYmSgu62+cKPZ%+(DIdY*K`JD^(J=>2@ zeh6VP7?1lm`&f8qODQF|MZI{m@J{_w-?NF(*w1JziA30U3! z;Gxp}v7Ui3jP|ji`{i)Lr2wLT;sd{hSK2VvvW^xzp{wj|P{HOAw%p#C<%8M&J=UHT z>XGT5{qqpX>PBt1JSVa|b?1RUXUwT{f^j_@sZ-2;CzPp|go%B4$^E!V{dk!pfUFUO zv`&PKf%UAr&WK}3_}O(Bq^}TK(lgSy(cc1wv~8~XU)o?XyN?Iy3w`nP1N%d*+5bj*p)_ zGE#>z)PgwB0UGlMjzk;|g#n>KA^j^a62svvG zEv4&uZnt!Ovv{+!oM@;8slNjm(vH;U3kMCs=?()xf>1`Hkro0#Gk?hz^0#Ln2FJ%= z*ZvKp2O#vs;q+!nE+t4F)e2w5l7tG^4a3*=AyluzH!l#^jG9CagPKN=D4x`?}xgQ zxH+-z?B-{QW@@a^S?S7ek~k+B~7p|2B$LuuG^1%t%H1^mawl8zdH7g6iN=z3WQvq!lm%GupthDO+7 z!|p*O3?~VS8#h9d0o~1nZl;fcQbsT{#tH?3S3Z`m>CU_I{OqHX2QEMk<{a~0iolE8E1N8V%D&`tK)OX}Hk^59*5 z>AJgVCz01*B#&`Ak8L}RvoDNm<^oG;2(!MZqZvX=*$B%q!u>6TOZofL<;T;_cr#Jh zoo&FrveVu=vi&mRg7%a<;EYPyGD%H zdd#~r{Idp(hd{)~8ocfx^qv6xSwHmEXpE&`f~DM>nI`aJ5*joXbN(;V^wrQf+2lA< z+qhWBvT#TbUuZ8^V=q_xAXQDTZuk$kOWYc7y*A&7W62sn|9wHO1pmMF3o&-d_p13_O~5onuX^PA@j=w$a< zV+&eg`rAWS4bf@sqHXGa5HiX)N%v%Jr)RlWl4pKW;}ciJA4o4B#-N~Fo44)H>U*k(%;f@*> ziXGvL9)W8=!}L2RkGdcTKH6(J*~mZISs8h}df4b4J?`#5Ki(XLWiAuP-kv_xBOa6i zE^2Tds_{Al(Z`YzMo{9_qk;x{yVNus*GnDZ0#Mfh-7bRu@k8C`s-p@$bI9!rqdt!K57=e5}6DVo2=v5pYncO#P&R+liQ`2dSZ}%W>$R_l)V@Hd?xDl z%INgOZ}I%)!=V}dY!+WvhDcw|>T2;)UsvxLM)N#F=`v#lRHUktv1mxLas;Dolr*p( zzH$s)-Amro2M!5_05(-puXVY#w;38paBlrU+V) zZMw0{-+mW?u<)FZa_`0o+qZH3uTuV_WM1R&a!@qI zLEDghtFU9R*9N)XEv5c5iu@*m(+h*#6^-0E&HGD+Pqz%RcU*3_Sk`xfCTEgfH;is~ z)K)KA?~YvQU%N)9J1?LMLooc1QKsr~mhxHZ+G)|+IqbhPm_G9Zj~hfj!(<)fM`4|v z{hMRmL*uJYQ`3<73CR4~(;Vl)DqsA+1QR?O65bjgrpv4lik{=%SYdz75x#0hfFW$H ziX|)`LT)H8MthfGV}rdVWalkdM}0?+FAtZr4Kws{53GU1DjhS*4TIlmrkUD%cw%nY z%f}ds2Lubp@bZQb^7<+BA^YXGu+pKe=8^5T$%n=n?8r&{;PFp&k7|E!%$oL9s;}fy zMn3rW%4GCN?iSoIWxQ#}jeE$;bMN!x<@!}i@4cEEl*nrb#dKX%eGbiWm(Y8Q)Z~iN z^jY8XK|lEx)$WN+`jP&>7ux?`cx|t_ZSM%pu8gv8K*_J#_dxm-FQ1i5b|RrR$kx=)lPA z%iY>l?^fr^a}CdWpJW8V>y1v-XOgC?k_6I$a#4V~i?BN)lG@7=Jj32TrP?~C(gXqa2h9n8#fm!>2z&+peIl*z z1XTB57xX^ojlzBpZnw`oR8JuKEiuIQXy)xJ7VTNJZCf?nDMdZK&lpxscv4Kd)W{m5 zJt%t1j`y(3r{{-@&6~xeCylxXEw_1~=P`%p9-8AmiT*l;>x!ZCfvMp&yxlsY@hzYH z4yVim%hwxP*=NpA&)jnB#;G?ST5dffZY_Lo@#9~(46lVOUL|ag2rSnuqn7uMURPtE zgabN-ntI72CI+WAMphn&r~B&JI|pV5dV61|&Yx$lZl=Zu2dB=CdUsLxH?WU4*H*7v zk@8+?lsVi3`{M~KvxIH z=6|yf)MLVWnbJhl2Zb^_sS;j!wzlY#ud#YUPZ=u3qWkcZ`v{_X@oS%fzn{Q=p0C=6 zZ(61R4fCjx)3P1=+U>9(d50RMCpPV;`b9fR3AeI-&>u+;Mr9AyB|FOggKydK>1p@j z<>m6}HgM_*q~AyEbwua5ZxplhF=SK0??l0GK`Q7J?0O9Eye{GJDCv2@D*wzb{lX@F z&n9;#;J8nbdG8W&t5kVw81QV9_QdNB!?r$`wA*B{9Y=9)uWwr3etik*-8nvboQj@a z?C!5+?r!I}gif7~Oy59f?^gQH7WsM#`V z+x!`Hdb3dZu)2uy2$2og5#7K4FYQq)d=q(Vw_0chI(NPOx{0v`+dEjDpqh=v++1cZ zq(8c*yvjXXY+K^|-dgvB<-^$V4 z!d>?u?n|HO9z-MXStsgLGq>q&(hN%_yQIa_bpaq{STc=u&x;>ywQubRv757$K{+cC4m znbN_%g2ju*i39bBBV)@ac=tP0gBwAoV-C3|Zs&6WxknC}bwq<3fYli&N2~Q|@n1E?$hy zU3GKZFVA&7AMV|cJlrq!g9Ro?1(&I>p7;VDG}E3`>kiouU+{V#!U(noN5u;2MW&mvmfO%4%R%KMg2keVdk+&nCoHk3UF|Qu0n|7Bm)$c9 zf@`~jy+@scmy462=RV%ce(~@Th2*o|=*7&kjhwQho}?q(fkn#kZlcv8@XRXS%y47I zReADecE);A&PIIBb8_C}TXx)sOeB=|j#R~66aUp{zD_SaPjBAKI6qmKxgOkqE$w@f ziJn1sK9aP*Lw3D~*SipMxZ-!Z=Kb<0@b!i16O7+s15fj$yTpZah`A zc9n2{`}}Y{a&XyydOo&)8a#X9tT#`ndW!t}O0o5I;^1{@{~j>E!Q6i&+Is<4Fg5dd zd%8Wmdw#p>x4RT_*&MOi5q;U_JM!260&nXU_3&IEU{^4B_pEdAZhqx!>V<0ht77W; zC;vmt^LyCD*fPsVpTzhI{XFRTStO$WsXt(j$9kVTQY!PF_3Uwzcm95S6R>-DKJ|Rm zwX*qgGk67U4Zg03d(2CFD2uyFDmtC;-y$AZx<2384?Lw@gYgP9LPKB5cVIfs1PVhQ7Zi>B+L)~Dogh1lPDXffP`30ZD%9&Rn2KwOdlq0{3N1# zjpY3R@9}`@am^|VUsqjSeX_MD^6E*c#Kz|oocb(gChuxu1BX50g|LX4GZ1~`5 zVDJ3#;#Q;M7}I=-MB@?T$CF;&eZlzSQ05(4;R{3U5n|;A^!RFaV|y*2t1q~-V(+Ox zVzKOEv~}y?_rqk_)z&ub{(AeYIc9e%Y-gJ*2mb&n7Tsx7d8wQQ(Mq_K&fMX=z9il4 zBc2)*xV=|RKH$BWxf={U;7uChFvpm#mq^<_9-KPtzd?cRVI1!eb~;i!Tu#cmMRZR1uQ;^a+ZS1hBpEIgfG-S?jyULJ=`-$flhF5fQyzU&HEy7E1E zi9Uc0#Sa3jhVL`Z$s4Cc!M=m9m!lW6Ia&8|z7VDK0sfn{hp`Cqqj{pWim$=1`hPB2 zcFRfjsvrBBF2@44cw$IT#sSCI6I>Od9BuqeEi@9j1dyD&<&Mp@g53SR`o+iKgZcYI=a{J9(==yrt=4t))(p%0)JOLd{5tclktWT=C ztb*^Tvv=uGYOO*1v;(Z z$n;=)FQC~?@Hp;r*>4a#9NOwYe7Y{K#}1!ocejq$Und@4XHNf|9wKQ^fO7jOa?XEM zJtLM}!PiWqHVmUy_4BQ7XOBF^-kyX%?w_6SANB0|Zq8@CjJ3_4v`(L|KHXgoUP8iF zX**i~9lCl->iT;M%M0nQ^_^`@7;Ooi9lx%@Ua6G#Ag-?`kogK59fmcyfviQBZb`^=Tsx9mv4{hdNVhf?+1y#{@Akg%<*9n-_5I==T>T7tjc=J9w<2w&}r_LGW}*S@>OC`ek+W`B>O3&U(V}(K2bjAF^b` z=Ed~Q-(bqAd zoRXAAXz=NM^fRn(|1rC8x@>v(EjtpiegZ3fF#7P*OjFI~ryB6P3{XuA_(K}=n>3uJ z29Qvl7M;ez)xNb{!{Rg1!)}sa@qR`3Wl`T=+#D=(0hU$((?}Y!G^zx=FH$Y(O553e z=-%4uI=Q{wI(@nyx;xLi-}rmB-?9I|?&;kFU68rm`PH=Q*fuSFI8bwZ61H<6ba@ng zx|{HL@aN>9EU5%Y^cCLh`@7a)2l>RAr!CW<%DU;9g6@~%hx3ZZv&ON9*1o&?<(tsy ztL&-Aw8fpg&Arm(%c|(diLLyqo6Ewm8(6@m?_OU{Ub9u@-_)YU$of9T&@z_r0+D1N z)~Yo8jOLd0$NJlmsjTM3=+5w!yY{K4gZrh%&|(aOpDxIn7&c!K98JuOzG~Tjw@`Gj zu(t~FtNd#1@2&l&&p#rFauIU82g|CtoBzU7{XnFmj-;UoulWs|LET;7-kH)+7moO& z0dk^>nw!0&BMPZ99v;&-?fSgF^goNoGlv^1$4RT#an;a@(B6wn*gj+{c4auSY$PJ* zPhwNcpUYdHrLnp>$XG)@aai_QS=)GX?RfWE!^K_3(TV@zX6F5I#KGOaHekTbaoxeo ze-SORR(SNHym*|=l=_aXMJu%xkg}%GpRKk3=J;YScN6b-+gEQJ`%mgu&Z1YgVwZO_ zOLnqL4(fNWR}a_2$|iSfX2TLrqH^0pimMWu>dj(GYI6Er|CDMLc93NM7TNmS8&mve zyDxs{D*vgoczdvQ>84@jJh3qc_s4f60!}OdoDd@c9Hoe}x3RIV+YbwED>qqIGfhKx zIZ`aGx15gvDo#XkK>{8Hu%ZmXj}HJ%b#zhq$0L* z@oWt;zuaCpQ&st#9CR2LeG(tvwsakEwOu@a=l=|Am>JA?+G#xPYrRfkn@hp4dhZPW zj*0Uhih=y+pXFgiqlx}kuM=7DyWN4Xq1^7>^3J>8eYYipo2^&-%_qCDQ;YTEYn!*% z{t;6?;Y+ud7k`QtJiPtViWHWKs<)NBw~aZbw-b{24|gJc7b2iJ5m3hjEop>ZiUUAs>tuyb znWbRcTJ-t;d?=!#=e&EpGPvyFAuu}mp>L*YDZl1$`Y?asp>yh`YT>bb=yk87YM$k-8a)mK7p_j2Ui9L06BkxDB$toxSl-Bag=KKvhRq z%7w;_kgVwAi9Mg?o#>OVVF%1{XQZvw7-@fpWjqYQND4w)d@^6IqvCR=`f8U4D+VVA z3M(k1BljR{vn{I!aS_`&ZCq_j$i5Ymb=i==k(=r9ETUq{Vq6{;Qtx%*8dFCK3ak2i zpyE6tU`r!Q_s=Tt%yr9s{esFfA3HO*hvzSPA1)qWyRKp~?oPH}FA}yP-4|n%caTBo z;@Vlk!c}-_jB;igb?M~K=Blvvyr!DeyUXRc@HSiLPw3oS0DA&c8!!{Sga8gU1{Nj} zrMNc&ih{f<4I2Zp7=kFLFcnBlN0H0akpQ1m=&d`7@MB7R1u(FXveN!TL?hC7u(EWs zF$3Z;khAduIXKM7EnLyvJ{zJDVp}3wD%-hQssNqUEWI7k$;?n0bW}Z0`u~Vm2h{pO zv`RN=`tu7pNlST&TSYBYDNfz7pXG+%!J7D&lfcVH}v zErLI#WfYxkiEzP!r0AF;sA%*IBIs!FX3lu*j1rQ9=&S_1lt>sT$T;@)+Fq_UqFjP} zVyrwo06habCoeF@FN(M9$ju@IPs0NzCIFyerR1Q+B++sqH8;gK!gTW@Ci#KF#EH*O zLy814qeMkAaaC&I;A+qaB6@(4yg|DgAMYdBK#gLgQer>lSQy^3ioIu{`@&9xrlvx!@PkxU9YO88jTrAcPGMy!&KgOK zIwAZD4pMtEJbPDN1yx5wOE8`-GC46kI+7TTC=DwMFFG0@8aY2M5EBiD8=HU|1tdgC zz(GX7k4q^4a3+99KmZc+P~bB007NNC*lfMYfk+5_lm_=7mHdAEcf_tDujQigMb5wt zgn@}-<3NZ`hCoEgPmD%EfrLxV$wkf1OG7Gvj{;8t6ec4=15yf#@}grPNf_I3kid(8 zkWdKiaNn{c9~&(;0UI1GAC4p+5`&`cS9wedZBIQjECK@%J~a{(J0UkaIXtluIVldF zse!wj>wofyrk+A{0J-n>PM8K#v}Cd-9)CWg`u}xG|4Ca7uh2j$Sk6XLBuE%92`-VO zD3t(f(sO?2XLsZk{>(1nz$_p}w^VCO`lg1>Mz)l$nrEoFAP4 z1C0d8O@WO1PsPv}^if*#-RF-UHdLh=^67H!Y4&ch|88FSq%HGP8U*GAQ}Eb1;E@t@ zlH>o&O)N50pb8Z;EiX0)9v?0#A|?tBmNSVsJv#vjpD+zRGB$>ZA+01PvVjNVTXw{d zq=)0AMP#SJ6c+=rvl7xVQSh*05Hb@JvBI%(AhXiIbFdJT;1M7qQ(#~aP%@xdxXEb$ zYGVU>m?M%R@d41urQC4jtp)wwAqJ{({I;i2l3)svz>Ma^4i_eC=R*I^iYmt^_K8)@ zos-s_k6NCU=hMGO4*sv)5??t)KJfFZavzkCfkuED77BVrWC@Lx?36P5%5gQd(hzJc% zRhNaAL7WLBf$jptw_<|FL=@u2AmShvqa#Ko6(>UwB6egm^A_DlF^mqVdRB}Q}d=6v*E<-Tl=Z~iERXM;2B52qQ z0s;&q05K+Ukfxbn+&9vQ|5yVkks|5f{n?SDn9%ar;2kO8JQ;9RCB>x4kUz0=$o<>< zW9GL5Vac)Z8Q|fn@}a6T!DnDncd($9(2<`*srQczr^D^S|7pa?Q%6MxU(vZWWB9d?e(NH-kxOmXf`LHpV zwMFsFaRG3W00c=<0d`zYd>(2FGJuf2Bh5E`zPId%OV7wBB*H~S&qKw^L(7CofGV8J&L}C)%Ere@gvf@1 zL0CBs1W8}iGchRxh0!zF@%`xu>S#rb$cd$BQ9dvT*)za@VZ`{#h%d*)V+Kb0%qOVI zj8e{yR>A~VM!@v+(KSYrqyP(~iDOQQ!b^mRj!G;9KolsMyY~WLO9Y+=2jiJ39j- zD=rK)au7EhoCF&OFA|n0F%G(ilNrC@zb>2^E=a@(9j($)b&gzUhs@|mT=8hSAu+89 zcu1G4Vi(%1pPJhE<+Ht$D~i7JzYnDd8>u)Ai3A(97(1>ojfw;nkOxJ80+kOBRS1^} zhuq8dv#Y-FFFTgE>?p#8DbB(HAO~?Ug6WuP*jTBlsYuAk@fnyoSQ#jB!T1CuKrt!~ zBpgB}MhQ4H3QSbcKVcAv1px^Ymr?>9EGf(bz$S%*$HXKfg;O+RZ80;b79vU(A^uA! zTt|<2TOA*I0gHI8Bj?Y0s zf@kVLNlz(gYow}gPT^^1f>dincj0%U2 zM23xw0VXHlF)_3uB;@t-0wN>Ap&)y@>+6!@ag*W#xJ9rf#BqQa0`LSVhNkSU&T1YG z%34aFwdv^4(SY;}K#(};@Fi(Pl@?_9)5uZv@=D{^5^H^+2qhA4$(N5>T6zF9ep>?r zZ4+@s4q(qolbwv5U#Y=Q71L)}RdHh0a<)%HlT%%t)oM)ml7B>6s{LDbL=_VN@k_o} zQln;}`tlz?p8ypz{lDskmW7#)oRq-QL`&|S5FH1T0FNjlGL`_JBq;_C13Rs}f-n&w z3I+xR4F>}QFE$U2xG)kbHV(SJ1tEni;5!JV42`%+0=I@xT$Kjig9M~0z%3}sAuA|U z#x7ngh?60PSPW)_F=#xpew^lXSQh#?K%u_R@5V#y4AhjwiyQe@!<_Wmo~?xzIKZuyi4QiqxM=9&gcuTLA7qTwK4oj# zw);c}R(h5+hnEJK&!=W|G<#M@1Qgae6<4{`{Z;!HZ`qNMiB`y3 zqvywbb;B?3zPOlwQ`T`;_m&}31=sQe%V|ImZW3uO31v|+x)1C;AE~=#SgsXqhD>xT zl~7B(ctY*aV?W`B>xoxdqAXhz;8>{4Nvr>-Wou$-YvQULX0GpKW@KVwXQJk$@y#<= z-P6mSURvGzvmLFnwXW=kFDb^3R+gq52!Kp2m%0xM_Qn)Hr6s@7daXnuZ|ulQft7q4GZs@gg%GC%pfKD;-V{&d|pJuo%0b-IFn zkGlUtyEN4epd<12!mTXH3-^1qwsy9(vr*J_ZOgR^Z%nLeGSBI7o{SG%OA4rJ^lGa% z?YW3~zDqpWPrl#CIJpa}hSzNB+(yIA&B)nC$KFiC-OIqnSlPr#ThChE(EC4csUNcc$+~-JOUrS| zI4D_s{^0%jy{5e~imc6*yzBE1_<`?kJKC0K;4ae&y+4l`0_@ ztD)?vIFmU6kWEpHEztjSqLNckax+r$_R`Yx&@p%Zq3bB^Dx+-Xs;({dUFNG^@du9> zE&C3|cg@P4wNkc!)MauNKKixlw$z4JmBv&Sq{S5Kzhkk(g%^Rxg%_m9!>4jy*>CE- zf+AvLHCKHJ42g`7i*RzXv%+@J!?&!FmihVZTXAsmL}F%9o7-$`)IxmZY_LzlQ9?qG zx9_20(7A5tOZ3Hl&|7xI=7Ps#8QzTeg0x(siUp+PF+e)QC&_;g-1zU$orR$ zqk^itoQj3^e;l8v7wPRI+0FL&Fs47^_RE6jWhjT0E&D&x%}EoEC>kC}qdaNDU$|fi zx!AH$V0={2lTsE@{2;C@&95dSX(q#M|DNTmvV;m4!POf1GrfK8S5Kdm@)F;~)C%p( z`25R+{Jnr0|03wlt}%JTY}Y9>GBb=4KWX}tULgHutHPU@?onGL(A6OEo4 zJrO73D7kY2pWYu@;${K*RxY{;R=Uk+9HF?a_#lX#@$Fh3v2?h*^288%pyi$N_Zh>Rz5$2?>6%jYlsp zQ+r1Z94WKPFE9s32|#9cx36SD&Yhm6iTP2X}p zj@Ky7HS1JVN>|0&H6+=GhZ#qOnMH>hRRsUah_;INvySyK_4QUy2y;pf^~jFYO$u|U zNOOTdl^Eb0?X4fH>zra{;-BbQRc7Z?Vd$S?GOPCAZZi|2 z6e*5}_nv|CwTTzHxe1|`9;KNw$i|94M479D4w4``yCdTNFn%A`I9{F%`5n1fnKoM= zcb(917F)COC#|kJV$Z)Ic&|MyS24Eqd*Df~4yl;|I>5;ZT-2de-l`^}{lUhG*+iGa z)xp{e%h3hN1w@6xDGC5Gx}mUPFno5T!#YDI5?b!7)$_KFitxR#fFZ|?tlepQbgs(8zejN1=v8;H=U zn<9>Iu1tf{9Lq?yo`fw_1u-;XIW@fXv)TZ``Vi9s%@t!A#b^&X!}~Pvt3)W z8?3%nqq7#G)#>}bq)C0SS-!tP={#8Dwe;s>#*ZUE?WN+ViHVq_#mcd?=8m|hOJO|8l;&1&zJ+;trc%{=wo>j+FM(n1{^8a>_TPe>e)`%g`IxJ?>3$8eR(3b|8fGOMZmtv)Y#1M`?`@;>^Ao?y zJEouVd|xDK7M6v8LU}w?yqUxh<^FAv@jhr<4$o0{z2R zDpz~ycr*9l5I@@wF6Or67EbyG?gqNvrXJpgF4zv!f*WWxJ}U+bfp`82g#3gm`G=CpgCknfN+syXk&1{3dL!E$(J2 zA8YWjL`ANp$TcYEvw!aAiVoxaNQ=5UkHR49f?(6RqNv_t|K;Mql?tDk$*l9W%=Lb%8>m_tsXN*k5|X>R!x7p4GPE)H{!UTM9Rr_)92=e(Q~`NE+o=U2 zLlYFxIJqqn=45quF=V6yjr2<3v0`gz$mtn;Lq)OrWvB1$u5Yfd?&|dej3Ie>Im=Ft ztB$E#%lqHUSye`Cdf43306pPbc5IV3>`*Z7|KQmr=iI33+Gb-@k^Vlt@oRR&=b%id z=n%#1U#jU|swI&|jalZIfj=T0mAwq#1zCN~4>2u>)GLj#Z^{Yi`x7wRk}=(r(lhTr zk{7xZA2(T?I-D0cxE|e`n=!H;@OV%^HXgI^C#<;5v-@|}(rjvJwOwVqo=;|3R>gO} zQp4ahk6+RUF-{oagK6s(PPk})L2e=nq|6yRlqoRO_h*5-i zs5x0lJiS#(z(~A2eE2|27o(rvzYM)S&5T{O@QFQe*$Hf2zgwBG66*?7c3j#KYN(S~ z)p2>PL*;yaPyW<=)At4V`I!3n*o1`Ym6|Iwo4=2(wccsXxT-NsP4bTRcW&}gu8G!c zjn%EJ^dA26D<{_MPq;O-GPJ8IJ~zoS+g-aV=T~F-uikRszXfjXr5@12pn-PZf`)|M z`1HNhgo(nmuGO%Ky71pEF75uV4Us0*ts%ev+B5~4WtvHU)1!2^0Z?(FVN>AXBEtPt z=hgQ%OV%}PYbjq`==b#cY-T~Ci%YIzps8l@^M|pPj+&;np|Y*J4I&ym7M!Pv9{qPq z*6)^_T4wa_IELR$zNzRdN$Ch1VGH&=QuWVTopo~%nUdPsQlMd(+gn>(Sn1Sm>fxL0VXlaX3rSU&pZYD7s$fr*(~+fe|F)}p`k(&)kgxX!rS*8Gb+b;IDW413 z-;-%sS(sFo?<$Av@A}r{p^ILZGadWNVtVqT zYC7Uu+mdRk<3ILZ`E4wxyE(aPFtV&aX!%n0t=@<$ISE}E?zsWy%DsZSuE*w=rxu2U zI%ud$?bJT3u6hh49d_nS%xRM#clWHDm#$p8Xk)oMDA-dDli=X4?jOJk4z~A;^AC;k zvkS4b@v<|wx3CU!@QnzJ#>ArV@%~<}r!Dq523Vu0aq+3XZjQUHX#19y4*!9rUgfmJ zrJ(52sOTsf66q@gj)6!;VeyGk$$_xQNDUHII@K#a$W;Ju6`a4UAXL@-s?ScBSml_rK}`&zN%=k zA#bBT>sCgR_|t{2b-_0>6WR)X$i9?(Ww0PWBmR1?5Afv4wXVQhqy9HXv(kHm%5Nks zi~?5!m(mL3__Y_)(>?ob&)#eauShFrrT&;3YMmcsY+}3XFjNMwh@{cbhYpa7973|n z1HxAk)V(R{yyr3sRj3Wh@Vs>|R1P;Ozo{W{ygFKVx0pb-r!OQt1i z<79}6;E0&ugw)`qK+B`s7?MZN@v(^zTN{|46DHXU91#!}=7q(>y**t*NWQy|;uGOs z9>ETHT6lTMsso)B>EVA`-|)Zeu{fvg>Fw0*XNkWW`8>0a{#Ntjm&UW~3r?SVPu{CZ zC@*Z9y5y37tGXh)mEUi7wX(Cc{MNOptHpW6*RzYij{0A`Q~67A+`E#bsqCo9^MTL8 zP2M-gETy;ScjYw(2AztsRny%jhav8QGEgX}_71hI5Z|mI53h)@uA#KFj4~^0J3ag! zZ7Q#3nwi*8*MiW zb9-}p=Kwbs8*?{veY7+R$29x3^+rPCzT%38q+~g>fW3YJ#-LaaqEy%pYQjDp9Eutn z>EWFeV$Fm{YJebA_fVp{D=yxh80{xdLPSQeVrBh&yj@VVSoH%6i74;XNGBhBu$rz{ zcEw6;s5f3Z?7!^sMW@}HPNRkPq$fi^JiBB2w%h7?zv+ts%| zEx&fVAg`(TdKEjYrjdw4ME{vMv*-oA0ZKH-58FnF}ATyScDji+mXJIm3+&cVXQ#oFA# zO81n@0qY=U^`~FVVh#w-S9qkz1&1hE`RxjE*F}VTDCloB%)OOlO((!yZDtf=j>phOfiAQfdF31-E@Jc+afg@UsH3fot-Y<2mEI|5)x-$159N9CyP_7B zYdjJ`0lwN6emjFhZGGMCWW8Nvd|dRv@yAIrj1-ovpM|f588q615FSABa8iMXsK}wf zNFOPX0u&)1DaY`Qb%;cIdxkm3P(q@lSn;rc1O!lF%MVZcb2IX6*@nS~Pg{92+ ztD`r54syzBDgH8i{ZeOGW=7PN{_CHsu4degX)g9(?=5P%;@5G-v-5mhQR{^(#R;96 zF-b%A`S&MqLD&f3`)}~-`AJN%Yc9BuNN;*Hn!ZxP6?spM!#0)~mAtCIaw{hjT!ot@M}e4YPnsSG|#SX;V7Aju~sY6l1HPE2<9 z33iE1U{S*qQmyyUBLf&wLA0bmIS=!MP&+I#lAi1h3b5RPjwGYy!V=p%wmdhkpu#<`BDpRzxFgLory{N@-Mb;y!^Fmgaaj4V6jFvj0V%*?bhtDVlk9e~ z`+Q7iairmi(_?1m(k_<2cw$XeKa!eAOhE>RL@|6r)$K!U9eqtLYz-`|ZLF=VPTHR| zb~FRyF?%1Kr0mdQg+*F~unW|b3|cZxI&K91=zih{j}vEDE{5p|NmSxJ+_93>6@U?QC*r z?Flw@k^ZZ;*w^H+S@7{-A1%))oya6Fxm5S$5IgAvrZy!G9pIz|P0>O|;-W&LJ?!A# z&g9q-hKCz1$Z?w#1RA_OIzdHBN**Fbkw)TBKvz)|3^Wq23#OC8p}<>%!Yt*Ez|{9q z-LaPcWsmviJv&Mg+shLQE_rvw8Fs~()m?G#El3#(jE}ZeXUc=HJD~FNFs7F5Az7Rj zlgcvIv5WVrO;2E*J9@&_>3T=S@4Y?YUJmA#mMADaDkdZ%1sjkO5E=mtLod%5mRGDx zV4SC?zh;1!TcEe26HCj@$Hv4_*UZVx$aXtb$NA}pH^*KdlEISfL%a;E^-U}dEIl=o zV89z@KnaK#uXuMq9~R&-90pB>!(ydmBITl!Q`HM6&y^IPN{*6w#Lb9|BBmq}Sur~C z(TYe+2soL{1Vu_Ecp(yecH!Wf;AAC4iZU`%85)Q7bwH=Eh=>TK=!j#e1T859F@h=` z9RPt5Qxa%!EL?eCa115@1C799!G{mv;1G9jtPPSH>UH80h=DYRN_u!V}O*px6jZ0#p_jj!yN5$GS%bdqoHM#{&C6R!EAEXRPbyKF7n( z+)4YO#ZDW04ZY+0j=y=d|M8(D*?5;grjg^06Lv;t%uNzwVr1y>2uy5{Z>W!#Z=kPl zU<%SV9OjWte*JB~9MPYum$>5?DwH0wPm}y#_BfR7bt1tm+Cp7T8oEOYf!jf%5M-zb zDGeB7JDn8jYF$^J(@>Hba^?(FpVVD`mG}H^L)tAVZH4$B3|VITJWclmxauVO6XIf{ zVDKc>-Rd+w8c2Zz+lIy9kcoI$1Pq*tOiDzi%AjPzppntZIIY%-cJs#iJWRk)n z5cY>&Y?H!8leeJ`zES@#dsLM|kYQ3(MFvfQph%&Vv|PT~TpS+Y@PqGZ zlc9{r+3vxXh@>dzQ-S*q`JOTHGu-8UZkLalKFdVk-ALcd+>#aI4U++FJFr`2|9*|5 zdbGWoWIY<1j04kyWR-)lJ7^?ZcL*U4hJ_}9;)22hoc$bqWBolNiLQQR#{h5NKyUjX z7h4O{vu-A5op(9f*#sqr0C;xnK9V z`IA4pK<(6?QM)_BB2bPf21|+@2bcCx&`Ff0 z#r>B(QZRHSgtR(H7OJ$(Fe$LJfBz`?xITP`5<*v5h7ls87=%#q zA?$Iq>pvZC>!|hQu-<_^1XaZdP!cPK6p$DdDU%WbPWA{3_Y4mX_VKj{wsE$xG_lx! z%Gn5b=FQj7!!Fb{ID`=vC2!6oPG7z|X5gNX0RFsQ-~S@YTN)P~hF9MI7a-9pX%BfNb)?d+Y;IsZ zDOA}nSSuFjtR0JUVWjUU#5q7~P1t<1H6m741CEc2A|O;BA3O5y&4Y=GozGt;5D`An z-X1Zosi+87TN6jiQ?7yM&RYBs7Zw=Aa&&Y)k&?s$-k4%NSUB2PM#V$*9-iL)Xi)dXfa|f7p%zx&CN}N{Mn``5p>94JTVpqjzA};0xv2>MFe^ITlrX4>FwN51@heCf9Vef7wQFdeK$GK7SI#L$4B zjQi(Fcd{HLVLO&8_xOMx=e6eSQ~hG$vtr?^|FXxe?`+TfBS+1a}97?#93e(8zuyc#I%*K^}U60lh2_ zN&_Ik1`)_!Iply8rcVabn?URdCv-+q+x^HbfwX3KZml=}A3NS(PP{)|d7mtVAFZa} z+6vyBn|TRfxbVzu_R*QOjll=AqxYw}o{Tp=Biw$CDSUUV;`d!ws@1PlDPOB`ORrVB zUdOy%PtN)sSNHaK(`)_Cdm1B;|2aMX(Mb5_obb)bh2MGDmHo;`@v$%U zi7V%YFXg^H`=Pb)jp^KbFX;`S(+czx+7&#}6yNnNI&=y7bmi_}WJB;`<}S zgZ-X0{??uM)`#-Zhw?LP@}2YKE62%~u9L6q*+08Z{_M*Bt*wz$WTO8_60_H9N7|{bmbVD(BL8!aZ=o>Ql43K;l6q^m9 zh;6Bq<`imkIHf6^+8CqQ5JqW?Ahjl8Z^tWiBr4p2p}N3wT{7ryG@=iM z96&-x@USuB;X(T0ewy?Ziu6?`=ps$#B39-+R_Z)SDh+|UB#pTOAzVws-HcSojG(mp zE4BvH>RFsRZ%(b_WTo3U&q_RHAr{+;C3Z8jwvsuUsd+=;qVeq4)2ly?J$m2u^!LvD zD%}sa-TrBJ$zL)#zsqO;ip%+jko`L)|98clUlkfYNwvS;-}_K&;Op;ZD{pK?Pi#d$ znXUY6xb)Fv`MtU5r7Qm@AH_%Rj0ax4XRe%AZtSOy{D)5LC&tUq3I)A(Bl&L;NbKfAHtxlX>{ zWc;ls|2>PkiRZ*yugNz~{FhD>uN-;Ly(ixIPX6r4`Q)!S9z`vS#g#;o@{({laj2{W zR5lou4Mk;vHvg3WxdukJN)bltVp=hX<&K`l$!{X@`eskP)KP z7y@%%TJ|!8bR}Ls9k3Zr&x~Q@_)!ZycqKlQW%k?&7txf3NMtUN0DM2N70g)+=S}AC z_dou5@b^15{IkWWA zZswr{`-`>s9^k@u=8>)BnKk<-hlv+1yvN^ho_yjk{>Y8{*_pp)&-r4@{%j?_Z?y9K z^x_+%C4gtf(@!jBzPj<3y%>V;9^LuV04$!6#oLs~;>@~m=Uv(J4(!=~J&Nqdg-(2d z4O<9c&7HC30~FnG1MeazaN*&{28V z7~qR>ppSW=PvyuR%_CiE2m92I^{E~2qZ}Nd9~zVg4a=g&(C~{O!grN7667~M22%<{ z$ptJ*snd5)Z1-kI4yQmPcSKqZNwc@%f2_d?|P~3Vjofx{gL{fU79=Zs_nh4 zvHPy3ey`HbKGh@LjKf1z$QS{Y21lpKATPiWS7k`oQ}NfL8Py`l}sha=W&r6Z|Xaa6GFfyfCEQt$ClX4%vnliEyW9#!Z~xn3V`X< znuT!rUp9q)S7`%`+;QM|*+vXV{0w+B7v8g)zHceKZ!>+*TKvs!`k{^R3t-c2X2p}U z;6oPpQznCHoM>D{DzPvLnFT}UAmwghP&eRm*Why35wbThs7pxM>#}l}Wigk~pb-+J zPx0_w^3g5;%F#{$>Y;A@;T{TL=fLLVBp(=J0_(9}&0Sq8J3H0)cWG;N=xpoK-VX@v z1Z=7v>t-Arke3-op+^wt3rOSzsO(h`;z}~{dIByp7O=_438EMJQA@lPE8Qo`Jh&VW zF0ht4k8|B8Dp>q-dx_9$W!7|Q-E?8qdVRxaW8HB5tHH*Dv)^8fEq@qWetv4<)#>^B zrYmb^{CO+RtOa-0U}OFCmwSfm56`W>GMjv3G68HCpILIA*#Z*TuWTnCTCBZv;;gao z0>a9nS%9A#mM_dlT)7jNdWgrzU!m_5b8Vuc$~5y7ud09Y$XzFiO52!2M2*1Pm$y#R`Uo#WliBRz^o2WiK8ly-G$@4v0K@3xN4Z9Tnat?g}kx^3HZy43dF z)!5U=JUU1@*bhu#oXiLcGlT$LfykW)q0T44FU8^0V{w_`@;8GSS^m`g0L2n7ev$7) zk@sYt7rzt$I0l^Bi+L7PBICs+^R;EWjTMX4S@X3Oll6sjU%wfA`C`1jV!XO&!IxNa zfNj#0F=y6j{n45Adqx`%&#eDsxcc0H|J<1O+-d^YF+R7N{K;hXwduwyyK%tdj5n6= zj%T~gh#f={XUUAGfWsp30u@R^7}*i@!dU3d|MhWV3h)T@@!v{{-MRC?iPB?Y#z&C{ z;7u9#C69ZN`5u!z7JmYmu`b+cC*HIpcT=bhTVgdM2J`_~0fSUBul+AN>^W=6?54^Y zpquMfGaHuTuV&Ln@x*pNW}J zIzJXy7Edgdh69zniIhu6U@oKOE`t$3G1CB0()~E_An`yS`Dhye^LV?`t`3#mx0(8P z82Y`;oqdY>LjaT`qxeI^OH z?FUq1p`&QIAtdNBTxtj^eOU%^HJNldnS3*bdMlc79Vli9t??=q`5p3cm%=HkL zdke}ur;D7}Z0pGxiw&XOhR||-5x{C=+G%7E%;;avEj|M9 zM-c{5Ct{doDfqlpLOv963nrTmk-Y>#UWKEF0TiG;*aLm|!(EhPcbPkH)A!s~-P5M5 z)lS>fr@XC8aaTWe&j3@ahq-qMCYPH;DBh;huBCSuV2ASU9jfiy)thwHn)FoKwN<*+ z4sJY;C=I)k1iKoqcp-*)D*}5xNFgsoA>R+5 zA41CWo6hqVWcx~Ty@gr+qyis7uFFh`(>U9a&$Z);0j$}4M{bENyHuCYx8RkSP4KL^ zla_3b;l#?>_0K22Jv#CA{^_-s2J0`4CSDn>{$#lE%y9LY(c1H~U!EJU-FKau@+ERT zg(rj_Ff93pYt56|@>j(ZB&EFqsM99I@_RV89gkwlw24S3XL zFK_2AZRf67kIw~%BVXW)A?+<1I$<2j&m)8pqi|MJ>!;}zi1dVFKk zqh!H#X2o5!UdZs0nC zq0vG+&`UpXciZ-6{LxFwDxC_4FX0XiL6A2bCchc{^qi^Quc_CiyRBPid-t{--CDXm zz)4eYTeG_AT{ZPiyVnTMp(Wv0?qTx8EJv0<0maf-H&7whr( zdgDbqCwZm-Y_{Qq;OzR`$<>V?K0iGD_1?MlHzt$!&Bh-Z^WGS&zB2gs(%{=`o5^|B zG}l8o<077M6md4;vYo|%N0Fy++J~4QfCZ!!2jPmodsLm`C<{5tQzC{?M4u9F){rns zB#jerd<9%7AQ8zdm1UG5nZYkhL#({%u1EHl`4~w0=g)->n?K72x{lB+|GXd zfli8YpYo0#=)QE!(TpTySq!Lzpf?zUs>6XPb6GlqPz!Nl^9>kB8puAlz;@W*ct&aT}z*m!vM+v_v)FAP54 zGhJVEnc}+(B`y+)9h(baKhAOBa#g1#EWxygpb(hnir5MgwpPbTKrnN_lx!Jl765P%OD~9K=7q@@148k{ViJKzpBY!?h}F3g z6_H3?EKw89Xo`dy-05wcB>+v)lDcSdQzd(0JOA4b-iG<)Q`5DFM(Z#4eto$II1#Kr zFy43w+-b|+0M6_V>@R>ymzg;LkLe}vDS?kb6hPnyDe%H*mGO|gSn0e3bf)y)O8`+a zd7fB~4{Z{1EQ4;>6@w~qBz^-fna@uA1W~(omiMOS>9+Mwy1DpCAb2W5YQ|r2DS#%l z=ROXm%@8!wkjJhEAGigFaTFAmmBe2{c4da{JRgg`B_}r+54#jCH5?Bflz|K+P_M=) zUWjF0jAot>qum0^2vjlzmk~_57Dy=Y!x#Dx3IUaXM;C6H!}un)33e@&as#Yb0Apq&ak=uOB9d@|HZx9>u<29dOu_iS z4dtkcMe3p{P0q9$cS!}X$yrt#U(jNI1^zVk_Q_Sf^~Ywb_w_d(nDZV1n63i{Pi!VP z0PM!U0IQSui}mq%jIP}r?+ervj? zW+ZxKGW}%W+uwJVm5Tp5aO)SD`WHKDUTO*N_+I}Y*%YOI=N&TucIAiKAYQWn746 zT!7)cee^RnzLafK1*}>=OPV#m5eC>(JZ9Kl_GzZW}7YS6S#cIN7jR}!D zPpr<9X!5z56BB?|XQ9w_s(>lZW{7f_qH?CF;=4zxJg~{(C`s5VGi;i8oGF>1in&17 zxl0q>- zfln3l6lXX9Oo2p&H?71IY$9F)P!%nzbAV!g)#b10@Yl4*S8X}#w*1dl<7-x&4M5x` zlN$gI)3f%%Zw``K7vZ9YU_lwb%utxtnfa_vT-+so^5>%+_t#!1Q-wQ--$Z}z`s=0g zPWiP*ytgYKWN-9))!a-e?eouUAXj($SM>!59_U+8pA2&UX_?bbZ^jq?4Q_a|C;uHN z>u0$u&A9BJ;olm4uHL5KS<*VO@w=tqUfUzRlglrIa3zV*i%HOoB;e6YO^+eph*rEA z`7fdLv_S0TV1>&;xGO=Vq5wjPx3Jt}s=}F@f-6v)5vYi{bTNl6;V6o@1cgZjQVD`u z0anaUrd^8z-He82Mj|c;p)dKPulk{{h9WLS%3chEolgMWN~PV9QcMR^^WlUXG$BiY zRF1`#0uO`DsVkZU<}+ZEEE!i6v&l1D=G26Wa9x@AMQw7{apJ2J@0+u5)@fo6*oxS* zzuEC;Z7083Pkylz%-RUQ0;}!RtR-((4yhC3*bp^FNmcunV7{yRna- zufILtg!=IH_rs)n{QJksdSeAs!$PTX-wuha0@gi3bsuMw0P41 zI+H@($*JvpK-uO2u;AozS?x5il9(QlFbn3(iF<+;vA-=OhuTh zF86h1z zEsrZe5lbPA{AB6tQLyu2kc)uC0J%XQ{UHGN-JAIC8Vp1YhG4FQgD)gX-Abn4lxCzu z$%RNt9voi@Cu9LA$d?jD69h4jgyRvh0)m2&3`~sa2_@m0GVinM#1{aS=?&G1&uY94 z0EdZ1+wnO|{)YX`S7XitlgWpM7gz2Urj#rY?_gIV1d)qFjru{X7HFsNoAxT3*pqK-PRn}?Dnmbb$1PGGtX6KhVgF2e;uNpnYbS)l}{r~ z$0q{@S`~o$8mc~@Jim046Q%RmFVn`|hR0Me%6ItrK z91VW1`oxqvU!XZ9P@0q|t%y~Y3ssl%mDUQ>0apLlM+IEAyh08RTR@l3r(vgX@`WUY zd;~EYL@Q2#q{YIo`(b)Lb`N{&k2&ezaopeOvi~lC>z+F$1 ztrY2uIH@!!<+6-o5dvR^#%ANO*$VOn@(P6*LM~~hkTNY~2*#PyY$d^@($pGbdXg#u zZgN$5tLpr9;2w41w;dBN4t#t1VB_82e;xb%@!`%{z1oL|>R%lY{JrO|=ZAh>(6~~E zt4a^g9t!#P7ex5e@r{o%yg3!lib~ONP({B-&45=$bx>7JV0o*2Ne88-TE1z(zq$%v zSsPe8;9cLH*nB6v<)-2748zvDQJv|gJ(o}RT|Nn{9qES6nb8f`!W%A!UmI0;y8f07 z;^L*J&@#m+S{j;h5yiYEPsl)#Zy=<*3G$by3Ka^}5PZ zGy!XTf!4&-<_c7DL#(rwqrIA~y_%)PF8}~;0%%U=s|&K#g}Exi3YGa{+H9pFrkpxc zOrEb~E*8_5@>Ev})Ye7n8)D53k!ujF^bo+KT(_@&r^B9M%LAi!#|LeW^xGaA z0uG!H_dD$w_So4Og6WGw-iU=>i-g`vlFp20UW8DAXPZk<_;NHM7eUNH6U*cj$|<5e zMd1`fC}B>E>C@x??yT5E1tEQg!w_+mXI8a&D+fP6y1)AK{jcvQ{@M4nQC|3u+~m`} zY0b&*sxqz`!$Xw26E*CM^ zDwr#kw3P~#adgo7_|Q4!dQ;WLb1GFf$_*Ar>zohvJM9~F2W-mq#1Jx~NEwloYbne$F!d6Q zQVNA_f_e$>+AxYDYX92fV9? zS&bteRW6Q-<{QeQ)C7zl0YX0>;Dwf7`6cP9dCdrur-wz)g5 z{cdDeMoe>NY~!8e!eNi~58%IF9{=?E@Lvy)9%TPM zHKwPOX2hD}DP@U>jxE%j7O4sgG^Pr5*0Q&4nw|OR>ay%IscNS zGoP(CpQpZ%uRK?(GFME)l+)1{ndtKrbQ%S7kvv<@ST0jsEFmuzQ=$W>`XxeUArj>*TM@{pJU0y+nUER;tHbyAAkEvUHlvR(h#v=$RndB@bd@QrFL@OsDF8%Pvx*r#fW=-cl_g9*-8)h3ozaa0sTD(>1!F!}S{2sclh-;ZpPPv5t;~&P5bxK0FBbk# zRh*QU&Zbbs`@b>gr58FX~Qw)}ESFn-*-UT+Gtl$WdD^(AvlaMyT3y_P;7~R9A9U zSF*Jia(B#S?^w>&oy$?1&DTJeDxpd=P~}uq1x4;W8Fi5^cY%hvNRvB{n?FyVs|0K^ zXG;NSE2X%(a)tQ{dASOr+)$l{D&7C3)`r_db$9X=RP=97`>zUmxrhDs+ z4p!S7AF)&(aa0@d((Pf%_XR23@h9C3quxrU-Aa(T1OcZ>K`Q`2h)NK&427(egO-rc zGfJ!L+rGTqyYc4G*ZVIf|9UU{IR5*=i9e3~B{{V6KrXYFR9x+Ot4*P>pHw$J(Ov0=Van~A;PyIPZ8f%Th)_RFtQ*9()dw^+d9<{~*VH99)FwAHrqnmaRn(`J zH^o=9CgwD!-WUwsc*9(H4q^Yj@95`0j(q*~x%mCd_j>RC)_VW(?V<5ES}UKouYXZr zlxWQ5sLvIuEavNO07A32uji^RIhs);F4 zLzk#w2B?xcR+ zMvab-oH$f_>R`jE{dH&dww%>(I=Q>fY*(G(fhxm&jZOz^ES1I_G)8@NMnkdPe#E=s zie0JF*Wv+{unUQx^N`J}l`Amlc_b=}xy0W;@$8>RD!<=5{_$s(pQp7xOmAO)g2}E5 zxZZ=i@)Pa+3wrrTX!&4J*#IfO7gtvu+E5i(HxgJs>V3P8RafOzIvm(o#cHhasjYKw z7!7C`#!DQwnQ5vg$oDM}4o41*G){0aqoQd$Mn) zw!NI#agg(;{`#xk%gaajuXnN6lxC+@Rz#|cGk`vo#q90t66KW~C17JY3+!0(byxDY z&lUja&K0UI6ss;2DWQrrP$d+(N=3N~WHivnOY(>koNNhRwitkfC_=ys0c7Dt0CI>T z6ub}x&qrf&;BtBR#S;1DV&E(ZsQlhX4D}*_9HbHnDw743!jXAMa2^^`A_Xpy1{Wtv zmBuhG24k=L;CdZ)bh)YZS}PA39T_$~RDEJ!!w)+fPw3a3+TC!8aFf@svwgr-c}-wlGpoAZvv?%1c+|gSG`M6`p{O5QHym6tB408XQa=)0UF}(0 z>s>$W-#F^sFydWPe_}YkHWKT{0z|QjLT3 zxa)%cP;h zthU3rs=%Nj+o&WvJf|b>R-@O#9onBCl<$rIx_5T=nfSMZ<1hBCFY7LIx2;cU%;hSr zWa}=@XwK%U&*v-8=BO{FE3c%h%oYHs0v_iImFMz-I{`G17Zs7`DTp$LY$+Dlt00Pr z@O(HNIQ!(HFgX}>Hb@TmC8aDGRF*U*7lh7{Llgjx2ur1aN5*mmz;};uTpog+FH0#! zQ_9hh3JesOdlfR!O0ZN#9H=anan+xCEtHfQLd^8U-*Vc~ZKXbV?$C(Y-r7@u#N7=i z^{Y?rYB*tBchaEchjVSlds_^4wOAc*bW^GG(Wv&-9`)Ql!om#&kop3#-4WExC{kza z_B%1!cT;dz5sWM~Rn`O+)dZA{h1S*j)eZaC4+l4n z;i`dkRH1$dS2Y?`Jr-P5d@MLVtqHWv4_^tPi`6jT1n`x52zW%-mVI0Z}9JKVzo9WwRR-8bR;%)#5LWH zZO)2l%8qEuiL7pqtj#s4&M~ZOGb!mX&aR8e9A;e{^SwNTo4w0?`%div|HG$ux<7y1 z{^4i6XYW28;mqnTe9>L_tn+O{by=)ApQAdPskV4cY3ceV3kAxkLf|G~P;D9Ds@l?J z@?shVS;<6};1ES5M1d?O56}l7h3b%zX@g0%#6w%+LAPNt9Z>0Z3_J@cCVXkLlGv3Z zfbSk-{p6WPZ*kWebYP`42`dE|Y!DhFEO)eS@4%=&;R2qDB zYlCzK19bZWwY%ep=To#Vfptsu(6NgLw@PK!4>_1rNaT)gYxBr z#OzK=$$)%KO+eL12oQKjLqK&+Xw3+=W;nE}CaAX7w{gtBrpmLb#=B}buzD0%-%Ts- z1Q*|h)OX3$bxH&Cvw0NXIwIdTrcn7BxF1|{2h`qAZXG269;>?*8@h=#wZZKTzD;%B z9ra!vt)3mNthQEGM@LdqZfsLlbbEGGQ?_|)zHxiLac#C~{cY2VriiRs|H5joi=#2; z$D+~(uvbRl*Q+DaJECvgjyzu__jy$6*T<@lMEAAk`I_iiP55=yl?-+GRh6Y{YKU}| zrJJg<87j*+H=DS4Rej~E#?n>I#WWhS91E`m$|#E}#K>i1FhB>}AkytHsirtka{{b6 z3EYwhYmSp@mqF#A(S)v%$Z}{=Z*vWmvriO!trv2y4`p%d&0vMb1ni=(& z8Q%TDp!2MLm*bJ1?;c&08ojlud{ydL+ecHd>2lNp<)!&w{#Kj6E4?vFTY3XqybnL$ zNzCje*hq=zI1up%2n-^G!o(}Q?`VHDFcJDG1B?6(yefr z<`_sv4CFRI6u3PSb~{Y5CLC5132BRlbihHa7)%~WE>9X+07GVf@1qQ@T$)}c&A1@P zEJrcY(6n+4rA&@qj#4Z|C|1VOE5n!UUFb4A2`3)vEE*X$)1a4N~hNl7(9H zPhJSr7oXA7`h#<8ysiy|WY+~2S9zBY1T;4IwAMve^nx1)+ovI4{R#5XwJ3iE_ZIOIMI@GqB_g1KHssi#kRaHG_T4tV=#K*HRZSW$L>!&Kk)6Z zV{^aUo47Cd{m5)5X>~|y?hbkCH>F3yH?PO<-;?~cXH|4);j{YEiV`Ay$MR*}l`Gqp z&*NpUV&zJy=t_uGE=VdHEma^ZosW=iiul(Zpf60ZErn8(g0D>_R3#Cs!zeXL(DpcR zD_FV}jV{1o{`dSU4m?kTWH4a26v+S?RK*Mm>=p!?36++BrE_E;c_>P8EVU$(S&=|0 z@zyR3RByLF(BihU&VEnT4<2BdMi2wxcSxsxd0> zj!gY1yt*FM-XParkLhR#>li|{51^_?;q^VBh8|FTw{*i0xTYTk6tiXs)7}`~(T}Ve zg#laBx?Xx+H@UV?v8t02KYK(5HNp89A(bDYI+7#HD7t)kv*IVw`ROZ}K;8fCKnNuBHIOvrz zmVBi*eCny<%gNoZHXc3u@?iJuhgXyTJYW5PESm*T8(kMb6Wrb1-QB4RZ7CEe)Sc26 zsFy0mT@nZhPJ+gRyF+nzcL?qdfq(n=W6s>U>?A|T*?afBw{LrNPP!WBhPy^|jte-q zO2xNJsrCw~j|#ZR4YK`pn$?R#fJ$XHKSfqAer8W@Rv%IJI6A5%Y8qhLj$ojT`b39& z&yWAePx{G&{KlE~l_T}5SjIPwbbudX$WLO(_rMN(LEr^=H%2TI6ZT&{Dls4OG9U5N z?r_o`a? zCzzZs{C{zLfID;($3G(5)f3xQVjFnVvP6M*p$ZxCX>7osAQrO|i{Il@KjKktc;pWf z>N^SPjzT@bV&CFn&N%2TT!cM7!VU+sg%F)%GG2I=ec7s^n(pEr>{Ol@;OXs<@9H&L ztCkw*t2fv_i&!Y_coHEO-D$A4_>y#SHlsXXEX4SxC1@M$2^{g^@f|HfP#dO5QdMM6# zNXxei;@e>32>&GL*c^6_x%2lkP@H{tV5cg64@LfFWfT zD5{Ycoh^nf7bTR@AxjuifikI}MU~SNvc%E30+>8uT)qggj0#^yg{=T#%fZA75T-x? z*C>o`;l(!b;s9%{+}K7=LM~5KiwdGiC8JFpAi7;8vP}U79BJesRZthDvlXYjFG~J` zOMQ7be&wCs`9BY!R4O38=@3 zgeTtVze0*XZ)dYkRdFm1s8z4E>CN;ju>=W;L<@=~(1}Ng3;HUv?x-+ROxEXh4@Pz8 z2kK_q>&aF6OI;ENN8*egA}szOzDQ$oZc~L(q~5Xx;?0DE*gZ3D6pVB<+eE)q)Jw zf-=>j0`-gnH5ovWZHk9&l7r<$gy}$l?}(mromD7=NiOicq~A}e_#cXK&y}LwRpY)% z!+%M_zpKL?X6;v%&0;MzFwP5h)retZE3p@2KGN)}Qr0jrS%fLAHOT22xUUQ>XV|~DQqeomYoJuSa%mJkmj{W=@@XGYy=PSs~X-!DVcElJfaMbl+M*C9jOAxqOOL)Rg@-!DxwuS&J54q8=and0U< z7T`Ld6WL=C2xF5AVvzG?69{3D^I`#RM~a8B33~~1?-`vAR?g^8RPuNdzQN+&qF}bk zNmhhd=j3P?RJ02o>4+x)OKP8RrR;=KTA-sMGevQY{G?nSOd}_t5}PYXC>0_V3*n1} z&~;+4Dn)D!e^d=GteOkcAc`p#BxH%=as<(h!iZMRh-z+FjVioa4PB`UcqG;FVyo05 z8f9X7%ppz2&;}!HyDFku30BLGDHJ1>(vr$Sm=X}N6pXK+#pVD-4YX(mz$3Ak5x6J= zC7DKYMIl32uvAZ{!j7o3YVnw!~&gWX&4a7zWeu1GttaxyOp_GadY z|0#t0CX9S8gLRh7b&}4qQ!D$&8@1BcsuizQvjCiw^Iys4zLm>+CQm8bJ`KZqcP)y0mFkq^P_Q-!)^dm z+EHoxZYj1NWxiz&u2~W8eKF2s5gv*l=dKvnmI&{ukocA;_mSpi%jkeoP}%#a*f%Lq z+YH#9;)pv%kcY*I_lx7LOXBQM$PgKikqs)a zb~RKHC$3r%(kvC%VUpBsl+a}oJ**YcCm&U*1iTUUM-*Kw3|wt9a@h!(ROl=azKn)Y z&P*y{KozkN$`~+NH27R*B-#y=!LXz}!$_<{(<0 zAOq96MUXher@%$R0S_i$L} zU{LS09so!T4u=hohRkS2&FO|rz~hoE9g0kgs{B*j{F7?TliWTHv0>{A?kWC?J0 z)uj1x9la;|F*e#373okCb*CJDuMFW(7WuFOdcP>azBuj;0sjMy{(wh+A!f6avgk-< z0)#R#OdTJtnip0BSW}5=k;1m9Mm5P{+EqwZYM53vbhTnsn|efxBCJLUSIdbn5{1|E zM7Agb0~lzdN?eC*Y^MzHMtGYNzLp2kqL$b%o7iO-J*we3V;I@5gsVB^sD5K8wozg2uQZ z32vx-_6!U=Du*Vro(EB*4Af&pjZ#Fb8njIc(ku__H%;g_f;GrOdJLo5rQi*+kIF#_jUGVvoS z{VgiaAuZM>Io^s8cP}O89y0NMM#O_O=&dxEZ3e;)g?NZdc!r37Gtnvy3^Ta+{XeP3 zu*pR;NM$fe$1;jVvx>#Ai^MUC#xerCf<;o@q)@-)k>6!gUMiBFsiR-3VqU3YoK&!H z)pG!Af5qwM|KpY(kY`$J=h$dxTD4?Zw`N&&W;%CgzUe3ZtH&-%6+q`nbtk1p-AavngpRq3NPY`T`Mg`mIW%Bs zuum2c_6CU{803@aL?h|NpuoZljZi$Ta3q6B%tz_;57LS6WfH&3r~FpRc&e2COflV2 z1@~HmZ^6E8!ML{I0i4_LpSy8i`_TV9_R>`M(AEI- z(p3J$4-`K>ocVv>=;z?%W{uOd6gb@KNbD}@K_P`pftiNHO?g?^=(<0~mZV`bSgUekuZHiKVeGI;e5anzpqaVQ%AA>8Z zSt6!Y8d@iU&ZQ#de1m7X!*f1jiLbFlClbz?nEDK#{v2>3L85)2^f^or&%p$nR~)Z>=eAt*QSf*{wb8OIzx<&Xn(6$lv`F z|2%e3SM>oN>HqZ7)dH~(V804raHoD?yB2Uax?wo7ZZ@`RGQMUGB*2lr9l&pWFJO~W zr3!ANYJH}vLl&Ekw!1dh z8smC((`p30M>Rv2>Vv1vgU1X!1~ub*m6O`kD8kgG;SU~X71ms2&v|XDrQCG(c?0IqVaA-!W%U9Ef(#Pn))Oi`2rd5 zkeqNInP8I=ZG%g=Lri`VL;U2Q|24dXncOBlqb1)hXVlKxqrkku$#cxd?aj&V&A;v` zwCDq#3II(8(u@bvjfc`r1_3ZmhO&%=vB`$I4nmy!A#ZvjUvw@$?w)ty1 zyZbM{I?2!a7oPO~_guO9X}b!jsgw3k2TffkZB-v#eEZ=*YO-R^Peydnyn9-tqNRh4_ayuT5RxHXboIwU0?6>o2vAf zs1I9g^ct&=>NSb#HS(QpIXDOP0 z0Sz|mXENFepW#ZxIQ_Nh@I39ov(yIJC=zWHOG#Y2AbiNFj+Y{E-?(ib%0D?aTbKIQ$F2@jz$Z-G&7 z-mxQ&fqm!U1J}-7myV6+eVfnw*8iht>-Esqzj{>J?@~KyQ8;c>q%^968@s7%I%ul@ zMyfykv~_(n^*wY|15Bo4Eam_}GRSH=#AGteSlvTc-wkg0zsEia=5BM=F-ev#S=y$q z1H;xySy{fF8omAvg6!L9anp;K2m-39yrD^UcJRPB& zKcbQoU&f88;Ko#m5GzHYO;U-y$}wF^v7PGhPIW|wN?gBIY@eprgn{prN$6_(`c_ZC zN<+|6ThLrBfd6!T$a3rU_VCf(!2aH7z+!jcQrGDzd7fqYgfbew+Uh@3wY$}_yV1SB zHypau8a(sIW7NcZ$k3}_H*C7d=4Kp)6`hjf7$AZ6~Uq=7J$ z!k^=dGUJam=Z2kQj5$k#%l=8kf5JJu#o9ZeZ@&Umq8%Jb_Af9Q&rylblH(txCqBfW z?#I}?O}gQ>ZN$DJqueE~*&}8&EL`2f+$pm&&pE%RzPO{jx&b77vxh=c$F!3a0J_OT z{+R>Xxd6IJfAF+F)s#Q=w7QML7gn>%P~dl~Dym@0vS1@eCSKb^GA-PDbWlxE4DZo`#f zokgyy>jT)l<=EKxNT|V)e2zx|Kd*iL&x-V7YyzbA@f%n z?k5ueH8mTID`Fy+@k44w;~M24e*~ghBqEw60C!0PIzh9>FJZ*PTWqbt{LTA}y}K+u)8NUe6Y~7Q+A$S*n{i~HYNBIte|2g5 z2sFCM)U(0Xy-zy`;pigMk8W(O0UR99&M*#ct+s3|Hf_w;Zvv${q)YrGf@qdc=u`yo z?$P#A)Jtqvi|jMCgBcDG^fAj!<{~5&LMcl1-0T*c(WR=I+{?xG7)$mr0a@HA~jL2^;=q|4U=&Y@2 zu5Pd`gs`rLYORK8%tN#nAj~Tv%#*?F)1hor5!90*bh82CgZ?4|{#;|8oMT>mbO_pq+9x+OZuQg`lwU-s27M*hh3@%-Rh^Eij;O`>c$>WQ!ls?(AooP z0Q>@$We@sIw+3_;C$-5ljFaH+e_ z9psbe$tdS^%t!3aA1T&9Qg4Ejv#AKBeE3RkXth{GjVQcPE>1Wq*0KlS%A377H9J@ZSU0S=nQ3g7BoE#o<3q6T-n{6 z+g~|3T|A{M4UPGC&7Dph&hOKVoPsB2j~2bOsKt6zR&@E<@9 z2nB4aowO+(wy9J8qH@wLaoBIXF=)0tr8l$4Hnj#C*q|QVQ=_hLk(Jt$7Y9~s!|%N; ze*Q4c^>&KWeWc@^@)wWGo;au8`-HJ^$KRmJzRa9?o-_9%M})BuK%{{T)I>N|Q#DCZ zKSZeBN6I`%b{j52v^aSO@bftrInS)gMW0drIit&1~ z(JsnhC&g$l#e6t(#{*5x5XxXg{x%G4)0=Xmwr*`sJC4e-g2dG@hqaNA40kh>Xjl~(Z$z{61ZP3uB zNZ+o+K`T&__57-{`V78yuPR>MD}DK};>nY;$1jTSzbd@*q44(4Tub`wYm8Z!IKoZE zbIyomnF|7h7|Mni%S3BSd#eMT(+DbDA0bemC}SKWX|~GKyu{lG7c<+Z8d^CX+2g5) zNf|B+RFe5?HiXUAcuN=g3nm%5CYXCC8TzIe1}5l7+txS92g_?~d#48rEsMtuGbhap zhpn5tEt|WWn?X&BUISzMm2)0#%OTCH!IiWAjjOANv;!-AHS_#`7Wt}IMNA_ljiTi= z;}o@GWb_ggbfP4T=K20C2^+;IDI=8Rl5~aQbOrs+GiO?U554|9-1)q8ZTe&;~V zzD4*-72o-BrotZfJSAGZ^dV02I8Bn0D!Y*^LrIn2&M??XH{L~=k&`uM;|-=0XeJZE z%Mq-L2{Q{~>#vCcBv~wSfVm%vQ#~TzVZXaf*{XAxNF#{~>WgkUk|! z9{0$u5122G>#fW%&aP37?27bm3v}+YO_4R1Mur#-=JZ(oteo?oJuZCuq~h6&(x-1q z?ztP7lB=Y=y(#ly@*!VRUvO@)K?Fw)~j^TrWNy^6l z^eGL7%#}yx$e-aXSm3XnA*Gw7ChMUu60a}l zZOR#Vkq%H9cnK7E%{|xdqyGhPz=dDoH^1cDy(x4M%C+UovgQQHvE?qf&t3e8JKKse z=*lmTbDtB=e2G2p7GdbRs`c_n{qByy^%{=LW2}V=^f=XHw8{}iVgoI?j*&XWsGJg% zPf1G0M1>Q)?qQ1Q5z6#1!(=X&ej$xn!dSl*w#@ z=}f#4ImK`vDZiciSLi7kP^m`2|K&JK{U}rQkO=sd*-AHCn$Ve=XPsT69a^Iw*Z_^J zY0pk6>^B1*GoL=qzyGA<&a0vaZ}RRqXWe$ox%E5qIymbJWB4Wh5OaYXQ}Iw;i9k)6 zU>%7NW6@A!@f=g$yfZwWYU=6~ReG{tEwbnblKR!z7=-KHS+-zD8 z9b5A5A3sp+T&-U8Kcq(OQsQ^0P)DFN3IhRKf#qZ!&@@Vnlr`L zF~i?U`kiM9&OFZ^XdoD*FYl+K=B1$IsiNYmAs1%GoqL8m)L0-$Pdq_gAxTL!N=gSH zT26QOczAPve4oC1MZh>oRVi3o*k4mDMqTcdvF?C9cl#i@eKTQ+s}?S=zan71z+HPt zH}q#AOsRdfYR-RXlrl8sIXLZ6xfwdN=+!gl-Ane8>shH;2qW`mFN;?!a^@X^(Ev-_ z#j~7+5SfcBf@e}hJxax(VLvZpcXF-(QrDd`6aEa>>$y-qhcVl&* zdYQR~oOjQ;N*jFkca|+frZtn#MR(6Lzk)5OLv0!IZCC+9Ef@eYZQ1j!+4JoggRjsA zTY$qYX+6$-_rCeV*YazU{<~F!$9ufDd%;C=JGtsx7&Qv%{}2?8Qk2Q5Jd1y$+J2hS zApt=5fT(vu(mzFNQvSQ&VZNOB57X6n3Gwb1Z^_XWDaFSMgnq;cY&)g(tJJ%P^qytt+z6%wJ>VD*V98&(gJYU{Lh9Er=9}O{o zO^FCW1GubCw4~;WnCbe_K>P9d4qfk>h-sXpMugl=h~&*VzKVv^!3{d>22I8kf8o4% z#WHuvs%Y5(TjmCR`ru?>+a9cQ;J9aWziksfybw^e1s$6Y>L*hayLXj4cRPoVyN120 zmm+rP(vCp5%_HQ{bkH76MuhO0HIa%FR{S(`_L^APpY1S}{&k(Q!O;otMMiv_+=WDy zoAFAwVQRJrHHQ_+J2Co>{)Qj2Z-6o_=zP!o%(rI=xJngro+|V@mCyN~VOQxg?HO_` zXuSZr7k&p`r_a60l7E#o{~~qJS$ZG_oCk#d2)z0`QRC&R#O-?F>*MT2ibu&RM+CJa zl;VCe*GxLsL>kXj3Fkxx$5e{eDWDIpd4ktCCTL6(w5C%2jaN<9pr-S1M01!6MXXInrxKXrEc+-h!v*{#_=_K>1MAO9y z&E;vc*(ALcvc~Fo4^>SykH)kfC*s1-KpXzDdy?gMo@L#BSNPyV`om8Z_TZc=j0NY| z0u1;)H6%TBWCFFs0!&1JOJx2T_F!#+I7QhoUEw5I^%=INL;9XrNu5M>buwq+1~qn% zCjF2-bCshwLh$SoM-hcFYx6L*bu0FeCF_tSXA)?G+!ZUFWy>7p?WYM2B}H@^m) z`RaM|mxtvCk8AJ3Z+?w2aG}tjlbA~vUr6R9r}ECGu}`LHOeAwmr1CE23oK||=4 zpmk13x)hAwG)`{{tuu@H=aF_d(Re=IU@6{wKEZGvVhpeVF<6C~EhU;SCK;?j3|8QP z$Nv*$uoOkJ1T|QTGFXW+S_67-oaRI{a4|Jsp3(%Enlf7+9bqx5;!>SA`4)6nB;~CN z>aA*tqeSstzM@;7bI-Aa7z+S+8wiFQ@#S6S3O3^P(H2QkQ1sN34m0BM&=B#^5{p(= zToN{!=4x0!9GPNjUll8#<%p+&z@u;9i;Hqzw>uk1Dm14mG@usNI5%F9&oD9(IkMD)(+MWe>6BkI~_K zsL&lW$VMuBlSbyCY4OOPDWAR5l$}G=A&5lg%{>5N*H5GCr+h|cyvB#T)=%S?g)YP? z+d&i^7A39MmOJ##Y;cSz?Q6JQEmh zat8Zs8tXKU>5#~Nn8~^s%DxrCu%F3%N@Cb2>K)?sk0K2Z{@cGwM@}>#N9&NIb?4&r zCJ}#`g6mCT^`|kKQ_))FSkuJ>pmUo3wH#rv5NQNhUi+IEuSS@zMVPH7XimifE!S)* z*?4|f=9oCdbZOb_Oa48A!KyguWBdikcl+MNStQC z&wx|ORHS89PgdI-<=Wvyt}MG5c(|L#F(;lh{3a)G>%e0TY`J;_7w~ zLkq!`ThNiokk0V~iOL0rf>EKgM!uvf(by(nuQ^>xk8Z&TBcazcYtX2EVr0mt=`^AF zFyWL1-L?U5*^Jycirb__Z*RbxkK@;b&mmOo5_F!Na$Hyuv`J8W=56X6aQSnvCAHUu zuVH7Ya<4L%S~KTbfc#C|0xZ9U=)Z@`J9_B3cbH0~_%7xGS`*$(jx`&i}!49ziw_5?<~9!$Fdr=_O&=RgEtJHiFqM62*^Iik``3 zJ>V;Qz*%g~Sa8NI#K6VF#XbFa<2ajHFEVcd+A~3=%@MA+te{u>rtZ8og~HE zP-)8u*((Uqik9t&+U0=3)xf%?u)V{iQ`(dRx`d5&*!EsT(?UqY0;FjXQn}>2vky7k zNhY%b5-}6P+3PI$maW*9_2?Z&KDS+50W-%P&)_6 zY4LLiHS1W-=W9}Sb7B@SiTl8R6n@z)?82|G^IvnXQit56Ex1ZuY{e9Q^@G2rQ@-WT zXcO01bJs9kCr=%x7?a1L`cK0(pL^YSQF`fp3GkXLKZ7)%hw40m8@j;sUL?sn)i9nH zSxjL#fr8gzRGSdcIt;vy0Bs?_8`0EzF`)Hmy3M}^X?GA*yYPQ~RcH?i-hr_mz}b({ z>?i4L6g2A;iG2#gF`3Fam7+5H7xkkQmE&YpN{aF+L2o);YbHi(HWol*4p?DG)tx~q z?-P|y(8G*H%jTa0E-{qd5iM|*#JWnB-xsT}=Ffb%#BgOCJI*x@BAb&bA195hg^nzSE00o4nip9M2YI5Jo#R`0VWZAT15VIk-iQ$)%qV->tWFtOwrx?S zYF>F{V{pQ2pDkrZFl(J9b!^#xY$C9Rf~Y@&Q<(FkB&_3gpC;%%qcE1O$=f6<03ZIj zhp9{W#qVJ^X|o7RbWuYvleW-DJl3_!lL1TjHo{!019QAe_ z)ovVk!;gL|fR+LSpx^q>TaJa?cs=_{PXDs07a?4|Ra8Y8Q&KgqBo0DjoQq7yv z%b(E6oziPxR3AH%>GNvXPw1V6v{Isb=3p)Rkk%te-*muuuc!Rzfl}SNVg8B$yipL_ z;Tk#Q95M1DY1kRo&4(N2!Hv_W4X~EZXg02?51sV(><>%@0-bqQpkR+NrGLqDbTYJU z2iZ(ZoDjW;P`?LJyx&Y+Fei2argjf1{m@I}t(T^Y_toFIms#^JQwN{@k?jBuxA^R% z4%AjP4E_+V&XQ z9MfFpC8ry;#cNmkX;%9fHzt}jCmPgym^CGtH6 z*LPyHbiOv1qcKxp9Y;$+gZHs4`#5$=D(f+x7rD)CYgXdj-kB6KUR)X2~;<;AhS$5)?wqg}F{G|^WOYgB^Y&fyE z_%bguRyeQ)o}~`H^fSotvxlampTdK9>D#f&S0=bG#427`7R)=^OzU3@8|(2K>GK+! z5BW0--DD?jGN%05N3^YYG*|%Ko%689y+MjVXXFVz{z1pvY)z}BufAwv5 zfpzj%tbuYSsI#XHYj%~#4(etQ8?1yC&cY*_+~IxbF@3=fD05%<{EDJ=yus6LQ1K2p zCsN`*RO(^0@#ApaPkulFp8Fbjmd?xKy_cnX;l+=k`p?5OUW97`z4U3YkxP)VQ;5!6 zxS3U$j8%YwYmknkuj;d4g_jWdw}}RK=5+24nOSt1UO8pGH@TETw+Yj#iczhHX;jDP zRU@?8VoX~T&8m}3YU2zKJm@z40seU`N{eX5!RpeJs!O4bn1JDnOfT`1pX z6w?L*d>l=48bxy$MSldLIf5JQKup#n%r=q?_MkfZ2+hAMJZTb}S*^mi$-1ASu7Q2; zauwMLme>oI-{CB^;VF5*>2ZNA(~2|Kia+xaL(T*C;>$E;_t;DCGG<+)3cmc&Tl+Co z`DP5?#TcHm(Y$Ap)NZV^5XXD{Wrx;Bx(>%X{W}-KyVruNDY3`9X?6RM8gghuw}1E8 zmTbe+Xw9iw({|-td@DI&ybIAc7kkQFz74v3LSF_Xb*vdR+oYCLMCAdx?F2Tq>NDIK zrrmzPo8J90a?~xh{z821?V$EY;jMQ5b=KaMwq9*l!y13YRtX{d>6;D=@|Gk9eTREP zs!pT#=nH1}FD?jXZ-KCf0);c2=NA?1VG<9?;%E2R3MP3hXLzp!7(MsUbc)t`<8kG; z$CdXXTHk>2Oxc+)-Z#Dm>pu3o{>}5|cW;fCFw@sTrf*;-c7bv(VR9~kim$@uKSJbP z@v@ew=9f_hmq!gOT6E4F(Av)NWUw7(Yg8pD)I|a+6$YTXf8YRmO$goA2#v}Z^X7QN zMz4P!b77Fu_=s#cJQI`9g^3@pfY%_y>XP$!$4l0F{U=*PgsX#Cj-nVSk+fS7s`VuB zZi3lH1gJIEtTmCUEzxKfrZAh@C3@ya?@gRNt-k|Tk*#Q^i%8jR9^W&}nO33&F2cE1 z9NG3f+4oq>9#NOxqswvtXWgaEbO4oK{OYgk8e;AcC1o+kcVSwzB2w{YqRRQk>8P$Q z-?84K!2wElpKs@iN6(^P`($kWKBVq6wtmX5Z6Tz&BcQK;Z)9+#pN)ocBh6VZBgjztm=h$<$EFe&jR#cAatMkTzUsLb_v#b9IE}?_u9{J zUB?iumlYSklwAGkZ|xqW^f=DQKHk_aO5#?i;%9&LA6Zf#(+w|;X`da}JTq!?wL|mV zx;T#eD2MLQ%d{q1p#cJ*+Px;-xd0&5F|Xdapxhg!+a76F`QN%61{Q)%K$JvA7DXkc zLy|~2as8R-(M)V}0ih;2i-OGD884gb^A&9g;W|QS9wiy<#DVtWfHuoUAu5v7hBI~U-D~R z@@|{=00uc79ovnI;q7bT%}f3*D=|{Tp|zWZ+f)`4Ecd1bE-#2$E(@KTebzw^Q z<|1|0CUem?ZRx=#s)L*)(H};aKOu(ibR=|qj_kGzYPCohf7Tvw*L&Ep-v3MS@!7}@ zcW9?|Sku+8CYOjoaPG9^_-^l-Z|ij82^(^b>*h4)jWr(21u=^i-t$}3S0^|xuJBsy zie^WvUzp=M50QNkpyv`TYaOcX2ms_!#!g;Z&wVU@`QLE&*8E<2`*XR2o7b)HUblXR z$-jy-vx_#fh?KsWDfuBw;(e;|#W9Vu17?@{v@Y#4TQThfXtpNkH$s$pq9i*PWQOKs zhspQSX73D>djmQ1yxb&&Tg-K!yoGbrmTO`Kl;yM*N>t}D zeCRMzyUv@jWAA!I>zndvjuEdP#naYaL-rm6Rtck40X4S4jpqUze|fgLc{aHuv2pDtD(jE-wk+kniy6oP}-C7cYoqMyp?rQ9Coue>F+&QJjK9 zgrt?1=HoD3XM~v@%=DF?<<~O%FMd|P{cV1f-~CW(_o3YGlh>^`d8)r6W!}daTuC>) zf|0w4HN1w>w@lN&*=u;Q4Sbt#CY!F-N2e`GsRyCfu_QMz0Z4p^IPst75d`r#(OO66S=w-3;GobSGG})FM zIX5};E>QVc{|uFLjgWB)*LU&JdK_%v5Fue3qi8wFS8~LH+Gj`3gL0t?_ZI|;hw65R zYc@OT_C|*fl{==?h?9z)QzIiMlKtfI(WBui%5crzP+dUVc0vsq)jWx-;wW4LUud31 zHqIgsxN|mzEe_TS*LkvMsIN>hUYMr4xWiStDU`nmzP=}x*|}IgG@Yi|j#$xuy`}Sp zp)!eM$oo^_+^_KE^MU<8LOWc-J74DZJP#fe&*@?*>0~PF2PJpV6!t3B95hWNOmJDo zOW%r@wp!yqt2`25)^yT06L+wY8q0ZYmg7p@CgDh+Vgp>TMw1gJeJe=bHNxCJRLVJ6 z$+5!zbK%{OUN&F7?cF@=KNTuIEx-M$!tQyl%5ReNJDkzgRAY-|-Sg=>H_~;k59?p6 zG`+Gsg=X6a2B;D8-BD5<3rYhMcT;EXq|Vrn%qWddOLVU(_P~rAApbnxcMeT|?Tv&W z(9ua*p6PiZ$h@HBup*3KRxYffFrp$op)4cXtBCBMvO+9xN-nR-s5t5_r&A(q$rI)o z?hooZ6+K|}vg8i_$QSyaE9*T^_6`1g3!c)8)PTqQ8(iTZRY@Otb1n*&IWYUFe-D*^ z8YcHPSkW~^$~IB;ELk9L9#p!|LD=U+0Ub4(<^16SvA2Fle(*>Gw?O?Zr8o3t>_&FtfL@hOTi^FT5TwM5un5Hhtd4d#y$2auw~p4jv1YT^!SZ zF9&uT)DvRZ7i8YCWr9Cc9a@*J4NxE0R~|n$BP=M^tXHn$lML=arCnp>Z&j|PfN(SF z<2yAA#6_yyXsv5nV+Y-o#G!5f6ONQvoy!rX52Cd%he%il$iDSaeOhk&!{hen0JUEw zwm(X5e=AURFHwA4BL6%`>L*6wN}A5~6rJm7nrDVJ&tg=+)tlc~o+!|-_cLmP7`H=Z zI}xe_6Dos~vV#-S!xNH&t4bXU+RYHN+PHrnZ#nwka|%U4gRqIvOplC`aH4NE)-yjp zEGNsqBGaQR%cHE!Co?C$BnReII2DN8gq8GWg>xFjaVpyw3I0Yu<4ycc6LW#i+kzAR zK`rPehwnvk-?O4Vmqjxzd9rWv1%KoUxxtxpO`zlvoA)huKeZQu%8vtOUq|a-oMO(~ z5KcQ7(DfkThO1~`hA*^9d9N|$L$ z*BA=B8!4n|n)A!x3#nSS;*6~#q~3(eIwH)S7nslQ3S3_STP%aFC1^T?o4$%PcMg+q zgi1LQdzkfy7^R2KW&;O$#4!Q!4Uc03FX0Qe%TnG+^o>DOn6B#ETtqWxg;vB)W5LQ zGp8`3JQq`dHMr{`b>CV1*V()74z^zfpZ^q${YCF_p55atuRpLpd|t%syqMo*Q20%O z;G4psZYus)`TVboc->_vwfj-3{1Z4cgXLThx@VIZuCMcAE6Itc6VaR8q?5UbuFa6{ zt-$f-P1ez4g|Y!l@7f;;t?!~*KSEkQBU&5^y6r+nbkk=!{{(5JlLZ>%syK33YyCSx z=kfB60C_6E(Z;X)*&eikFCn#UiIR@_>c4W;e-mFGjQWb=;% z)yE|^u33uD(NaGlYCnUlzQB|od*5LXuyRW_uxn3%!#`uof@af4+#Jou16mwSlNj&Gk1n(GSX+8Lu3?BvOA=xd5 zs4{U{53}X1D{LQ^+dnQ={7I1hM3nhhap!4{>{pV?JGAP%eA(Y9l}p3MSH{fjM@_Dc znp_z$ICIQ#k8aoBpetTnMNsH<`a(M=g4B;RqmPBRpOoHu;eFpV?2NGQ zIUer|T%ndC6%MYRcR9T7vH4wQDzkA*m$g3SPhLHQA52G5)=16brLD9D-E0;kT95FO zF5?=H#M=UR#*YqXVs<#wlGtx{9e7VP?o8J1arMlsQ+%5EG2_%ZeS&FYEY>Od=ZMBv z%VfTm$^FKuyeH^g#VTK~aB$C(`JSqO9jSU5Bl|r~)3S@ork3$xwf>zUs;j_$57E{V z{R4#AdZPT&q`?%~WOY(@c~W{l$z&_qa6Q3nC&G05zxy3`9Q_?$c|LFn$p{TZ`#}mk zV+sRNzF~O(Okz-BPEbj4a7k%sQF%yKL9u@d8UnxU==b#O;~zI1sbsJ7s~RXMnyAsp z+JU9t3n_l%RCxPSG})Oq(w)lNohj&|IQ%1L@I?my8zSYm8FTM`E^u(mk@=ac?u0k7 z!<#?o;VnH}4%#`1YMBjhoDSXHNSJ6@wM+3Mmu#7D1ri@il)m}gd*SiO(f`3~@B2=kcb$DMada?X6)?`O-Iqi)NUvjB7uNF}@usYgILmRvkJz znuuu^L-m1jj@B}__B|(Bcfk1W`Mr=zU{RME(;=8V2qyG^tkaAh;gvtdGdPI(iTJ^*#!yf8tki zcVcq-#2R{o+5ZfK?-kJ?E1rBS-iilo6%Rj_NxeX;e8o$-rs-ZAV<#M~gtiFg^oSH! zt@!S)$B(sd(4`-rkLvuC*!eWB(S2kanIYL!IlDr zxA^mAobzO#X5Dejx$jbLCscfe%J&-B=L(g_bvoZ`!k*WtOKuDKUKjJT=kd5M>|^Uv zeE&tJ+^<~K51A70OQqlCs9(Y9-ON+9$dUO~X7kwN!3$rjrzN)@`#*Z+_0Xln`c=NI zW2WLylGKNOhKH4$={4NQ4k64Ct6iGrIh6T1jJ`z&HIclYc(NASN1M?L#{Z9{vy5u9 zdHz0DTneRlTZ+3o1a}DT?(SCH-Q6XDK!A_{0TMzWxECm;xE2a+k-E{l^5p;Ych2|h z+3Q7KKJs{(Y5wL;+oXG`>d+Utk2kpzVYz*x`5+ z`h=5UQkv~*X1(%8{hSdaywMp9;xD}Ar!|I{h}p<3o|h2+mt+S@|B{_S%>;f1Z?#f^m}wYe4W^wOcm=8-DsSTQKQ z7m?qMF6ct%^mq5Cx7(LjnWn`1Yj}D)*}Lg^Q;7kocod|RRaMm_B>>9OT*{K1suY4c zLJA5}YQnmdBK8b|p^hZ};lwR5523+#;nw&2Z0-#>g$!`HLj;q@*~1`G8i&;I$w$aN z2FxxCX7y#?uMg=I2WNKKhq`6!fsGsC8M8s<#~zJ4`U5}c{$MFSKsUJEvXb$|mt_kQ z2(zu|M=Y3z>;iIJ{IL%H7%N7c4O6xOTc$o~mH` zkwU&oqvy^hYnEAYD-Q?o|8#+QWZ_=ha|*jIDNUqs!l0mTkCN>XsY9EH`UqDvmL(QW zZMc00*hObFrC`4x={iqhK1pQsRmy?w*$Bm4f%1nccE) zH*JZ%D3x9=CcYhg|G;@tEevkWlpZIO7A>C|OPgnMcgWmt$iW|PYb7>}Y3BEf4cIe{ zIr?oMjyT@KI?}g0Gq-axqb=?X*suDC4lQBmfN)L+cd4)_YjMBJGU9S zk11RCNTC}v-3LUrdnD#i1(#lF?Lh&9{>R~cTyaQy&td#~Vi_=E5s2jTYLho@Q+5F< zI)ec^-7+RndE<35*I9DJ#NF0eHq4AdNF|i%wzQ#CPEcCACCY34P`R)?|m(+Zx?nJJt z1pbzHynon--^JpE^C9k#%D1A4f<%$ zIkgA`^^_m$@k_`nOGrFc7FD|Qm|sCmj!;y9ijRv%!kJmk=z)wnr9e>pgI-qO_D6|O zjua5B$-1)Ru{dHVjH`}u7l)6$g$e8Dnj-9Br{4Ed4ZKkT$(#0vpWoCW2O5y7A? zQFgpsp(aT~3URjnqhi~jJR9#4Tkm39pIjT(0lY)BXBoEV=;HA_77N^wjoK9O+?9E< zD<3&cpxZ?s4PuUfQK>&Cu$W-zS!L>e$~3shIJC{!z0CmIr0dzF18-4-4k)|ZiLKid zoLZF~TV=I-q)fU6!w31|2dsj|Yy!s}LdKlEVB*nj%DO#rT5w4N7QvU^k%Twy! z+Xn+P_b@-$Q7iXB9RNMLoipbG{2hZSww0TiR|WIu#WT#fImUs-hq&hoZIHX#a0=jt zvU4w)^{hAyK?!VS(tzHN>t>AKqK-Ty3Yj1dTT%>q%~knuA%l21oq28a?h;D$%K-ht z5dA#*{xp_pVi+&>NzB8U%zqr=^-Vnu&Am-ch~{QwTG>!$G% zJ@ARAwsWLqh)!~Rbik9lmKFejih`F6puCcTtei9tK=iQ&hp@UYt8ToXg9e=ulH6pFLSsPHa!R~yN?>S80R8oO z`opU?e8WptV@rG$AMP&UUfc`B1v8_gXmEk|@}r5Gv=r;KNz!BPA>9KYcZ?8f*2DMX zh8{!@xCQt5ut6XCY%}^FGK8$&N&Kv+_gyV^fhgsJg7!;(Yo`77`_H>xae|+bYHX7k z&qzWiM0%%$y5~i^4kQula^M$A9UCh4J8DjQgbq+bOR%ytLS7FctKBPY+9P5B5;cJE zhM^w&^f|hXJG!I!lfjav?aG=63Bw-ICvd?qh#YX8*yTTt(T@+YzfUoXbT)g0T0h^9 z8n)~Rr$3j0Pn}G^{`T%8#@9-KIW=41bL<@uy+Aa(CA4)b%$W$LoG@z)%&{R=) zlrLz*kIDTe=)yK+{l1AbKAKHuog8JK%wU;Izq>fXJ~hU?GR8JB%sP$56U2*S?tgg! z{i#iO^|iG0ru;z*1F7SCLCm z3816OtD`JzLT#x>ETAo|DkZBTp(Z8AtH`S+1pvqZlq8gt6#%k)l8W+Dv|=VyR>tC- zdaO3aWDfQermhJ|!M1_n`b@zy5?xdGKpUsnciY*IE?0T7o^oa1o-JjLdAMaJ6OQ__ z^#`)l=qQwF(^MwXpxL5(6)iTUI-ag)&K!F^BBk^P8s$UZe2) zNF4ErDCwoNCNsAa%N~e)c7Wt5nsjNL;8(_yl=HNV%alaNia2~itanBPaUcuYQgPiP za9$M{ddi2M66)NQ1|3j94#*tV)vUJUje2QJ;It+Pp}02g@F6E}taV8L@A?Hz#T zzN${Ij1EXzw_Dy6?_@u44!oVupOSPuQnp#)L8!Oy+72HGVD@OMC*FR4|MV#R<5uw! zKW>(LXiE*W2WXrSF8_WoIwf1VBVN2l<~u+auu2fUOc*mo5xq+pb0Zz{O`!bFapA)` zEd3(x;at{(*aw3kM>Tl~)+3tQ@onwv_RoBt1GbMruYIUeOe zV|G@JwaJ{U*|Z(+X!#HuIFlQDC8b1h zxRyK?DB~3zyxd1Koqgtbcky}xi+*TR6wKDj5?C0>S_UL+PP$tV9Nb2y`AouYLm2#A zu>19|(_^l_PEs2Pl{ScR4y0OV1iBVP;73xrv?2^rDE-Noy()@3md@K`djI{)TIJeC?&pnyIZo`6Ox1*7#e`7V$Gy>g zvAhG`{CU-;C3&Akx`1WkC$r=+8xmeuvO(lqxp$^BIcBh|li3gF@a!;;=J6d7bv6&C3FMj9u^+ZE+$xg|W* zuqPE(ClD}E;4xL>*CG@#AU3xr6E~LNRuNE^lUIKNa;<1sO zxB-d1D~Y{}jHm{woy%Q~02Vtd5*vRGLpKVokl5rnCWi{DfeP};jGOb*->Zf2yU}n~ z-)?t~Y%Q{CjXU{%Zf*B8yUA>}1-d%bTws!|hg|4S+XcIV-Ccy{t@PRl1R5XIbl>kd zk*xK|{T^2RJkEbp;8c1VmHl66>vmwzdI)4U5VVal~>uo7JED8=B%7G7*ohFrCcT{W+04_&r4o3i|=VUf7Xf$D> zu^^%34*rN%`h*t*MwEe97ip>% zi0h`VKE7Rkp8MZZ*gSXUjA+5a#^FO2Ybt=1z>7?#$Zu%7WTd_EwL8LHD(T$?StU zG~DiM`p_kYdj({=HaGLO*9wjH%Rjck+=(815L;qGo})+96X@4ZWU`>v_Dl-;oUfus zsrpzrXLS#Kvo|!aR5?f+zIeyz+i{gfA^r>0pOmhJ=+Xa-(mz^eEG8o76JQ^r5!a#6 z^C0k5VC!W(=sI}(rw!`&xBc>MK)|N7$C0Yfu9E)&!0S-e^H9QLTFrA;#krHxq>IA* zSi1d4x@`jBdaU5Mr)+nq?7U5C{fy9Toz!OIj?p%i4*Y%!geeKa6pJ7=?IE*7kmzsS z0q!bW&PqG}->+0$=Vc-P9gH!5ZYI6>O4|7MLEUfi)4A&z_}iJ@ch9j~qS-G*)81|j zZHi~Di(n>I^QRO_j{%i0MT;&Ux2(&A-U>8*dX_^yS-^CFV%Q(Nw>!jqm`8q;%XFN3 zZ$JO;-rzsc?-$_pk7HLu11zhmv$npgrLL#3zPG6XzuSQ~Hn!(fTf}9@n1^_(J22^4 zl8G4O>EMl#K+K-R!kNU{nMgoO;jx~qf|Q7qu&|VXsH~WTs>&U4penDXnusZhm<_S0 z1);e!rH&^9Fpx^uO-0m*QpevdBI%K-52al&vu@ITqjD&s#l26}U)ca>WU@ME>VB z{N+jK>lnmU0{Gek^Vtpa!5jBIvHdy>b{+|P9o~5!)AJ?<{yL`XrDo=w*ni!(FUp_s zdF&~9Z1dJ{aaK#WU&W!1o2>IO3^dQ(eXiwbqUBLba#+4L;%#z-V&tR`p-5H(P;bRn^HB@onB<2IG$ zQ<74V6_?-(Tu=oRWnzBfGx4TRhU7Hskc(WWkh@z4i4eblb2 zJcuKq^vTDSOA0Oh67FNvAsuX*XgZ^7k*<*T>v-^+7}!a8=gAYqh3Ck1Ea+7V_}pjg zTT<(rh@Ll5omb)Tw-Ifx%uA-F%eJ{Mp#KS#?ns5q%6jYrd|rs>Uv6g}@m3!5)g1Ge zZi{9=6Bs|_&L5!ho=|Ie##8oGu3<{ebCo!FkT7IN%4t`{Wu4Gu{|@juks+K)55t~< zW=-lRj$5LQSfUPDqzax@aykGsZ1PoK3FKZJq<#DXVXvNiP(4jt^ZRb?_p>S3_0G`c z#@O}dz^A8sB!8d2e%imNmc1vJdaPQ%tJ<>9hIGm%EN*CJAL<$2ksvAF(3Bd z+byC$9>B+v?1uwv2Ysv?{dlo$Ao2Rgu{uAuCNHnLHm9a0v#Kt;x1|=@*ih2g)LfVu zn~)r79bhdL??!t3%E&gcZtbhM$kadmOd(sv2mV zO23oU_`OI&0QAhS^<6aLBoTHI)pe56aTVTvk^;T(8T)MCv>u9__C)`&$(z$_SeL$P zQ+m}-GL!vEuw_psXj(epsdVU^lK0z+ED2aXSV6b__dN9%ueDAzk^7;LZBJV#a|? z>M6c<8h7Bnpx1lF>R)Tw%tz>Z>m&5XD28X~2hRrC@!CMKZ{R1$2hRsNHqiKy{Sb-Q zKaN#tG*)>&Z)kRIE#f?o_wH*~LP5v<%n!e#|I?m)$1|(88#1iI&;?`;c zI%-0i08RrrC24t8RT))PX*E?fWhqq!IRGafqNb9VHi?Z3k&{2Ma}bk89HpKwp}iLk zFyWqI60J^XfKLgqxzaA9Evly3BRM8HC@;#RA;LX1A+V6$)m2qmT0};Il3Sh8!iK@t zg)aUHQA|2DFvH&~z`@^Jk0Vr@-sU-9=Ye4FENvK;G;xw5#ii$?8}?gr*I7#ES@`Hp z=-}UouG2WgD?HPX{y(OrZ6>8{Qs=$;mk`;T-N(1xf7i=C?-XA0H0;Q^@2k|VNV?6a z)E@BHY>5}$&Ue1sMPBcswnfJOn?%g2RiTLd-t!dx*-YQ(s-BbfKodk`nKdUQ9TrK< z7pMZKWxN;2Vg?^2q1m-Z?wbyh#V%8ZEmK6V()sPlx?emmxe+h8-b{V?ep0aI6Ycp^ zfu?WN|80|fI{v-Y{bd?L|9eHI{E|2C+voZ5H_Ig#(pkG=p{LxTzn0S}=Ze`5FzkmT z_m5HRM<|xVF1Ew2)4i_qV>rj*(8GQ7qrKsmdz~-$yU%w2R}Zs|qwHmNifj1}gIFNO8$Zb4jUj%c%im zB~)c3l<>j6q@1W8g^(7xki#8ITMBJ2Ds6Xqqj-<_G|zzktFpI#cuJ}F4FnU$a_ps1S}E7 zEpliLFd6o97%tsUoV}lfVUL((PT752{YtKJPd10*=MHbnj~{2d_dk4Q__0L)^C|7s z(f30r>AOw&=2v|ApYCk;(ykS;0h&2QLhnk{-mVqA+wQ;EY3DeG;_=~ivDbON-+6l6 z{ptvDv=2SrgB-u=!0T|o`{3Yzbvv{3OVaa8Q}fF6QXBg63WqaGGn*UpDj_A+otZ7* zxTI_m4M#P3GX)7tB`I@K86y=LbwVjqMMVQS6)7oIX%$Xcz+)*1WeGVET{#g=c`+AC zTU!F>URP0^})-}%h#v-e>QL| zQcc|ArT3)EXjGd&ESA6A>G(feJL)(&hT_Nb-QLR=U1vw|!-KZnlaAxl*5iw|;}g)% zS;x*v`@ePd?0Dz;)gje~mm>E$M7UMth-p@ypJTK8imQ4Xn#=svE!^(VKcMoFkg=eV z0+Pr%D99V%k<_H&mLyfsSK$XLs^~~PHd5p_AraG65O4s9=#iM25!n0P2PQEaCvv2e zB$ah{wax~)H(Ntz+qd^^F^gkIFJ7N~Oy7R*lm;{WPtR<6dg zp4~RD{4T_PPaFG@Hu^Ge^lk3=+p@XO`Ey_L$8K6$|MZqE-|P+%PNosh7yg_qe5Fvf z&sDR-UB0DO_d+>yPrCSQ8uw~){LOmr?ajgMn*+M@nI9jYeLIFf!c=mT=i|Wn6pPkMh$6c2fTPN@LzwINbrmE?svKcK3Sb){vC$j(R zD?aJ!JcYtuLRv3C;FnD&&t$1T}0aR7?p}3<;!l zs3o*$r8JbJj7a58$aqYN%w0$=T-C&MNG)s#T>@!zeW^4(SmH`jn!9@8M=|N3)~y$P z3m2e{y@JK7iVs&cKh7MAI$R3i9EJhZycz%rS#n1$dObHbpoh4#9i@dWnPx0`yf2kr z{F8(bFN-?T_)zA6NPbTTHb1l21nW>AbE`;4`zRa!&_HRE!WVh7RTG~w(U*B6*QHZe zbu*`Bb6?BnUf0ciNZmTC-22t@d-@x`OV)TT-EzU-L)@Q(0mg+iKGiBd)m$Fgv~|V& z!$thP>)x;LUod_8PWNYm_U{D6%SqCkgL9dT!^hQ|0xhRztK4N{w&k3a0NRph->pXq~Wk^ygV(Fr=n95~F#BJFR*WrA|s%Y+i&(&Xa_EYW*b22#cQH(iFwP*UH+p{j*#7-SiI?S>d&$Gu&usxZ781_ualXMpS_G)J)=k(3Z)bD4#Kc26?JDL7+wEA})^L-FP zs#X3#uYuS!l|`fIk#YHXU)f7k#VxYpzmd8h9UZU1y>DPWpSoagn?P?skndga=Pm8m zb)8pry_*g2t=jIb+Rlxt|JBjeF_zJl6V@>hx0Dx_Ca07nm$4PQYir@HZa_*UQ)(p(c`Z9h9V>QzO=Sa1SuJlBDGO40y*t+a ztfq-fz!X|NUxw%~-Sjc=_D0_1Pte|a|MG?U$S?|@*tU3+vw0KV(5N76K&9iy?r6{8 z;vAh6OJ?LH!=rh})}K*3DJUtLPBSveF#Cyi9(8Oyk!Dz!X=Z?JqlZHayKVM8{X7zr zn1{N##KvKdTT535IyZ}lzP&%{p8DH0bJ4MV)AjTnV(kt3#Z|%S{NKjG-OTOpg^1JKZr9|eEBGIpEUi_E8`|`u_?e)QR_TsA_PcQIc-{+O9{9kXrEdSj> z+^nE4qzVuDs{T9f-{GqJGMV;%wWy!MaE2jzL)82Ia^ap@%jCTj7=?3Jj2V_%eVSDt zM{XqlAWDoU;qN+hC2f5r?d$f>4>u=!f3{I4uXo=!uKhu_ejaRPwobpJRrtuD_U%yF z%|P?{VBJnv^_z~mkKm4vJ)mDbU7squzLXFBS26Ihv+X+s@&O9E>HvL(b-#haFTt(f zd!XNXJN~V!+9qOlCY*Mb{Bjhs9P|f*s-!$(Ojgc*E_U>~wvV8Fis=`7pT9i&Q*!z( zw{E_;daA1B;`g#X1*4LsnT)0rg}en7z*tS)UP8-?SXQ4*!cx{D&^sZYBmBu-{lrJf zg(i9Z;O*ym6F)&a&qkl$gyln^J=3{MZ|zds813!D6Wv&C_2^yI8J!%HB1%4U@=*wwW( zf_z;+RoyoG?(dg3OIQ1E{%o&(9DMeDc;S8C+pi6uUwp3UoUI(6L=H`2K(ncsqny6A zhV7aBor%o7zePur`O_=Emv%pXf10_tUG?Q%;kUio(|6h5-rcT#eS7e6q3!x<|CM~+ z&$&K|&7QY1xo_q2)|kEVM~IX3$y-7-JQ8J6+=-TC&K8uedenM~jHXKTA@U4<^7K#G zWvZY5nVv29aQJ0n`)2p!>e%OJxRdtbFT<5|Bh4%(xeO+C=jfITRQdKu!}@Uj#(3j? zYwOA|c($iw3Nie*dUUE{{8!n~&q~aXD(KJuuOFSzuW-;`2>3@^_gOdeA4kXk579!D zSxR0_N)f=##pcXQ@4|D}iJL>+`T=ZNx&|?ucQpIq=J{>v$*+d?$;!&FwY|S{_94$7 zG6g3S$mvt6m?&6Vkx01^O6#c@`3TrXhqz~QMCG$Y844a?CqCO6P6N`9Ng#;mGMNY z%G15fvJ&dpKMF*Sn1kUl#<_u3Eu4tn`_3&4)){Jj?Z~3#F38mR?Cs6!_0P49*)M$g->Mm9A{v@)Gz<7I5<1oEP3(Y{)Y*yKL=Ic=IUR~ww$k*;>*%ay#F{&^&IkNy_evq?Myn8>(Bn}9QylI(UD@M1h+ev zP^q9uoq({rBu|Jjg`Q=YGmu(KhDk@|ei-2X6Ml9tkq7So!7%F;8}Cj#Kdp{_c{=(X ziKZK`B3EzxfNf?3=Cc{sp7+(g>~A}dAbMqJ2T?7o>0qnQndh1Ek# zVR|#cyC%lG@Y1s0UDZQj16&ptd8yiA# z*E=RIGOi(($t8)EHJ%BD-U%5|Ag>x4fACJDC?3Mm`k@@sj#@wgTgKyZWFUPOf2e!^}>u>v;Ptnk; zh04E|e`ZgAEPXg#e}8&Fw1Z)iPdoa1K)H(|JQ|=|$m0Ar@OPhg7!@jioK#S#A}Dd0Dj>93ZFz8Tyn$O3zn-(Ft#7clCrw-sxs8W} zPc%=sKATnGN6pM%LwzUxeXj?Lu5h{M1Nm=ob@vRi-VJ46nq;2#;O#N>uqS&TgZp~8 zBPU2=C%~p{<}OK1%O}7gEX^XOD2x9%C{Cs(M)H`DdE@PWzgO1l*)At3J}%t7dG?Q^ zxt546P)tLX*Ho70u^53Ax0oO1;pgjv^V+S| ztf8Zv;ZBV4Ardv!)c&`$^7Hu-cJ3sB!y@QOp`~xOW-UUi8N?Q!9o_=N?Y}oE2gf&{ zt%rN;$9t0<)7>Dc<_#r8(cuq#%vgPmsrk%FynyuHKq|*b(ZD3H$R?Ng7EfRiy(U6T z?qKfcmHbk(D!4{;2%Ag6DJK6a8!JJ!`LrE-r_jUmU(a zd$;hlx4QlzwVIDk=p%zCgaL*wA^A1|VYlDT`bIF23>3IvC>4wyoj4ia`V60cJ#z3F zyZ@_V`ZE2~R>jIq-~7kC-7f{JpT>6ou3cU~Putn~Z*IBx=gyzi*YBDM7e?NFpOrzC z&*ots*yNd_hMOV79=T0Up_z96apJl*j?SSDjy}4sp@u$*#CBGI`vxS|jaQ@Tr$YsA z`%A8o#W!PB@3D1Pn7TKpx=W+%bECQwOyeoj6cTM~`rMNFU;O|+d)7<9fk?x2_jzJeJN=B2LN_=Zk7X7eTyTTAio)sva zG%+G2iYPjgI3hmIpY+}nHcAGaCyx@`{fr@QTEp==Xm))tJ?F;Br%Af6^>POVr!On` zPrhQ0KA;aiRt>K;j&5ZvZ=@lo!JC^ax2K1{KFpt8Zv6gvvNwfq(^t^8Md)S4rXM73 z8Ezhu@B5_Mz&;v|MfHvikD;d1huec1+DmIjz-#BA?Qe+X%aOgGoonyWYhQczFOf&z zkcZ#%m%fxNe_Fb{UU_phb-U2nb>7kNY~j~uPDn=4#3f=BtBV=4=_3u_t) z$*yxU@wGFKv(=7tu!>dpi%s@OF^Y(x2sIQpM?A-j%y*X@cIM4?=5NCDjt4S#kPWYg zQ%{UiPj#|(bqZc7wLT(w3zQczRdF@Z1?p)=>pB4SLNv7WjSci7=@i%`Sry48M442n zgcYcGIEe&QNX2+*`Gm=NMHK&Wbab?^x3@3@3hDt=v=tufr~$N9xOG*zjQ|2h8ghC7 zg^@``EP#JRmHYWLfBx&~^Ys1m%FWZt$%;1F(oyM+64Z;F;XN#}yiK`m%rMQ`uL2{r z@Zj76TM`mh;g|vKLhhKb|->!TZsjq3-<_@~2cS$qg1jhNXD83pBa-0IuMS8kf0Ucpx0fp$*DcmAS}zYiVUwr%`EZT>>7e+ONhb-(?L z-Z{xScwK$+HT~NsEC!smHJ|-^!l)oyt7~}t@M}_OTT~<1CaK;p#WFC(UdKAgQ8&rO zGs(|0G9l5=%+O3$w;R=ihBRljHr z+|+HJ*Y!crxU@d;an${P^om=^U0gb@kK||1SLj=Kknj@QFYd;Lr(}jy zcVEwLzn$1$Ir}z!_6v9XY3$io$i^-D@O#zioBG$6nR8e9$0u2zfByXQ;qS~%!?#J~ z)#>TnvKJN}RN4VwyQ=J;^vlH`EjqZx*$3*FXxpS1xO#heg@?LD=$q0&<1L-=HvPhk^8CEm=#u&n&e*m!$q63i0*lk_ z*3hzzjqud5HT7^*4iJ}Pdo0bwEl4f0!fHI{Dm&?@zv8U0o=Qg)~b=XY?Me{tLpc< zm0!Eha<=zBPKq^Pc~bivAz1A(hhXbuy6QvF-RZ-VoI>sWaC7__Z^XcN-H}4J)&$f0B@JZEK&Fv+yuG~skQbMk_ zLzt*jloT*rRyRh=J6+y9)FdIR4?6%F9Yc}_j7Pau=Nf13dxIztJX2?F*EM1_o*(3Z!CAKZHOszu!KeWrdu1r znCWgPiO*iWe6T(%zc!CA#A65CRO_QqU5$`ajA?w~P}fkCBxUvPVgGc~fZbT5{0Y z3*W&#e^~wZX+^8RbeTvGVoib@}E=C2R?jIn>=<{dDo` z=34I1WK$C)tsdGlh{KFc^up0)Of zx3i{qe_K|69WDH*UYmY1H&^-nTiK_l9vKx?V@FlP&r0F&gz6kk&l+QoazkH#?U<0{ znBdrua4qdYEuB8Q2zanZOL=0whV_~x|Bj)>596Pzp6DF}9#OhEhd6 zmjR>I`ax==BXZa_os4$%vYz2Igwo%G)$DHh_iIYkb6jai;i1rEV{*=PbBC8GTuqPG zuYEer`|tSrYIjfS2V4HS3^#EPwYckmDcIz?0b=!6KQ8i4pVjoAR(GWjNDml!7q>a3 z_nPJO`rmHz%g?<8Reo-N`{fe^yHJiDENyno&$o>zc512}EX0}^AN45%rHL>;}qYK@+!LsAGUxL?u00e zuX)KRNJT}NS4Ej0pv-Sz8W*ffkf=8?l&PW-zK%j14uanlH5C5Y%YOI${nx3i-#ckn zhlui5=!U-qzxUBCXIk(t59il8)pkr>v98wBCO|7~{ot_hke9#MPkyp~eMNusgW)F| z;q5ueEgRwWcgE>$Sqwy@wLbs}cECZju}4~1h$9Ma)j1T@HXhP75I6+4&~LTY>d`Z9 z)7Aq41M*@*N}^nc<3kIA1F}7x2a*)F92Ga5wYHtKdvuMvf&VzRfHXnl@@*K|HjG~D zm`(>oy?5An2oG*W8TV8bhm`CaRO@>xy}B<38aHZf(}jcNTcPF{5FoQ~EU$aAwskgl zSN3L)=j#kt-K5CPoM4O6eS{UWnRkq(draEU<9wq>St#e+r@S*C&($t6$8*17T7C;| z76Q1RzWcR#n^E^VyZvub%g4%Rm*xHWXufVUii`p0RD{)#t51em;IIu)#wd6;uY4=D zV-C?|pIeaV6JccOrWYFH=-=X4(;Qlqo#GiC5gVTppOzGvh(-*)?JiL8(g&2e@p*=8 z6qg#Mgu0uA(?}RSG^^{U@<5h_lq_Vv0i48G9S+oJ88ep)_2n;a(Ocl4ThV53s z^{OuH3M04W+uP;fZHBF_a%BUi_;_Nx1eaSm*RUt^dX@9zBIBPI{WY#c`PtfovXKKvXJl$60VgzNzFN79c&r6?2b zKT4n_jG?|`M9zp#Oc>O1ZF6)KkQ9*Q;-pdLj=xJD8cw30Kl)I| z%N-ofLk@Oobv{#?`0{A!mSpCF@W*qi1}s0^fT3TA2Bi^Jul-=_F*AZQVt_5dDV!DI zMcwO3!z=5g#^#If{!rEp%;_5Zo!Rg;zwKL2`&A3FwEeys8wf_yaMoGHw7 z9f`@UAwgy&oIq+W?ML_ZNW9}Ic%w;J;%V>eF>)JmxLI?!YuY93AGNv;?~LFq>&ua!(;;?5pG ztEL?UFi#?|4zDEw^_FJn4_>Y*t`bhnR$PU8M@JOtlF_P&k3CA2&r}K zud*Ghwi>Ll9&U6&H=7|FtVc^-u;mtG70!6!$}IZJUAs#H+A}rHhme9;tM4?F~ z%DXi)x+5#NVLTa85Hi=KInv+?$#6pzSv_UoZRWI?|Ro#osivy)XU(m zk9p7&*Snzz--uqfc#wzDP)rIqS$jy+reS0>nS_-{Ku&>&S%vGNi>JR!l)G!}LnA*{ z%>X7%ue+K7EG}O6^y2C4lbBs1Nv)!3>^)hu-D&MSXq-JMSz_sZlW26D-1K5tVERc= zkjq%N3$EV{QVs5TjTmZ%A*+G725V@#9jwr;J;#2m#RXGsw|J?8F1JKC+u&-Q+jD~< zH9tMYOoioWsTnlScCgZRtjS>%-|4>C6j7vKJ>&+H%c(wHzfrgBSdo)o;QbPx7!!RRUyY6s@4z5#YyjG(P zuGi6{*^K~#x{OyCg$T5F-J7SC97`yUHZ`5=E2u#gV(+7`E6#G-Mu4@ z@z6*Qw|iWHY$}HAiX#CbVGk7p3CZFZWXZU>*?GA+R0ZzKi?gV3vv6@U@bd)8$+!S0 zLTM;d2mr~{YOS$hsp?_4jnf~~Tfxz?D!{hvJ|I<>%P2RKT%XGj;( zXA2&%0wJy2`fb_@tl)(X9mVFI1&-iSGf=rDtiZM>+Zj}$-vQUfkG*{^@E#XXu@eMg zHww{MIx<~76h*Z;v|@C@VBK+$Mk~Y=29Gn=j&agt@6h+`FlFy_4Yb!}GfW^z)|c(> z&>hBD_xD3*b$ddJ8xzacLu?GWLRDQ0S9A6iA2|6h90}vj(!tO z`+(fMc4)a1r~=x1)i+peiK~P5=J}T)@$015aJ}n7jrQRyqr)@9xih2TN(W?x&RD5- zYf%EW+HvaIs;@;GQ)}H@90cC3$oO2TWv3_~IKG{9Kg{Y8@jL4CC zov{X|`E&L88mFaK2LIMo7(%Zj#~hAyhLzg&4!R=p%t3k1hNuUpT|(MR8atFbq(2j|~6=3bZ&D{mRUY#6D`Lo|)s zcUSt3`5Lw+a;b;XX+BU=r=j+h67c8Y^pFz>BPI8czV9jX$ctRUjO2mk9gcW98FwkA zU{yLp65<%*yYaN5W(4w9{`XU@EHaF33iWOKBF*wm94hT>8Z8Y42Gl13{<<0=7&CP!$V^+$qO;U)>f=gRx%J9hW8@8RxWT47 z+oxh62woK0p6~YjR+j9L>DwX0{vVOaH+q<6js9vqP)^i%v(eEXE=-j}SB?p~&JdCx z)iN4aKNi!T9&ewSXjl#O9Mg}4`-Gx=iU-}x&~Dv%G1v=-iF3!9*E&z%+x_F%UGCCT zY~E97){|?|mSY1hutpR*_NIE`X?En}o>X60mLIs%1yp7W?KA8dGb|pCLE$11 z195P)rV~uF4QT`^vFk$n3>Ja2Gl$BuipooR>&ER+m0shuQCY6qHo@c$A(RG8WZD!| zZVFrpWE2TxbP;4U(Zu+PL<9jv5+Qjk6TsuHQaB-b3>lT@eQxe3K?^#0I|@3CeM;Uq zB5$;#(yFG!qOr*kUZ>t&3G1q9N7un%S;K?XLm(Hggt$OYDgY^+oFvg*X+8V6abx{f z9ZiI5Vz(CFK>$Yw^k6u{w*N=dS-3U*e*b^Oh|$f)sIh=igN+SXPy&jybc%F$m)+gI zZB$TE1XL77-549)Y;<>wF3BI?%g^t+&h`2S9@q1}pXa>JId_@)%P&VNtmEI9XP3B# z{x(ZJswmT-8mI4*@cQTBqHn&1AF^M3%y{wA#oi(DbD=r?FEQ_|=hp{LpYJ=B+;V;W z#Pi30j(<*i(K)tTR5Y+pY_5seI^?3W;AS}Fc=&cUjg z{i_SPA-@yDzBrsXY2|h&?!-@rsBcFDzaI|!<`h((aQLPTdCawH+^uH9k@@iOKaQWS zxqrXwQ+D6y!i&Biu7w?(;lH08F1+WPe>UXp<&c7N;RWZ9l|MW3=hoq(>t4Sfy1%*Tn19c^ z=v2hZt1kJM9p9gd_>{UUTMr2sbkgcu`8mKeyMi)1Rx?Z8EeR= zfy?p41=*XYD%d1)`=@Z-Gd*;rykPXhsiD#vv-wv~m3%n&rtprGJ2kCFdk_;!8aYlO8iWog&7Zr<0rzflhMTI$^SSW9Mqpq zy1cyM^yp03%QJ2VA}TuV`RSBr-eteM(;oLe$G$)9^XJjAPnQDTTzGTyOVGpe#M>p& zFHbr@ycGTJlK;bR*>DIp-H={R-~;+&UZm z=DP2zi^txc^mudC@5@Qgw^zMipZ6}f=u`IW^W!_`A6>e7_u<)-*WUekXLCE&wz$Z= z*d^_!XJoOXRJo(9zMz78IN&Il$3`A+$3;%%BP0nOByAMAP$&IRcYlC{hbYLMTN2F& zKbk6Ok)VhZlRlQp6$ljmemo=3^T;>zsKO&LJ~_FmqTFKAB3vW_UR?#Q4nt8;?skL| zUbsL5cY>uf)PPi6uWz6uv$@v8p@7RO=Z!GO$ZL;P_?U{ntasjwQ-? zBZ0RaD20$6)0J#9gtwTm<^ReP22zPL0p$K%yL(3uJr`;w@AWy~dtYMSo$@X` z>HqQSi-Obnf6nKXo+|ix^7WTfZy#U&c6;N&o#pd)o>n|P_v6Cn4<&gH1*P5%n3!_Y ztP-c}(nRQ=a5=Lae!FB5i&Qa>a4t)bzLy}@G*QHW3t|EYv&sOPr*Rnz$rgvmmIg`V zl|)RWMUF%Wd8a9%NV;g8Iv-jYq$DM%EKSsgQDCwpl%}Dcx)Di3TT2zCj)2Ofm;LZ~ z`@`iy{>R%tz3#s$EXngOdg1*g&+GD|v_ePg60Z{IXV>ed22-JGOUfDx0d_a6d@GI~ zE=;bx5dQ2>z`#na(9y<`rtCKG8u7YZe03t>B^G_mzNgr%rD&ODfs!{)5~{G{&;x(%hM9uJX`Yvj-yKYG}X)( zRsjU4EXc@IPf_+*D9IrM;2MNEnj=O5K&+#opQDwI#2$PlKUyN*#)0*u1=I!8;Q~Ac z@KmIz0GBdeKuv?Bt4!2TBEw+^38*?67aO1&5h*7lA!%)H*4bS4`u*EirH+r@I^WFm zc<{=#__fFV7p~uad0cpK;&VYtzHgSJ4eWg8F;_b!f-={jL2w^Zwu{US=ZT}Ba(|em zz3|Pv6Y2cwwbk;Tin0n1R9kp%+obxITjjJ_&7?K+j(PbVi}p$HWu~6ozC5T2xIQ5} z0#Vbhi^p{Z8g!)}%?ojOo91vo+4;2fJ{&FF$`4x!+#fWS*cVrsi%;^?!NV@x?W#qEoR&m)&2VO}z6bu;`@Y z+b^+&Hv${x%!_W?ym;dN<9y(aud%<+`sJPSeSamm{6azT!*}P3E}VII?#z?ZXK$Z5 zbMNevb5|EP7WTK!tz3LQdgnp!f%*7)>B{5M=T|R({Pp-kc@Vu1U+xw4#yqCb0U>P^ zj{zz>+yDf z%q(0o!ct@;68Ob5G&LoqP56ZPjYuA9=;&X?Pu{$KJ=Rq|-(Rs-&0H8Tt!7d_lzP7Y z;*p={So|aM{LS271sNaSnwGiQ1=?bMdBwoRGS*cEyA1eR47p(nVg|AThdiTgkA>ep zaWwy}>2@V@g{eC*YFc;GX?7Rizpv3e>U$6$ZzPo8%QgNlOR+X=V@7&RQAKf0$e=8xMObGutzB(KC{qfP)7mj|N@}Q4m^3S@K-u3=A;ZgC#`^#O2 zuMh0M+;{l)$o|!h_?Oq>=^RHYN4w;y-@A*}FVEPO+%kJsk@n}gdFf5-ci$rKe>u@~ z%d}_O`2FK!Wj8`kzq-nvw*K|hxA?6vQ{kdm z;Uo9)kXli^dcKX{$HPv4d?J2(hv!+jeM+*t73WZ`o7GDeXh9_p5QF+^eR{t5^w+yl z>E=~bGdKxqLUc1bWXf!|DXDOJm+#r$SGqCxd7`Uu_+4RtgBh)dP*8a6L!L|C4~Mds z$%Vh&^FEusbjf|~VQ-a;Lh&YXYvzt9h(Y+1wMBD{) zZz_00fiMsW;>axsqxT04Mv&Mc4a}UZ$ru>3Ac17-*Z0bG`1UZS6piEb<+(-WB&K~8U9*k} zEWdr~-0Q2Su9rOjcKvqQwea&5p%XOS!QatEhf{x9y8L!M(cqELqLo2IWz=gV48T&l zjRIfU$8}-imz;7 z{;r~j=Sfo?@U6IQ~lK2QAWaO4OLP{zK!JS~JEG#ENG6Dna;=nI*?e7E}W7hlC(%swtb8KHaEQ92l z(w35BqW8Li%IgX`?NJaq&oG3%4AQINrmXRpimk_4RH(Xtl2h-A6q!*Tl|?mbwo#3` zy8)V&yzQ+98Uq5VC3+pzKvb(9`4xpas86ir*J=j3G;yiZ02&)^oVb? zi*aa#)-~O#r*5BbS{IjC3nP@ZxfeA^LlEK;7Xr9PM^;W+{dwluxKJ`tud!P_pv0O) z>JJ;y`)q8Pzsh<(_AKO2ZkMs=Yncr#9lJg2)AOsnJDsI#UBx4vcFi@qy#^j#ByWa# zx~^_cBV4vgMS@`%#>VokVgdUCiemt|9RbB*s0c@ne_L5>LkTo5$2-CcoKX^+P!idI zqF128bHZGG+ZvO5u;$LD!I9Dak^YHor1+Y&^cLUTp0MZ+XmQ72ZVS7%iIts`kPsIi zn^fByp-fI1j*m&scB4i(_>oz{s5Du8SCmf(F3ukZ6XsLGWKR=qcO{jCRibqjV=&@A z8c90XLsL)n)eEBzEdC zJJsrYG#F>epmW!xcl^*!HC&=~O1^nm2fJ#h-)d{baQMY4uOIwe+48%(zJ74Ca&Wh@ zcdNFwd0@J$Z4^217u7MLvmnH?&P9;LhH{gR0}P{Ad6b6u1a`QkcjWly`GDKpvNHg| zaftA=q~Wxr(X^OJ$C6>^9BCfRHw_gTQx=;63J=K&OpO~%Z0fE|<9i!fU28p~Lx{0a z_5KMsXHjrrNpN9{usMs{Su@yK!Y^-QK>GxNeY`R^aCbs?abAdrTL2`?zrexDZvqsy zc|`;S`2{28Nm>1e;s%eHacczdsYL5%VujNXJi#Pmab;DARF@v_s4N#qR>eeO!H~Qq zWgsW1Ahb735?$nz+!Io5mDg#E)NP5BV+xDRqIv&H=~n?M%rr_>3Z*(zW+P}BWYlj` zOXFn@@?)w=dez+cdJ|lW9`>DKO_zaTzX5?QjLAbVi=94{I)2R4Y|4XC3pHz-@y#?o zwN8Qm^~-&J=JoD&(wmbxZ_b&2d0=07(Y)w%QX|KGhGsCjPB0h=vb>*#XGMXx#YT2@ zC00r7Wkm5E|6iBg9)8QY|0RXJfnV!X7-OP0n{>w*>cf5Tg)Zf_0hzy5YO8}_PESui zjnQ1QQQbWMxnuHM@6hM|wO1|sZ)+Dj==An#YWHj}ds}r=jk! z^k$Fd3WwM@N3I;iF3*zp>Wr97>hhxY$}(Ns0>kf#waFB0UOij z;J`kby!b1#Y4>---ya$)H6|_9AUP#IQ4zs$z{Vm`rCLtAL5f%>LFte;Y?H@z99P_x zkMHt`{oxg@pP-6Jimp z`m0`1p~3nw zU5jt$4|LyZQ5z1)PyP!_zCA zmz(_+`vW-IiY9ZygpHRN$1BfjNluXERX_r&&_oze1S=VZk33EZJ%)g$68S*dTuHrT zsr`QBZXa@`j(QZ37_7Ib(!15Et*ELFIJHHMI*4ZUVdx#0O@`@iV{JXt zsG3?ky#rruA&t!$PR{9Vi-Xn`xTh4PW=ACFRFsGMZuRQxI+y?(~{PNA>Yl&IyBj>Uk=JXi;ryL@+d9RnsJ9hwrH+;-wYTcCHMG|;bvxzD zMUl-F@!}de5UpTdqaa~9yaWKb3lV5(1NG5h)dQ||Jy7g`G_FGl(;IJ3m-og%SAkjxf?b!TV}q+{rD$gT|Ngx4&@R%$bPUCymW=y#jm&t_ix zN}eM$E`1s=hN^%WB+}j@o5~#ri@kcj`E`}Lww5^#-t_8x?Z)`Wad!>3xk}nNXqjO- zFYL%DH3spHlO=Vfgj59=>oAjBDr41Zi!Ag^mr5tQvg70K_Vj-%6B85L15|cH?P||# z-QKB-PyhTn{ru_n(5;Pm&J1f}oWoh+T)Xo8>Debww@#h8^t5x0yswmfLqQj?u9(mDm6pC7RKGY`@Qbz4}{q zvsuSZQ_oT@bWDS@W+EUeAs{F$v4S1hSDPF`sST;YhSj<^4RseZ^;%mk=#8dSs(nM} zm)1UdS3Ryz^H(3ea}rHH)%n*}B8h{_`b;RXAFR#-;|6m1|0*m2VbXu0u$FL*c1d(I*r-9C)G9;l6jtl! z*X{@6sHy1QOoQHZ!cNopyU3 zvi7GVyIiC0m3G5>WBqrC`hOgK&3h$$VN&8^+{$v=LX#3w0tUn}Ypo?K?L`fJIiWsn zQMQik;6QKZ+^5p*Kjr5yeHa^_`a3yKYpPymRj$4nUpx6|?)&xo7aot^I=gu0(&qKk zOAlv<9&(0mO>f_NJbLcro$^~ZF5NqQYyalx-o)^B*Hi<0r~7k{>A|Kx$DIz{W(2qj z7SM+euTqJnB7%N|h5rndw+)iBh{ih0662M4LxAG3JUB&w5GGOv6CuaY&gsP`4_YTS z8bvn|qM0qu49)0qWhGev&-gN0l-Gb;K$u5VP)dI<6d@F-BM_h~?4mPmZ!>Lg*5dY! zdi?uRekb!&OZoeceRuzG+|>bc>SNP=j`5Vm;KPMV8npt=h|@)(z0=Ni(3PV5x}&T0B=> z1YPj^=!BpAxF>WuN_jtGepq2gMu#S(&mz^e7+~8p$=#yZRs%vAs-~FG%H$z8>#6_a znC_JC;Q1oRx0?hX7FFHiCeN6lBuJ{$B%@_+lO+M|1)TaEoLuSW_Q~bx52q(TLLQwC zDL7c}c`M-EpQyaw9(iTQj+KVLE{}ElW9$7T;$=m6{?FhKr-D9J6jq#j^Zjzc?*~Pt zPfKfOe^l;SR@ax&*1q)Z|7J4uhggar9YryKpfZ|ESyT#0;+@i5Q}fw@`)}bCp~zeb zAA6LomvE>n(!t*#!HII%n_%sQa*E%8pV*RCk_2!ma|mL~dirwGMv?$MD6hJ%psI?b zquz9^`hKM9-&izv8jL?nO~BT1;*j2$A94B^v*9Rn@Jn~?yV;6M8!v`;9A3@+Y@*eE z9b_8!4|ENo`l_pb(-}iUYEt+x0U>UXpaC{4*x3DumYub*AW~A>(HNZo4e|kn>hii- z09>uW0f*%?Jt-0?ID`NhpH3kOH8Y{jZD4YHAZ0KexFp{JP}`P}!&WAtyVBJR1#cQY3V{tb2NnT|+7QZYl z3T(}b%xzv-TKk*Y7SoGwMy78l79`omgvj)IE!C z#^5koUJAj-1C&+9a0?g%1PStzxCNNcHsZuGF_dL)F3s|vVtFpwS+YG%EB&pyUF_H& zj@ExpYwB|D>`-G=`gSa#RE89xdLS9xhP2U|9C0m?cNYj6O%$C>;NFSB?|2hMtjS|$ z`cucK9hZyPuR5B`m<4 z?XS0?rSAf1j!A0vb}T#qE}beCVh4@)RL=C`I^oJ4W(|n4mCkTNL1GLcaYicHIDOuR zN+tb1c}zjJ!lFbYh`a$Ln)GCdR`cmH(g`#XRGSd7P29L$oY*D@?-x_8$<|=zB5Kkk z2jev88M?F#t-%ab4ZrB98$2@2&ObBVzb85Ww6HuF zbL;iazy0F*N?FfD{*$G=iM=l!>z4FYtG?|c-IX{_BfEtJ=BUcca0?0X0ieQCi&*1j z1AsC=KreksHTue({}?N+zQ$&3z~eFIDgOOzsElJ9yvE{ zl4>`wdV>mq2Fs%Qe`6?{$1z)3TB3(=%aLDpV?OVOeqg-m=(;%b>G9dG*G7wWS#JI7 z>Ql40m0<*1HAq7*8w(E!PIZ%ul@AU(;S%b|)Hkmv3%-$=7#HW2WqUNlO+PM5_n;Oa zlN1Kd_LR@@;0|yTiPGW^apwtkQqE9O6Hg)QWDyK_ni*2q-f*MV6TlUD7N6lFj|y)a zP@^|RqbFOZGD(|}q0-4mY=IhfDw8{8V7(%m%ruSaTof}yg~g2?0BZL0srT`t2hzPi ziYX1bG~n3^vuMEukkA~U1Eli>U0<$I^O4luXuztYzzEzDCfrgsvh!XcJO4P^xs@fj ze~l`<6`6lK=*5%JUpIYTJa8!a&(bbG;`QCsPxsRtKc+dCCcL?r{^ES_hwCR^pA7nQ zD*EN)&=(hdKCF4l-)jEuTHT zdH(YD>E|1_pG~}fIr8=wv)N?6(qx#e2b$B`SJ_l^l~&R+9+%W;88|f1ueW2OJFR2< zmtfmi=U&xdx>8lKZ`smoJ@hW_`t_LIZwXt!b62n5SigO9<=X9&{}tTWD*ZapLu!$o z?R9)JZTeqcLrewL`(x$%Ja#E-u&BOqW%m90=_^mahMz0W{aO%&Q4Qh|PdMzCef70YbDbXiH%`G3GVLF<>=&~8kE4y%i{MvJA6&n5@zVOm3v1Ue ztY5vq6XB1t|$-f>{bT;Juh2Vm-krfYPD$WJx-*77)^DQ2A z?cDQgnsEKP;Q4tW{^*+{Kl?3cy&m}5;;zxWj_G{PjVsF+PH#Pbws>}U?tAM>QRnij z!reDt)?VoMG(hynROFZ7W4#8vN~#0mnym_YHR_mMoIy4A=c={U>Yt-)##XBolcgh% z9)(O@kKBB9YWVGyjXxLX-<+F#e{uWW#YZ>Z-q|X>e6r}&ll{_&gT zn{W1BxU3fWvCBLvZlx99kIJ^OwDl*#MZ{I~G$kU^P>RVBDG9% zR3XaO^gEyi{54vlS0`lIB8J`Es?yAGa%&KxF9pp4!dYA>HZQV^SC!3wV0BD7m0>;7 zYP}%US|07L40tR5fecaYf~qVL_y8B0#*bw1sJACOzs{P07{(d zI{td;kZEt40o&4G-LGZuOV#vY>PB%l=X(psnzo?bTLVK*qYxV&$W4Ugs=UMs6ukjc zsDsNj^Kg%N>a6+ch~b*m@Xc^rYRk`!-@isni{@UxxO(t9Y)ScM>BFx3(;fF$svb?Z zJ?Hd4p6k6kmj7t#)f3K_>w7mIoVoIW7VT^m z6yXvPY!T*e6Xffa7+{(cWtkZ572)aOc{n=M1HhYupeTs~0igl{S~wTA8b6K{v0YTN zi%X4rT^!z<3vY=-wx^*w_z-MsMsz^}^IM z&&>O+gl+oGfSh+qVBJA%UHq7<$@R2Jy_E=mAk2`^^x!; zSKaLx!T9Jw7r&Ui(mTq2lYQKYteA{P_F6aOFes*>8^?oE_`EdFk21v&Cs=i?2Vt`{K&E z`)g-o9$m|Pb~(7~Q`~?d!>yB#Ek~Th3 zcPTVC`)TU;J@MT;Yo8CUM}N3Dl=#H|_6_^467b8`uQ2=QM9$Ii!@ow8N``VQr!CC4 zb1k>L{JVpc2P1Xs?W%X}se9Ij%tTUEuxe9?#zL0CrgK+Ss_{0Ds3b^P9SXwnND1-E zii#-k#i}Z2X^RA!>}L`tBMD=!wIgo0EicA+wBBNftD@~duXD(h!$qI;evpk& zSdSHLBnZ)Qf{prOc>N5lMl>7}tR(0rmSztUbX4GVlMwTi60%kjGv`SU)e#Aj$nwsT z$Fg5FJiO2<&EY7ZyG6**6>eBds7l8H#5f`=)u;`KV5O^dr^4z%WEL-}Q-RnHCAKJ0 zSWx3$X-b;{wpG@oRT0-BP8bk{cZ(oK^CprukCVrnL^k>ouFRra!i`?F{$fe7m}$5`q%^ zT!=JfIey+G1Mz48_y`((%o`IR%A=IvVU`=H@10d$5L2J`sNne}kMP{YSf3-N7KiL9 zwsDf-TC&N;YC*=ZFdRCLY7o?*naj{msB(;})y{0x=V>$&rV#|I$b8jRiro$3{Waje z7U*EJa(|;>cO7?kQxeT6!$3PiEz|}Kv=7G~(gsMo7Lg6+s?lJ`yfCgl3OB$99}vWM zq`?lv*qQ?V% zhpH5&^za$AD-jwc3_BEIbO4bCP+@@C29;Bok)Acpv1Ox$4Z?n<324x;M^Da5dh;L0 zWA3efjy2vfKadd?Ifb_$T-f;U>{{%vv!>p!+!B9S94b2OS>WT=;_N$cJaAA}0;VXV zsT!lJ8LxqiZ3|NDRpwGqQb4A{IkR zk1xDCca7*9B5aqI?&$*2Hu3V;w{*637Dd@KExWoobLK8ljc{H15 z0IVkh>&SqfO0G&Gu$Km^`7Kjd4zB$PWmYOz(Pg_Dln1K$2C8^^s=2zE{2jl+RIg)L zo*#wYQ*u3A!PVS5aCl>=I)g`}k4K|lP^B+ZdmtIk$k3pMqL^7mo&3g)2kKMCv*qw? z8A1mf-=@ZF*P_iGtDV+0?tmX;ow)WyMEAkg1TIRi9Ij0qSpy;U@FS?g$Uy*wDFCMl z8bYW-D3RVBLdP0;Zjmg)Fj^ktf~rIRaa4(@^mk?Wxi|SnRax3T%d~%LdHAJ`PocHf zN3|Sll}rn*Yzsnyfl4p~7S5IvQy1XZlM~Q|N}|+6)q}eIf=W!p^HxU|~14Elu75P`;iF$|w5nrC30qvxQwF#4bX7B1hF zpw^y^ZZOr`PcvSRL-rk)MMp?aL@V@5@yH4t+<^R3v3&F4qGJi%v+-i<@!UJ>hu1W{JxkrR;Q`YQ%lLCp3dWrz9s*ZBB%HTI=gyRha7l|__NpMB~} zo1Aic-&;L0-!jsG0@Wh zmJ~)1OCgA4>dW(}05gPABF)@wa!~RqP5N^#i4Q{K20jhR<# zo>T7~HEJ6nca2bbCXMPw@KqD2>RC17pag~uXVgaitMDPY5vSQ^2Zx!T^ z(rwTdmf$2PBr~8~EtLu8Wh$hDvwejf?Zkt;jYLxPl=$#cJQ_j~5aD?L2&Xu0386Xy zpx%t9hRoM!Bvw}`N+*jkNur%}VOABe)=Y@?o2P*!$}kbB(idpZ6RIL`(@DSqx^#b? zKqJOkD%sI5FoP0(VU6;=S?h&n(wRn8Hjh>!7XV_Mr9zcar2}>9xwU$8Wh=uas3$c0 zV`b_(~Nv+jmBs?JGe z(NB6vl1Bk1uH3Xt#&4sslQQHv3F;>N7N_|9+mB~IhF^Ggyx`fXvj1{Q-X!PQ+tm}` zqhi3nP{N*^&N|qDtH+qrSFu@Ipy`(@X?;i{Ohq9I4oy)}$O)`c$Y6rGiORVM;aJ7v zZVDNGbaml+IbIY`n7UA`SWdu+821=Jk|5DVsLlphOBAl6aMc)zF)2KC6oIxzVKzgw zy^6nw&fiZH?D;M*NCQ?@^U+NddyV86zXiH}^H!Oq8+w}?MwM?K<4+HQ&b-N8U|22IVrJ&VhW~Q6&kB!h0Ypb;!28|M>AFd*3V|)Bhd8FYm?!&~g#|hIc-bY( z06@m>@|Ac+21T~cSh~hsrsI!X3r&vI2yJTy_f&#=EBJf=fCs9T`l=MGXv&P=kp4fA z!4mN%b5~WD|JuMmW`siYqi*jYKFpB08Z3kjecccKsYnYUnv?WcF~ zA`g%V79Up>M}TGEp^E*<>WwF$V>ucdX$F6fi!9h9H_SEa;*H0T=q~%RH@vjB+ z#sCxGDUoWqCv=1(Avs=Lp+xQgf^tqEFu*Az#w{%)FbfdRgAx(PD9d7DV5%*uim2Fx zMGw%$D{D0RDivCO$#yfOyJ?EORE@4W*(zgUx|Lj=KYAX!swp-AA{>@-++op*G>v6jBYHZ1J5y&N zL47=lI24Be8zQ#kq%q;kTK!bjba$or?o{FK<+6qE=VwYTE|p*FDmnA?>di|}k}kY( z9-=jMY^Z#no$OcBV%>=4tW8QW&4)PPE1N@i&1 z1ZbxDDaN=co^XOE1fxJ9O_~CD86IN=E(1xRVzQrniZ3eBPb~w+7h=fgMS`b>9>`NR zIff^|g(opEDMmdx#g_*VpCg=@$OTG*EtY_^wZ#>zG0U<^Ze zttqnVkkq)*3)0g50uDW5vB8G4P6?>j{^^+d05(z#6p{7LMx&b}jG3b3{v?CJEL2a9 zYIU07pm6gXoIZ)^oggu0aP&DeeNLS*j-gIstA{c4F}?aJ1NyXHRksVNM?rr;<$z=3 zoOQ#TbzyzIy0F}64rC=we5GU=FXqZ@XowO)cnyVTu zD4ylZ=Rwd;m(}DKmk8I$@+HXyIbxO48Z<=>xiiI+m9u>@S$;U~C^TG&2csjYp(z5F z;g{pjPDl;RN(fC$fCeN$GC@L#ytyI@8G;IlBB+2hez{kyru}JYINyo?D&^MZc`#TD zqe^q6p5lxns~Z5WvqXWYYs>Pm-IXA-*sdaV*w(>k;nO!qjkYFe+kQG za?xPphnIrQHB(1#DwC#nv2{_P}52*Jg`>f1p2 z5b`K^MGe3&DK?{{xZc=@8n4sh7$GMrk8vtpmRqgXJFTWz)|?vRT03Q!iX9|Dnd$;e zrA%0Apmu!FfgL6DW$N;U8*wM#lmK*9DV(%0G~1ic*Af(QOe-^5D=k(gI}j3j3?>#y z=F5d;L`z3`0{!t>z9zE4>MCO5YFzR95em3R1TPN>$SF8s z4ly*=JMkkloe?^E^q?)aUbhpk)q_+tfutXZ>mXDF!)l}eB*iSOG_YAkd;kMuVSO72 z0or0I3fbXu0q#nHo}ylMN=e7XBh+Lf5o-Lgs=#=jP`xb2L&7n3vhl&{Ng>MVUYbJD zMEPu#e7q`u7)CnBL<4}+1(9U35E{uy#EHMGV^3CJC`&5w@+ou=`;pn|^Xgvp6jwIZ z533m^%O}VWf@;I~=>15e20`P=@_pjILu%99-?F33bw{ZrHS8{@1`#aqIMts%cH~00iC* zG3->(pHtOYMYJswYv=SCQ@Fn2=E2dr+BsYk2d&rgzx**lYx;<8TbkyQ8TM}up~b3=~#sQ>jgUdyYbm2g_-PL*$6J-Pht0{hRI z_R>eIrPt6$D$c8R2 zi|D)F|L59l({c-Kv9EHmkGNWm<230W#7HfCrcryk4l_s7VdIh0^{{q*WH(&A5+>IL zmukf+x8flzb@?_8Sps)FT?;fo@ovQh)@z6AOYjpFgkS<$=!`%_B1AP^Ml4V($C)S) z;i(iCEEMD}7-k2Ebs&K=YqfZ|q7G}M2B8x?RYAv*X{u!9cztmp6NsQXKnbZC1)tE( zn<-yLq%0f9GOIOuX1zCsq6e{d9hJuhYdo8=5vt0(%3Q(Ps5BZ=h0-JiYXlg#2vFKY zDQyBqT|DFg5&ae=0t-x=)9alutmY8=#%o))YpbSAXft@`6oxi~tL}9>xE~va?HcC~ zvnJ_`t|NwR$2w=fx6Xd2cG*<3P?&aQG7C!Tgc`L%2n%Y>9QB$hz3LhL%Hd|(a9tb6 z7|(*^+Y$ftn5(vGZn%>HTeUV^vNc>W)m`#z+VW}IbEh*b360j+HaBXue_fL+wd(ii z^y$?%oJSRF#b?$_uWnqsHgfS&Tg5q6*~#hRCwtefuiSa{db5+tk*c1Qd_`+`d+^A? zgfbQ)9H!SNV$yj?TCimcwO)JV?8n1%G?!7T+e!~{vzxiyRlU(-%Bi!NVc5+uuoDb} zu}Y2UMuW*J{7|FnY#pK&F4d1w9;c(b4dLC!2oBw0oozf!*R9izCMoAK)#7X6@f~`B zO(=+(e7Y`I3OpUGnyjprE=yHPQ{#`x_OZ@zJ1P?E%$MmacEW`(z{x}@lMNFGMIDCa zctc_jYiD`E1q1a!a>m>ehQbhn9Dj4wK}ro1hNJ*6fyGTnzSL2;iP<#uWLASyOOtOS zHdYTMsx6xU1s+!t4sUE!K9D5XD7}s?PhiVpIwfEOJcz0kQlAL6O}TelpE*qKT&ka4 zZ*Tcq&)^s_=JmTq^&8n*My)VPpFeZLwQk1fpuHGhr^M*Tbk6>0oB3KhYH8Sk)E|Tp zSqet2(!>@S<2D4gMF-PDU`$i`_8Kd9>svV}I8{ciN9up)YUQ9atKO}t&at-Av4`%f zIqB27=UTJpM&Eo}wOhg7xcc|O@T2oP_kZr>e_kspTP!@g`tkhYyR&Qg7ngos8NF0~ zb*ufuz^*52=veD$dEMMxC+%(v!xXv{02G(UjA516pH&@94IH}q&H69PVzn1{zPIY! z$j@sn_H#7d6_(LzH*pZ7(oTYM8r7FrHhce%y|aF5qkA7V5*&gC4-gVU0t5>ZAh^4` zYm2*EOQDp~mU^ip#fn>TcXxLv6lbMtH4-|7=EluNgXtpv2;?RiNZkVxU>H)3mJJa);UHF4JzF;2wC$EDz0y zFmzHxXjGhgSQZ8)releMaFci-iM+9l5hy%AYHwqJi!Dc>D@&LS6S)(6z;zMYh&FyI z0#7aGXnQ(uePpmXG>Qv1Z3zhgvNAF>uNl;DXy7wPQ;FF3Hygb@FlPlu%W*PH!h$+w zokx1?dL$hd2QAQCR8nw47!uGc0Ue%@WZ(~K-QnrkVUrwTm!1F#4ES@mQVWl8K-w4( z-4x@;y;}#vqZhrMo1*P|5{t{C-NQU$Gayt;F#n{7^g0Ya&5EAqYFp99#c^V@VtltA z*WQZHGvUVi1gCL#oGHcj*f66!kTzt`SFy!q$@xFsLtnX2J=A<1z<=Us(%NAouxq~5 zmbKJ|YcdR`zZbOj-CCb{crWkst6QB#C0#|8M|q!)Kh_=RCZ42y>dh`Xsf;^IDNR_P z&YwR}1EQ*`ti1=J%@;3Ev8!K7NhJ{3oAw-bSTae0EiWV@4pyDC)r-`;= zxn@ciU@t`(J9HhD za1DqN;O7F!AflTkJ)5!KJyO@&rLUs|qgpzw7lsulM!SxF>>VU;3Chy7e?_-%^Yv~) z#U|*aXJ~u2xq5a4`u33cQc*%(j5CML9iK5hJE;CW!Tzt@yj?)v4geV%L?HtrlkuYO z4W^S06PR=v*iu+M$oujqd*)|V=Vx=oxIYRvS9y+5W`s(5k{QwrmRW@N?n=y`OLuKZ z!kS6xdc*#SXK;!E_kK=aZ8Xs&d z8BYAN5c_7{e_V1nmAXFh^4D-p@4nHG#`CPQ#I?GEUemab`bq0` zC(Csg)9Pn4swd;Bzvt!tjLPj#Nd26b-XGKWK8`saX^vYR{x#iyGTZyPOLnbQ_-wxY za6<9>l=9x7_C}lWti1hvtKD*|Z95ddmm9C1i9&~jr49G^D6zXVeyk)d<$w^V2j#dv zq_)>v|G7nb3T?UCuQA<$S`g!s2S-Z*f(DSlX*gJz1;7yF0*bH`WA>9{3Fim^zXM!*0PAVS4H4r2#TYpi0GS{Znu&re5lWA+L)&Pf9pIr6$!;B9GD*cZyZO;-VK-D9xF-R ztw`RlOxS9AJlgQKxAE3R(aqi0AD$kz7Ek}N5?D1HJI$VJN%}QfHq=!)eqkmLb_Sr$ zg*iXb%bMtFu!rP|%n;prMEeX?2XXDxzBU-0RDmUlkZ!9RD%wldYwf$a@Js8Ja z;MRHNizTVGR{p(i-ITAx@hfwQE7RvAZ3k1WXG`q|y(-(SIty6iFTF-HGLADcZY!-; zO&o;XT!1bbA5kJNS%AAJz7LwrZwAf2+$q?Fx;`#~%Tc4yEe35CWNv|g<|Pp`SoRhu zO)rvsRF10!MJEa-5(@QX4K>5ig^6=gA}ClH2;d+l8fjFxlMP>!hgRyt$6E=JLV;q` zOmsPq-AisgTOSQWak4>KnTbinsqkSaLS`%{kvt2bID}GG)E|b762)Hc=ri70Pk)(s zs-YQt%~B#YsO^Cd>xG4D~Rq@D0&JkGyni~ z2f_P^*jgy@q@kYN+QHJ6;O=19Aero}@!&@FX57oM?TXGdUDSv#axxU!8!d=?jCq)b zdybB;2P82CX|q4C1cCx zh>eEU{`kGhcPqt7r#S@+--9R#qQMddqWEbo2RA+s-~XKda5wGw_J_Fj%KNMRH)gue z=M@qbYR+aAeotwhPfEnkV-ptS<7ZoMFUg)y$RsYb?N5q-?GZR%m-_w<_mG|N_bF`R zOzG3@g_OOSs==4&8{D`e6qB?R=aHbk$@IRY#$_gh&8gCBU+4v z2}Qs%Cgii)A~J!7kI0#KxgH7+PG-@1sR68K+-VEW*+cHkI;l9v|kV;!x<%k z50WPIZf7BYQZvioo(d;tfbx3{4n%Ec-ij~rfrm0k#{x6%=p?0FKWYfzCcqbF!sns& zASE}XAmxNGF`)1nB`5;LIU~9UxmpJyxSm@y(=1Al?38Xg)Aj9l0AV}X9 z%qSa3kBRNs)JBh(bge0j9~F)7HViE53iSCw`|y}sLUB`fco)f$y?Fc`q=R22Ti2w8 zM+E86r2oXRJ)^WQ=kdjp__*EttmXlAzAbI!ddcXyZYh&#|lP!-4f9uiQ?^d}rQ~7JQ`uTwJ`xdiIHJjIR zHcQRU3$m^=Xy0+P*Q}I#3yQD<3TT6e4Y7sybA%6Y#!3))_6h*M_M`U(Fk`Y7V-nCA zEXUR$vR8^_kP9>@$S@}lnU;mEvGnP3`XG%jiEPY`<6F{_Ex*Zn>nCpd{`Hg@FE*m4`auMT-RyG@M+t^de_ofvm2dj zn2=kje2{LSiWjFIspxfy1+tE1F!xX_fduX@DF;tK9+P|sy(me?j(+nO4Z#uXrQf;Z z$7Pc{^%9eAJZ;3REy1W60-kwNp-uw1IZn(B6gN`TwFKbM$ zH8%fDTpf$w%6zkuUo-Q?mxwi%NZ3f2U2FO*y?HG4*HqkkeacRL(w9&1%Vo)5D(|kB zCB3TL-OIeUm3L#NI&S?_^0$#&U%GH>+;nH+g8XqGYPA!3HZFP5DUrA+dp03+J}R?0 zikj%4PEBCyu8h97jEM2{3Q>+oaxIGR4nUa(P)yCd z4CA6KUk3R;cOoWvm?jY(lBCQmv0Ng)WJ0v`LzC_=;vV-k8`Fzf=|@4|yz(zku)U~J z6eT4ZprLJHA(LUGVJ5MTiqN2EaYryy@H5c$3wzQ~Tn`7D02y8USe!?{wS8F~THEd$ z+mYr(1k=h#$&GKTq*s%-n#qX`1Yx4bDlDH?P1Cd@njJ6@N!TSCx?yQ=GytbU} zeyDDIkPKBYM36+1g^-Vl6B9!(O2j+l(e_mzC>A|@P(Aze)8uZG%!D(z6Ts0Ggqw54 zw?HYr!zuig7u$v$+L!9sk>~DY{U?sb1LN5Xe;)k${ODfs&CheVAzC+lvT$NyLrK=W zzT?uvj(1-sAFO;z+#N{S=)SpJd1s?CY3JjUy~3T8!e{&GPj{y8?u{j_79}j!B~I5Q ztqk1Q=uDhzJ6XggEXkgZN$quTf1eUQ8Wh@YL;V`V?vLy5j9T|$BKWxJzl@pum{q*r zuY_Cg6sY2Yn3PlV{FCmKgxiz|XcoghFXz+`0btljyCtJBTvY8yB6&JsD-CG}WB4S5 zs*Q>+%i)P!N!y*ZWJ=XO8Ze9c_J04cZZb<4ZA9h%33NGSklPdeO0z;TG*JepP#L4;3X zP0h#!XQyjLMUM8{^GgCEnJlRUy}ZfHv7^YP>DI*&`H>wlN;+3|sQ}!_wtN?z@C-Ap zJig2XKYEc%ZjlW;&kB%@HV2zHqb#-km92@@!=V-gyq1JW2MSqG5L|#*jFV0}TzG^{ zdd`AOgi>$8#fyboY}x{M2c%<7S9rjczc&&WN>nc~VuDIwnv`BVhNp{8b{^V4EGD^XKim!`Cm4vY+ka*7Q4H$CE#o-ThL2 zYoqt>R{r7h{N&H&_f|@9EAGYHE2SqR&8Nfizvi27Pq&{e%AOC)oQ;bgOrnolkjrvT z-rubXk8!;_E1K16G3HRFnL(#Vg?f)GwI5Y$+$-06UZuT1ruKVQdRT;~Q<|{{ zNi`$lI>Z+$$wnswqZWmG$_NEF!y;wq!`eBbyCJ}KMuI^Bq82EyUy7zfFbcuyjs*n? z(*{ej;B`v6b;5!M`Q3596~&db7IhnuxHcni)Fv1SA|zbcw5{GR{5}yP&4iTZz^4Au zHk%MR$c!NM6_SgflL=$)1><*u@me|XFfe=uT6O?AA3F`R2oH%I#z&;xFOtKORE*Y* zLKvSbaC}Z?VFEin#m@ovV}XTqPw>jL(Llv0m}LMYNO#^28h9%RK10LVO=QNY?ZsfE z6rf}bR0>B}0ns+Z{BT?<9ZD<{nHU>l(2uY8n$U!6G%u}JD=`t0jiW0{cEuPyY6)u# z7N2lN^h5{_h4A-M2=xMpWPA|a)X-Ko(NTohw7@@cbYhnS`J-akG1@{~jWeCkP8M#j zw3y8k@;$-~^OyG3sf-@pBl_V{=%?rYE4Ox@XBV^V6x&Be-;<)Norx%ap8?r-Ma zp6kK|F@B-_VodqwOvA;P(&>T{E{H#Rg*H2_zBOCy_nN!HCcx5` zCiA1!qf)E)Dt52yO`g`7t+yNP4C)??DlN2{FJbIvBwg0z?I(o2(M$|NY+wmqCP7xt zb_iZKoT!(Z{IfLI5}KuzD|}jvwg*ey&kI1ad+{@Q$kPS2p>TZ!-&TPD8CZlYZ%`v7 zP!$|4%N;Nw>C_E(lRfUxUTER}Lhx%o0$7A(g6a> zGm>JN0|aQj*fPIDsPaw?c-vR`+;2BIf7!JC*Nyf^|xk!$CA_hVwrYKRp1R(i9 z7MUmoQd)?7zq0x3t4$s$`aoYWgCm4V zRJpf(fwlPPr z4}_9~7H)6OCnBSysJ1#@p<}LPX8&Ik{p;&mxSqbbR*=7rv6+X7%-3K29q&|ZX{|<- z|D0T8yz+GQQ&JSvl+@SP0-+RaOwH8)5|>?Ih_)NtL_sN9N6kkS|C4U8sUA8k;7)2> zQg(S=907pO!Vo2)rKhJtCt)gt(bQ4?JHCvAV$@uPWH7v5s?k$RGk{Q)4B40)b;&Q^ zJa|{xR8{Z{-J&e1sivi(NQ$cwX>5r2Hy`$YP3IVe<3dScM3E+n5)~$s3FO|{LYYl( z-@SUCP;$Sh;@-zp+Sv^qT^%c8QrzJYXl?Re6aBOO>!HDpF#tANYAUdjIXmo81insw zLQdnecZsi3Uc7kjSS4Isd?R4@!vuPps(CZloqJ6zftf}{e3tE6pFj}%{ht_-F z+Y9m$_SN>@v36&r2?#)Xny2L002m#6^l9L}?vWI&OMl0g81J8p{-1~0ZLzT)k0K$H z0E{3o;>4&%KpxT}hqmSBWQ|>oWg^Bm_|TEZqoqN5vi^6tG@#VvtVdJ*8&itG?0$qp zZvD!-ViMdm!vb7?z7ZJukjDOAcb8LDC;$0&!J4|KWB>T?aG7dzqZAceRg8s{?bJMg zkH`H?d@WXZ1b$5Qwn>v+=T{1jxxcZpiF&%WepDs1x3bjrces?)HNsTKe|7J(ID9B{ zIUxMbAtFt(EP}?cEVjIURyaWR`KBhK_^x5{>DtB%k&*r5AAg5SQQ5+BuKVKfvsNgl z*TYg0lDh7nJm z*J7!^@~GHog+|#9!?@+_QrDQiZUF9RU;9W*n%Zf^j*f@0i!rTeHwW~>1`S4TyqXBL;^CK^TyCGVq*%&*^g{Mvk`&!%!7cabhHSX4 z;=lC|{%bA=_T%$|Dm7IfAMDv#QF>qLA?EwQHlBp9n9AfB6^%Lvq5GeZ6;T#`2h-QY zvg_K~|JH;1uep3KnhOPL=xv(Z1hU-VQr+*PpBjUv&A5~ZN0VPqpVag{L` z?AU_nOFCY2;1^d`4E#G>PR=G~pMJE6R>|48q}?6JiceP%VTM7k$!jskKIWtclW<8! z-rog(vrSEVl!^}24*K``p8uNe#XIY=tbz4-!3~2vP|_HFGhHA}>`G zoftqTvwoMlqAH={X&I>KVcUOA^w0KRz5Upj@hHD0n+3%r(sJ)r885D(Cy};}2^!&f z3<)4mjxsfnf)7q5)|V&VyMNGlxBl;N-H1!Bue;w^&QNG;bkgzSWi2nUoHPjs(zt#? z#e$UD&PBn`dX@8ce8oL_^z_a1mk%?ZyeL?C|KM3|nW-9v z79o;*>(<-arKg3yYUavjgCLgCcV(|1KXUVQ{7b%`+daCyzd@Z;C_ z4IGIn3NzD&oA17VA7rI!Vx<{qk60d{|3SH`qU>#^j>V4r*F^tp|HiM`Pup9wqZ@1E zWvx|a*ZepMNW=$QU*tZuvPY_4XCWY@h#=iNIQd1Ojy2Nn=WgFA(d5*)fJ*cN9*SnH*|Nl9)6q_Lg_oSA2s}$s%Fj_tNX8s{@MQV z?vL*$zmMX6yiU6NAt&q3En5>cVW=J~hDJpV*QIs#f(l@46_k}ebnI?**553hsrfrx zo15FIFH#rZ=Qm}SzshJZYHGAsk%OA?V|;1IDL7G2eU<7V7FK&Vs&AYYEtG$(&(Hch zTuZnmt*9Wqsj24X&5_50W!W$EwSJAt(fQD@FhjYFf_;>A3sYw^`*JJuQx>v@o`U`k zS9)RY+q9hY^n&{P3E3HMveQ3AyTqOTu<`JWQ4jX?wbdMMW`^!jNpyUsZTiqJetd!b zJ6xq7KV)R57gkrsSHu->xAyFev&NdGm%mo?b#w{~ayI;I9892QIpY;NDqJeIRR;7a=-53&dCS@vAg*ON5({l zg+(i?hC8v+QPY5c6jY=_UHiXymVUi`xw6-{v;fC*bj20C$Cp53X`R_g#j*5YPFF`3 zCKfhMYfCFzD{gUl2|g$k#${okZ)j|&r++Cf15FJbBQrA-GkXLQ0WsInP*zmdP*+q^ z3Uve1(t_yeD5+(?rrbRm?0sKX_WWWNg+yW59Vy9y6oh!hAZHFyLVQ6kD^pt@1P{#K z%F^228Y&?tA;1eq!niF=Of9XgOfSP_ZK$E4rEhF(Xlw=HhFBYGs3(TB>YJzL}JeI7=ga6z-N=i;_8H88e^@xUQARwmZA7F;+k zlr5AS0Z-Lp?n`eGN@5ZEbB$EfqBdTm%(VRTX2xRjuI&J_Nt)d6ryr zUqMcO^1WN^1Y3Mi9y{Cz+Syy#+1S`Zq1?D1EzFFKYb&qf4H``_BOoolrntQI4E;1ZF5^&!%Jl} zx71cs!7Y($YLQ{FG11X6;Ss88N=j-m5o(4IA>n}u*36lzf(MBsJwrdJD8HnXBDrBO zD1@5}_l<{(n+p!d?dj$wCKkNBJUrHR_V#wR5IIUBgq?w@iK(Wxp^=f1@nwA3XvGHl z1qOyis)qZ8`1|`u$3}#!C@84JhAA4sP$Hujhk5yV>2(SD6$_K?Lsa#-R^}!K#x}SZ z85kNH8`~Hgn;7cpYH3;Wa#`_M+u9r18^NThWKf1?8rs?#8s=8G9qck(M%vMS0Rh3W zVc}81At525L1AHGYRbxy5fSQUaK2xk(sLUs(<|?l6u!87vxt$jV#D0fz|h9l(!$aX zw?|uA+S?hK>*;8kTElU1wX`=iv^TcI;0_O4V`FP8Ybz@&3oF&jaG9t@#`?zu#;B=A z1cZi!1p7xvs70zPslVR^K$yqEw2Lx@iZ004*pSaB2= z(%1$eD1b0CGPJX|vo@6kP|0$kgoKa?Q&WCEBkRj>nObXy`iJ6@r)vjChehI|8KUK< zt)Z=Gwpb7No1`g(@lg>^}v zGAHn`2k9?s_9%}F*6^t)DbQG~yeJwg3x~_eOG(RPv2s!v7#u19p%KLr0}=l_LQO4= zsR$^fjd2HtjjfW670U2Z=OG~r4c)*1{~!%b-C&%r@DM-Tbx)w;;3>F1qrQulnsT%7 zP7VmO|Dm6V8XzGnkCB4O%gOTcVq|z=7%W;AEhhy-2}$IF)_gZb#)bW+@h_gtM89XD!+Jsey=I>3lquD;li?v9T@^b zP7XYvn4ByYx5&zK@$>NW@yJP#ii(JQ!RLZW{E)^<6C+SYxOUW9h!RXB3FG7EGc+(Z zFyIrq)bWT?v$eG`MDPk3@?xZU_)Kk7bVL2LwB&_sjMS~;O^pi-a->|cq>>c4 z1_I6tfy&Fm1d!$^W2wHHpFrBQv0pq;xIG5L&TGJr5;8CmqT(PCk(HFfpmEe_alK2m zU!q#7+7d8fN@;l@D=QwQypf`|zP|oSE)~f_{ihB};Fpi%du$BS6aw5LROvC90M9GP&Y9{FP^%#i32r)4%Rzh4@S3^VXQu~*Pj=m%m z#l}esWB>6JNI);IY31wVYhvhSIXwcC;^6oUCLGxNvpiOvSy4N^dZ21+Yi41ksG+5$ zrfn*7RQPO>n`E~M~lOxe8OT3gv|WH;!voVsJ@22p5mq2RS|7HNvt3% zJBkl{@cj@EC9h%PWn^q>QyXP%Z9Op= zJld6>^z_n(%94u4rsBlH+`O0l4TTd}Nk(DDTt5A?E4ybo`P2y7Ic2hwPSS1_q`|8lZhJ8`nAT zd)M$xTVK~eX-9Wf!Q0%roZal3FBr9Hh*=QAB3P`3zJiXFu$b6o+EuiMmZUs46CFRC z9!QE$y}C96R2KZ!K3Lw`_jpVQFgw2j`j%Zh)X>t>(O+4rZE0z!E=9OM-CHrZ4ZuHI zKe+%=UTpVuwH3Wg%rDRV)K^w6r@;Z^hNb$2V{96g||3-&ZRgx6uVPfIJ zFyc|-ed#M{YCjt5eU|gBf9^sEMzc?gKpr)uR@IHRRes87Je5;b#Y}d!Pqw!YHZ+a^ z|Ev;zCq?Y-Y%Wy4i+@&J{dQdjrl&_oK+Y?stE#A|p)SUONAlE5&oj!&6;=M1#mFm4F0Msa`b*j!WZY*uOK$sy(Z%2u?4N1qGXEa_3RTq7)6x*0NJb$DarZ0_Lq+OKonqp`}|RTKji18OI$raJ9;$I9w(X-{3n#-HKYg`VEY$=>$G zsio1L{+!1pJ8;?6^!mCtNvRo+9^)wr_U2`sDQW2&Xh;z(OblHrqb5cgL3eiiT>|nveJ~xlo}X@uah&gV`IYdmx*Iboje~S83`TB()`ca z-qnLSkfIDVF4JOTfA>H~&3tcrSJ%W)&-C{4_`q&UXIJm!$im#}`t017?H`jLE7~W9 zYu*NfuAR{ZB3sh_7>}FI%-B|C#Da#RzC0Ue;sRXT-{kO zPfsl^fElUmef~~!I=CYurJ}2;UsKoEb;JZ_XNB_%^Ia-~UukYuacM<$Wl3dW{^2pw zz>xOqmz8fnkH2kiEicl;Xas&$%?y=%s;Tb$y|^?sHZnCgx45*&0wp~AJ<~jk5T+mO z9Gkk3!N|!PDaiLdzmKP(Z%|TFR$5yDGH@OCa&B~w3^cb6^)K(N%rDI?PS4KI&Q4#@%>aI{wvNcjae!om z<;12wR3qetl&0T3{i#@5RGeR2Sy9qA%!*_aMIunJ%W!3-rDtXqWEQ6nl0!LIIJPLL z5B9#Fo~+GqvC_$~^|$=~KH4+fx3ajizrD7)y0o^oy0kDhzBn_uurS$FOoZkk8D2U) zD0-PPvN?W6s3J5`QCM7*ky%`rUElMC2}H#R;-tF_S5ZM>esNY>Mi)Mu8_dT1`}mBE zo|0~bW{Qq9+P?ax=I zrBv1Cn{7`WjaHUqB&g`5YvDJ zWaQ=0WakqrJ<>>ooScj#8ygQDGb@;xmK2|qo|m5=27z)yki0y&3+u+5zJ>0*EOSvQ zIZN20o_5d2583Hy1$jlqIh8Xk^d!GOA6>g;w-ArP32gPNKgpZEs>N(?O{ z#t&bg-r(co0P{c@*l3tofFK5D7AnGTj8Hfd&H;t8vhoTbgjIDF46%YL)@BfTK@CIZ z#cNJOc{y1HnFYDI#UHBr2*Hd$ep6nC>(8IV?VWwFh$KG^l7{?{{N&d!VkiQMg2Nzi zDO|sY7f#OrqT*nqC!=TPVF3ZoI8p!O0uhq23-XDJs%c5cL6Hhp((E99dGYUKrjXjg zw8Fxoye9>XwcUGoOn6%~?3dwc>pt1eQ*h!WlMp8Q@p-Uoj!GCR0OjUogYd#-m7y?t z3QlfTCUPLI>JBd}m=*|w3J9XunEq`2VuW#_pcMO~1Z+?kjG2Rp8Nou;z1&q;laXFj zSX@w0Sl5To44@(nKlB*v%Z(J>NTuJ}}umYI>3ouB>j zUP<}_$2Sl@5%XoR@Y;rYYSi_$!acpJ?yuxl7F7P>kkQykQ_z^Twh(+koGaF7pmo8KQdVOyeRi$PEA!=N=5p@ z#}93v839C>;%aLv^w+evReLi$w)m>zO&OsSIRThc03yynbwSQZ%fWD3la;a5kX1ai z48RBD;}f#bq2Pi7tRO;MR|P9A_oKF~tR^QjyLw;K8dy45{xSPQMs0OTQcBju;>W_J zq3=hGM3?c^*)iu6tsT(R+uYw;<`|i?cYAH z(le0&@yNj-aw34J7)23N!xCX>X+KvcRF>T>&nYa+E6D2OVB8}I zF*7o5Y!Lw^#bBJcn^;Q)EL&++4fSPB6`#tJ9z00CT~k}rSXGdfnYPkcobd2%^4EVW z(*ED*&d$zG+MmVhXuNn5;c`tk64yi4(~WiaSBcdOpx<0sPOrONSO2MKJG*#k@0gwC z3o|(xaBPl6hLRp2x>$7HOuxBX-pr&xc5&=?md-_&p|Q*|{}uh-6A5s^wtRwvVi1vU9!M}D(h94~Ux zvF%=*fr*LxYPaRi>iX!X^EMbM={ecRDds;^G}SeJ$4q~$DEV0UG4KhfQ9Q<+l?$b|CdD&U%R_1#8 z*g5;BC)Q-Xmc~z$x7^N6FK;{`W?bLiArfPp%gUVi@Do&=nVOOm|McancQqv?jaBvc zpC?x~C0xeW+1dA*h*&rG#|a6~9YVsw^fXoVH6AB?c>F%xUqLaf^5pyO-F<#;ULM%w zI1d!^t5=?Qx27U#eCl_~yGJ+oC0KD)55RnMv(1}W1`@XSi&Jk?pCqTGRMo$~SzcZq zcRQ)7y8JR+pMRXxMg&JcESR1hi;W0V*V8rD@bhtcp8D1?FeWOc{bzG#cD>BR*d%m* zb{dKjC#FX|Dlfg+y)?SNa`(nF5$W+?UDVt?%?AQxT+{rJk)0Q>QY+t-*CzMOAI7EJ zyir#1=~5ZscUSt7uQ^1-k2e?G^$3cI&{NYf3A*n3`juB`sPCPD{crmlCfBS#d>dMr zhR*zIYen+!J-XMl+SYO2eR2Qx?LBdEJVKCc>H^=y$hRrT{8-=C*28-*Dy!bjw#?7X zy-3Afh-F>IS66WjHqO*IDBEt<-1l&X1lxp6y6TxtVf04G}~9cs)sw_gBlp=bbmnkDq=#>|Xvp zhfRL;te~LiGF)h0ek4@e00yaTcJU1H^bhpcR@Tss^#};mQmpv#^I+__LD$T-diQOm za_+Z>cQ2kDr##zB-a1QuA^7uQN!&VvD(N4?E zF*I)Y{A+n*<;L@z)zzofYXkfT8%e2&TVrQ~&nr)UE=@s>jv$h}n4aPBhv{k4hw3@0 z+VH)TM{hG8UWN+^d1Zvw5l8FtQ!CGfcwTdSdR-$xB`P{xRo!^7v$?Bx=!=P|Sw=x_ zWA3~Cr>y{zWYAzyz6`O&8nwwr)N=oJIigNk^?h42}=u$TSe-w z&Pj&}8&i3gs`!C27Z4OMxrIEeEGQ~@QkA6{8(4byqhqM0Z=x(OE8DjG z_0zMTZOyIOUw^w4k98iLobIM&(TRO@=Z}1e z0)w55L!;`0uU&>q0%7iNWyi@*bDfdwx>YzGy?J`h_~gptgo~EhqK~yZ?fXyCk`tEu zCWmIb$0m@5!2#$^Z#v3H#bjJ@p4)uq;?BtU*81u4^82qRPv73V zZET%CG4OPH-c=gv#?m%)p3Svg@90|NIUyT6Ry}kQ4s+N|j!X#)M3;+E77MH)2 zT3Gii@73eCSV?!&kD1M4Om520A8hZ99(2A~+nXEfTo^0A-;q{da`+%f6zC}^LBSBK z$s+}U8ToQtii?YjpUIPsDu5Y>=I?~(9>rqf4>Fk^xcy;=6v)VoPfRKMH14j2QtBg1 zD}|KC^!8JVSRw{Nd z8jlDKXMYx;9Ey~QhMba(or!@mh8`>o4hTnl)W+Ibx={NPgGh;R6KL3Im}w=*T-ga| zFT*89B`FQ{1bHwp2vL)ONTb5Nx$H^cRtlABK13vJ)C9!D4E&-FfmkASD^!kurV?d@QBfqQ<8GAN#i@qT&kUr zLGwuo@KCZdGYL^~a*%L(+6a3P6NNfAC`Oaev(e+aWBpVZ^jSF?fV5!m*5!6b!7s zbg}LMEHGk%16bO9KEuXbTR&nJIi4UjE=tc>{YN#4Z!g0oEP*d6BrYk!Cm;b6;-R8p z7UTioGvN`7GKOC#CF2mG6OW)L(I*1Z;$r2l0I;EkaBz@=ooE6{h@&~dvUrrFE-_Jd z5Heng<^0x>xnc$q9Ro@iJGu$QV$H4MmpAU`+`DxdE(sxXJ{3uR2_%b~q7aryf=ED* z704yRk4Nc+N9DtSASPj80u!+SxgA2?{ViDC1B@*Fm3SobCT%_Mp+`R zi{smmvd!&T=F^Tk1Mu~|9d17?xtVkO(~bJv+{daBh1y=-3>_tBa#Q9p*qrbJ`rBQ&Ys5Bp_ckSDh`gMj>-{sW;D{C zfYEQ1C!)GKBnE*o@t<#(x|P=7`|vz2?(T!jaKQnBe7vrvk|M#6Wclg+gIp|)41O1@$bIymYP5FfLxGoPd~=m8fNKjFUr1ke81&X_2glQ;4>SS&Ws1y2i=U z8o@?~VA#idp22R}^>KAY84U?5+s_W~R$YdRPl`uM#@$5Tz&k)0f$tmb>wzm}>lLM> z5UHgTq@@t*O6nLI?BHtY%E-(~2WFu62vBf~^pErpi`LPybS%(NwJ9vi$;o+ZOw5Tv zj`Wd~-gy)l;G=I=SJ04~|Mp4oo7KHbRT9Ubp3C{bT(?gI&C@$NE!n;IZi$D5@xED4MhQSQ@^MFSz?Ep{mMH&Dx9-CdA0v z*Z0HaJ;w+RI~+23y`B|TcdjISq^j<@T|-vmXvUgwga74=q9a?@2ca<#i= zX=JJ47~}&A38P_VCLm!aAfV(FUns3l&uADsd$RQN&tb~_J8SjEv8)z(+5|uj43vl` zjh;6qc{L`0!aK0YOx>&g?Zw9NgR{fSaFsQs7r%>pneh7NjR#RlsrO#Izx5z7CNb)k zzt@XL#l=Y_-BN}s>N*ll)syWz8^0gtR+Q#!yeh1H^Xx;$u0ud{X^Fb8Yj6k^3%4MX zAp>oD%-y3P4(G>LSp=ay*+&0gKosR>nLkW zD|j2!sMscb`BYNfkn*Cgp>Y3C>cga+srrIvujAD@_5Gg3hkCiYIR%hLhXh2DB84d1 zhq=4NqBoXBEnTDh=^R)d+oOlipPrt-xQs7B8eR;CebUoZFWkh{gpkzW*yjm;UZ$2F z4i?#O-!{f4CRU{v6cqO@ZhilA`&Q$_jPgJ@Zf3npP>7qgxdU;myQy6aWNHAP zkW@~{5iCmY85!k9PaEXz8Rz4w+x|x3GF%Pu_3vGqitm>tCnUVd%zE@Jx8zA8j_5;v zc1_je)QrWP$~!k3%ad_~oMMCk!&sMwu4osKv#Fc0bu=};<8?C^e|K{entj1RZ-2@F zcj{PtDt|A+E&v{*LpT9onGP(_zHk|?nz(p{$S2A14fXYnSs8UtUgY1n{qSK{T4r5Y zL48$x#q%VG=QrMan^JKRg@t)1-86R~b9MA}^K`ec_lzc^aIiNf@H2#Lf>T3C0zwIh z=%UH!gS;pZ#LW1f5q!=rugORix-Y|ZD>0#>vf$Oz$2aDS>(eXl*Wn5Pe=4oLnO_xG zd)N5WEk7?06)^&OF}v3xc?|-5)}}VW<}Qxj?!L~}hW6KDuiF^m5poS{?06WE6Vft= zM7eO!Q~NpvM8(p&dE@#5B#&G?H7~;zS8r}rRay1$L&@#eZ{iCcq?{(cz5gmbtG3QE z|3!&=m=GBWzL1A+Xw4l{I>&2Yk+5^I zN4ZmR1O>#9Q}~lRl0kzbz4>mtuw8~LC$q6I?R8#Z(v$cbKjLq_eET~mFRiKYQ@M(r zOB9bRhCI?&YlilxH-#iPTt0L#s+py!_jX7T}_-Lh=~{o zV!cUO$qCsxDPyR7*ocFu$QZQuV=u$?=2iZ~_?tJ9-@e;OO}brmtGqnF`lVZiEg6DM zKniGC6V;Sz2X-+uGPiIh@{9B^H+67wxaQ*m7AMWdBhkH)LI#!6%!`8sqKO)%2$IG7#cV))xVr1c(tGtizc0JfJip&&985HS7 z;_pUA5f+Ibh#wFFjvxcO5?tzfwIuiXO^3?&)i;yxzIvMcFXB_ZH?>KOBK)lb*XBoPx7Mw6+sDkqhduNBUusS|00(#3dM!s zwQ+j?&cpvx%n2v0AUz)+fB*fb5vI15hME9PP!2_-c~0&hgO)5m;60{2|Ga`5n< z+!x-*pI=W6)M!9oJapjT#eaVjs%usAKZVw7RTaE;;Gl@f zeCUYGPtxD3%uAPy8VRCD4;?%IN08rjm2-Ozo)u`w*HArvMVEu0OAL8QR?@;)^1sL> zADR^?gt<49dhK+$vP!6WXsBPnKHt55=Sv8tCh`aO8(HgHR}>iXpGS)E{eAx5!yy5| zAxD2d;dUSQ$4nA8FWS1cJ74hBsms1$KKuVU75wk{s=0}3u2U#2p{okGD@YO3*~Q`i zBG!2fnd1o7Ut`>$H(^)uCTx@Bw!D%%%-M|s44jmkBlOc|I+nC zLBYq4o^Tr9dujXiSsqRy6R0W)X$#~QTX?t?n(r|Zdv_@#kg8u+DwUmEzOfnOT_@#kg8u+DwUmEzO zfnOT_@#kg8u+DwUmEzOfnOT`2W{{p1!_`fuW(Ep@FuxwwA8ChNgxl zFqo>V>uMY5nHs@`6ZmCjWoBzlLy`JxU0Kp1z^6v4Ot6o(}LkXlZL|s;R4~sR3)Smaev;f$|SJf?^Vq(lW9VG7=J^!u(uF zTOdX;GBwoE+WAaX<*ufh^0LCMg^BU0(b0*Ot>U^)W#!JXAq6c1BVC{a#tse*j!$-U z^z^D|ArN91v@jYYrYkP4s|(z{ni}e=@(K#FSS?VWj;@}bu7Qz}fj)?=qi1AcWsT%S zadGkRar5w?fU}d6lM{jD0LT?TT0l@3BPJy+EiW$%{4CO9!h9$WTWf-aiIJ|B2C&+~ zUFPH!mXs1ii%8012j#>?QGnc3^mHw104LN5ex{AwD!(P*j+g8$q-}AVkD4!Wa=TF$_j{=c}lcxP*+X zf+7fpg@u#_HOVRT!-owh3=AmB%K&9L%7%b52eB+n&5e!pVc&#@hbt*5E32p~?GFs} z_XSGPz+hEPEq#3>6EhqU=wtc#;kuUK&N|5r!Sl)W504EEDq>|6$A-s7#s+|2367|u z5TB3;tfsI4HwQ3=iS6hM1VW>Q0hlKW@<~d`NGm9SNU}0J`ob#6_YaJYj*X5C_7BKO zOJMjoiB^^-My8ggrg}OWN=m@Z9Htzmq@=E?5*!rl3v|>z`~1T|Fhf&QV+$Of$cf_S z_VIepMBBJsNiV6xc zGN3PMxq&fI*Z<_gN=ry#M1)bCcq?-=V?#Y16_xOipfF#heM+jPiqnDq9=<+)Vcz@p zg~2UnElW#13o`VX33pAJL+z8p{8@$gl$kQ(<#MV;B(A2`n$ee()2I6-V zmjEA{7nA_YBPxmkW_r+<46FbUcl8g=4vm1HfFC6)D#6byhz3R{Fj?aKeBfXNkthNl zPXOD^Da0$Wb42?jv{gTcVL;}Z}7#vQOdlKlL90w5!oEz!gTcyYOS1o~ELIVAoGeC4SeOG%kR@<&{rsVbomsAJ*)Ae?p~R-K`Ab~lvtm-)Cn~^9uO~mO837TN^qXQ{#Tj$d^oT5Uqg`*vbk%5Q*RrxTnw|b_Mm)41+1u?s3?4uQ&5;pZxWe^TazoK-ne;(MoTQ8tjx)ZPs$=u8Lar; zzRHa5Wt=V17DuqM;uhrL;uhqUl-Q9;LL~X<6AhNoiC%8`w~9+{vTgxq1K2JzClTK@ew!E+kS&7%3nnCLs#i7R4xvVzB*FwZ-#gB~@ixk@v|BrR|$r6ku4%NlquT$z&Gu z{_Wd$pFe*5_}SgNHLzqF9VceLPDIJ972M8=5P>|boTBE z3l0nk-Lp?yPgPSLXKZH4$Hy;(Lh*3(poBPt`6Vav8{1mb<16~oSxgdz!pe=xV3AmC zYEp7yR6=wPn?|9K;TQSgv&R%#{n}FP%!nkgCh<$lgG=L*k%om7mHR)fgn5(&mnmbC=OUT9u7`0_8fBZ2n1S~ z_kXnPd=@5nH5HxEAZJ$>J4Y9o>fo=XucU^zHL~OfAHv6rgdk{obgQ~{xjmhh7L&lB zQyCOCo0UZ-lk%wXv9U23NzpVWi5*9a%ceZaW@ctGNSVx-mhOV-neR)L3$vn<3No@F zl)Qp87TgecNU&?5Em+3?dvWdj9Dbk+1LU%x;ErJYyg(t3*U;DX3k-3wbMSzX6i(s( z`*ifw2}BbMBlw3ycCM&!@yB*gd}>2$Tx?V%m6b=$V$s=*Y&Mk<6`v5F5S5gaz+%%Q z6XIYgvq`y6pFX58snpo`)b!Hz&1nft|2P)*5Ey3c|8092IZn_KHwXof&I{&}4`LcV z6!%Ux1&IZ?M>OnjBb>6n;hw#Ljyt0=935Qj!$Jb|bu|e%8z6DV*>cX5w=egmr&6=C zsB}6#4*o^UVNls@20bn>Dmo^TmKT*49Y?3_*bH)3&Vy%4X5ixHm? z8xzNX)0hxX&0}Y>8Kj)Yj~+b&+O3B<4<6>^GNa=Y+tvnG=2texcS3bJ1r9KtoE#t* z#2P$&U?c@iOwD(~U?F}!NV*URqO}#yawkW$AP`klHG#at&ed%fu+F==xrPS??)BT_ ztD~!Jt6?oIRTtj^bh0%}?npxm2%#8(M zlYxJJwow!k9dqJ%l2Hb4M%=9syw!Gc!<(CFFCO7J7z|rXUbHkY#IGm|K`x;0U&= zS_c069bMf#VDiSU-L4M%1NRx~Xz%eA!fZCjQlpr3W*$9>&5Dj>L`O3jY-S{bN~6)E zqmtv|q9WtjktwkZ7VxGrSqxSVoywx-k{&+3bMxk%$4{Q+(4so(24}>EDw`6U$~Oo4 zrzK^;rnW8uv6CT6N=joy@%vAoKD>SVDT7rxyEHF|(bmw^hAPd@ZIt;RgG-8fYMLrI zGb2+E4-;c^IDl4o5E9}qZkSpEc@!6TW{89Mk(}~a2?H(IgBnU|IvRR{BSn!fKD~SO z`c+zDY-A*xMW*GFX-qnul1UWdx0KEBFGXI3`w%gaj}XsWBIYimKwi&Q{z%c$dUW)?fM6L#$eD_~}BY-(%+d95H4 z{3Ztn#DL(8cFe7S0E$BfBc?B6C@NwgwmL_7{o%`(uOB|YNlQtJq_Jr^V3(-0yxiK_ ztgIX|jls^TuB@u4re~Gsmlk$cWRuBx%=l@Z7x%tg8{kIS}Q5wG{+%kex&XEi5F6 z;*f@7fVi}x6whdJ^otMQZr_Ud`sqyy>~k#Az&D(k9i$}7qWD>6y7<>lnsp7z=rI-3bg$jV~? zp>Y&Di%gA9e*ON<+ZXTNe){(1>$mr>int}LtqAIx8c?lM$5|Vh?N}ubJ3A;KczSu; zdD}TUyC4v&t7~g~t81&vD=Ujj^9%EfYrH}n+z>yaAVB+G6#e?+w{M?6e|hunRa9O~ ze0&~@Qdd!3P}13z-&s;xQc_e|mS0p<0KY41%d0wj3+i&oj&QWGQX)XCZ|~l{`T|S9%44x1@MDlF3%WWx zd+LE^zND_Sqcgv~r>MQ7h(d1f?d>TpswplmB-0WiQ={`p%&KY%Ej}(GIyxgU`Tgs! z5wFvt;w$@_`E}Km^punh)wH!N%phz4XJ}>R>|$^4?BeQV1@*7>jUPXMY;SMRFDx#@ zO0F!u&9<&o=IXeDAmP%4aJ$%Jn#V-X^C%MzkKoa&AT{8E|9Np z$w^261&yJBo{m14Gz)W66KfpN(TcDWSnY(?>)$uFwzjr+mT+}#d2MC>=XVZH6oMPO z*ujX3`1avL#H-g|-z2_&l|f6OXVGF=IkiQ_#oY}pEfcL1&26*wz2n_QMNMNH6YXUU zt*t%9m1J^$DGahG>+b7m7;o*WtEAB5k|8lkdG{{y)!WzU)VzwOs)~ML1+1nlj5jlY zva`N{p|-Z3v6;CUyv!}Znj(3)xz;y#=uUskuddF|FKzw!&LPAv#fM&;Af>(jeDC9@ z*YDpaCnY7OGe|5(R%R8bqpr5Hq_=)*cD%K@xnXi}WORJu2j9k2L;u{!WK(xXS=01D zZ(VIiZ&zO+=T+lf@IJ*TkQj9E2aDb^Z8Zb^w4NCJ+ zi7+w(JAvloLhV?59v;-j&mUXcKiAhcIQSu1=ab?t`uP4~#K%wHJ|?Hfq^HLKgK0%> zRb^diQ+_Thx3a3OeWZ76pkr!gaeiZYYDEaWJTovpG!8Y3slm~)_O6`D{QQ#g`~os7 zhR#k*jE#u`mztiQ@a|b@g>yK*8CcJ;-?h< z_R0#pmlwgc@MC*=Vzj3|zoMnNzPG!gsIIy!F*PMRH7ONH!hsDqHX$lHhgvl{x3v77 z6A}+zL8uLjN=RadmDRMG8|uKWLfpy=*^$sr0S9^p5XB0K$f)S(YYU?0sxw}HiTL_C zHYS;p$%cfHnpafU+||`qKQ_|4_+zB8YqV>4YOfk_yNT#PEI7ByacC^0LSVG zd{0kvOKW8#Xs*1lh?SC*mY$mh$2ul0H7YJHJ}o(?q^7lbV0vK#$qhx+C1DZF^yJuJ zZ%=zG0k+IlJ*rhsI+kGosm-V5=Hw)%Q!b|l12-$HsBUm=eNh-Z&f3z#(%Sse%-rlW0x>={IW{sfI5-09F+0B~u(ARL z35+yP!-_L`#n+VQ#X90RR=gR8tc zz^*1H0Ch-p;mN*%`T4>AuJ+l1)%7jj`GMBnw))!2qO5dsQZkvIm6(u_o}L0Mmg%J2 zoZa@u^1MG+*4EY*wtj3au7Tnv#}SCJF(~X04D<~34-F2FPE5{B&&*;(CB$W<#f7%U zn%OWFA|>)mWCppcusXjWJGV5ypaZ7EZ2nm71|vGQy|z5o-9I?DEXFM@hY*INtfV0= z&AYa+F0#_zGdxVY5hp!ARF8dmbRFL9Ayk~1=? z1;vF7W@b@cQS;ndL+#kY%3OcP!0aaH^z<-Pl()HN`G5T26xivhX+dIVE~TwCJifHp z@e^b1hv;jrEpKeE%x9)0l4G(-dAS)GSp`&AA3sls2E83^FbfOYKR3RA|GD*Zcw}g1 z8swUWI`b$jBlH}GdU^&1y8HVV}4|QTToJbyQ{XkrKLPC zkCKs+m_a5nm>Jnr2dCX${xA~L!^>Hnd-3}g^esjl%Pg z%~1cq@c4v;=+aVY^vky&BBD}hMTL|UNu+RHA#ch~4Ry8kAP`(!Y@ll7x68``Z*65O zxi~U6H#a*wJ2N{zI|uuIdvk1H1lm8FFz{hzb!}y0W@u<;d}e6@Vdizk z&d31P@xs!=GK2cgvKC1NF%**H=l8jRwvOSx3FOxJ+|t@M0ksZDmzW@4&{|&xOs+rOCmzvV@eF=x+(h$vN!uPMBpQ z%{?yiW38{DdPPp(#MaJbudhe2tF@b*E#3nLTM}Fx2)1|?H4VA-`rh{D=Emmc+W*JjBpNi-kwhGo(}ddE_fSfV{5#N`)*TpB@JcKh4H?^j_&rh^2)ktu&dG0QAsU* zRYhQGM}}edPmfPd_fIWvh}t;%gan3#2g5MYz!2Y{pwzq~T|f(Hqet5N$Hu3(P}>W0 zU9BB$OA9me{1^#-PSp1IpK}A{l$Z}Mo&XR2!^iJPRVBS6KP8oA(UOYV2J%80Y6Lr? zo12}zzXJh8Bicfb#?ssf2szZWv4Z_wMTLcZefdSDElmi-$M;FewD{D{`r3RrQWKL? z<85uNvp@Ki?ZbkO1^FNO?PzegpZ~s~06!-i6J-T0y~XL#;la6;`StA|lhZ@P6a2hf z>%3S10m$)lEOJg4P+}gueO~d5RPgTBgM<^XaP>U*edfJR(3SiS1t&SJX4q(AF_=a51*^aiaw}P=a zc%rhU&{TCnUsqFVa#DIzO9ujR?BlD~5AHD;H=!)_GNGrTuxMzJUqr)YZ@|glfqcp5 z_ak9;R@OLY8w&#&Az=jzaWE=<71b4uT@5X5Q{VYk5TdKAoN@?hKq_#_iOQ_~oKjNi zERK2~kr0y>o0<)dzHwgN%naLIxr|g&<=_>Ov&I|SxHt#dJJ@@>IC+~{6Rnk%tj(w9 zOF>^58EK8peFX@_p$}i*J->H{opbNrja$#(#`LwTV>xY|_MSL%=6C-+VaLz>u{X%m zoZxC>h*d-)<&^~|yX#0SaF+$`rOnL>axniyRDNNZi$@oyEeS0`V&Eob7jeID5Fc8EL4f*w|`X%$66H^tIJD zv^BJsRU;4wet-Su+1;GXEHZRY9^82Iq0+<*6L4ks>0>8-eEp6eKXE)D7-m6OscXv! z^J8VW*NZA?R9adpiOEoD=4dlyd+XIC#HA6)=o;H>cCRh^w(z1{Uq-Cg+* zM;|`@{@dNBIk3BSD#ls&GgBS^{`1h~%h%5b1q6fu-O9;;Ku1q+3r%g*+8SbQsDG-H z{{G&x+qa`rGMg#mh-MLUa}T_Ml(MCl=Pr}*Gy6ga=B~Sq-90Ux90(Zn(A>xx2U!XD2uF-L9d#9mDsYa<{N?cXjaJ7rrkfC~&_ko@--)4=X3GY^*%j zyMQ#H5wsaXmOnMf_yl{fUZwzRcY6xSdSf1Ez~;@iDv zxfE6&jZS*@<)o*ls-YkcTIS3(Z-*M z5n6U0Zobf3cXY4|4-E?ra}5smcX0GH_H?!jgHm>=t3&wyP$MZw9i$C4ZOr((*RUc& z>$A0EXfbIK5e@A?Jun7d)0mfo4U%z_y?f!$uSvd^y?YDn<2D>Pk zn-Xn^7W$@mQ}s}%U2d)+VSBU#Y|&VRoWxi`&ewMjKfKFZ*;YSy^{hS2;SLMm@8sqT z1)^Qf9>GCcI+{whP?)tKsA++VadQZFBWQ_8V04H|`W$>*fPGPDO<0zbM@Z$WJG{{5TJ5%-^wNLf!Fzx>lF*xrv|0{c)~*&IL* zcpb&Ve|hiS>#r9;G(kus71kP_eS6bfvWl?`x_I@Xi&Kz?y`7uO{-8jAe>*3aJvy3Z z`~pyN5jH`am{}=n8SL4&KYaflRUCn+M%0%l-g(dlgTR*nHBM>iNfB6=1?;$Dc(Y<$PgIxb|QnofG610>w@m3t%NSpu% z0(<^&aG;tB9uQCy<%#6nxOr;-OqVBy97B0hwR_4rGeMO*_f;9YZyWGmMk z#>P}r!-`*Q18FAnfs)(0CO<49Wrae^af!mXVlx0DsWw)XR#ldiSJl)tz~qCQH*Vj$ zdH=@!yT5z?`TI40B{dw>7Y&UG*0`OSb0)?XDj}AEdwl%QTn^RIm2Ik93EvyC`=Ya( zHW!j}o8QzxUE9n=KtyJxup{wqN^AZ)hY$kEE6#1H%OlBy5ES4wwZW+=YwPIP7+I;| zaELiUWA5|>R$ASh%!m{xA6fw;$jJ%4P;0A>+N$!3ipr|uDwvUgK-|6a?Q6uH8(&^r zIr#c=kdiV1n%v4Z7S_f%6I1BDndygwXoUuZhWLA34)!!&nGFx|xp2{2!wh9>EjGqw zqG8T6G@oD5mVW!&+b^GXzcZfa(Yhw7y8a0!ID)e{r) z;u3wj>I9sniaB1Tv$Y0_kX2RHwa`5Q|M>O8mrox*-TC1e7d8Ttmf z>Z|U%=R2sthZX#u>hC6PxM%deT492p&(nVgcD7!UIm??%k36Nv;} zRdap;3E-LM#PA`_j8R-hwnqBey2j8cCh`c%43$DX_FGGzW>58!nyJ2-nGvC_xTvtG zur$B0rn;`K8seM}pP};h;r`7R*S!K%l~nXq4AlT{347i|OUF=EH`F1(e^0<}$J5t# zIbOYZ&K_-Lsd4mAaT9ge(|kA~8P&wp$oR;l*yQZjY}$?Yqm$)oHV7UYORUsMat9e@K@)YaBi zKzx7e%h%6eZ`^!z`}YfbVf4A4iu%qtd7_btg}%O0NU*<8h`(p}xexCTT{(IDHY1(@4vYaEwwcj-Ol{s z=6Up_xeC9mIC}8o`&%~>qcV~s)AAB?-*jl}nA+F^$bhI))L7T8gCsWB>GS9r3h~;Y zM5UzVga!EoO?CB`zHQEpudQ@eex9jlP{yfu6*bjVRkgGeU^*N4v{ z-o1&q?rCAA1#L$wBV#KwQ*CWS<)B^e4j-PMdG+LT*K+rv3#ZQb+FEJ`1!x|-`qz2; zfOD4wkTRUj@5iTMwp=Xil8DnYpP}=a2l_R!dIH`bu412>}}e1=L(@0Wh8^w)KLA6>~;Q@1h7Z*8q_sBi9UnwgszpV*PBePl^o4c&I*@h5i) zE~2d(j)*6k+nC{%^|jT4-QAoHlFqRbsqqW#PY;}RF-3ui)jx9n;@Qidmwx~4imVK$ zWPeIdE;)rlVUx&7cU#+f8`W%xRygH`=F*DN9|iR$2%U~XBJW`<9u;d*%rQ|8c{vnL zwfy~JVg7Pa@xoYr4PFyp*U{L}+A`JH0$KIo(B$kC0ui*!+1}MR$VQ5z;Qp3}3?8qf zYz=OWps%m10ffYte#`xm(U;LtQ2O%S9xZ~pn-4b3^X!Fl9@kDQ9y|Isiti8iTNFli z9y^c4OisQ3v8=E?zu8Jn!%C;LZt%zALaBk7wO-9SP7Z6V)rr6y7vq*!MB3^Xyjtq1 zTFmclZz#4lGsZ)YqOqa5qpf8UY7jF}?Lr`cZQRTRFcMx4yT2?=zrL)gg9jU7t>mMv z77*s+>-<|fIk%4$(@^l#Ra?s}&{;|C^xuD9x4(Ab)b(o~ey0!p;Ys?QkyTd0V3HG8 zH{T>?7uV&NcIVf(7X4V9m>K@wt81lYu+@rC5K!;L$sOLgIKU&=zW(idamQpsMZEa+>e`>UbY|UnQ8G6_-&$2&cQbwLE0v!8_WrZ& z_D8QG?tJ`ucc>qRP%91hHsVdJEG-4GN2NIc0V~xzI$m5{Qe~A-WK#1o=>SG4=zksY z?eoVEAHPL*CZ-|~etUFu142SIbUoO`DKDSA`1ikh$se0)H#&Slo#TU%OLRG432L#MOYxBhU!aNr`sBG z;-cO>SX`SJ>a5$w*At1^M^2w~b=i9@9Lud%Qd(Y9Q&(76!~_&sQMIz-M;gFc)S*I?b6$qSAUC*4NfBO_6MCfdn69X6CcnKs zzkYvyrTo4tC5Zgu<_RBHdxyZw8lu{zH2@?iE-Wozu&H(2=0^5myMu#pT!uz!T7ZXv z{&t9+=Ppe(V^^HBBhCq59P#|do!gIIetq-#)r7&7H)o%w#qAA`4hY(R{kO{(B*f1; zUkG(KCaiyNUz|;dO(7*FJmFy|o&zd?z?e6 zKmDb{`Q3ZM4(h-Ic9%&@Uaj>D_eCN9Nrr0Uygg-?L4hb z9dQIa!Lq9?{?^?mwDi~gb1f+}rq5r4Hph;b85Y`D%3E6g!+-TcxVD&{fsRFX)XLh8 znGX*he!E#W-P8K%<*PU6{epKnhwMEX&7hMHV>+SX)Yw@tGC9z-v9ejaHJYDZNsj&U zm-i|EgU2u1`<(jo#G(EF`iOH6cXxtkO^S?4_47G&$j;y0p3N>Ufys&hBW{9R_BwJ zsnl<;vRG*mSs5?CTn-Gh-+%fti5$ab@pJ@VznesUd1tHd@vE7-=AUyD4;YzGk6u1{ z>HNP3!|nH7IpR)mHN%W&lM9P#=+v~B`MW_rqHVzMr4@_T>KK z+i~xKKb#u%=`0)O+qaPNqNz<#c6O#!Mcw1_GBr747%}9!mOnLb#p_)w1&O{)RGibE*^XhI-w`AE7Z?7h_U%@6jk$L^X}b`uhS^*q|krU{^GcB&Lup|aer{Yi9h`C99JNN zm6AdUYI1YQNsFG@6O6MZoW6AZ$^~h82p_TMPk9Lo{dVG7NJb8mL{Dmqib_b&OYBIC z$%u(g00A-pqR6DMo;-@DCw9ceL^dK2iHRxkk@1Z3YEo`(V>;lT+00yK?eh2WHj0ap zH-o6Ja_!7X0>&ja^~RGMpTEXYe7*OcalQBlkDj`Qj-isd1s?)!$>DqL(2;Y;r6r_+ zN?Pn+Z^{8{W0QlIe!u?LWN@^~& z;Mj~iwC6XTMRw&xLVY4OF)b~D5!;vwINnGAT+k!g3305*^lmb_hUDw*AAI z=`k^#5a&eE7y!P`1Kej`W>c*_ z9y%uH6y)&ae3MIKfn2unVQxX2kRJ< zO9e10U?<}fnTfgCS@gUtdSpxLivcO z`Sy*60Hxh~LE6$tL+hlydt{tT28_q12l?%?6Xn*+tIdh0|8Y5}Eus;{Jo@$%oQX!uf9m?B^TKCNN&dqd3UCNF zgrd2IzKu`lSr0}$gUU#V$=Q7P|f)Rz(85@TxsB?bNMtW4^SOhz`9%wmz#a;f)j(8zRBM(%^BH)6W8@<^Fk zH{eDB`SF8C4{pyteS7~&YL>Z&_b%T92M!#*cJ*IjnS;l~A(@arcGlWTUs@V~!sgbh zy4Y+wV9Mz+w1~Z-fyWLQ=4hD6>e2&YsOIL3_(n{6T2$%C9FKzdSU!b8j(HhZ(o+^!)mfhq)v?+*JdbQDE+#Qp$>_nprN*)*Zc#Z! zE;P={NE+_3!cCiJltdCaC4-WZ$jGK7WoBhDDK|6kGNYqk-lu0~(Q}?=KYtVP^z;1P z$5Zj7FVEQ7odN#trivF1yLc))g@!n}T?!|d>ASf7EhecWE@Y!gX7u%!^p)jMV)EW5 zB?Nix(eKq&SCrec=Ri{VdT&N`O&D~HFA~0eyEi|7`{j$* znYZ%jkE>l>0}M4oLOpl6_=JXq{&8FzuWC`1MJg;HS~yg&Vq2QylPlYkqUlkrtUcjp z@fMOqUd`*8;a7j_&WNo%rLTWb1TG~a zU=(d^PZaEa7Bwy|x=(6qwJV;LlhzYeUtAf-xbZrbHQB$svhnso(cIm~ckeuA-gwID z$ny%@?_-8HwJqF@cg5q3jCrwB+vOvhKj&8``&%o^>eh=2Co1YH^ZWX6Ui$=f<@p3K zJPKY{|ETJVEtHjCo@^_fKuF3?DW*`0=LJm+bab#{I#L~ty@Rt8&-Gt-lseB2_omD1WMS+3(3{y8?d zv@XppH@`a3+r-Nyxh|((*&<>@uu+lV;!tVt9;xaYs4ug%l+i{YTIrOeq^RWFs_I4> zyGMwts zdHk?uVq>GcvUPlQqM@U>7Fv}(jkQIy+rt^(MzYc*1u=KwW-$Pg2tBo z?&6;%Kdai`eVH7e8fq(8o6*hf1I9l(gLD>ek9KFDG)cc6}mZdUgq^fEu6b>i|4YZOx!SPhnFmr zU6YzdLqknP%h*s)OS74?1H?J(M0Ra%N?}=VQCG$2ME=aJ_wn^#a}&ReY<~xs<-qIo zj}rjv?!)W#bXVwTSenZ6aUm~WMoXGkbk~+tS2cCDm33D%&(;@ImlPH^O^spolyJr- z$}$4qXQ$^xbo6^#=I7%pRLqUg+@rnFT&ydpF3g`4YAfk%uc)xF&@~lal-%m>ni`&% zn{KIQ=Vs-(c!69$*GtN#dzKYMHmjB==L!>6HpjXatDk3oo*5V%fAsP8jhuJ=YyYpJ z^NwrsT;F(@f`m<&A!Lw{0c7ulsC7}R;zExLts)9q1Q7&8s-AP&+G9`kxLR$GyJ}sv zD$ZZC^Ja&H3^LhErZ_-r{XOTe{Fl%3d7t-v#(iDi>xK*(CCQ{K0Hx&2_0`o4ku|Ul zi{ui(@i4Mzh9b z2LGiWO&l{QT^fZ!OC!@5*o+x5FnDt4$BySuWyD1B}ys&WA-o;DjezN3~4Re<)1=X#RqQV8wuDE`=;8U5j zZKijTS-}(0Q8P){KFiyI;s2bt{mYqwLCq+E$qLOUFcv?&;XH9@$TiXHl`x65J9kw` zZtQDbQeKIO01XfK9%u+fQ47o_*=SZ0hDl@5j7kN>1-g-h2B5w9QQq-))9LitCL)F= z%2^GARc^d?;nLL$w^!b}x@`G|3s-Jky?yb*AD1s*d4BQJ`*T#6j=uT!+mC;Cc&$CC ziS~p6pBn0UE%&+f&U0sv9swq;Q$up6e)>{N%hiB+;OPDP=d+X6*H$#vo%-s`C-YYp z6|LMJy{)Ml3@kL5$kG{AlPOs)9!JaN0)0aZ0zY_sosmZ2&iat^>f=Z_xn)i+A$L_V z3|4sK{?UJZ_x00d>z7@-`t#`<01G+){K2jJe>`4RxIkhH^u0Xvp>yKZK&zdZ*NGz7 z18@F(V^A5i-ygqq^+A{CBpC z=a0h&PaXf}(C?Qv7F}Pr?&A5P!q1k@UdUQ>?e_W4mhw0rjW3`iQ!xcZkYth@LLfLT zzShCBKXrcDVe_^1_5SqZucy!b`%>$Tr#BC*CD0|=i|X$^y108;6>N9I=CbAGr6m=c zcUG2`>2U4tzCP2;nJn^mXjq9FW$WKoPyyKsKC$>o0O7GWn|}Y3(1iF;n|~K z4|M+dexmoplOv;Fe%EsE0cTe!Qb;F!QnB;vAM48N0IwffUA=W%@rLacyDDmel4kBG zD<#+oQ3(nC9SWlixTWawPfI>)#sjk}>7FAOV3*G{aWX5)< ziXoZQs5RLky+#9+W0_2@;&5pOrL8?%wLM4w)|L>xX9kZ$L}(E&hJ+@fSg^S6H?O8Tr(XG^9S;`GUC& z&BNqS@`-eSUeiSqF@vU~(->kg7uyoCb4Ud29&A37LpEt-8jVKfZuO{*8aCIgYBw`z z1_wU9O{t!qK*C`4)m1esmzB(!UAlQ|#ip&b&9xP^W$~LZ==3BK5r>S3AjYL+5wRjP z4<3;c!J{Ew!ch~`38I9?^3&UwB$35Pn^Li2e*GL(0EL3byqzA+pZEL9 z?Q>8E*pXC|`IaU)rV_C=C^-R@v3Gq!P9B*n2B12P2I|N(mVgeRxa=G%*4i#qGxc7G zE(ZP@P#n|yd@_yP?{L`%y-tTt=29_D2D3U~T7%G;ol`JaL*@KOPaZv-wRZWkDwme_Fe&gEDw{5JfK-KKu4^nGl zf~%r3;5-Ijz!Xx$(_eLB(kFXPJ~{pU!P^f^6TztX9<_F5Rdi$XJ`653yr~hJpB{8z zOEZbi6a$V6M5(dp0Ogf%8A1jo$ZKJYW(Z?~z&cMW08fiLx3SHWU_?Y%SikAwMl7A!BAtNMv;sJR>D5wKboLR0a#DlW4#RRVXpZq*9H~?GCuz{jGkNPj4Nx`vGIuZfleJ{mCV- zfKQ=pQ|YSJ6+6nSb|=I}B+m>9hG$fMk%CLY@>qe??hSY~O1I0}3eJ8mpT*H?Rm%M8par1QbMv0k?OV2O z-qBE#0nZ4^sz#u&X_07pJ^_v6v(ZM?bjQKpZ{K?M_}H&+KeWE5hRsZfHG36vKV#>o zhtxGRg}|^>JgK1}3|9U*3>mw<>5Gc68DYU;@gy8xBoS$q{1~V!LG>1kUY}lF-OGe}( zh4~o;5(y8BCELbN9)5c0@E_Oi{GcA}AD$RVOo#VN&)cjvt=ULV42Cy1#m>m5P#a6Z zdiOnO!{l1RzjQ(7JauRQ?V$5q?b?g>i; zQFO@o#8fH*9+D^XWUFm_CbF?HQzlYL_-wPr54^MPR);U(Rv4uQyUP{m9qx7OX$rr} zrSJ?7JM7?lEG;SdbbYaW-jYWP3Kwxiz{ZC&Xxcmfe(UP*&z?WH|Bv>el#!mnU_xYq ze&~|Vt&|yjdLy`LNkC>?7$prw7KjCi@Veb!)bFp3jjk-)6cf8AIsuhVrsO4)bM2Ea z1$`EWm`+ZO!YFA>Hj^#W%e@|8`hEdeT$j=0w173=KiqFOK?2VH^c)UQjTEA-L zr;9bRdH3(!y=fLmI0cRu-=4U6_19B3?j8E!r5~pq8~k6oT8^@>+vTxav|5wX>NHAq zBEDEGrqh!MG9fE9ArsjAD%Mri*K8_}%PoxsUp1%_gy(e*lDYafDLk2uo(W5rvc$lw zq+?2DHoIkT%o^~yA)U|L*JkPkWd@l{AyatW?W1E}r3wZk(#T9MS1E;fta&p1xyi#f zuAex5&HJ zOzhUrOAnMaBShOm>ubZ}lagT{piH(=8&21mnfRn&IHbg?r3$8iDIaR>>m9M!9BPBv z=~43az5QODOlmerrG7AvKBfT%b1Kvt1%s9`h8<7=RzQ49Awk91@3Y=oW zDb=yBMuQoO=_n+YY2q2E6f)a_0xP;oOjpqOz3+y7);5)zp#qeIwX>_;sf1XNTw@4W zhrB#C3}#^Kv?_V~*jL~G;sTi)AVlKuUuubQq^DmWl*o1VT<~8Fbo+gNo1;xe*QgZ= zKt;D1P2jGVOIt}50-RjM)MZD(qq5>s;mr+j4bR?KS92gTZpOQJ{E*jOtQ-mmqvNww zdY88=)#LN)>2ysyVW@Z5>ur^au=y;bL+ zkSEa2=78J=3RNB$8|tyE84R~8n z$I#D%wo8MjJ372QZeN?nJ2>6l>G!nxy$+LHsUnd}Dp5!TkwJ^}GGHTpzYhJX+3o^0rxtNTUTr>k3 zc{%D73Yc02lja>6fxx-&(Y1%S?%Zv;^5fChQkEXPNQ#_HqV=zylVhFT0aqVj*E)K= zUQfHrF*N;QpuN{_RGJH<&^jW>#Y#yOn8-Ybgu|6H`8X~f5wkx%eP(cE?wZx#{&!o* z;CmN7y*W5M4Tt5Xa5Tf;^w5-Qnc6pCZ#CC5{9b7RftH7aqi{5X>ve}kE)Y;kT2(?$q7a30Azm?w4)acNQ& zU;pRG2urUpDPMH74O$zLDK+sJmaqU5O~FyY9@N)v6N_1pUczd($w2+<+O417sabUQ z^zm*V2Nq)-{`1hGlc&C!?CR_UjHAzCbJ@IJuU^ivIh}o-H(Q=PQp*K=IfaNtW3e;2 zZEb2JLqW^QX0su!K!^)Zh{(h%pno;jo;`XOj>D!jCTGOdhsgXndUyX2#OInMZG&Sk zM&UD(w%6h0PIhJ%4VTkymyh;!N;w<`#DuhMZcxt|*NYAQ-ihhf7$m6`dO7m)*y-Vs z<6n>Wj=UNk?d-7{eSvtqHqxh7=$?nG2pPHbAP!i0Jl3i+)vL9U1| zz_4@y;H#bdphZS!()(V$dp|w(8la=&-wh8Ajt=#Lgxg+Ez-O15Om@#tg_{l6Z{550 zo6D*P2R{aqK*+~sWRoyB9wRRvpN6ATX~Z;+K*fm~AGIUWKiFIdczkL$fiN>95fne+ z=?4;ReqVBmnoXo6vsqmHUU=#D_>4p}I!hu{B(B)WG%J}(p;#bF;TsfS-$v}qV6}Ek zzy9|(2jBhm0VHT1>l>JOInm|rboY6Ce65hl^60l;ZY(aldHvqQy9S$EB^9fAsp;r+ zB9WrtQ6&KC literal 0 HcmV?d00001 From fa6100d9aff5b6adae98044e61cf1abbc6a18a0d Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 14:27:23 +0530 Subject: [PATCH 21/38] Add AVX2 flags wherever necessary --- src/modules/cpu/kernel/exclusive_or.hpp | 32 ++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 6f971e1a7..54a6c3bfe 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -64,9 +64,11 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); dstPtrChannel = dstPtrImage; +#if __AVX2__ Rpp32u alignedLength = (bufferLength / 96) * 96; Rpp32u vectorIncrement = 96; Rpp32u vectorIncrementPerChannel = 32; +#endif // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) @@ -88,6 +90,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTempB = dstPtrRowB; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) { __m256i p1[3], p2[3]; @@ -105,7 +108,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTempG += vectorIncrementPerChannel; dstPtrTempB += vectorIncrementPerChannel; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) { *dstPtrTempR++ = srcPtr1Temp[0] ^ srcPtr2Temp[0]; @@ -148,6 +151,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTemp = dstPtrRow; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1[3], p2[3]; @@ -167,7 +171,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2TempB += vectorIncrementPerChannel; dstPtrTemp += vectorIncrement; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { dstPtrTemp[0] = *srcPtr1TempR ^ *srcPtr2TempR; @@ -220,6 +224,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTempB = dstPtrRowB; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1[3], p2[3]; @@ -241,7 +246,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTempG += vectorIncrementPerChannel; dstPtrTempB += vectorIncrementPerChannel; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { *dstPtrTempR = *srcPtr1TempR ^ *srcPtr2TempR; @@ -274,7 +279,9 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) else { +#if __AVX2__ alignedLength = bufferLength & ~31; +#endif Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; @@ -289,6 +296,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, dstPtrTemp = dstPtrRow; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1, p2; @@ -302,7 +310,7 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, srcPtr2Temp += vectorIncrementPerChannel; dstPtrTemp += vectorIncrementPerChannel; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { *dstPtrTemp++ = *srcPtr1Temp ^ *srcPtr2Temp; @@ -931,9 +939,11 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); dstPtrChannel = dstPtrImage; +#if __AVX2__ Rpp32u alignedLength = (bufferLength / 96) * 96; Rpp32u vectorIncrement = 96; Rpp32u vectorIncrementPerChannel = 32; +#endif // Exclusive OR with fused output-layout toggle (NHWC -> NCHW) if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) @@ -955,6 +965,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTempB = dstPtrRowB; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) { __m256i p1[3], p2[3]; @@ -972,7 +983,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTempG += vectorIncrementPerChannel; dstPtrTempB += vectorIncrementPerChannel; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) { *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) ^ (srcPtr2Temp[0] + 128)) - 128)); @@ -1015,7 +1026,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTemp = dstPtrRow; int vectorLoopCount = 0; - +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1[3], p2[3]; @@ -1035,6 +1046,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, srcPtr2TempB += vectorIncrementPerChannel; dstPtrTemp += vectorIncrement; } +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); @@ -1087,6 +1099,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTempB = dstPtrRowB; int vectorLoopCount = 0; +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1[3], p2[3]; @@ -1108,7 +1121,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTempG += vectorIncrementPerChannel; dstPtrTempB += vectorIncrementPerChannel; } - +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { *dstPtrTempR = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) ^ static_cast(*srcPtr2TempR + 128)))) - 128)); @@ -1141,7 +1154,9 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW for 1 channel) else { +#if __AVX2__ alignedLength = bufferLength & ~31; +#endif Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow; srcPtr1Row = srcPtr1Channel; @@ -1156,7 +1171,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, dstPtrTemp = dstPtrRow; int vectorLoopCount = 0; - +#if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { __m256i p1, p2; @@ -1170,6 +1185,7 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, srcPtr2Temp += vectorIncrementPerChannel; dstPtrTemp += vectorIncrementPerChannel; } +#endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) ^ static_cast(*srcPtr2Temp + 128)))) - 128)); From 700c507d9ea663b2d70181a3888e1e6fad26cb3b Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 15:21:56 +0530 Subject: [PATCH 22/38] Update the code to have updated F16 load functions --- src/include/cpu/rpp_cpu_simd.hpp | 30 +++++++++++++++++ src/modules/cpu/kernel/exclusive_or.hpp | 43 ++++--------------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index b4cf0721e..8cd0fc592 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -1651,6 +1651,24 @@ inline void rpp_store24_f32pln3_to_f32pln3_avx(Rpp32f *dstPtrR, Rpp32f *dstPtrG, _mm256_storeu_ps(dstPtrB, p[2]); } +inline void rpp_load24_f16pkd3_to_f32pln3_avx(Rpp16f *srcPtr, __m256 *p) +{ + __m128 p128[8]; + p128[0] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr)))); + p128[1] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 3)))); + p128[2] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 6)))); + p128[3] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 9)))); + _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]); + p128[4] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 12)))); + p128[5] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 15)))); + p128[6] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 18)))); + p128[7] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 21)))); + _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]); + p[0] = _mm256_setr_m128(p128[0], p128[4]); + p[1] = _mm256_setr_m128(p128[1], p128[5]); + p[2] = _mm256_setr_m128(p128[2], p128[6]); +} + inline void rpp_load24_f32pkd3_to_f64pln3_avx(Rpp32f *srcPtr, __m256d *p) { __m128 p128[8]; @@ -1724,6 +1742,13 @@ inline void rpp_store24_f32pln3_to_f32pkd3_avx(Rpp32f *dstPtr, __m256 *p) _mm_storeu_ps(dstPtr + 21, p128[3]); } +inline void rpp_load24_f16pln3_to_f32pln3_avx(Rpp16f *srcPtrR, Rpp16f *srcPtrG, Rpp16f *srcPtrB, __m256 *p) +{ + p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrR)))); + p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrG)))); + p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrB)))); +} + inline void rpp_load24_f32pln3_to_f64pln3_avx(Rpp32f *srcPtrR, Rpp32f *srcPtrG, Rpp32f *srcPtrB, __m256d *p) { __m128 px128[6]; @@ -1784,6 +1809,11 @@ inline void rpp_store8_f32_to_f32_avx(Rpp32f *dstPtr, __m256 *p) _mm256_storeu_ps(dstPtr, p[0]); } +inline void rpp_load8_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p) +{ + p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr)))); +} + inline void rpp_load8_f32_to_f64_avx(Rpp32f *srcPtr, __m256d *p) { __m128 px128[2]; diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 54a6c3bfe..9d6efea8c 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -706,18 +706,10 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) { - Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; - - for (int cnt = 0; cnt < vectorIncrement; cnt++) - { - srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); - srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); - } - __m256 p1[3], p2[3]; - rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads - rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation @@ -778,23 +770,10 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { - Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24]; - - for (int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) - { - srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]); - srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]); - srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]); - - srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]); - srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]); - srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]); - } - - __m256 p1[4], p2[4]; + __m256 p1[3], p2[3]; - rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads - rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation @@ -862,18 +841,10 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #if __AVX2__ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) { - Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8]; - - for (int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) - { - srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]); - srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]); - } - __m256 p1[1], p2[1]; - rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads - rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, p1); // simd loads + rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, p2); // simd loads p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores From 2823e4b0c748ada325f8c23cb155a8cf96530f3c Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 15:27:19 +0530 Subject: [PATCH 23/38] HIP Code Updates --- src/modules/hip/kernel/exclusive_or.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/modules/hip/kernel/exclusive_or.hpp b/src/modules/hip/kernel/exclusive_or.hpp index 42f11e542..bd5c5cd5f 100644 --- a/src/modules/hip/kernel/exclusive_or.hpp +++ b/src/modules/hip/kernel/exclusive_or.hpp @@ -229,7 +229,6 @@ RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1, } else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) { - globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; hipLaunchKernelGGL(exclusive_or_pln3_pkd3_hip_tensor, dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), @@ -245,4 +244,4 @@ RppStatus hip_exec_exclusive_or_tensor(T *srcPtr1, } return RPP_SUCCESS; -} \ No newline at end of file +} From f0732b0f012bc4b60172e7de27c0ef394eddbb04 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 24 Sep 2024 18:22:29 +0530 Subject: [PATCH 24/38] F16 PLN3 to PLN3 Updates --- src/modules/cpu/kernel/exclusive_or.hpp | 91 ++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 3 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 9d6efea8c..1cdd826cd 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -561,9 +561,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); - *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); - *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); + *dstPtrTempR = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); + *dstPtrTempG = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); + *dstPtrTempB = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); srcPtr1TempR++; srcPtr1TempG++; @@ -571,6 +571,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, srcPtr2TempR++; srcPtr2TempG++; srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; } srcPtr1RowR += srcDescPtr->strides.hStride; @@ -816,6 +819,88 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, } } + // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtr1RowR = srcPtr1Channel; + srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; + srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride; + srcPtr2RowR = srcPtr2Channel; + srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride; + srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + for (int i = 0; i < roi.xywhROI.roiHeight; i++) + { + Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtr1TempR = srcPtr1RowR; + srcPtr1TempG = srcPtr1RowG; + srcPtr1TempB = srcPtr1RowB; + srcPtr2TempR = srcPtr2RowR; + srcPtr2TempG = srcPtr2RowG; + srcPtr2TempB = srcPtr2RowB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 p1[3], p2[3]; + + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); + p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); + p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); + rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); + + srcPtr1TempR += vectorIncrementPerChannel; + srcPtr1TempG += vectorIncrementPerChannel; + srcPtr1TempB += vectorIncrementPerChannel; + srcPtr2TempR += vectorIncrementPerChannel; + srcPtr2TempG += vectorIncrementPerChannel; + srcPtr2TempB += vectorIncrementPerChannel; + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + } +#endif + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTempR = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255)); + *dstPtrTempG = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255)); + *dstPtrTempB = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255)); + + srcPtr1TempR++; + srcPtr1TempG++; + srcPtr1TempB++; + srcPtr2TempR++; + srcPtr2TempG++; + srcPtr2TempB++; + dstPtrTempR++; + dstPtrTempG++; + dstPtrTempB++; + } + + srcPtr1RowR += srcDescPtr->strides.hStride; + srcPtr1RowG += srcDescPtr->strides.hStride; + srcPtr1RowB += srcDescPtr->strides.hStride; + srcPtr2RowR += srcDescPtr->strides.hStride; + srcPtr2RowG += srcDescPtr->strides.hStride; + srcPtr2RowB += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + // Exclusive OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) else { From 91311c3c914179415d1edf610346c1fd98760508 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 21 Oct 2024 22:25:16 +0530 Subject: [PATCH 25/38] Update outputs --- ...gical_operations_exclusive_or_img150x150.png | Bin 0 -> 24339 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png diff --git a/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png b/docs/data/doxygenOutputs/logical_operations_exclusive_or_img150x150.png new file mode 100644 index 0000000000000000000000000000000000000000..78ec2feca15d28e6f126f6501a82ae0670fb1fe4 GIT binary patch literal 24339 zcmbTdWl$VX)IPcd_uv{JxVvitL6!{=+#$HTEwD&%CwTA>ECdU(xC9RnAh_Eu5_}gD zSi(a7{NA_f-Y@sVy|<^jXQrlS>O9qb`t*6unSU$)b^uQ`)qrXM3=9AOB<9GJ=n`S%S#h6l(6-AH~PSd=wh{QR2tmVUN!NSY+7b&qZG1Q0Us=vieYpzRM`XV^eM# zpwgQ~u#4IHM&T1s)6mke|$f~?jRa4ghYU&#p8X23In%UVqI667I zxcd1A1O|bFL!x71F2E2&%rPXtjlPb(9~{ZYX5`RJbtF-CrwY?f6xKX!LW9 z%>>II(Oi+5X|O&u)IG9#S0MegygEI;A1bjtq(lhTTh?hqD#iKZ8(C8hQwt0H=D)8E zOFcZVN?m$!N>6h|tIoLl4d~8i!FM-$C!wj3!*UXfeJ9*vFl;1atIl7+#TNx;O zJ7MnoUS}fsSm_JGf-oEII}Q^8$K%&VM+PR(BkJU$w z_C(Y|6j7CBDzz*5dJ)aqr|Ar3xAtnOxf&}t>zyX#4HLc%FVUU%rTsg#{jFmI))X#% zbg0eGCyM+CJyq6Cy0gC7MjpQBMrrDqWYWQVFJ8rIH4O*4V^aMwKE3z{z_!27dXND} zO%7}^Bim|r&mv3~!Y8YpOL*pcs(Q1W?O8Q`d-S6HBeA*7qDI@3!a9FBpsY)$ui)bpu9)HxN9UP8ZT5UDmpzbfT2y>`L$xR z4gEX$O3eymNq@Y^k0VIVJ>yzLb*M9#DkXZW3U-)<&Fj0>bS;WUO!EyQn__g^df?Y% z{{T@yG&!n%m!Xs#f`{P9&4G0TtGd%)=i@;a&_Be!)rvTum9pK0N~({JVrceQE2Eg< z)zJ|By5kV8Sb4WkZZxt2%NY$TE?q_XGu6P(<;i?ITU+-rmbGt;<^KTw@b-t*q`xL> zGAwB?d7Q6br=T$6St!tdji0%1r-^Q{RT{4u6mws)SITsJUgfcpU$b=GY*?%)l~_I4 zE0F+4$4v7W#VOVmD_U)CEM?!P=>9B%{UReW|YD8PXy zua*TQtXr&)8q{-RbGAd?mC$8wXwbE80auOU)P&|Sp6J+d+`%g@Ko?^Asb#c>|ah6*-xU89>Lw)FHU zpCE_U(4T0aU#2wz#TGPrhWn}?$yDg@(G*oOZ#?h*yB~W66}fG-sW8=Bm&cIkL)8>3 z!2qPIa!C0m=(NR}Bvtr(1=&wrhvg$PDoFXMn%-+|F{<^DgEFgV{*7<8v0_h6yBv30 zgq?R|qKq+Z2kbJWmN3oRYMzuz`btx7f-yJd3`nLxbl<96nG!tfPYf}#>5$Z(?0d+xv0dx(|+cKq3 zS);?T{Kk3s=!&05nze`oS8nwq$C;cLCxx-Y7hm#V7z zNjb)Px|OtkSKK$#rId$H4Gy`u)&8=dQI-mIGR>S`3fL|StOFk$tRd{Eupk>BuX2~s zxzn``K4Y|=BBpA|75W|gu7q^4PyUycOe@kl)?eqT4)&jvzYEP4CP%&&qq%w5 zKYUu-F)GOM)r?;FW5rSNQi*gkfVmGe!L_ z!s!gt&2HE0X?w{@ZA*fG;;i>^6>!h7{e2A^iKa+;BBn+YX`^k&*F}Nt*L@Z*?jiy> zcknkzv31@=C@P%XB4_^y5ge6}HO<;0pXEyiCer%(M&=p<5aw*`#=ol3(2zIlFaGg!I)#XHe(?0+|e3}VN zR8{FJ4%a7YB*=EcxvyZ@Z?(F1IiXWCv+Si%KPV9b1Bc}4Vrxc<6jDuqDzK;mNOPg? z{d(VW9-mo7p-V+w*mxtF2yuUY_DZnG@u_ELmR`I5L389&z8JeqeA>5wHAPSGB6Y5k zlcmtmphSCuLxX*ccpeb|yC_QCCkEFwWv7Y%Y^<-Xps3O-rJj5zUL}^ruY@*W%)+iz z^XDAewAy8Ye&qE8K!pKVR9)q(9rh1!Z+Od$g|ZyD4Rlyho%7{P)n~7DWm*c@Zx&^_h_iPM5oxxOy%d~MXYJorN z2b;!lCNtcI(WmNQW|F>9^+~_kZb~4IXnowsXbHzesetrExvNkya9X~(FHy@(z;#<< zasE;?Tz*p6gtx7*&?}arnbJPnWa*!l6yLecX`Z` z8Ae@+INY(0*B>1Jcx-LIa4z8{E%*mBfckd{=Zh=z3M;1xNgDP+h95(xdcDcl#g9fS z9L;+FSy9|If*P9s6xFWW+Z%`HrLWP`rg3gYd6{WMvHrsl4Sh98^;)gk6ZT^y)Wn)x zngU-{@7lX+DNVGTu65~Xd{o=)kC(MT-XCiM@b+)B@2CjUs1f_502|7x{rFh(>B)jc zE-oJR0(AX4$hl34WN+c(jMe$jIN5undx6R>-RkAZ-{mee$ug7*QBoZ0=hoC9V8)VG zmvCoJHXapj`;7!lXBUZUExyDr1l1Sm?})Z$S#p4ApjX+QcD5iq~Clv6r4kw`U6Yg}T^oaqo#ohgm0P}KG`koo#YXvxxm0bePE?+5Hk6o0 znP?h$@z~`|4LkCzN$)r!tyiqCfnCvXI#l$X8j9rbHR2y2?qi1}6CKhjgCEyofnNA? zy@wx#SDQh`_74ZbgM<{%2a9gS&USjz_}~7Iy@#gAiTQ@w^k$ecJ~)f|itZ@sA7_#V!;RyO*o1b2^-`T^wc7;W#F{A6WEM-;D7zZo+Te5PWQ?>H zWp{oG_ZWVlK{=H=8H)V;))3r1ZFO2^D9xGan^I&)^S$%aLZLRVdCN~o5HhyMR03KW zZ6i?ba!n)Izj|Xd`D7v{Ge+Ji)I*MuY+v!^IK$|c?A(Q}d4fdpJxHIX^N!qls%Da7 zX&l40Z1PF8&osuUDT*P@cQ`1uW+e1E)_V)`7?E?4MKD7I5!v#S%{rRgWxkLvWt4FS zZ}H{SaXtMqvZlq-{6-=QX%%@Uyy>!$0WQCgqXue2jPuYo{G$PPUz!sZeu$<+^#>9x|{WreLQ6Qt&eB=;(^0eAm$EAlmV2 zj6j`sI+#v|xCH3>{U=MZPH5;qfV@`{Ol$n=>BKLZhbL-c-}gLzKWQzWWnH2ooMF_E z66fCLEeZYxXN0HwrNvdY{A`#QuR5l&?lN2{^O(eyf6JVgF)%6&KC$9-DK%^?;Bi3i zR{|f7)R4`|+d0+!YewDkZWeQo2B~JU1QZsq*?ZA<-`=0ErzJL&Cx(B47OWgT6C|D& z6%*NSD|(`h^YU5KPs3+kL)OHN@}%_;1yn3zjo9f0QZmUK$ZA_+CiV?+r^&r)&WnO)b@nMaF3)sx@dwWX2(6XSwvWcA=O17| zDqI4*JRFXX8o84~4VfdVGM96$D3(1|rh*ip14?dd;j=D&)v7ELlxaa%p8`$EB^)CI zbGy<{YePSQqhZTv3S4+&d^PHw+cRj<110h>IRMz&NY?gw?pU%dVJ!!Sy^u2adoWo{ zI`iOM2Cf#EtF(e(WEd4c`I+XDy zd@?!hfb?E*Io*!m8Orz%l{E=UjQSVMr{mR$8d*xYrd}-jO}3sz%~*anC)DUb#~?y) zCp>L~Y-CaRwj?07T3I(nFI7SUXK`~;xqim3sba8sP3ekF?N!vd1V*_D1DIJ(E){boH8M$ZKWt0e@yU zJvGEBPRUIP1MP_l`$XN%3O7S))&J=$lc<=DPpI!p#+q?m<#vQRam~c$P(2^Al1^-NVE@9Q{FYL&6>A_5&h3p00s!v>y50&aU_>z_&h2s6qK zVf=E@U6B~9M=X5o$9|(Pv9ZjsXK1nzoF$j@rY8oAh5szlauPCi_XZ_q`be4;jKKw4 zkmSEWFu8!XFgZh0qNS#_SMOq5CnPzVfFN21q82Uj$d25XG@D4Tq|OJaZxCye9gzk< zsH|wihK-Sd1NW%+wf}8cfwymRL1AWfD3^fo7T5_Egb6N&g$a`Go<4&p63^H2iz5Xk z__W?s^CsnJ`nz>e{l zz@q$Y&-RCR_gV=lpDL zu_>}NSf~1tc!z9!(;;L=PN?VbR8sxumvd{&22$^N*ff5O&N)WOjP!gI{I&mdlCn@^ zms5S$gdXL#TYaVgD~*jH@4<&{Q$V;7I`O|N5nFlH;5cRgwD<0$gqfIkd!R-+kJw3I zb4`=pzmrfXAG?$3rP1bJ9oVCS&FmPDq+^5Chh%St0-#L@orI--08qOY>*%Iu364Xq zj7{3?TM%fJP<&REoWG8$&$>(TBtbE4i@_djCUJZ~5zCu6SvN1G}H7_a!L=48OjnS zXX$|!kxcasVB=FKj$bjEn8{^$lCrlS8e${|W*h!$JymslolZQ`Qzi4ZxJ_QWBDIe8 z37!n?vD~sBl4gXtrS!BMH@1;sH*XjefrI8=Cf6bn!Ure6DVSrZF4UzFv3z;`{cKSE zdPOOiFR|**cm4gWst*9=1h+o3x%0AyJQ6vhj}l1;FpLCdsNC07y7rMaw+C#Lc3rW$ z8H%)UTM#jR#`W^My|(*pGHiFGLp8M07pxYd)|JXLA@^{RX;`lJMw`UQN2i$E&s6Hn zJ3R*NFq+Ta1W`z?Hp-eKX*Sq6-Iijll$|WWS6Lew50%l$*#cIcuQiS^NB+&JtVl){ z4*Ctqi%b$v_4PWdtv;1D*1OAyGoZ5RBr%N2VKG7qrS`}n`c2~aBIqQF_e|Frv3q{` zV#urzs=pti8+~~(t>|}O-+Hl@zRT3_CN<1Zg|Ix~V>eTZk$)*NhmndO8tNZ)48vnj z=k0st$dO7fv{|*k`P~d+BzutTgqoZK*lI3#yyPR zG7$h+vj&JbU#mxo*MFZ*xn(cQ=S@+~m1=A6cA#k2f%Z7xy#O;wbxfIBfBW?UZ#DNs zv8JJ6m!CG%{`_pmm7vE)$X};3^}DCxll@QY(Jzt4BTQV2XgXMNAnn=Kh1x8$0b1&4zXZ#OT5Nfh$ciZ!-6V1CBasa%US&QFxt=%0ZG(3*ophQI#-&t^}sbq>==3 z?8vZutvCey-?UVi4-<87Jn_j?Old>jr;fDdCKKX+3X+n{;|lf3G$cT1j0Vf>Ah&9q zAB2@iA?j{$Qp{s1BbvbTdYgdiDd-LxbJyDmxLdpf?+{V73zm;{8_Ex^W;aAT(W#~+ zoXbYceL1iY+nb%KNl*+F?=(xcfHJ}|@7LzDFU5rLylEd88i`2wj&Gy1e%8Sb)kt_Q zM=S9{Cj^ZSiWrM53oALMrhdJ7+)IkY;GP!DjPP}EkZ0|$qqLrx)Bf6aw_ws3Q`+4C z8$aMr-HD*`wA#9}#+ZcW8aUd4>|YlG+^}<3KRmIuV)s`g&0(5A80whQEt@+zj=Xw;*ciEpI`&YT}Hd3UN;ql zOQHw#bA5dSCr6)!C*zKou+l<=<{i;cdPgH^V)ZFfznG45PA@62+bFP-P{QrFc$q(^ zoKNZMVq{DBuQFI}M7-IPZ%nsQuY%2}*!7KOj+w7-x;U?LG2h)!muhW9hy`5 zJ0^*8R$)!up-+eX|6KgyZX}18)BkyNJ@0daz2_ERo#HZe{7J1B-`;% zZ>#J2`P%R08Bl4nkMV3eW?VMgQ2tQ0OoOuu4HdUsYu-i8s35_3<&1Xa??sF6?ad3e z;B0d|QBDl;Z5A-oH=^N})iYgNopy?rk!v z_!4@}DF<1z>c*3cev+Ryi(L@vh9DLwag%z16l0X>Ue? z(-VK+(}0#XA+y!$-ZA{zQ;g1dyi|#no4D*LMMak)mU-zCy*~2y>Ms_oRNH$b+uw8Z zZrN-YMviX)N$2IU<&CS*4EOJpT&&ZSup@SZP^2#{wR+TREuGgqQr9N2A}d>eEE5V0 zyOFOua>h?x1?MPZabfgoKN&xQHHU!f)}YwR-mMPOpC<$b8pA!O%_SVq?2G4KKP-LI z`(}sKt+(oN{+Zr$$n$w%P2h8b8~eKUJ@lZR=e!i5>9tZjXieTn;bvBP#(>DvSoD8p zd#AG?PZur$@EnghroQSF*5Hcu_13)(j(e)yo|~fJwtIo5DBocouqJbkU~cWFhP+(5 z5ci^9JGP>lub#D;E0NB9po&^LAY5T9ZsAtGI@q+=t81#B7>gMG2Y_0R4QjNe$MoZ& zMJhH;z-HX1u#(XOsivMJ?j@El$?S8ZlvtNzYp>t=HV4N@av-J9l#a^-iU1_;FtPgX z>-M~G`ld95%jNMWY4#}4=Uv_F_c6*7?IQ0@urs37x~(*RKBOn@J}69lyuvd)7|<5) z{EGYn+tAxLfHN6S>=juQJ(KIRhcy&MLvGv6kIQ1W&YG0=eXRp3s}U@xoi*AIa6#wOZ-ca2m z)$P%+cKIw}^8Iss@uq~8+>E}LznM=j5DoLYc`*=Gq}X5t;~$-bi-r@k`s=(1oVJ;2 ze*M=C*C#|23}n0;72W4-Nup+^l)QrbN(KF7ebUYmlo9Rz=_!wUx5K!4^@IbY+&Wm_ zz~$|dz&V>b)b#8e_kC8OJN6ngT?)ApaY5Sce+VVbA;$BA=L7m~InBQ>JB%jIIYo-p?ogqt4Y?= z$)O9?Rc(aM1Qw*^Ts&@wUOLL-j>;^@OHN6ruffXk2Gk6kxupmE)F_YjQucBitmd>` zK{?ulg_+DM5HKnq$p#Bb^Fr&xglK16_AL~8lg#tTMl;eg*-ajt7lU3`rg>|O+X7|7 z2|7_hNWX82+>fWMp%RA0KcwVGG&L3ALd*G-JR_PiqTI`{CTjC{Pd`+Dh>6@ycsuYx zDi+*-UH9`Gq7n?#qS;}T^-Qt+cqehtbJd%l=k1+`v+YME_z!Tx%HL<^@S$mQYYd6I zbC`&B;HU1Vt!w1$$n|shtdO;`zG*(h(39D+q5g!0hmb6fVGoO2@Wi^q5Kclg)% ziQ-y(@6*n9r?gVQSk5TYqA~ZRV0x(~b@c(PMBHPHC>I|M#V`j z1cWAkc8aaL$a>?K4>!l^BlVrIj2*wkGTz1&qlWYSRWW* z&sxT1tC3G{{|E3&f7h04-O`qdR2gBWL`&WGSAu<~b?`U~-4q(p?AJ6>fchTqE_KSw zSAxU?nM4|8{SjnWCOy{1~gnyD#O zrIHd5kH^%h6iJH;J>3?^+Lno`p?Ajsfd-bH=LQ1*+mC3=QdLvIi?df5_oHo)sK_(S zBI0T+rWoIFtDK{4=*(#joOD&mY|cv{!1PZqskLcbvSOukTe~g@klOIJv)z<&y}7PQ z7(1y;GlkZZyF;oOd28x z4F`t;`=@hz6wT-Zd-CY}lP%0eyLqR1dr|XEiv}15K&1M)%8o{s(d^s4#Ck!tuC*EZri8A(pA!Qaf2r(wzfrM3uy$=vB=VLZi2oI;HLC3fg?;2DfGotb+W{j?;ri1Tt9iEamwjJ5+X+j z-$?Wb9T1bnaA09dZ24W`UP?u#n^BkKSyza8WmmnkM7{ z7mIpRs~w>+1A~^dC46#rh6W*8Zhn9TDL2kZ)^zAzmTK^7pKz57(W@8i^CTpTG4-TiX(Q@iH>jREkbO38cy`ayBKH_0b>~hr5WvXL*6pqnNq9V z=)*s@;Jd#85-@54-bZSz#hgzPcha4$(>Bg9qqv73q?%C4Lht@CXgJP8#um0X$cF3c zEaK_L$G=HW16AIG-iDe6$bKl8Cik!o{0CqbFfMy&aYEIx^Zd-=uI9E{lT($1YV!1w zceiUmSCvRng>@>h3-WDO-x1=osTK1u#W`hKb7jUHTiX3ZTYIh|Pj+&gg9XIJh*xqc zN$ZE!Gv?t>Zrqi20;zw7_{rNqfh_1 zM$aRAm+=X&p?<$*y|#Abc}7J->rqIjW<*)l@}u3v$8Ib$LJ%CmqB)g_-`okKkdTpFutVPK8L_ z9_m#U{KN37Z9Z!{u~;y~4y1*o*_Z z`EJ2~3RvC|Tp>jkn`U62DGV7mxqVG!vvF>VDODNy{SuNa05lFm)=Jitrt7&WM z9lD+r@HB=f&KaEsEh3{+TDE9Mz0r1Re_SNAE6O>HvNHD%lRW2&72O8R7M+`H34d zJqKb#;G^@h^x_}<0B7X0k$L9lG}GBYUwc09$^CrB$~G9h0HFgwzwKM>5pW6k0`}A9 zUY1k*{?}ttU7{8U6$OD(W6lW-66x)IyUSmMsl#x`Lf8KRNL`un15%l`RS^5z>9!^R z0I%q69WU(#o7jWWyOe7ghbChW1R-Ntw3F>Q`SuXikq@hmbbYLv1Qo@+A~$wa)V)Wo z8|#@lXHyP*?7v+e`ghulcRk7Z1i$-{n4Np4ECMx)4rw(lYDOO>jMAl=VmT&pFBDxg zwTN;X5Y4CQp%Sxw;^fTR`Bqb5x9__O0QkM3a3w{d05& zc7awh)8?%M)^b@qB063;smdv-c1qVzN^bhofw9G9yO3zI)_RVcM`zf@cU1S0)CYNS zSNnViUhj&9P9#YKr{O2)O_`>Fp@e`CZC9#-oh0PR8cDqy<=UUI#9K7Dj?VPXNgYqlkQtoE`k`nJOK`q^zlv|tT7hH|H<0B#c=|X*U zu1a+Bz?!4dRj*1QWs3^)JC9;p3W-{YzLj$}Ahl4dxZ{g2?^HBd4c|lt(EI~5(WJCC zMoB5PdJXnk&6pzA){?hR!$h}qrCZJ8BlGKvTbz6w((c$cG3%sWUjZ_D^5J}M#Vq(; zjfX~4iu96i>JI(_Cj`V7dza>c&g-)gp|CHAdv|JT(;U2?HRf441~GQ}B|ZlRg)KL5 z55wR4H)gsXV99DE<^zQe=Fwl0JS2t5!}RRF(rO>wLV%O=?<@It`?;I!VNG#4n3BVK zCV8Hc@N$$CQVEWm@~4#OVSI}j)sIu`#+6!k(IZB`!JVbGm=bH7!S&7zoI~$Yg%|TV zV=t`4&Y?xgMJW33%~0Vc-|)?{-!VOD^6!{Fgjcc$iEU>}(S3nsv$#l?=PK(HBGGaj zQ9WEe>s0M9Ns+{qOGK1uj~DE zklxGpYpXp~y?$9Vx7}=edvFg$|NGZtSVu#>aP?{gv^ri@)FpQ%C&O7fLdOvl(TkRX z)!SWfBA%rWHLlUXpP0Lb^DXc6_beG}$8rBo)ujjFh0l7>tEDxY0SQ_&gbjrV$5}+g z4X=WdA{=JYO7Ky&m#OHUMApkXt(sqf%haT8>k}Xy=$OFw?*Ro2K`Tj3Ic`ClEL%z2 z7S<@aZ>97-WQy&`(4DGMxKctN{j8TqJ=^?Vf!*hCprHf4A^v(XGJZe%7Wd5Eii9$b z_?DDL-cDi;BBk?<9G;h|%8P9Dt`a;F<9AgB)rLTBzLx`=n)(mSR{TTX(y3ZXh+vfE zdB)crXhsA5%m7Q^Vzsv7KJ0k3kwbi2G%~cL+MsEh<=O38v za}+sId4^$RJ#-|c&w74vcJ;S|-29bfo(LzlR!>Gg*Gs|!@SZhNNY>`=@M-3N<}G^8 zG_^@w^_pET3^pXPy@I)qXR`i!fD_eI(9+`_)-gQtPRq5jPzPmRSCL9;xMn%%#?QBA zhJ%AnRUa~?x{7$rB3yh75@t~6%ZEUGS}08za`nI))^bg0A4&Qn`Q}?^RvKY}O^%PD z5D%``4!*Tio6|Qw@2(u#1I9c*BEY|Z))I+7F6j+7oHgTl*EeJOZl%Y2L$1Wo)0MkA z%rKrW$y_y!SITp@HJSn;;_0J83CyiDqY8fTwqm5ya49X+17tp5|C+{%vuBwODgg!W zC!XzgWWHn-$`T)MG8Z`Liq-8X?NPETUG=-Rz5n_TV4n|7PqubKYWET^1kp&HN}YrR zbm_?Zs0Vzh$=waV^?5e^%Mz0F9Q{GB&mWck7@6C>FT0Re$En`kj$aF}pnp1O{w#1! zT<_&Z`80T3&VUJnc2sHgZ_5Mz_8k{$``YCv!{~!lmLcLWIXC!R@XWve>4&o)s*q%G z!!Wt1Y}M_7Dfz1au``xftnkn@Breayki1w6MFJq zcFPA10h64q@LP7#%!w9E3uHo zb%BBWlXQ4^w-#dOM&F3ru^=bkkE|Q|Fp=Vaw}*=*8RWWCWvmQBecY*5tb&u+g6T|! z>$C|eE_XMG$j!g?$)LOj*QCZ$Nne(lw#-45XpUdE<+W2k;R`i-;#Ldvg11}L>O?R$ zjfc!?%(l+qlj{gdFaKt2A5JMJE}fk?)+sTRx#g-XvhhFID@fLB(RDi=)jnIDJm^S{ zzC4|lV9Pb|#P8P6%xu>D2Y6PPE~c%kExE`eeb;`oO^liuV8-<)j3764MR3-8G?)#* z?B}yZT04`qb1`{E%LKd2NpKDrND=1_O45KPWwK?0EqjRt9O=0)cJD@!F4hSCD6iWg-0Xi2YK z9vna$F3K}*A5?Q3Wu}Fm>5y_($Hx?Pmg(oW?Ry%)HpqtS&TO1Z4!xL(er|E}8mFyj;Sp?x&A7gV1 z$_1_^WF0?zofo(kHY_Y&>Y1S2JkCh;!TZe9B(g_dN6_Rg)e7?FtP;^Z5<$E^q=f5# zM`%JbM8joC5fi!gqN&cKyMcflBt}K6%FYN-5`>V|oIFkm>yct56U0nXtAOI*q+YI^ zmeIUXyRd~htreZXq(j=Q2m5#9PdN_XMuOju%`j*-FzL6tpV4qFoiG1toSQN=HLngC z z$h29B6;?D1S;09MPi65y&z~s6B#mzQhO7~9N+Yc>P-Pe>ANZuD`QRaNLBg6!rTlyS zH6BY-Xo8@P@!N(A))99*i>IDy`?TMG&%|0IjI2>yG{bzg=E>IS)DP6ooNeZOUQbt{ zz5By6{N7(aGPBwJcpVp@6w;dHWCGhV-%qyme+>G`Nyvrj2`tgSVdBjUfivmkko(ZQ9O-Rh`6jTB%2ID8K+ zyC5(2$c!njUH`S8`(29{zuM?k-kV_Y;aq-f{z7-K`Ib{xJd|ytw3M0rA7EG7EB0EE z7%6m$8oe6Wl3etkd5WrjvsnH$csZcAW%Nz7h(=}HW8lzUeS_dxGta%>a=%OIim?%5 za1$JTU4^ja=pEeH7$8kKtv_oMh+fT3Tb|keI(YT!g@wjD{80)eiie-&-?qun7>|a< z8k2n<;#0eouU%FJwlxXBx6d`q{0n3>kwfotFnP{l0%DUc2H@Z3%0c)>^g;VxAw`IBEDEhoNL!M+070KF zn~Ra{KC6Ai^7h&oT(JqEvWwHJbb3^#-BtGU+{3@v($glR_3rZ-x)#q+kgmYdC25F= zY(%P1{ue=GA=~o7*r3-c2+j+R2cq7WlB`DzO$AE`7#xoix0TxkV3BR9ldcLr^IT~a zVC;TnnyFAFX}fXf*#u`^pB$rKLo1RWSy9gifND+yTqJHi&huH#6V5m_JnF506(4F! zOmYgi4>WG@vmiD-+Rvv&`5oCv)k@^>@#ZWRc@~D%&dkL0{O5DjHtc?U`m{YHsQ#m~ zFN^F0ykm$#UuvePI4H#bM6Fc3W~APZ61JeHZFOpv>|1zafA01$9YF%YxD$QMoLBPd zPkWGGl3=C+mk*J-_CJ-lccoyYb_rtF>+Lcd#@L#J_B;;{a!MrUNRP;~il#XmyL(it z;`{V7l^qaYm?qviYd$2x&i%UbhV{Neeqt$;+YyH+SH_Ot=#~C^NI&f6G!y~1Mm$H9 zCdm82BFI3M!FeHe7H;i?ZeC&_@dSkFSi-Au$`NuKIzdTKr*W>m^kim)&OZPVM56_? zA|F=c!a5iM9SHWT;$H`OB(~dRSRkV&KBB+*T{*#p6OKG&#ZB8}d> zN%Gx~?IZkr)MHoQ=h;X<7Hi+U(U(-I4i0sNoPX*%@LDy9=*dG>yHzc8W?Fx_mi=qx za$O=hAI>sEDO+D86)0t7HmsXG84OE%)s>oe@2}4~3e(Ntq)K#;Rt2Z~`aOJKJ$7HJH6N-fj%5B7gWzW`?4mZF8W*6d;|vNKj(GyDW>jaT z_#~H&9e$<1A3-gCu;r8eK*Y|WiWWMa3kmEVQN9j=@U`72c14dWlFuH1k_)*yMh3O^ z?jKz?qhAcPYk|?zemy>m$|p*+0RoP5m(stda+lj#P2X!;CQR#s+@~q4aJAt@$kec1 zVw%_Uf*+9_r`LaT3mX)mXzz9W!U+IAor&bL%vUzhr0D*@QyfgWk(+{uA?}6-h-pMn zC9vwn&Age_OZwMM$_;mtnzL%@Y?8xQ_K8GkU#9!T(!ZBwBK|n;dSx zA>&?d-K8~)52F*_CG%twc{%g~pWC|wyP(?+{()N*sEP-|e)_4lv~3qCHgWGn zo(9aG4ntcQlPqIYn_|hseQPtRJumVCX`0P}F2U7Dc~Ub0*R?BkXH;kP@JwS;Ips1N zb%BFOX(b_R=4gqCB&H|x7ykEbW)DBUE(+N$ZW|*;Ml%FzTiQAj)vaun4MW8-I>~zf)aAKj{@XA#6<%Dn#3?30U8|^w?E0D3AcVH-L)>Io?%GI^c zqXt|DSz5|9Bwhc4E;6&9l(zJrVSI#06bQxllhu9fb*q$3EB*v-_EBtfz5LV9c8Wg?Cdzm}Mr+FOgA4HV zN3dPi6j}hm{4SjRG5J4AUI+;q`dgjQs?o5l-`bky(l_Zt@X^FRn{}N}V-DNcw8AJL zi%aCY{tA?Ib;e{iocf4~VeWy(!fLxhck8(8&T>W7&kZ~>-jc*sL=`8IqIj{z-*Eno zQ*!w?Wpm02Tlv{lJg>nm=URdPeJRw{;xb3A-Z2XlW@aYWC@HVQ=qc$p0ArJF(bG1Q zo-DuzUI{w!NZ|4PDk={>o*BPb&ylx71i=J;lYtfO5gzAxe%0&aKK`ffQ>%HCIr}0z zk&2UXUc!!|6{8M;O8m!)(#WJOVyOq}+(4zt}2UkH&VCyu6Udh3>>Z!SFuN*TBa-1x;d6%a` zlm@J0e!9d1m!N1=EJ5(vc6_-T_iNiD2a~~9U&vB4ZGvgs?AWB`gFjd-HJIy(41-|% z(Bh(;EImy~lmG6N^Eha0Zd@@mygh!0gauAS=~ma)#yNF0X{KMP$Coo-JfD--9d57* zD=PajdsvE14>v;%Rk?)={k)LZ`Ly0`Dt01SALJ%6{9c1>FUh)-{Kq$HuDx$^kv(*)4QcaQN-o;L_D|7X$H(u5Vwcl4^PQ&&_}7 zB=#e`+Q$@hc3Neps?YqyL+VEh-P`+!k;KKr1rWi}r%Nu>#Rm;5QkhT2cx|Nzn^V5- zrZXb0DGI6_te!@t&gqnHdT{A!lNANC@uBoFir^5!+_83J=BK1rS!ai#my17Zi`W^K=U_47}sgxhCN z!ae+n+eo?f;HD8g?1*`~mC=2AEuGbWfNNc3NhhLO6QNZd+o<<&J^||7=agpYcw(|< zQs@^`^gQFaZ!*3ET#>O30qO(MVhnV12l6-$crsPT%ECl`acC2Vu8VHCrN4Oim= zg&#{F3-(H3(cUmYK~v;#-$K$S4#D!4I8{&{+t8ZB?@$-D=Y$4`@2*T?e&XlY#_ z8B)!jle2wj(2^vbV))wVZOwvDQUo(^HJ#o<#8V*B;;TyUHufR$ETdp;+*Mwnb@I?QsOAlH*LmHnA;YdAJx; z)&FOPcq-^+_q^s=@7GS}SL;!Is5bUxlDYTe?SUX1>Dzf&{25s_Ip_)(PoBEz~I*YFYA}luV zYGF!vE)k??g6nV=WKQOZ(b~t9UwN*BF80o>1v)LU88O2GBVXUgVs(8sdTTKL1HyJ+ z?{CdC376A}6Fv@EU?%(%D(B{vK!1^3ox43;@NT&MT6g5yFzqE}yWew=-YizOxbE7T zmJ8->b!35?P_o&1dj*6tesC_q>g;e1ibXA3yJ^AS>>kTtFZh1~tQ}M0phn(WhIpiw z=uIIkktgB{9i8TRoG4;>Xew{_u!@hgZACPiZs~H$D@e+&wwlsR{Nq34d=^WW6<^RNSLERc*?PsNkvcN)hIiO^7=V6kjEr^i-9zK|!|#P35^l9QbnQ=FwuVV9bv+XD6o>mxR=b8N6`j9&u@_lX zx}zU40)!~2%_HNX(Ca%YElAnagQ-rW*RoAXbH*I5ZhF(dEV(%(mah6AW5fIvxBRtlo2R&EWp^iG#0O7z;iU%@YN-O4RmU!b1?d~fjE!j?ngkB|IsVQ#ZQCXkTJ zbw9}Q%hzM8vt&j(^;%#2S;^O6`wza#}f?D1v0bWLA zlzB2mh{8gu%d+B381`a9y!^+&d=5JuIpgcsja@FGLcc6ijAWvlps!^sUdcOMU26X6 z_zU7rF)Fq3I9bW{g-NK|?#W#{rTo{|s!r*8$BaMVpt|;z<2bd?gZ}`s{+kWmrQN;t zo~5Y#N{0RvWJf4wv=%muaU(*^i8_#>!DV*ZJy+n*{1ijs9L#kO+AH9#{+r@?w1>6S zHJc9+8;d!}R)wO8P0ybU4IGDRKbtIjwi#RjC2^hv`1Rn)ZKUw$guWl?nhu*~Yz~_z ziF{Cz`T3PqVf&{8tFvZC&NnbFj(pA^+BVC;am}q;$784H+I6X(=1co#zPYfoNO!%u zGQ4HsOc@q5!)^gjJG1>WQ^C&;@zmB1Sv9J|X-^kjIa#?lRC22|C8T8F;^O&Ie91fi z05sd)eQ-aGJ5gEc=3TWI=> zliR_r>NdBQI->|>w}dncBySOBcpnXL6&nt*yQyRMId_2^F`MMn=|cxbtCGJwWU4}% zrllyw&y$x;6`XB;PNfv{o=*ENL?n5A1+3$FCPB@ z!D;^h;G%vZ@phTuKL=cVIq=%~vHt*S$)f4QQ@XfO87(juu@w;h^%>eU$l5mGWFIT> zhwUe-+xYf>i9Rm>*LBH>G^p-Sx}*@HwwmZamNt~Kh1@d6 zuAdp5_$Qj@^H9xsDA%H;?BP0CU=omC=tB2kxs{TF*YO9_&NJ{{><8To2Od%>&369 zYWB8mulSou)nJ;(S!<|C*L3Iw;T2F2GcmDtd6aBCLKH5X1d?Njo+ralXcF;6l zDDjq|bE)Y;WVnUmxRNA>&&_!vca0S;6`n$jO2-qU_>=zt!A1W7X+IPAaePhS57~?C zItI0-EsmYyFB@vJ+30gYWLhN_)4XOWC1o;67)gNz9k6#9Pn-C|_QUw=;y(&Uqx?ku ztS5UYra^I{d?oO!*y->~=R9Q%)%<>V^5l+a_)W^P<%PZ#j>Bj0M zbXgK%7RcB*+ifGe7-1*l%_qZO@J>As##jFU6@CtAe-u1T@lV8>mHpna@fXA15Q9gu z(hbYC!DgRbwzt!4%%PeUmfqHBniRNDhDI*3J z2aWHqwGC%b^E~C6=2Uxt7ID1(@scNPk+78ftsaK2`#Stx@nzPVZoVwh{84cQv;C`7 z)h=%BgoLuNSk;#0z&UNh04XX!+aLqM;ry$HULsjuSn0~7qLiIFbrY0%oV}e%Io0Nb zJ=nO)e*3(o9%kadJ@F5Zcy6ssE*p!pNmozYRI1gy8jY@}%NWOz#kkI$M&l_vJyO4t z_*3?b)I0#bDA9Bu*~7vYo*~t=D>?je@e{^qd_k;F;$juABle%O-|c(jkBT1}bPt7pwI+?M{9}((wbL~H1HwAB)w~`R({0vCqn6_1MYpqS zc|556$(f}Rulg#?Ssp0f82nHBE&kFT3y%K)Tf5M{8Qy6-jQ3h~wzo7_(xSo=M`^BY zqKFlV0~TdmM-Tv#s?>hTf3mm6PlA^AUlD!`Xg(*>Tg28WdE%cF_|wBS8nwJGF3FC2 zJH-XuOB=?G3xbz&1329jkeGhciSQY94jRPe*odlCxg_E5%AAu+nobi^a&S@ieUv@P z!qn7K)+-|7iW$x-zF~sQu=Bx1Nwrq2sz#MbE@{=PCaTnpMBJwt#uJrCCr+O{Cl=kFid)&sN7go4eV@1d-=0+r!Fx+@&?GfN#3H&AT92R~Iy73mZpzB&Mp4Rub7Lr{{FuNo?QrSln zc|jH+>Ohr?ZrCG|Hy=TGkHP*T@J_!Do37{=))FL)P`hmeCg4b5NM#@}3;|Zj>s&^+ z;~hWY_PKGZUA>gP4-&lE@@m>Fx}4f%L*X({Dc(;tL znA5|_P7;fgg$T)V)m0;^hdt@auDtN2II30TloM4dMZf7UqwkKqOTihHETcK9>S1v3 zojBp=E=J>cJ5iLHRO`~@Zj>CU!m6EUb9--Kr}HL(@$bRczY}l#521WCiVbg0(&Bw= zOX0h#lM%uNOoevI1U^Uu1qNAx3Rk55-rur!jQ#<5n%l*H4m?+=>b?rrZLJ@|ekbr1 z%(s`Cy_Bgeu}O1p07&H80Uv0Ao->qo3HGR?Lch?D4)Hz)aUTfbvkYet;p0|o-h`GG zgrM~IB}%0yC`o;_w>8>HXn$34Cyp3QK6P3YD`EYbljZi6D>?Gr+W!E)i`zupV&xR2 zZ+h>I_)+^rnf@0~;va-Q3A6Eag~p!4Q_wDKtwiy`<(5FDlq=Wi=fGe1 zCkKJ_7_Gc(cLuMfPjxP%rg(?LTFjPrY?cxv@CFwa>Vh_PXk^_4tjf8IEtV_jy7%m{ z`#tKmmb!PvUjfCfUKnT8V70Q-A-BDj8(7iA3Z5YF zb`KP=6tVNf^2e4@!_cKBKyv7MqSfh^2 zI_{9j`-1`*jGH7;y;Gm2e`o&y_$7bD>)(hUVEBQcc!x=O?yUs6{{W0`WYacUvQBe7 zq`q9bgoJs48Zs6%js{U;RtmoX43pat@!HwuCHaW zHn&caL2aVm{hkCbGVHrLj7+$Z%v+WrFUn`0G+=T%(8XZ#t|RQ;#VzaJ4c=}rlg&t8FM;rrz(?C{#tWaYe%GC!TN9f z5*OfY()fSkSM4YJL)gutY5p^`(V);RHEWxDZxOq~UM(^^yQhGpw-TTXu|^%yW|3r8 z$*)?!_!;{ZctgYfG4UtCpMyURJVoPlx*u%#Rbk>7E}GJNM(7va!0mv1OcUKV+~-p*Jq; zZX>vp&bWp6;FMv5i0xkxnQ46YJXon=V@Mzci)o8WC* z;yQc**KQ&BS09Hw4ECC=+I{~3h~wRNsa(0XHsWN|uBMVSoI@qT8@mB+WDeVlvX(!k z{{XkQiuCOp;17y4F9!IcW8zq}EBUQ#bothOBwBRnd(B&#-qJBnDK+fid2R2+pNZcce0gQ@AH^EIwavtO zWvIW`FKr~$ZzkDprZ^G2!*eDKMLwRIutYRMqWDDwHE9I8>FX)t#pYD7Sbf zoRg97{t^EGf|dAF!ygeR@Q24Q3*AYo>6cJj{ii|H65<<~og&=l?P3!G0>~H4a;i|~ zKowp67ycXf_Bz!pAdgq^4zHwY7k3ca*x9R#<8vtygp59)CCq1YDy9Q@nMFH!UEd^~ zOX2Uu9}GdDd_POi9cupo**CW{X+ugsZPRX}xt*5Ha3)vX7uci>0gg@PEsx~Pi|v22 zXTZHL_B_>fy<1iIt$U~Vo5A8skBdJR^vhUoZ9EraCB7oiG*G^DH#*JN+Tw=d@E;=2 z%~+yo9hIf?Wm6`~FfRuvHk*>=_G*rqYC(ykkx(NhyCf<{kj}f174A zg-k7K7>G(2RI0f~ZQ4`il|@oAQReSyzlKeF%jyf+yg%SS9^8Cm_^Av7=wfi_3S0(V#Y$I+I5#NU+Y)$zqIe?jU%U*`LaO0?Ds@Nd2llA^cPD_ltFC zd;vGwbj>o_`u^=BxE^#DFgmP}qcVnR=VqV%Dc1Vld81P{`8>bC{{Xiq!S5INAIAC= zeiiU!9vQZV{{Tkt2Zi)RwzWDgkhcbT1*%O1u}O1sB%Pk-NFGBp<d8OZEZOF)07;!rzIIXS};>maf7^MdauGC+0A|^>DrHr=J8ZkFxy|f zx*04C&%0u0BXR%&l-e?Jmf=Vl_d%j~E8zyPJ|+0SS2p*zT6$Su+<74rP?(fsFo6L( z+ivfdfC(o(Ys3@8-ZTA>JQuEh&|0!Zs`w|v`sJ;@rDCy8_Y>KFYTH|0U&jn*Y)uRX zX#+Om1Wp@hA=A450D^{o*S-k&dGN~n#lH{qJFgMm--u<=G<#hVYm;<~ZRbY&L^1`5 z*%mP?3@Ls{WGX+Kvq@&WXN!hoh_BA7R#ah5D#|eB=c}uVvZwpLX46-9mdU@zI7cVR zcvFQ|E~XRM;^Lgts$Hs;BXv$tk2+1aE>&(_B(BtDbJ+eL_)6R3YG_^{*7QSVYO+ax z5SW=7Cjg6Sw<`oEX$|G!h%y2WQzyxX!e4?OFYwQeJT;-Vg>QSJYL|@-3rC+xT*Y?5)a_o944`o!;prtBg2@FNR%55st$9SUR$! zTiQWJqLPA5xVLD{q^d?Kt)cHh+{^KY!XMfT_Bg+>@sGma5BP%r0K?k;o`%;<@aKnT z)@Jba%#lL(cHd(uaN-H>W18h|AOZm+N{q3{!b_sMANVNm?M2{U3H%=TXYoVC-X@1f z&~T*qUzkXMor73w_L7iiA?Amm4syOFL)ZOs)r%IHsP7j$v6kJrCZ07WGmE|RN z&RgLx?GxgE+sjhbX8oHyHwBK9;Y~+D)g;zFA!}E0Nu}LOBW_4E?KOeaEzD6fc{ex6 z1kxSGL{(Ak9}52fXYUOBXYk3^{8#Z4#M=Fp-lufe9vrx6v|SmlE~hfxM%G$x&$e1g zB&RTH65f_w%IyAR#!`y@uFf%4sNpctr)b65@}!h+?BOe@rOKZ)o4QHgTj;I&ZxiD& zxLP!0POPsgR3_x*Cq8*f(rRtF#{HC%)!AzA-1ry5-VpeIYj>jf3&MJKi>2vzm$q@} z`c<5E_G==%(MrhzC`3~!X$HWoSO#Ku6|ROY84bMfL91LC;(&diJbX-sX%&>IW&{u$ z)0fY0n4-T8@sBaZ<@_y)jR_>B3p=RGPI^)MN$nZQCbd`7Wp7Wz%(DlZab-zj)THUU ze6OBq>q?{5sV6UjKNa>otKzTi8E5dDL-8Mnzu_Nvwaq`nKiYbEv{h?hOI;)`&OEsT zAF+mX0}Jx1vE{3g{hq!*{6+Yo@YlhA4c}dzJVPgdv>7~4;>)?@n&$p#E!@vAW`Dl& zLn%2KA-az-z$mZF^2`PoE6Z^htTb&=C+{}o^@6&5&D}nET|1}D^gn3g%2c6U;Nujj zDzIFuPOyD7)1$rDOCJ~i0KrARA$(%^9k2ME_1_s>YZl%Vz7~3XQtFoyJep{8aE4#czk2U+mZ7e}?vC=z5-|2Zy!vv9~f? zBylvYZRG6o{HW%6CXJj(OG}Nhs<)U?Uu}_aR}8}|!DbPmYReB!w>Lch0E+#pbGJ2} z(n`&8`D)Hz?JQn5G_QlM)Tu)h>8QBI$_@~Vj@Nfhrrq1VtopC^d;NkmUw}8B7x)$M ziurWk9Q;vB`93Ik<@E=aTPs@$beUz+w7DdSrShhmQH>4MjE{pn&kL2df**jtX>Zz( z;r{@~oepmbcxOn~^uGw`miDjVeFw^lHA{)Cq?SADkkU%CToieioQM)lkrs0`%;gpN zo=;k}T=vtGe6od`ZMDscuTAT{xgivl?5%ZX+keL172`2D{wBk`RK25kYOT2?Cf~8F z?{=H!x=#1K;+^i5xZl}h#(J;9O+Uq7v=76OJTc>M2(`YgqG~!saNJqLYjW~fHJG-# ziSv`GtND^Vm=@u7F(KFV-^QP`cf=owKk!Pg*?V4|;q;4rL*pKi;m;7grR=lZ-^XM2 z>v<(;kQtIOH8)%g^nq@#N& zU8J0(-_d=~w!1g>?Du7Tjhlg}8fy2+7Kw(6A$&szwPt3M=L~iiI4<6N`o- zH(ttKSG2iR+Ou|AEB9}9&%t=tF~(QRDpQ=j+1120tdjRJ5mDJ_l9aSoS8cspXOs9d z$Nm!isJ;(szY0Dmd^6Ft3oi%!LcNB=?N%u|+nD^sRgGu!Kvs@Kc44%cT0@KxMHE-# z`MoujX0`CxM(NX1c8p~tmF(S>&xy^m9DFgZlxoRF2}LB@vRf}ZZLYSm*(3kiwQ^yN literal 0 HcmV?d00001 From 13b315fe017371db8a690e82d69a3af1706eb4e1 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Fri, 27 Sep 2024 13:04:57 +0530 Subject: [PATCH 26/38] Rearrange XOR GPU function header --- include/rppt_tensor_logical_operations.h | 44 ++++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index fec0c3d53..b998f90a4 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -107,58 +107,58 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s */ RppStatus rppt_exclusive_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); -/*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor - * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+#ifdef GPU_SUPPORT +/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 - * \image html logical_operations_bitwise_or_img150x150.png Sample Output - * \param [in] srcPtr1 source1 tensor in HOST memory - * \param [in] srcPtr2 source2 tensor in HOST memory + * \image html logical_operations_exclusive_or_img150x150.png Sample Output + * \param [in] srcPtr1 source1 tensor in HIP memory + * \param [in] srcPtr2 source2 tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) - * \param [out] dstPtr destination tensor in HOST memory + * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT -#ifdef GPU_SUPPORT -/*! \brief Bitwise OR computation on HIP backend for a NCHW/NHWC layout tensor +/*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 * \image html logical_operations_bitwise_or_img150x150.png Sample Output - * \param [in] srcPtr1 source1 tensor in HIP memory - * \param [in] srcPtr2 source2 tensor in HIP memory + * \param [in] srcPtr1 source1 tensor in HOST memory + * \param [in] srcPtr2 source2 tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) - * \param [out] dstPtr destination tensor in HIP memory + * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) - * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); -#endif // GPU_SUPPORT +RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT -/*! \brief Exclusive OR computation on HIP backend for a NCHW/NHWC layout tensor - * \details This function computes exclusive OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+/*! \brief Bitwise OR computation on HIP backend for a NCHW/NHWC layout tensor + * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input1 * \image html img150x150_2.png Sample Input2 - * \image html logical_operations_exclusive_or_img150x150.png Sample Output + * \image html logical_operations_bitwise_or_img150x150.png Sample Output * \param [in] srcPtr1 source1 tensor in HIP memory * \param [in] srcPtr2 source2 tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) @@ -171,7 +171,7 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_exclusive_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT /*! @} */ From b348dc622d65cfb47bf0c5f49b7b3553f0716ce6 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Fri, 27 Sep 2024 13:10:21 +0530 Subject: [PATCH 27/38] Add empty line --- include/rppt_tensor_logical_operations.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index b998f90a4..730eb4fa0 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -173,6 +173,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s */ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT + /*! @} */ From 0e183655f5fbfafe97c92c5bfc238a71bc77030d Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 30 Sep 2024 12:36:57 +0530 Subject: [PATCH 28/38] Update aligned length --- src/modules/cpu/kernel/exclusive_or.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 1cdd826cd..215731eb7 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -200,6 +200,10 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; srcPtr1RowR = srcPtr1Channel; srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; @@ -508,6 +512,10 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; srcPtr1RowR = srcPtr1Channel; srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; @@ -822,6 +830,10 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { +#if __AVX2__ + alignedLength = bufferLength & ~7; +#endif + Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; srcPtr1RowR = srcPtr1Channel; srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; @@ -1131,6 +1143,10 @@ RppStatus exclusive_or_i8_i8_host_tensor(Rpp8s *srcPtr1, // Exclusive OR without fused output-layout toggle (NCHW -> NCHW for 3 channel) else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) { +#if __AVX2__ + alignedLength = bufferLength & ~31; +#endif + Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; srcPtr1RowR = srcPtr1Channel; srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride; From ad5036d51c1e3a7df6436ae4913170450a83c68a Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 30 Sep 2024 23:11:55 +0530 Subject: [PATCH 29/38] Updates to make F16 outputs consistent with other bit depths --- src/modules/cpu/kernel/exclusive_or.hpp | 80 ++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 215731eb7..588fe394b 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -403,9 +403,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -420,9 +420,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) { - *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) ^ (uint)(srcPtr2Temp[0] * 255)) / 255); - *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) ^ (uint)(srcPtr2Temp[1] * 255)) / 255); - *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) ^ (uint)(srcPtr2Temp[2] * 255)) / 255); + *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255); + *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255); + *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255); srcPtr1Temp += 3; srcPtr2Temp += 3; @@ -467,9 +467,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -486,9 +486,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); - dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); - dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); + dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255); + dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255); + dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255); srcPtr1TempR++; srcPtr1TempG++; @@ -548,9 +548,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -569,9 +569,9 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - *dstPtrTempR = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255); - *dstPtrTempG = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255); - *dstPtrTempB = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255); + *dstPtrTempR = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255); + *dstPtrTempG = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255); + *dstPtrTempB = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255); srcPtr1TempR++; srcPtr1TempG++; @@ -623,7 +623,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores @@ -634,7 +634,7 @@ RppStatus exclusive_or_f32_f32_host_tensor(Rpp32f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255); + *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255); srcPtr1Temp++; srcPtr2Temp++; @@ -721,9 +721,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads rpp_simd_load(rpp_load24_f16pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -738,9 +738,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount += 3) { - *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) ^ (uint)(srcPtr2Temp[0] * 255)) / 255)); - *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) ^ (uint)(srcPtr2Temp[1] * 255)) / 255)); - *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) ^ (uint)(srcPtr2Temp[2] * 255)) / 255)); + *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[0] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[0] * 255)) / 255)); + *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[1] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[1] * 255)) / 255)); + *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(srcPtr1Temp[2] * 255) ^ (uint)(std::nearbyintf)(srcPtr2Temp[2] * 255)) / 255)); srcPtr1Temp += 3; srcPtr2Temp += 3; @@ -785,9 +785,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -804,9 +804,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255)); - dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255)); - dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255)); + dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255)); + dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255)); + dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255)); srcPtr1TempR++; srcPtr1TempG++; @@ -865,9 +865,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads rpp_simd_load(rpp_load24_f16pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation - p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation - p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[1] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // exclusive_or computation + p1[2] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); p1[1] = _mm256_mul_ps(p1[1], avx_p1op255); p1[2] = _mm256_mul_ps(p1[2], avx_p1op255); @@ -886,9 +886,9 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - *dstPtrTempR = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) ^ (uint)(*srcPtr2TempR * 255)) / 255)); - *dstPtrTempG = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) ^ (uint)(*srcPtr2TempG * 255)) / 255)); - *dstPtrTempB = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) ^ (uint)(*srcPtr2TempB * 255)) / 255)); + *dstPtrTempR = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempR * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempR * 255)) / 255)); + *dstPtrTempG = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempG * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempG * 255)) / 255)); + *dstPtrTempB = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1TempB * 255) ^ (uint)(std::nearbyintf)(*srcPtr2TempB * 255)) / 255)); srcPtr1TempR++; srcPtr1TempG++; @@ -942,7 +942,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr1Temp, p1); // simd loads rpp_simd_load(rpp_load8_f16_to_f32_avx, srcPtr2Temp, p2); // simd loads - p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation + p1[0] = _mm256_cvtepi32_ps(_mm256_xor_si256(_mm256_cvtps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvtps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // exclusive_or computation p1[0] = _mm256_mul_ps(p1[0], avx_p1op255); rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores @@ -953,7 +953,7 @@ RppStatus exclusive_or_f16_f16_host_tensor(Rpp16f *srcPtr1, #endif for (; vectorLoopCount < bufferLength; vectorLoopCount++) { - *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) ^ (uint)(*srcPtr2Temp * 255)) / 255)); + *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(std::nearbyintf)(*srcPtr1Temp * 255) ^ (uint)(std::nearbyintf)(*srcPtr2Temp * 255)) / 255)); srcPtr1Temp++; srcPtr2Temp++; From a86e2b0355d55db9ac5895f1e3078411cb3e5e56 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 1 Oct 2024 15:03:13 +0530 Subject: [PATCH 30/38] Add std::nearbyintf in exclusive or hip code --- src/include/hip/rpp_hip_common.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 1bdf362a3..5c4a07885 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1828,14 +1828,14 @@ __device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_flo __device__ __forceinline__ void rpp_hip_math_exclusiveOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) { - dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) ^ (uchar)(src2_f8->f1[0])); - dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) ^ (uchar)(src2_f8->f1[1])); - dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) ^ (uchar)(src2_f8->f1[2])); - dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) ^ (uchar)(src2_f8->f1[3])); - dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) ^ (uchar)(src2_f8->f1[4])); - dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) ^ (uchar)(src2_f8->f1[5])); - dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) ^ (uchar)(src2_f8->f1[6])); - dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) ^ (uchar)(src2_f8->f1[7])); + dst_f8->f1[0] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[0]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[0])); + dst_f8->f1[1] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[1]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[1])); + dst_f8->f1[2] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[2]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[2])); + dst_f8->f1[3] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[3]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[3])); + dst_f8->f1[4] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[4]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[4])); + dst_f8->f1[5] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[5]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[5])); + dst_f8->f1[6] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[6]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[6])); + dst_f8->f1[7] = (float)((uchar)(std::nearbyintf)(src1_f8->f1[7]) ^ (uchar)(std::nearbyintf)(src2_f8->f1[7])); } __device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x) From bb3a55a3b68c7ee3c8ec1abd97b392aac9aef9f5 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 8 Oct 2024 12:27:02 +0530 Subject: [PATCH 31/38] Update the code to use predefined zero vectors --- src/include/cpu/rpp_cpu_simd.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 8cd0fc592..5f8af1bdf 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -521,11 +521,10 @@ inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) inline void rpp_store96_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m256i *px) { __m256i pxDst[8]; - __m256i pxZero = _mm256_setzero_si256(); __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); - pxDst[0] = _mm256_unpacklo_epi8(px[1], pxZero); - pxDst[1] = _mm256_unpackhi_epi8(px[1], pxZero); + pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0); + pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0); pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); pxDst[4] = _mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB); @@ -933,11 +932,10 @@ inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px) inline void rpp_store96_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m256i *px) { __m256i pxDst[8]; - __m256i pxZero = _mm256_setzero_si256(); __m256i pxMaskRGBAtoRGB = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); pxMaskRGBAtoRGB = _mm256_permute2f128_si256(pxMaskRGBAtoRGB, pxMaskRGBAtoRGB, 0); - pxDst[0] = _mm256_unpacklo_epi8(px[1], pxZero); - pxDst[1] = _mm256_unpackhi_epi8(px[1], pxZero); + pxDst[0] = _mm256_unpacklo_epi8(px[1], avx_px0); + pxDst[1] = _mm256_unpackhi_epi8(px[1], avx_px0); pxDst[2] = _mm256_unpacklo_epi8(px[0], px[2]); pxDst[3] = _mm256_unpackhi_epi8(px[0], px[2]); pxDst[4] = _mm256_sub_epi8(_mm256_shuffle_epi8(_mm256_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), avx_pxConvertI8); From 6f79652f42f005fa8b92819da44b2b55ea754ec6 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 9 Oct 2024 08:08:43 +0530 Subject: [PATCH 32/38] Update to use existing rpp_load96_u8_avx instead of rpp_load96_u8pln3_to_u8pln3 --- src/include/cpu/rpp_cpu_simd.hpp | 7 ------- src/modules/cpu/kernel/exclusive_or.hpp | 8 ++++---- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 5f8af1bdf..34a4b7bda 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -496,13 +496,6 @@ inline void rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *s px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ } -inline void rpp_load96_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *px) -{ - px[0] = _mm256_loadu_si256((__m256i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23|R24|R25|R26|R27|R28|R29|R30|R31|R32] */ - px[1] = _mm256_loadu_si256((__m256i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16|G17|G18|G19|G20|G21|G22|G23|G24|G25|G26|G27|G28|G29|G30|G31|G32] */ - px[2] = _mm256_loadu_si256((__m256i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|B24|B25|B26|B27|B28|B29|B30|B31|B32] */ -} - inline void rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px) { __m128i pxDst[4]; diff --git a/src/modules/cpu/kernel/exclusive_or.hpp b/src/modules/cpu/kernel/exclusive_or.hpp index 588fe394b..5ea193d41 100644 --- a/src/modules/cpu/kernel/exclusive_or.hpp +++ b/src/modules/cpu/kernel/exclusive_or.hpp @@ -156,8 +156,8 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, { __m256i p1[3], p2[3]; - rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads - rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation @@ -233,8 +233,8 @@ RppStatus exclusive_or_u8_u8_host_tensor(Rpp8u *srcPtr1, { __m256i p1[3], p2[3]; - rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads - rpp_simd_load(rpp_load96_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads + rpp_simd_load(rpp_load96_u8_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads p1[0] = _mm256_xor_si256(p1[0], p2[0]); // exclusive_or computation p1[1] = _mm256_xor_si256(p1[1], p2[1]); // exclusive_or computation p1[2] = _mm256_xor_si256(p1[2], p2[2]); // exclusive_or computation From db5a2ac46bde173fb4230892dd6ba2e35fba28aa Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 23 Oct 2024 16:26:47 +0530 Subject: [PATCH 33/38] Update the version --- CMakeLists.txt | 2 +- include/rpp_version.h | 4 ++-- utilities/test_suite/rpp_test_suite_image.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cee0b37d3..9f7b70eb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ endif() set(CMAKE_CXX_STANDARD 17) # RPP Version -set(VERSION "1.9.3") +set(VERSION "1.15.0") # Set Project Version and Language project(rpp VERSION ${VERSION} LANGUAGES CXX) diff --git a/include/rpp_version.h b/include/rpp_version.h index 68dae8b63..0c7575ef3 100644 --- a/include/rpp_version.h +++ b/include/rpp_version.h @@ -39,8 +39,8 @@ extern "C" { #endif // NOTE: IMPORTANT: Match the version with CMakelists.txt version #define RPP_VERSION_MAJOR 1 -#define RPP_VERSION_MINOR 9 -#define RPP_VERSION_PATCH 3 +#define RPP_VERSION_MINOR 15 +#define RPP_VERSION_PATCH 0 #ifdef __cplusplus } #endif diff --git a/utilities/test_suite/rpp_test_suite_image.h b/utilities/test_suite/rpp_test_suite_image.h index ef851e3db..71fbfb33b 100644 --- a/utilities/test_suite/rpp_test_suite_image.h +++ b/utilities/test_suite/rpp_test_suite_image.h @@ -104,6 +104,7 @@ std::map augmentationMap = {61, "magnitude"}, {63, "phase"}, {65, "bitwise_and"}, + {67, "exclusive_or"}, {68, "bitwise_or"}, {70, "copy"}, {79, "remap"}, From a9363ce4daed31be4cd8301e218c44dc94505766 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Wed, 23 Oct 2024 16:28:58 +0530 Subject: [PATCH 34/38] Update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc7983d75..5e70c1b9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest) +## RPP 1.15.0 (Unreleased) + +### Changes + +* RPP Tensor Warp Perspective support on HOST and HIP + ## RPP 1.9.3 (unreleased) ### Changes From 97653cd64a2198291a30ab4c2f96f384741feaae Mon Sep 17 00:00:00 2001 From: Abishek <52214183+r-abishek@users.noreply.github.com> Date: Wed, 27 Nov 2024 01:51:58 -0800 Subject: [PATCH 35/38] Update CHANGELOG.md --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0287aee92..cbbdaf390 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,13 +4,13 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/r ## (Unreleased) RPP 1.9.9 -### Changes +### Changed * RPP Tensor Exclusive-Or support on HOST and HIP ## (Unreleased) RPP 1.9.4 -### Changes +### Changed * AMD Clang is now the default CXX and C compiler * RPP Tensor Box Filter support on HOST From 0b2be7e9e3083f7e5127cf8f346e553bb13362e2 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Fri, 6 Dec 2024 17:06:24 +0530 Subject: [PATCH 36/38] Updates to fix more merge conflicts --- CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7a8b7bd8..93527ed2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,11 +43,7 @@ endif() set(CMAKE_CXX_STANDARD 17) # RPP Version -<<<<<<< HEAD set(VERSION "1.9.9") -======= -set(VERSION "1.9.5") ->>>>>>> opensource_develop_branch # Set Project Version and Language project(rpp VERSION ${VERSION} LANGUAGES CXX) From 2d9f0f25056e5321000a574180361f8588c32fec Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Mon, 9 Dec 2024 15:38:45 +0530 Subject: [PATCH 37/38] Update version to 1.9.10 including exclusive or --- CHANGELOG.md | 7 +------ CMakeLists.txt | 2 +- include/rpp_version.h | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0b749808..25ee4b814 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,17 +2,12 @@ Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest) -## (Unreleased) RPP 1.9.9 - -### Changed - -* RPP Tensor Exclusive-Or support on HOST and HIP - ## (Unreleased) RPP 1.9.10 ### Changed * RPP Tensor Gaussian Filter support on HOST +* RPP Tensor Exclusive-Or support on HOST and HIP ## (Unreleased) RPP 1.9.4 diff --git a/CMakeLists.txt b/CMakeLists.txt index 93527ed2c..b6f91325d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,7 @@ endif() set(CMAKE_CXX_STANDARD 17) # RPP Version -set(VERSION "1.9.9") +set(VERSION "1.9.10") # Set Project Version and Language project(rpp VERSION ${VERSION} LANGUAGES CXX) diff --git a/include/rpp_version.h b/include/rpp_version.h index 87a462929..ff042cdbf 100644 --- a/include/rpp_version.h +++ b/include/rpp_version.h @@ -40,7 +40,7 @@ extern "C" { // NOTE: IMPORTANT: Match the version with CMakelists.txt version #define RPP_VERSION_MAJOR 1 #define RPP_VERSION_MINOR 9 -#define RPP_VERSION_PATCH 9 +#define RPP_VERSION_PATCH 10 #ifdef __cplusplus } #endif From 40252a13c0321124c7130b7bdbc2c60d3691f139 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Fri, 13 Dec 2024 12:35:19 +0530 Subject: [PATCH 38/38] Remove duplicate definitions of functions --- src/include/cpu/rpp_cpu_simd.hpp | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 3b62da3e2..b02cd23b5 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -1723,24 +1723,6 @@ inline void rpp_store24_f32pln3_to_f32pln3_avx(Rpp32f *dstPtrR, Rpp32f *dstPtrG, _mm256_storeu_ps(dstPtrB, p[2]); } -inline void rpp_load24_f16pkd3_to_f32pln3_avx(Rpp16f *srcPtr, __m256 *p) -{ - __m128 p128[8]; - p128[0] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr)))); - p128[1] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 3)))); - p128[2] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 6)))); - p128[3] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 9)))); - _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]); - p128[4] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 12)))); - p128[5] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 15)))); - p128[6] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 18)))); - p128[7] = _mm_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr + 21)))); - _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]); - p[0] = _mm256_setr_m128(p128[0], p128[4]); - p[1] = _mm256_setr_m128(p128[1], p128[5]); - p[2] = _mm256_setr_m128(p128[2], p128[6]); -} - inline void rpp_load24_f32pkd3_to_f64pln3_avx(Rpp32f *srcPtr, __m256d *p) { __m128 p128[8]; @@ -1821,13 +1803,6 @@ inline void rpp_store24_f32pln3_to_f32pkd3_avx(Rpp32f *dstPtr, __m256 *p) _mm_storeu_ps(dstPtr + 21, p128[3]); } -inline void rpp_load24_f16pln3_to_f32pln3_avx(Rpp16f *srcPtrR, Rpp16f *srcPtrG, Rpp16f *srcPtrB, __m256 *p) -{ - p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrR)))); - p[1] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrG)))); - p[2] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtrB)))); -} - inline void rpp_load24_f32pln3_to_f64pln3_avx(Rpp32f *srcPtrR, Rpp32f *srcPtrG, Rpp32f *srcPtrB, __m256d *p) { __m128 px128[6]; @@ -1917,11 +1892,6 @@ inline void rpp_store8_f32_to_f32_avx(Rpp32f *dstPtr, __m256 *p) _mm256_storeu_ps(dstPtr, p[0]); } -inline void rpp_load8_f16_to_f32_avx(Rpp16f *srcPtr, __m256 *p) -{ - p[0] = _mm256_cvtph_ps(_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(srcPtr)))); -} - inline void rpp_load8_f32_to_f64_avx(Rpp32f *srcPtr, __m256d *p) { __m128 px128[2];