From 56d6aea6e2b56fd443be638282344569473216ea Mon Sep 17 00:00:00 2001 From: Miles Price Date: Thu, 14 Mar 2024 14:53:59 -0700 Subject: [PATCH] feat: adding code for release v0.6.0 (beta-4) of CV-CUDA --- 3rdparty/CMakeLists.txt | 18 +- CMakeLists.txt | 4 +- CODE_OF_CONDUCT.md | 16 + CONTRIBUTING.md | 20 +- DEVELOPER_GUIDE.md | 21 +- LICENSE.md | 15 + README.md | 294 +- SECURITY.md | 15 + bench/BenchFindContours.cpp | 8 +- bench/BenchHQResize.cpp | 129 + bench/BenchMorphology.cpp | 10 +- bench/CMakeLists.txt | 4 +- bench/python/README.md | 116 + bench/python/all_ops/op_adaptivethreshold.py | 41 + bench/python/all_ops/op_averageblur.py | 32 + bench/python/all_ops/op_blurbox.py | 80 + bench/python/all_ops/op_boundingbox.py | 84 + bench/python/all_ops/op_brightnesscontrast.py | 46 + bench/python/all_ops/op_centercrop.py | 33 + bench/python/all_ops/op_composite.py | 52 + bench/python/all_ops/op_convertto.py | 35 + bench/python/all_ops/op_copymakeborder.py | 42 + bench/python/all_ops/op_customcrop.py | 30 + bench/python/all_ops/op_cvtcolor.py | 37 + bench/python/all_ops/op_findcontours.py | 109 + bench/python/all_ops/op_flip.py | 29 + bench/python/all_ops/op_gaussianblur.py | 30 + bench/python/all_ops/op_hqresize.py | 53 + bench/python/all_ops/op_inpaint.py | 56 + bench/python/all_ops/op_jointbilateral.py | 55 + bench/python/all_ops/op_laplacian.py | 30 + bench/python/all_ops/op_morphology.py | 102 + bench/python/all_ops/op_nms.py | 52 + bench/python/all_ops/op_normalize.py | 45 + bench/python/all_ops/op_randomresizedcrop.py | 44 + bench/python/all_ops/op_reformat.py | 43 + bench/python/all_ops/op_remap.py | 53 + bench/python/all_ops/op_reshape.py | 32 + bench/python/all_ops/op_resize.py | 57 + bench/python/all_ops/op_rotate.py | 31 + bench/python/all_ops/op_sift.py | 47 + bench/python/all_ops/op_threshold.py | 42 + bench/python/all_ops/op_warpaffine.py | 60 + bench/python/all_ops/op_warpperspective.py | 70 + bench/python/assets/NOTICE.md | 19 + bench/python/assets/brooklyn.jpg | 3 + bench/python/assets/brooklyn_bboxes.pt | Bin 0 -> 196520 bytes bench/python/assets/brooklyn_mask.jpg | 3 + bench/python/assets/brooklyn_nms_masks.pt | Bin 0 -> 196620 bytes bench/python/assets/brooklyn_scores.pt | Bin 0 -> 98600 bytes bench/python/assets/countour_lines.jpg | 3 + bench/python/bench_utils.py | 411 +++ bench/python/run_bench.py | 236 ++ ci/build.sh | 80 +- ci/build_docs.sh | 4 +- ci/build_samples.sh | 2 +- cmake/BuildPython.cmake | 12 +- cmake/ConfigCPack.cmake | 4 +- cmake/ConfigCUDA.cmake | 7 +- cmake/ConfigCompiler.cmake | 2 +- cmake/ConfigPython.cmake | 4 +- cmake/GetGitRevisionDescription.cmake | 3 +- cmake/InstallNVCVDev.cmake | 6 +- cmake/InstallNVCVLib.cmake | 11 +- cmake/InstallPython.cmake | 19 +- cmake/InstallTests.cmake | 2 +- docker/build20.04/Dockerfile | 90 + docker/{build => build20.04}/ccache.conf | 0 .../deadsnakes-ubuntu-ppa-focal.list | 17 + docker/{build => build22.04}/Dockerfile | 58 +- docker/build22.04/ccache.conf | 17 + .../deadsnakes-ubuntu-ppa-jammy.list | 0 docker/config | 4 +- docker/{devel => devel20.04}/Dockerfile | 32 +- docker/{devel => devel20.04}/gdbinit | 0 docker/{devel => devel20.04}/vimrc | 0 docker/devel22.04/Dockerfile | 72 + docker/devel22.04/gdbinit | 19 + docker/devel22.04/vimrc | 23 + docker/env_devel_linux.sh | 4 +- docker/samples/Dockerfile | 1 + docker/test20.04/Dockerfile | 59 + .../deadsnakes-ubuntu-ppa-focal.list | 17 + docker/{test => test22.04}/Dockerfile | 25 +- .../deadsnakes-ubuntu-ppa-jammy.list | 0 docker/update_build_image.sh | 4 +- docker/update_devel_image.sh | 6 +- docker/update_samples_image.sh | 1 + docker/update_test_image.sh | 4 +- docs/CMakeLists.txt | 4 +- docs/sphinx/content/cvcuda_oplist.csv | 5 +- docs/sphinx/index.rst | 3 +- docs/sphinx/installation.rst | 68 +- docs/sphinx/relnotes/v0.5.0-beta.rst | 39 +- docs/sphinx/relnotes/v0.6.0-beta.rst | 78 + .../sphinx/samples/cpp_samples/cropresize.rst | 2 +- .../samples/python_samples/classification.rst | 16 +- ...orch.rst => imagebatchdecoder_nvcodec.rst} | 40 +- ...orch.rst => imagebatchencoder_nvcodec.rst} | 22 +- ..._vpf.rst => videobatchdecoder_nvcodec.rst} | 58 +- ..._vpf.rst => videobatchencoder_nvcodec.rst} | 62 +- .../python_samples/object_detection.rst | 24 +- .../samples/python_samples/segmentation.rst | 24 +- lint/copyright_check.sh | 13 +- python/CMakeLists.txt | 21 +- python/build_wheels.sh | 84 + python/mod_cvcuda/CMakeLists.txt | 1 + python/mod_cvcuda/InterpolationType.cpp | 1 + python/mod_cvcuda/Main.cpp | 1 + python/mod_cvcuda/OpAdaptiveThreshold.cpp | 12 +- python/mod_cvcuda/OpAdvCvtColor.cpp | 6 +- python/mod_cvcuda/OpAverageBlur.cpp | 12 +- python/mod_cvcuda/OpBilateralFilter.cpp | 12 +- python/mod_cvcuda/OpBndBox.cpp | 6 +- python/mod_cvcuda/OpBoxBlur.cpp | 6 +- python/mod_cvcuda/OpBrightnessContrast.cpp | 8 +- python/mod_cvcuda/OpCenterCrop.cpp | 6 +- python/mod_cvcuda/OpChannelReorder.cpp | 6 +- python/mod_cvcuda/OpColorTwist.cpp | 6 +- python/mod_cvcuda/OpComposite.cpp | 12 +- python/mod_cvcuda/OpConv2D.cpp | 6 +- python/mod_cvcuda/OpConvertTo.cpp | 6 +- python/mod_cvcuda/OpCopyMakeBorder.cpp | 18 +- .../OpCropFlipNormalizeReformat.cpp | 6 +- python/mod_cvcuda/OpCustomCrop.cpp | 6 +- python/mod_cvcuda/OpCvtColor.cpp | 12 +- python/mod_cvcuda/OpErase.cpp | 12 +- python/mod_cvcuda/OpFindContours.cpp | 8 +- python/mod_cvcuda/OpFindHomography.cpp | 14 +- python/mod_cvcuda/OpFlip.cpp | 12 +- python/mod_cvcuda/OpGammaContrast.cpp | 6 +- python/mod_cvcuda/OpGaussian.cpp | 12 +- python/mod_cvcuda/OpGaussianNoise.cpp | 12 +- python/mod_cvcuda/OpHQResize.cpp | 761 +++++ python/mod_cvcuda/OpHistogram.cpp | 8 +- python/mod_cvcuda/OpHistogramEq.cpp | 12 +- python/mod_cvcuda/OpInpaint.cpp | 12 +- python/mod_cvcuda/OpJointBilateralFilter.cpp | 12 +- python/mod_cvcuda/OpLabel.cpp | 18 +- python/mod_cvcuda/OpLaplacian.cpp | 12 +- python/mod_cvcuda/OpMedianBlur.cpp | 12 +- python/mod_cvcuda/OpMinAreaRect.cpp | 6 +- python/mod_cvcuda/OpMinMaxLoc.cpp | 18 +- python/mod_cvcuda/OpMorphology.cpp | 17 +- python/mod_cvcuda/OpNonMaximumSuppression.cpp | 6 +- python/mod_cvcuda/OpNormalize.cpp | 12 +- python/mod_cvcuda/OpOSD.cpp | 6 +- python/mod_cvcuda/OpPadAndStack.cpp | 6 +- python/mod_cvcuda/OpPairwiseMatcher.cpp | 14 +- python/mod_cvcuda/OpPillowResize.cpp | 12 +- python/mod_cvcuda/OpRandomResizedCrop.cpp | 12 +- python/mod_cvcuda/OpReformat.cpp | 6 +- python/mod_cvcuda/OpRemap.cpp | 12 +- python/mod_cvcuda/OpResize.cpp | 12 +- python/mod_cvcuda/OpRotate.cpp | 12 +- python/mod_cvcuda/OpSIFT.cpp | 6 +- python/mod_cvcuda/OpStack.cpp | 6 +- python/mod_cvcuda/OpThreshold.cpp | 12 +- python/mod_cvcuda/OpWarpAffine.cpp | 12 +- python/mod_cvcuda/OpWarpPerspective.cpp | 12 +- python/mod_cvcuda/Operators.hpp | 1 + python/mod_nvcv/CAPI.cpp | 8 +- python/mod_nvcv/Resource.cpp | 12 +- .../mod_nvcv/include/nvcv/python/LockMode.hpp | 8 +- .../include/nvcv/python/ResourceGuard.hpp | 8 +- python/setup.py.in | 81 + samples/NOTICE.md | 15 + samples/README.md | 113 +- samples/classification/CMakeLists.txt | 10 +- samples/classification/Main.cpp | 2 +- samples/classification/python/main.py | 11 +- samples/common/CMakeLists.txt | 12 +- samples/common/python/nvcodec_utils.py | 641 ++++ samples/common/python/perf_utils.py | 276 +- samples/common/python/torch_utils.py | 187 -- samples/common/python/vpf_utils.py | 519 --- samples/cropandresize/CMakeLists.txt | 10 +- samples/label/python/main.py | 32 +- samples/object_detection/python/main.py | 20 +- samples/scripts/README.md | 19 +- samples/scripts/benchmark.py | 46 +- samples/scripts/benchmark_samples.sh | 104 +- samples/scripts/install_dependencies.sh | 73 +- samples/scripts/requirements.txt | 9 + samples/scripts/run_samples.sh | 87 +- samples/segmentation/python/README.md | 23 +- samples/segmentation/python/main.py | 36 +- .../segmentation/python/model_inference.py | 14 +- samples/segmentation/python/triton_client.py | 20 +- src/cvcuda/CMakeLists.txt | 1 + src/cvcuda/OpHQResize.cpp | 139 + src/cvcuda/include/cvcuda/OpErase.h | 12 +- src/cvcuda/include/cvcuda/OpHQResize.h | 406 +++ src/cvcuda/include/cvcuda/OpHQResize.hpp | 154 + src/cvcuda/include/cvcuda/OpLabel.h | 17 +- src/cvcuda/include/cvcuda/OpMorphology.h | 4 +- src/cvcuda/include/cvcuda/OpNormalize.h | 6 +- src/cvcuda/include/cvcuda/Types.h | 1 + src/cvcuda/include/cvcuda/Workspace.hpp | 7 + src/cvcuda/priv/CMakeLists.txt | 1 + src/cvcuda/priv/OpHQResize.cu | 2788 +++++++++++++++++ src/cvcuda/priv/OpHQResize.hpp | 115 + src/cvcuda/priv/OpHQResizeBatchWrap.cuh | 408 +++ src/cvcuda/priv/OpHQResizeFilter.cuh | 402 +++ .../priv/legacy/channel_reorder_var_shape.cu | 22 +- src/cvcuda/priv/legacy/cvt_color_var_shape.cu | 77 +- src/cvcuda/priv/legacy/gaussian_noise.cu | 4 +- .../priv/legacy/gaussian_noise_var_shape.cu | 2 +- src/cvcuda/priv/legacy/inpaint.cu | 2 +- .../priv/legacy/median_blur_var_shape.cu | 18 +- src/cvcuda/priv/legacy/min_area_rect.cu | 64 +- .../priv/legacy/pillow_resize_var_shape.cu | 4 +- src/cvcuda/priv/legacy/random_resized_crop.cu | 19 +- .../legacy/random_resized_crop_var_shape.cu | 17 +- src/cvcuda/priv/legacy/threshold.cu | 4 +- src/cvcuda/priv/legacy/threshold_var_shape.cu | 4 +- src/nvcv_types/Tensor.cpp | 10 + .../include/nvcv/alloc/Requirements.h | 8 +- .../include/nvcv/cuda/TensorBatchWrap.hpp | 386 +++ .../include/nvcv/cuda/TensorWrap.hpp | 37 +- src/nvcv_types/priv/IAllocator.cpp | 4 +- src/util/Compat.cpp | 2 +- src/util/Math.hpp | 13 + tests/CMakeLists.txt | 9 +- tests/cvcuda/python/cvcuda_test_python.in | 27 +- tests/cvcuda/python/test_ophqresize.py | 306 ++ tests/cvcuda/system/CMakeLists.txt | 1 + tests/cvcuda/system/ResizeUtils.cpp | 145 +- tests/cvcuda/system/ResizeUtils.hpp | 3 +- tests/cvcuda/system/TestOpChannelReorder.cpp | 264 +- tests/cvcuda/system/TestOpCvtColor.cpp | 292 +- tests/cvcuda/system/TestOpGammaContrast.cpp | 145 +- tests/cvcuda/system/TestOpGaussianNoise.cpp | 250 +- tests/cvcuda/system/TestOpHQResize.cpp | 1320 ++++++++ tests/cvcuda/system/TestOpMedianBlur.cpp | 2 +- .../cvcuda/system/TestOpRandomResizedCrop.cpp | 68 + tests/cvcuda/system/TestOpResize.cpp | 12 +- tests/cvcuda/system/TestOpThreshold.cpp | 313 +- .../cudatools_system/CMakeLists.txt | 2 + .../cudatools_system/DeviceTensorBatchWrap.cu | 152 + .../DeviceTensorBatchWrap.hpp | 42 + .../cudatools_system/TestTensorBatchWrap.cpp | 175 ++ .../python/nvcv_test_types_python.in | 27 +- tests/nvcv_types/system/TestAllocatorC.cpp | 184 ++ tests/nvcv_types/system/TestArray.cpp | 265 ++ tests/nvcv_types/system/TestConfig.cpp | 11 +- tests/nvcv_types/system/TestImageFormat.cpp | 8 + tests/nvcv_types/system/TestTensor.cpp | 200 ++ tests/nvcv_types/system/TestTensorLayout.cpp | 48 +- tests/nvcv_types/unit/TestMath.cpp | 24 + tools/mkop/PythonWrap.cpp | 8 +- 251 files changed, 15228 insertions(+), 1970 deletions(-) create mode 100644 bench/BenchHQResize.cpp create mode 100644 bench/python/README.md create mode 100644 bench/python/all_ops/op_adaptivethreshold.py create mode 100644 bench/python/all_ops/op_averageblur.py create mode 100644 bench/python/all_ops/op_blurbox.py create mode 100644 bench/python/all_ops/op_boundingbox.py create mode 100644 bench/python/all_ops/op_brightnesscontrast.py create mode 100644 bench/python/all_ops/op_centercrop.py create mode 100644 bench/python/all_ops/op_composite.py create mode 100644 bench/python/all_ops/op_convertto.py create mode 100644 bench/python/all_ops/op_copymakeborder.py create mode 100644 bench/python/all_ops/op_customcrop.py create mode 100644 bench/python/all_ops/op_cvtcolor.py create mode 100644 bench/python/all_ops/op_findcontours.py create mode 100644 bench/python/all_ops/op_flip.py create mode 100644 bench/python/all_ops/op_gaussianblur.py create mode 100644 bench/python/all_ops/op_hqresize.py create mode 100644 bench/python/all_ops/op_inpaint.py create mode 100644 bench/python/all_ops/op_jointbilateral.py create mode 100644 bench/python/all_ops/op_laplacian.py create mode 100644 bench/python/all_ops/op_morphology.py create mode 100644 bench/python/all_ops/op_nms.py create mode 100644 bench/python/all_ops/op_normalize.py create mode 100644 bench/python/all_ops/op_randomresizedcrop.py create mode 100644 bench/python/all_ops/op_reformat.py create mode 100644 bench/python/all_ops/op_remap.py create mode 100644 bench/python/all_ops/op_reshape.py create mode 100644 bench/python/all_ops/op_resize.py create mode 100644 bench/python/all_ops/op_rotate.py create mode 100644 bench/python/all_ops/op_sift.py create mode 100644 bench/python/all_ops/op_threshold.py create mode 100644 bench/python/all_ops/op_warpaffine.py create mode 100644 bench/python/all_ops/op_warpperspective.py create mode 100644 bench/python/assets/NOTICE.md create mode 100644 bench/python/assets/brooklyn.jpg create mode 100644 bench/python/assets/brooklyn_bboxes.pt create mode 100644 bench/python/assets/brooklyn_mask.jpg create mode 100644 bench/python/assets/brooklyn_nms_masks.pt create mode 100644 bench/python/assets/brooklyn_scores.pt create mode 100644 bench/python/assets/countour_lines.jpg create mode 100644 bench/python/bench_utils.py create mode 100644 bench/python/run_bench.py create mode 100644 docker/build20.04/Dockerfile rename docker/{build => build20.04}/ccache.conf (100%) create mode 100644 docker/build20.04/deadsnakes-ubuntu-ppa-focal.list rename docker/{build => build22.04}/Dockerfile (74%) create mode 100644 docker/build22.04/ccache.conf rename docker/{build => build22.04}/deadsnakes-ubuntu-ppa-jammy.list (100%) rename docker/{devel => devel20.04}/Dockerfile (74%) rename docker/{devel => devel20.04}/gdbinit (100%) rename docker/{devel => devel20.04}/vimrc (100%) create mode 100644 docker/devel22.04/Dockerfile create mode 100644 docker/devel22.04/gdbinit create mode 100644 docker/devel22.04/vimrc create mode 100644 docker/test20.04/Dockerfile create mode 100644 docker/test20.04/deadsnakes-ubuntu-ppa-focal.list rename docker/{test => test22.04}/Dockerfile (79%) rename docker/{test => test22.04}/deadsnakes-ubuntu-ppa-jammy.list (100%) create mode 100644 docs/sphinx/relnotes/v0.6.0-beta.rst rename docs/sphinx/samples/python_samples/commons/{imagebatchdecoder_pytorch.rst => imagebatchdecoder_nvcodec.rst} (52%) rename docs/sphinx/samples/python_samples/commons/{imagebatchencoder_pytorch.rst => imagebatchencoder_nvcodec.rst} (58%) rename docs/sphinx/samples/python_samples/commons/{videobatchdecoder_vpf.rst => videobatchdecoder_nvcodec.rst} (54%) rename docs/sphinx/samples/python_samples/commons/{videobatchencoder_vpf.rst => videobatchencoder_nvcodec.rst} (53%) create mode 100755 python/build_wheels.sh create mode 100644 python/mod_cvcuda/OpHQResize.cpp create mode 100644 python/setup.py.in create mode 100644 samples/common/python/nvcodec_utils.py delete mode 100644 samples/common/python/torch_utils.py create mode 100644 samples/scripts/requirements.txt create mode 100644 src/cvcuda/OpHQResize.cpp create mode 100644 src/cvcuda/include/cvcuda/OpHQResize.h create mode 100644 src/cvcuda/include/cvcuda/OpHQResize.hpp create mode 100644 src/cvcuda/priv/OpHQResize.cu create mode 100644 src/cvcuda/priv/OpHQResize.hpp create mode 100644 src/cvcuda/priv/OpHQResizeBatchWrap.cuh create mode 100644 src/cvcuda/priv/OpHQResizeFilter.cuh create mode 100644 src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp create mode 100644 tests/cvcuda/python/test_ophqresize.py create mode 100644 tests/cvcuda/system/TestOpHQResize.cpp create mode 100644 tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu create mode 100644 tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.hpp create mode 100644 tests/nvcv_types/cudatools_system/TestTensorBatchWrap.cpp diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 51e72f3ce..0868ee168 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -15,7 +15,10 @@ set(CMAKE_FOLDER 3rdparty) -# disable all warnings +# disable all warnings when compiling objects of 3rdparty +# libraries included here. It *doesn't* affect warnings in public +# header files that are included by cvcuda code. For that, see +# solution employed with nvbench. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") @@ -40,9 +43,18 @@ set(DLPACK_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/dlpack" PARENT_SCOPE) set(CUOSD_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cuOSD" PARENT_SCOPE) # NVBench -------------------------------- -set(NVBENCH_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/nvbench" PARENT_SCOPE) - if(BUILD_BENCH) set(NVBench_ENABLE_CUPTI off) + set(BUILD_SHARED_LIBS off) add_subdirectory(nvbench) + # Because nvbench::main is an object library, cmake<=3.20 doesn't treat it + # like regular libraries, and just creating an cvcuda_nvbench interface + # library that depends on it doesn't work. We need to create an static + # library and pull in the objects created by nvbench::main, as per cmake + # docs. + add_library(cvcuda_nvbench_main STATIC $) + target_link_libraries(cvcuda_nvbench_main PUBLIC nvbench::nvbench) + target_include_directories(cvcuda_nvbench_main SYSTEM INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/nvbench) + add_library(cvcuda::nvbench::main ALIAS cvcuda_nvbench_main) endif() diff --git a/CMakeLists.txt b/CMakeLists.txt index 6256d8379..fccd9c7eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,7 +23,7 @@ endif() project(cvcuda LANGUAGES C CXX - VERSION 0.5.0 + VERSION 0.6.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 990ffc0fc..018377c20 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,3 +1,19 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + # Contributor Code of Conduct ## Overview diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 89506b788..d21011b97 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,22 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + # Contributing to CV-CUDA -**As of release v0.5.0-beta, CV-CUDA is not accepting outside contribution.** +**As of release v0.6.0-beta, CV-CUDA is not accepting outside contribution.** Contributions to CV-CUDA fall into the following categories: @@ -12,7 +28,7 @@ Contributions to CV-CUDA fall into the following categories: 1. To propose a new feature, please file a new feature request [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the intended feature and discuss the design and implementation with the team and - community. NOTE: Currently, as of release v0.5.0-beta, CV-CUDA is not accepting + community. NOTE: Currently, as of release v0.6.0-beta, CV-CUDA is not accepting outside contribution. 1. To ask a general question, please sumbit a question [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 51ac6d4e9..a5f4bec53 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -1,3 +1,19 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + # CV-CUDA Developer Guide ## What is CV-CUDA? @@ -35,7 +51,7 @@ CV-CUDA includes: | CopyMakeBorder | Creates a border around an image | | CustomCrop | Crops an image with a given region-of-interest | | CvtColor | Converts an image from one color space to another | -| DataTypeConvert | Converts an image’s data type with optional scaling | +| DataTypeConvert | Converts an image’s data type, with optional scaling | | Erase | Erases image regions | | Find Contours | Extract closed contours from an input binary image | | FindHomography | Calculates a perspective transform from four pairs of the corresponding points | @@ -45,6 +61,7 @@ CV-CUDA includes: | Gaussian Noise | Generates a statistical noise with a normal (Gaussian) distribution | | Histogram | Provides a grayscale value distribution showing the frequency of occurrence of each gray value. | | Histogram Equalizer | Allows effective spreading out the intensity range of the image typically used to improve contrast | +| HqResize | Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling | | Inpainting | Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood | | Joint Bilateral Filter | Reduces image noise while preserving strong edges based on a guidance image | | Label | Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels | @@ -59,7 +76,7 @@ CV-CUDA includes: | Normalize | Normalizes an image pixel’s range | | OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask | | PadStack | Stacks several images into a tensor with border extension | -| PairwiseMatcher | Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method | +| PairwiseMatcher | Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method | | PillowResize | Changes the size and scale of an image using python-pillow algorithm | | RandomResizedCrop | Crops a random portion of an image and resizes it to a specified size. | | Reformat | Converts a planar image into non-planar and vice versa | diff --git a/LICENSE.md b/LICENSE.md index f0b0397aa..0701ae6cf 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,3 +1,18 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + Apache License Version 2.0, January 2004 diff --git a/README.md b/README.md index d8fed8bca..31ae4466b 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,29 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + # CV-CUDA [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.5.0--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.6.0--beta-blue) -![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2-gray) +![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) -[![Cuda](https://img.shields.io/badge/CUDA-v11.7-%2376B900?logo=nvidia)](https://developer.nvidia.com/cuda-toolkit-archive) +[![CUDA](https://img.shields.io/badge/CUDA-v11.7-%2376B900?logo=nvidia)](https://developer.nvidia.com/cuda-toolkit-archive) [![GCC](https://img.shields.io/badge/GCC-v11.0-yellow)](https://gcc.gnu.org/gcc-11/changes.html) -[![Python](https://img.shields.io/badge/python-v3.8_%7c_v3.10-blue?logo=python)](https://www.python.org/) +[![Python](https://img.shields.io/badge/python-v3.7_%7c_v3.8_%7c_v3.9_%7c_v3.10%7c_v3.11-blue?logo=python)](https://www.python.org/) [![CMake](https://img.shields.io/badge/CMake-v3.20-%23008FBA?logo=cmake)](https://cmake.org/) CV-CUDA is an open-source project that enables building efficient cloud-scale @@ -18,181 +33,248 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance]. Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the -operators available as of release v0.5.0-beta. +operators available as of release v0.6.0-beta. ## Getting Started To get a local copy up and running follow these steps. -### Pre-requisites +### Compatibility + +|CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages| +|-|-|-|-|-|-|-|-|-|-| +|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| +|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04| +|aarch64_cu11 (JetPack 5.1)|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 35.x| +|aarch64_cu12 (JetPack 6.0)|aarch64|12.2|SM7 and later|Jetson AGX Orin|JetPack 6.0 DP|3.10|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 36.2| -- Linux distro: - - Ubuntu x86_64 >= 20.04 - - WSL2 with Ubuntu >= 20.04 (tested with 20.04) -- NVIDIA driver - - Linux: Driver version 520.56.06 or higher -- CUDA Toolkit - - Version 11.7 or above. -- GCC >= 11.0 -- Python >= 3.8 -- cmake >= 3.20 +\* partial build, no test module (see Known Limitations)
+\** full build, including test module
+\*** [samples][CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. + +### Known limitations + +- For GCC versions lower than 11.0, C++17 support needs to be enabled when compiling CV-CUDA. +- The C++ test module cannot build with gcc<11 (requires specific C++-20 features). With gcc-9 or gcc-10, please build with option `-DBUILD_TESTS=0` +- [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. +- Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version. +- Test tarballs (cvcuda-tests-*.tar.xz) need to be unpacked at the root level to find existing tests. ### Installation -The following steps describe how to install CV-CUDA from pre-built install -packages. Choose the installation method that meets your environment needs. +For convenience, we provide pre-built packages for various combinations of CUDA versions, Python versions and architectures [here][CV-CUDA GitHub Releases]. +The following steps describe how to install CV-CUDA from such pre-built packages. + +We support two main alternative pathways: +- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings) +- Standalone Python Wheels (containing C++/CUDA Libraries and Python bindings) + +Choose the installation method that meets your environment needs. #### Tar File Installation +- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*): +```shell +tar -xvf cvcuda-lib-0.6.0_beta---linux.tar.xz +tar -xvf cvcuda-dev-0.6.0_beta---linux.tar.xz +``` +- Installation of Python bindings (cvcuda-python*) ```shell -tar -xvf nvcv-lib-0.5.0-cuda11-x86_64-linux.tar.xz -tar -xvf nvcv-dev-0.5.0-cuda11-x86_64-linux.tar.xz +tar -xvf cvcuda-python-0.6.0_beta---linux.tar.xz ``` +with `` the desired CUDA version, +`` the desired Python version and +`` the desired architecture #### DEB File Installation +- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*): +```shell +sudo apt-get install -y ./cvcuda-lib-0.6.0_beta---linux.deb ./cvcuda-dev-0.6.0_beta---linux.deb +``` +- Installation of Python bindings (cvcuda-python*) ```shell -sudo apt-get install -y ./nvcv-lib-0.5.0-cuda11-x86_64-linux.deb ./nvcv-dev-0.5.0-cuda11-x86_64-linux.deb +sudo apt-get install -y cvcuda-python-0.6.0_beta---linux.deb ``` +with `` the desired CUDA version, +`` the desired Python version and +`` the desired architecture + +#### Python Wheel File Installation + + +Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be accessed [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example: -#### Python WHL File Installation ```shell -pip install nvcv_python-0.5.0-cp38-cp38-linux_x86_64.whl +pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl ``` +with `` the desired CUDA version, +`` the desired Python version and +`` the desired architecture + +Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings. + ### Build from Source -Building CV-CUDA from source allows for customization and is essential for contributing to the project. Here are detailed steps to guide you through the process: +Follow these instruction to build CV-CUDA from source: + +1. Set up your local CV-CUDA repository + + a. Install prerequisites needed to setup up the repository. + + On Ubuntu >= 20.04, install the following packages: + - git-lfs: to retrieve binary files from remote repository + + ```shell + sudo apt-get install -y git git-lfs + ``` + + b. After cloning the repository (assuming it was cloned in `~/cvcuda`), + it needs to be properly configured by running the `init_repo.sh` script only once. + + ```shell + cd ~/cvcuda + ./init_repo.sh + ``` -#### 1. Repository Setup +2. Build CV-CUDA - Before you begin, ensure you have cloned the CV-CUDA repository to your local machine. Let's assume you've cloned it into `~/cvcuda`. + a. Install the dependencies required for building CV-CUDA - - **Initialize the Repository**: - After cloning, initialize the repository to configure it correctly. This setup is required only once. + On Ubuntu >= 20.04, install the following packages: + - g++-11: compiler to be used + - cmake (>= 3.20), ninja-build (optional): manage build rules + - python3-dev: for python bindings + - libssl-dev: needed by the testsuite (MD5 hashing utilities) - ```shell - cd ~/cvcuda - ./init_repo.sh - ``` + ```shell + sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev + ``` -#### 2. Install Build Dependencies + For CUDA Toolkit, any version of the 11.x or 12.x series should work. + CV-CUDA was tested with 11.7 and 12.2, thus those should be preferred. - CV-CUDA requires several dependencies to build from source. The following steps are based on Ubuntu 22.04, but similar packages can be found for other distributions. + ```shell + sudo apt-get install -y cuda-11-7 + # or + sudo apt-get install -y cuda-12-2 + ``` - - **Install Essential Packages**: - These include the compiler, build system, and necessary libraries. + b. Build the project - ```shell - sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev - ``` + ```shell + ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14'] + ``` - - **CUDA Toolkit**: - The CUDA Toolkit is essential for GPU acceleration. Although any 11.x version is compatible, 11.7 is recommended. + The default build type is 'release'. - ```shell - sudo apt-get install -y cuda-minimal-build-11-7 - ``` + If output build tree path isn't specified, it will be `build-rel` for release + builds, and `build-deb` for debug. -#### 3. Build Process + The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`. - Once the dependencies are in place, you can proceed to build CV-CUDA. + The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations). - - **Run Build Script**: - A build script is provided to simplify the compilation process. It creates a build tree and compiles the source code. + The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for. + By default, only the default system Python3 version will be selected. - ```shell - ci/build.sh - ``` + The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility. + By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked. - This script creates a release build by default, placing output in `build-rel`. You can specify a debug build or a different output directory: +3. Build Documentation - ```shell - ci/build.sh [release|debug] [output build tree path] - ``` + a. Install the dependencies required for building the documentation -#### 4. Build Documentation (Optional) + On Ubuntu >= 20.04, install the following packages: + - doxygen: parse header files for reference documentation + - python3, python3-pip: to install some python packages needed + - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation + - sphinx-rtd-theme: documenation theme used - If you need to build the documentation, additional dependencies are required: + ```shell + sudo apt-get install -y doxygen graphviz python3 python3-pip + sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme + ``` - - **Install Documentation Dependencies**: - These tools are used to generate and format the documentation. + b. Build the documentation + ```shell + ci/build_docs.sh [build folder] + ``` - ```shell - sudo apt-get install -y doxygen graphviz python3 python3-pip - sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme - ``` + Example: + `ci/build_docs.sh build_docs` - - **Generate Documentation**: - Use the provided script to build the documentation. +4. Build and run Samples - ```shell - ci/build_docs.sh [build folder] - ``` + For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation. - For example: +5. Run Tests - ```shell - ci/build_docs.sh build_docs - ``` + a. Install the dependencies required for running the tests -#### 5. Build and Run Samples (Optional) + On Ubuntu >= 20.04, install the following packages: + - python3, python3-pip: to run python bindings tests + - torch: dependencies needed by python bindings tests - CV-CUDA comes with a variety of samples to demonstrate its capabilities. + ```shell + sudo apt-get install -y python3 python3-pip + sudo python3 -m pip install pytest torch + ``` - - **See the Samples Documentation**: - Detailed instructions for building and running samples are available in the [Samples](samples/README.md) documentation. + b. Run the tests -#### 6. Running Tests + The tests are in `/bin`. You can run the script below to run all + tests at once. Here's an example when build tree is created in `build-rel` - To ensure everything is working as expected, you can run CV-CUDA's test suite. + ```shell + build-rel/bin/run_tests.sh + ``` - - **Install Test Dependencies**: - These are necessary to run the Python binding tests. +6. Package installers and Python Wheels - ```shell - sudo apt-get install -y python3 python3-pip - sudo python3 -m pip install pytest torch - ``` + a. Package installers - - **Execute Tests**: - Run the test scripts located in the build tree. + Installers can be generated using the following cpack command once you have successfully built the project - ```shell - build-rel/bin/run_tests.sh - ``` + ```shell + cd build-rel + cpack . + ``` -#### 7. Packaging + This will generate in the build directory both Debian installers and tarballs + (\*.tar.xz), needed for integration in other distros. - After a successful build, you can create installers using `cpack`. + For a fine-grained choice of what installers to generate, the full syntax is: - - **Generate Installers**: - This step produces Debian packages and tarballs, suitable for distribution or installation on other systems. + ```shell + cpack . -G [DEB|TXZ] + ``` - ```shell - cd build-rel - cpack . - ``` + - DEB for Debian packages + - TXZ for \*.tar.xz tarballs. - For specific installer types: + b. Python Wheels - ```shell - cpack . -G [DEB|TXZ] - ``` + By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python + version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory + used to build the release build and `X` and `Y` are Python major and minor versions. The built wheels can be installed using pip. + For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems: - - `DEB` for Debian packages. - - `TXZ` for `.tar.xz` tarballs. + ```shell + pip install cvcuda_cu12-0.6.0b0-cp310-cp310-linux_x86_64.whl + ``` ## Contributing CV-CUDA is an open source project. As part of the Open Source Community, we are committed to the cycle of learning, improving, and updating that makes this -community thrive. However, as of release v0.5.0-beta, CV-CUDA is not yet ready +community thrive. However, as of release v0.6.0-beta, CV-CUDA is not yet ready for external contributions. To understand the process for contributing the CV-CUDA, see our -[Contributing](CONTRIBUTING.md) page. To understand our committment to the Open +[Contributing](CONTRIBUTING.md) page. To understand our commitment to the Open Source Community, and providing an environment that both supports and respects the efforts of all contributors, please read our [Code of Conduct](CODE_OF_CONDUCT.md). @@ -254,3 +336,5 @@ CV-CUDA is developed jointly by NVIDIA and ByteDance. [NVIDIA Develop]: https://developer.nvidia.com/ [ByteDance]: https://www.bytedance.com/ +[CV-CUDA GitHub Releases]: https://github.com/CVCUDA/CV-CUDA/releases +[CV-CUDA Samples]: https://github.com/CVCUDA/CV-CUDA/blob/main/samples/README.md diff --git a/SECURITY.md b/SECURITY.md index 1bcc28963..f0b0503c6 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,3 +1,18 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + # Security NVIDIA is dedicated to the security and trust of our software products and diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp index 2beb27470..06deb9732 100644 --- a/bench/BenchFindContours.cpp +++ b/bench/BenchFindContours.cpp @@ -87,10 +87,10 @@ try CPUImage srcVec(shape.y * shape.z, 0); for (auto i = 0; i < 10; ++i) { - auto anchorX = rand() % shape.z; - auto anchorY = rand() % shape.y; - auto sizeX = rand() % (shape.z - anchorX); - auto sizeY = rand() % (shape.y - anchorY); + int anchorX = rand() % shape.z; + int anchorY = rand() % shape.y; + int sizeX = rand() % (shape.z - anchorX); + int sizeY = rand() % (shape.y - anchorY); generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY}); } diff --git a/bench/BenchHQResize.cpp b/bench/BenchHQResize.cpp new file mode 100644 index 000000000..9d80963ec --- /dev/null +++ b/bench/BenchHQResize.cpp @@ -0,0 +1,129 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BenchUtils.hpp" + +#include + +#include + +template +inline void HQResize(nvbench::state &state, nvbench::type_list) +try +{ + long3 srcShape = benchutils::GetShape<3>(state.get_string("shape")); + bool antialias = state.get_int64("antialias"); + NVCVInterpolationType interpolation = benchutils::GetInterpolationType(state.get_string("interpolation")); + bool batch = state.get_int64("batch"); + + long3 dstShape; + if (state.get_string("resizeType") == "EXPAND") + { + if (antialias) + { + state.skip("Antialias is no-op for expanding"); + return; + } + dstShape = long3{srcShape.x, srcShape.y * 2, srcShape.z * 2}; + } + else if (state.get_string("resizeType") == "CONTRACT") + { + // resize from shape to shape/2 + dstShape = long3{srcShape.x, srcShape.y / 2, srcShape.z / 2}; + } + else + { + throw std::invalid_argument("Invalid resizeType = " + state.get_string("resizeType")); + } + + nvcv::Size2D srcSize{(int)srcShape.z, (int)srcShape.y}; + nvcv::Size2D dstSize{(int)dstShape.z, (int)dstShape.y}; + + nvcv::DataType dtype{benchutils::GetDataType()}; + nvcv::ImageFormat fmt(nvcv::MemLayout::PITCH_LINEAR, dtype.dataKind(), nvcv::Swizzle::S_X000, dtype.packing()); + + state.add_global_memory_reads(srcShape.x * srcShape.y * srcShape.z * sizeof(T)); + state.add_global_memory_writes(dstShape.x * dstShape.y * dstShape.z * sizeof(T)); + + cvcuda::HQResize op; + + if (!batch) + { + HQResizeTensorShapeI inShapeDesc{ + {srcSize.h, srcSize.w}, + 2, + 1 + }; + HQResizeTensorShapeI outShapeDesc{ + {dstSize.h, dstSize.w}, + 2, + 1 + }; + cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace( + op.getWorkspaceRequirements(1, inShapeDesc, outShapeDesc, interpolation, interpolation, antialias)); + + // clang-format off + nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, dtype); + nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, dtype); + // clang-format on + + benchutils::FillTensor(src, benchutils::RandomValues()); + + state.exec(nvbench::exec_tag::sync, [&op, &ws, &src, &dst, interpolation, antialias](nvbench::launch &launch) + { op(launch.get_stream(), ws.get(), src, dst, interpolation, interpolation, antialias); }); + } + else + { + HQResizeTensorShapeI maxShape{ + {std::max(srcSize.h, dstSize.h), std::max(srcSize.w, dstSize.w)}, + 2, + 1 + }; + cvcuda::UniqueWorkspace ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements(1, maxShape)); + + // clang-format off + nvcv::Tensor src({{srcShape.y, srcShape.z, 1}, "HWC"}, dtype); + nvcv::Tensor dst({{dstShape.y, dstShape.z, 1}, "HWC"}, dtype); + // clang-format on + + benchutils::FillTensor(src, benchutils::RandomValues()); + nvcv::TensorBatch srcTensors(1); + nvcv::TensorBatch dstTensors(1); + srcTensors.pushBack(src); + dstTensors.pushBack(dst); + + state.exec( + nvbench::exec_tag::sync, + [&op, &ws, &srcTensors, &dstTensors, interpolation, antialias](nvbench::launch &launch) + { op(launch.get_stream(), ws.get(), srcTensors, dstTensors, interpolation, interpolation, antialias); }); + } +} + +catch (const std::exception &err) +{ + state.skip(err.what()); +} + +using HQResizeTypes = nvbench::type_list; + +NVBENCH_BENCH_TYPES(HQResize, NVBENCH_TYPE_AXES(HQResizeTypes)) + .set_type_axes_names({"InOutDataType"}) + .add_int64_axis("batch", {false}) + .add_string_axis("shape", {"1x1080x1920"}) + .add_string_axis("interpolation", {"CUBIC"}) + .add_int64_axis("antialias", {false, true}) + .add_string_axis("resizeType", {"CONTRACT"}); diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp index 69ed2f97c..d3947e788 100644 --- a/bench/BenchMorphology.cpp +++ b/bench/BenchMorphology.cpp @@ -25,9 +25,10 @@ template inline void Morphology(nvbench::state &state, nvbench::type_list) try { - long3 shape = benchutils::GetShape<3>(state.get_string("shape")); - long varShape = state.get_int64("varShape"); - int iteration = static_cast(state.get_int64("iteration")); + long3 shape = benchutils::GetShape<3>(state.get_string("shape")); + long varShape = state.get_int64("varShape"); + int iteration = static_cast(state.get_int64("iteration")); + int2 kernelSize = nvcv::cuda::StaticCast(benchutils::GetShape<2>(state.get_string("kernelSize"))); NVCVBorderType borderType = benchutils::GetBorderType(state.get_string("border")); @@ -50,7 +51,7 @@ try morphType = NVCV_CLOSE; } - nvcv::Size2D mask{3, 3}; + nvcv::Size2D mask{kernelSize.x, kernelSize.y}; int2 anchor{-1, -1}; int bwIteration = (morphType == NVCV_OPEN || morphType == NVCV_CLOSE || iteration > 1) ? 2 * iteration : iteration; @@ -129,5 +130,6 @@ NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes)) .add_string_axis("shape", {"1x1080x1920"}) .add_int64_axis("varShape", {-1}) .add_int64_axis("iteration", {1}) + .add_string_axis("kernelSize", {"3x3"}) .add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"}) .add_string_axis("border", {"REPLICATE"}); diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index 67fd8c5f1..e82bf3da4 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -30,6 +30,7 @@ set(bench_sources BenchFlip.cpp BenchRotate.cpp BenchPillowResize.cpp + BenchHQResize.cpp BenchCenterCrop.cpp BenchWarpPerspective.cpp BenchWarpAffine.cpp @@ -75,8 +76,7 @@ foreach(bench_source IN LISTS bench_sources) string(REPLACE "Bench" "cvcuda_bench_" algo_name ${bench_file_name}) string(TOLOWER ${algo_name} bench_name) add_executable(${bench_name} "${bench_source}") - target_include_directories(${bench_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}") - target_link_libraries(${bench_name} PRIVATE nvbench::main PUBLIC cvcuda) + target_link_libraries(${bench_name} PRIVATE cvcuda::nvbench::main cvcuda) set_target_properties(${bench_name} PROPERTIES COMPILE_FEATURES cuda_std_17) add_dependencies(bench_all ${bench_name}) endforeach() diff --git a/bench/python/README.md b/bench/python/README.md new file mode 100644 index 000000000..b7879c8ae --- /dev/null +++ b/bench/python/README.md @@ -0,0 +1,116 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + +# Python Operator Performance Benchmarking + +Using various performance benchmarking scripts that ships with CV-CUDA samples, we can measure and report the performance of various CV-CUDA operators from Python. + +The following scripts are part of the performance benchmarking tools in CV-CUDA. + +1. `samples/scripts/benchmark.py` +2. `samples/common/python/perf_utils.py` +3. `bench/python/bench_utils.py` + +We use NVIDIA NSYS internally to benchmark Python code for its CPU and GPU run-times. + + +## About the Operator Benchmarks + +Operators for which a test case has been implemented in the `all_ops` folder can be benchmarked. The following statements are true for all such test cases: + +1. All inherit from a base class called `AbstractOpBase` which allows them to expose benchmarking capabilities in a consistent manner. They all have a setup stage, a run stage and an optional visualization stage. By default, the visualization is turned off. +2. All receive the same input image. Some operators may need to read additional data. Such data is always read from the `assets` directory. +3. All run for a number of iterations (default is set to 10) and a batch size (default is set to 32). +4. The script `benchmark.py` handles overall benchmarking. It launches the runs, monitors it, communicates with NSYS and saves the results of a run in a JSON file. Various settings such as using warm-up (default is set to 1 iteration) are handled here. +5. One or more benchmark runs can be compared and summarized in a table showing only the important information from the detailed JSON files. + +## Setting up the environment + +1. Follow [Setting up the environment](../../samples/README.md#setting-up-the-environment) section of the CV-CUDA samples. Note: The step asking to install dependencies can be ignored if you are only interested in benchmarking the operators (and not the samples). + + +## Running the benchmark + +The script `run_bench.py` together with `benchmark.py` can be used to automatically benchmark all supported CV-CUDA operators in Python. Additionally, one or more runs can be summarized and compared in a table using the functionality provided by `bench_utils.py` + + +### To run the operator benchmarks + +```bash +python3 samples/scripts/benchmark.py -o bench/python/run_bench.py +``` +- Where: + 1. An `OUTPUT_DIR` must be given to store various benchmark artifacts. +- Upon running it will: + 1. Ask the `benchmark.py` to launch the `run_bench.py`. + 2. `run_bench.py` will then find out all the operators that can be benchmarked. + 3. Run those one by one, through all the stages, such as setup, run and visualization (if enabled). + 4. Store the artifacts in the output folder. This is where the `benchmark.py` style `benchmark_mean.json` would be stored. + +Once a run is completed, one can use the `bench_utils.py` to summarize it. Additionally, we can use the same script to compare multiple different runs. + +### To summarize one run only + +```bash +python3 bench/python/bench_utils.py -o -b -bn baseline +``` +- Where: + 1. A `OUTPUT_DIR` must be given to store the summary table as a CSV file. + 2. The first run's `benchmark_mean.json` path must be given as `b`. + 3. The display name of the first run must be given as `bn`. +- Upon running it will: + 1. Grab appropriate values from the JSON file for all the operators and put it in a table format. + 2. Save the table as a CSV file. + +The output CSV file will be stored in the `OUTPUT_DIR` with current date and time on it. + +NOTE: `benchmark.py` will produce additional JSON files (and visualization files if it was enabled). These files provide way more detailed information compared to the CSV and is usually only meant for debugging purposes. + + +### To summarize and compare multiple runs + +```bash +python3 bench/python/bench_utils.py -o -b -bn baseline \ + -c -cn run_2 \ + -c -cn run_3 +``` +- Where: + 1. An `OUTPUT_DIR` must be given to store the summary table as a CSV file. + 2. The first run's `benchmark_mean.json` path is given as `b`. + 3. The display name of the first run is given as `bn`. + 4. The second run's `benchmark_mean.json` path is given as `c`. + 5. The display name of the second run is given as `cn`. + 6. The third run's `benchmark_mean.json` path is given as `c`. + 7. The display name of the third run must be given as `cn`. + 8. Options `c` and `cn` can be repeated as zero or more times to cover all the runs. +- Upon running it will: + 1. Grab appropriate values from the JSON file for all the operators and put it in a table format. + 2. Save the table as a CSV file. + + +## Interpreting the results + +Upon a successful completion of the `bench_utils.py` script, we would get a CSV file. + +- If you ran it only on one run, your CSV will only have four columns - showing data only from that run: + 1. `index`: from 0 to N-1 for all the N operators benchmarked + 2. `operator name` The name of the operator + 3. `baseline run time (ms)`: The first run's time in milliseconds, averaged across M iterations (default is 10, with warm-up runs discarded) + 4. `run time params`: Any helpful parameters supplied to the operator as it ran in first run. Only lists primitive data-types. + +- If you ran it on more than one runs, your CSV file will have additional columns - comparing data of those runs with the baseline run. Additional columns, per run, would be: + 1. `run i time (ms)`: The ith run's time in milliseconds, averaged across M iterations (default is 10, with warm-up runs discarded) + 2. `run i v/s baseline speed-up`: The speed-up factor. This is calculated by dividing `run i time (ms)` by `baseline run time (ms)`. diff --git a/bench/python/all_ops/op_adaptivethreshold.py b/bench/python/all_ops/op_adaptivethreshold.py new file mode 100644 index 000000000..ddc316cc6 --- /dev/null +++ b/bench/python/all_ops/op_adaptivethreshold.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpAdaptiveThreshold(AbstractOpBase): + def setup(self, input): + self.maxval = 255.0 + self.adaptive_method = cvcuda.AdaptiveThresholdType.GAUSSIAN_C + self.threshold_type = cvcuda.ThresholdType.BINARY + self.block_size = 11 + self.c = 2 + self.grayscale_input = cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2GRAY) + + def run(self, input): + return cvcuda.adaptivethreshold( + self.grayscale_input, + max_value=self.maxval, + adaptive_method=self.adaptive_method, + threshold_type=self.threshold_type, + block_size=self.block_size, + c=self.c, + ) diff --git a/bench/python/all_ops/op_averageblur.py b/bench/python/all_ops/op_averageblur.py new file mode 100644 index 000000000..cf591e3f1 --- /dev/null +++ b/bench/python/all_ops/op_averageblur.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpAverageBlur(AbstractOpBase): + def setup(self, input): + self.kernel_size = (3, 3) + self.kernel_anchor = (-1, -1) + + def run(self, input): + return cvcuda.averageblur( + input, kernel_size=self.kernel_size, kernel_anchor=self.kernel_anchor + ) diff --git a/bench/python/all_ops/op_blurbox.py b/bench/python/all_ops/op_blurbox.py new file mode 100644 index 000000000..8f24740d5 --- /dev/null +++ b/bench/python/all_ops/op_blurbox.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import os + + +class OpBlurBox(AbstractOpBase): + def setup(self, input): + self.kernel_size = 5 + + data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) + data = data.moveaxis(0, -1).contiguous() # From CHW to HWC + data = data.cuda(self.device_id) + data = [data.clone() for _ in range(input.shape[0])] + data = torch.stack(data) + self.input = cvcuda.as_tensor(data, "NHWC") + + bboxes = torch.load( + os.path.join(self.assets_dir, "brooklyn_bboxes.pt"), + map_location="cuda:%d" % self.device_id, + ) + bboxes = [bboxes[0].clone() for _ in range(input.shape[0])] + self.bboxes_pyt = torch.stack(bboxes) + bboxes = cvcuda.as_tensor(self.bboxes_pyt) + + scores = torch.load( + os.path.join(self.assets_dir, "brooklyn_scores.pt"), + map_location="cuda:%d" % self.device_id, + ) + scores = [scores[0].clone() for _ in range(input.shape[0])] + scores = torch.stack(scores) + scores = cvcuda.as_tensor(scores) + + self.nms_masks_pyt = torch.load( + os.path.join(self.assets_dir, "brooklyn_nms_masks.pt"), + map_location="cuda:%d" % self.device_id, + ) + + def run(self, input): + blur_boxes = [] + # Create an array of bounding boxes with render settings. + for current_boxes, current_masks in zip(self.bboxes_pyt, self.nms_masks_pyt): + filtered_boxes = current_boxes[current_masks] + BlurBoxI_list = [] + + for box in filtered_boxes: + BlurBoxI_list.append( + cvcuda.BlurBoxI( + box=tuple(box), + kernelSize=self.kernel_size, + ) + ) + + blur_boxes.append(BlurBoxI_list) + + batch_blur_boxes = cvcuda.BlurBoxesI(boxes=blur_boxes) + + cvcuda.boxblur_into(self.input, self.input, batch_blur_boxes) + + return self.input diff --git a/bench/python/all_ops/op_boundingbox.py b/bench/python/all_ops/op_boundingbox.py new file mode 100644 index 000000000..5b9f1ba3d --- /dev/null +++ b/bench/python/all_ops/op_boundingbox.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import os + + +class OpBoundingBox(AbstractOpBase): + def setup(self, input): + self.border_color = (0, 255, 0, 255) + self.fill_color = (0, 0, 255, 0) + self.thickness = 5 + + data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) + data = data.moveaxis(0, -1).contiguous() # From CHW to HWC + data = data.cuda(self.device_id) + data = [data.clone() for _ in range(input.shape[0])] + data = torch.stack(data) + self.input = cvcuda.as_tensor(data, "NHWC") + + bboxes = torch.load( + os.path.join(self.assets_dir, "brooklyn_bboxes.pt"), + map_location="cuda:%d" % self.device_id, + ) + bboxes = [bboxes[0].clone() for _ in range(input.shape[0])] + self.bboxes_pyt = torch.stack(bboxes) + bboxes = cvcuda.as_tensor(self.bboxes_pyt) + + scores = torch.load( + os.path.join(self.assets_dir, "brooklyn_scores.pt"), + map_location="cuda:%d" % self.device_id, + ) + scores = [scores[0].clone() for _ in range(input.shape[0])] + scores = torch.stack(scores) + scores = cvcuda.as_tensor(scores) + + self.nms_masks_pyt = torch.load( + os.path.join(self.assets_dir, "brooklyn_nms_masks.pt"), + map_location="cuda:%d" % self.device_id, + ) + + def run(self, input): + bounding_boxes = [] + # Create an array of bounding boxes with render settings. + for current_boxes, current_masks in zip(self.bboxes_pyt, self.nms_masks_pyt): + filtered_boxes = current_boxes[current_masks] + BndBoxI_list = [] + + for box in filtered_boxes: + BndBoxI_list.append( + cvcuda.BndBoxI( + box=tuple(box), + thickness=self.thickness, + borderColor=self.border_color, + fillColor=self.fill_color, + ) + ) + + bounding_boxes.append(BndBoxI_list) + + batch_bounding_boxes = cvcuda.BndBoxesI(boxes=bounding_boxes) + + cvcuda.bndbox_into(self.input, self.input, batch_bounding_boxes) + + return self.input diff --git a/bench/python/all_ops/op_brightnesscontrast.py b/bench/python/all_ops/op_brightnesscontrast.py new file mode 100644 index 000000000..1cd38e679 --- /dev/null +++ b/bench/python/all_ops/op_brightnesscontrast.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch + + +class OpBrightnessContrast(AbstractOpBase): + def setup(self, input): + brightness = torch.tensor([1.2]).cuda(self.device_id) + self.brightness = cvcuda.as_tensor(brightness, "N") + + contrast = torch.tensor([0.7]).cuda(self.device_id) + self.contrast = cvcuda.as_tensor(contrast, "N") + + brightness_shift = torch.tensor([130.0]).cuda(self.device_id) + self.brightness_shift = cvcuda.as_tensor(brightness_shift, "N") + + contrast_center = torch.tensor([0.5]).cuda(self.device_id) + self.contrast_center = cvcuda.as_tensor(contrast_center, "N") + + def run(self, input): + return cvcuda.brightness_contrast( + input, + brightness=self.brightness, + contrast=self.contrast, + brightness_shift=self.brightness_shift, + contrast_center=self.contrast_center, + ) diff --git a/bench/python/all_ops/op_centercrop.py b/bench/python/all_ops/op_centercrop.py new file mode 100644 index 000000000..907c31cf0 --- /dev/null +++ b/bench/python/all_ops/op_centercrop.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpCenterCrop(AbstractOpBase): + def setup(self, input): + width, height = input.shape[2], input.shape[1] + self.crop_size = [width // 2, height // 2] + + def run(self, input): + return cvcuda.center_crop( + input, + self.crop_size, + ) diff --git a/bench/python/all_ops/op_composite.py b/bench/python/all_ops/op_composite.py new file mode 100644 index 000000000..d42e5063b --- /dev/null +++ b/bench/python/all_ops/op_composite.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import os + + +class OpComposite(AbstractOpBase): + def setup(self, input): + data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) + data = data.moveaxis(0, -1).contiguous() # From CHW to HWC + data = data.cuda(self.device_id) + data = [data.clone() for _ in range(input.shape[0])] + data = torch.stack(data) + self.input = cvcuda.as_tensor(data, "NHWC") + self.blurred_input = cvcuda.gaussian( + self.input, kernel_size=(15, 15), sigma=(5, 5) + ) + + mask = read_image(os.path.join(self.assets_dir, "brooklyn_mask.jpg")) + mask = mask.moveaxis(0, -1).contiguous() # From CHW to HWC + mask = mask.cuda(self.device_id) + mask = [mask.clone() for _ in range(input.shape[0])] + mask = torch.stack(mask) + self.class_masks = cvcuda.as_tensor(mask, "NHWC") + + def run(self, input): + return cvcuda.composite( + self.input, + self.blurred_input, + self.class_masks, + 3, + ) diff --git a/bench/python/all_ops/op_convertto.py b/bench/python/all_ops/op_convertto.py new file mode 100644 index 000000000..48e4fa21c --- /dev/null +++ b/bench/python/all_ops/op_convertto.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import nvcv + + +class OpConvertTo(AbstractOpBase): + def setup(self, input): + self.target_dtype = nvcv.Type.F32 + self.offset = 10.2 + self.scale = 1 / 255.0 + + def run(self, input): + return cvcuda.convertto(input, self.target_dtype, self.offset, self.scale) + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py new file mode 100644 index 000000000..c0bca25b6 --- /dev/null +++ b/bench/python/all_ops/op_copymakeborder.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpCopyMakeBorder(AbstractOpBase): + def setup(self, input): + self.border_mode = cvcuda.Border.CONSTANT + self.border_values = [255, 0, 0] # Border values for 3 channel input. + self.top = 30 + self.left = 40 + self.bottom = 50 + self.right = 60 + + def run(self, input): + return cvcuda.copymakeborder( + input, + border_mode=self.border_mode, + border_value=self.border_values, + top=self.top, + bottom=self.bottom, + left=self.left, + right=self.right, + ) diff --git a/bench/python/all_ops/op_customcrop.py b/bench/python/all_ops/op_customcrop.py new file mode 100644 index 000000000..0618a4821 --- /dev/null +++ b/bench/python/all_ops/op_customcrop.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import nvcv + + +class OpCustomCrop(AbstractOpBase): + def setup(self, input): + self.rectI = nvcv.RectI(x=30, y=40, width=420, height=390) + + def run(self, input): + return cvcuda.customcrop(input, self.rectI) diff --git a/bench/python/all_ops/op_cvtcolor.py b/bench/python/all_ops/op_cvtcolor.py new file mode 100644 index 000000000..6eafee402 --- /dev/null +++ b/bench/python/all_ops/op_cvtcolor.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpCvtColorRGB2GRAY(AbstractOpBase): + def setup(self, input): + pass + + def run(self, input): + return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2GRAY) + + +class OpCvtColorRGB2BGR(AbstractOpBase): + def setup(self, input): + pass + + def run(self, input): + return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2BGR) diff --git a/bench/python/all_ops/op_findcontours.py b/bench/python/all_ops/op_findcontours.py new file mode 100644 index 000000000..7fe31cab0 --- /dev/null +++ b/bench/python/all_ops/op_findcontours.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import matplotlib.pyplot as plt +import numpy as np +import os +import logging + +logger = logging.getLogger(__name__) + + +class OpFindContours(AbstractOpBase): + def setup(self, input): + grayscale_input = read_image( + os.path.join(self.assets_dir, "countour_lines.jpg") + ) + grayscale_input = grayscale_input.moveaxis( + 0, -1 + ).contiguous() # From CHW to HWC + # Binarize the grayscale_input + grayscale_input[grayscale_input <= 50] = 0 + grayscale_input[grayscale_input > 50] = 255 + + grayscale_input = [grayscale_input.clone() for _ in range(input.shape[0])] + grayscale_input = torch.stack(grayscale_input) + grayscale_input = grayscale_input.cuda(self.device_id) + self.grayscale_input = cvcuda.as_tensor(grayscale_input, "NHWC") + + def run(self, input): + return cvcuda.find_contours(self.grayscale_input) + + def visualize(self): + """ + Attempts to visualize the output produced by the operator as an image by writing it + down to the disk. May raise exceptions if visualization is not successful. + """ + output_dir = self._setup_clear_output_dir(filename_ends_with="_op_out.jpg") + # Convert the inputs and outputs to numpy arrays first. + # input shape: NHWC + # out[0] = points_info shape: NxMx2 (M == max points, 2 for x and y coordinates) + # out[1] = contours_info shape: NxC where + # (C == max contours, number of non-zero elements are number of contours) + input_npy = ( + torch.as_tensor( + self.grayscale_input.cuda(), device="cuda:%d" % self.device_id + ) + .cpu() + .numpy() + ) + points_npy = ( + torch.as_tensor(self.op_output[0].cuda(), device="cuda:%d" % self.device_id) + .cpu() + .numpy() + ) + num_contours_npy = ( + torch.as_tensor(self.op_output[1].cuda(), device="cuda:%d" % self.device_id) + .cpu() + .numpy() + ) + + # Loop over all the images... + for i, img in enumerate(input_npy): + + # Grab the information on the points and the contours of this image. + points_info = points_npy[i] + contours_info = num_contours_npy[i] + + # Keep only the non-zero entries from contours_info + contours_info = contours_info[np.nonzero(contours_info)] + # Use the num_points in contours_info to split the points_info + # Since the values in num_points are not start-stop indices of the points + # we need to use cumsum to fix it and use it inside the split function + valid_points = np.split(points_info, contours_info.cumsum()) + # Last element in valid_points is the remainder of the points so need to drop it. + all_contours = valid_points[:-1] # This list stores OpenCV style contours. + + plt.figure(figsize=(img.shape[1] / 100.0, img.shape[0] / 100.0)) + plt.gca().invert_yaxis() + + plt.plot(0, 0, color="white") + plt.plot(img.shape[1], img.shape[0], color="white") + for contour in all_contours: + x, y = contour[:, 0], contour[:, 1] + plt.plot(x, y, color="green", linewidth=2) + + # Save using PIL + out_file_name = "img_%d_op_out.jpg" % i + plt.savefig(os.path.join(output_dir, out_file_name)) + plt.close() diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py new file mode 100644 index 000000000..962a12856 --- /dev/null +++ b/bench/python/all_ops/op_flip.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpFlip(AbstractOpBase): + def setup(self, input): + self.flip_code = -1 # means flipping around both axes. + + def run(self, input): + return cvcuda.flip(input, flipCode=self.flip_code) diff --git a/bench/python/all_ops/op_gaussianblur.py b/bench/python/all_ops/op_gaussianblur.py new file mode 100644 index 000000000..cd306ec93 --- /dev/null +++ b/bench/python/all_ops/op_gaussianblur.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpGaussianBlur(AbstractOpBase): + def setup(self, input): + self.kernel_size = (3, 3) + self.sigma = (5, 5) + + def run(self, input): + return cvcuda.gaussian(input, kernel_size=self.kernel_size, sigma=self.sigma) diff --git a/bench/python/all_ops/op_hqresize.py b/bench/python/all_ops/op_hqresize.py new file mode 100644 index 000000000..a5514ab72 --- /dev/null +++ b/bench/python/all_ops/op_hqresize.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpHqResizeDown(AbstractOpBase): + def setup(self, input): + self.resize_width = 640 + self.resize_height = 420 + + def run(self, input): + return cvcuda.hq_resize( + input, + ( + self.resize_height, + self.resize_width, + ), + interpolation=cvcuda.Interp.NEAREST, + ) + + +class OpHqResizeUp(AbstractOpBase): + def setup(self, input): + self.resize_width = 1920 + self.resize_height = 1280 + + def run(self, input): + return cvcuda.hq_resize( + input, + ( + self.resize_height, + self.resize_width, + ), + interpolation=cvcuda.Interp.LINEAR, + ) diff --git a/bench/python/all_ops/op_inpaint.py b/bench/python/all_ops/op_inpaint.py new file mode 100644 index 000000000..c2419545b --- /dev/null +++ b/bench/python/all_ops/op_inpaint.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import os + + +class OpInpaint(AbstractOpBase): + def setup(self, input): + data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) + mask = read_image(os.path.join(self.assets_dir, "countour_lines.jpg")) + # Binarize the mask + mask[mask <= 50] = 0 + mask[mask > 50] = 255 + + # Add scratch marks on the top of the input data and convert it to tensor + mask3 = mask.repeat(3, 1, 1) + data[mask3 > 0] = mask3[mask3 > 0] + data = data.moveaxis(0, -1).contiguous() # From CHW to HWC + data = [data.clone() for _ in range(input.shape[0])] + data = torch.stack(data) + data = data.cuda(self.device_id) + self.data = cvcuda.as_tensor(data, "NHWC") + + mask = torch.unsqueeze(mask[0], -1) # 3 channel chw to 1 channel hwc mask + mask = [mask.clone() for _ in range(input.shape[0])] + mask = torch.stack(mask) + mask = mask.cuda(self.device_id) + self.masks = cvcuda.as_tensor(mask, "NHWC") + self.inpaint_radius = 3 + + def run(self, input): + return cvcuda.inpaint( + self.data, + self.masks, + self.inpaint_radius, + ) diff --git a/bench/python/all_ops/op_jointbilateral.py b/bench/python/all_ops/op_jointbilateral.py new file mode 100644 index 000000000..99b0cc0f7 --- /dev/null +++ b/bench/python/all_ops/op_jointbilateral.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +from torchvision.io import read_image +import os + + +class OpJointBilateral(AbstractOpBase): + def setup(self, input): + self.diameter = 5 + self.sigma_color = 50 + self.sigma_space = 1 + + data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) + data = data.moveaxis(0, -1).contiguous() # From CHW to HWC + data = [data.clone() for _ in range(input.shape[0])] + data = torch.stack(data) + data = data.cuda(self.device_id) + data = cvcuda.as_tensor(data, "NHWC") + self.grayscale_input = cvcuda.cvtcolor(data, cvcuda.ColorConversion.RGB2GRAY) + + mask = read_image(os.path.join(self.assets_dir, "brooklyn_mask.jpg")) + mask = mask.moveaxis(0, -1).contiguous() # From CHW to HWC + mask = [mask.clone() for _ in range(input.shape[0])] + mask = torch.stack(mask) + mask = mask.cuda(self.device_id) + self.class_masks = cvcuda.as_tensor(mask, "NHWC") + + def run(self, input): + return cvcuda.joint_bilateral_filter( + self.class_masks, + self.grayscale_input, + diameter=self.diameter, + sigma_color=self.sigma_color, + sigma_space=self.sigma_space, + ) diff --git a/bench/python/all_ops/op_laplacian.py b/bench/python/all_ops/op_laplacian.py new file mode 100644 index 000000000..ee9d4b75a --- /dev/null +++ b/bench/python/all_ops/op_laplacian.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpLaplacian(AbstractOpBase): + def setup(self, input): + self.kernel_size = 3 + self.scale = 2.0 + + def run(self, input): + return cvcuda.laplacian(input, ksize=self.kernel_size, scale=self.scale) diff --git a/bench/python/all_ops/op_morphology.py b/bench/python/all_ops/op_morphology.py new file mode 100644 index 000000000..f13434e05 --- /dev/null +++ b/bench/python/all_ops/op_morphology.py @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch + + +class MorphologyBase: + def __init__(self, device_id, input, morphology_type): + self.device_id = device_id + self.mask_size = [5, 5] + self.anchor = [-1, -1] + self.num_iterations = 3 + self.border_type = cvcuda.Border.CONSTANT + self.morphology_type = morphology_type + + # Morphology requires binary input, with mostly white foreground + threshold_value = torch.tensor([150.0] * input.shape[0]) + threshold_value = threshold_value.type(torch.float64) + threshold_value = threshold_value.cuda(self.device_id) + threshold_value = cvcuda.as_tensor(threshold_value, "N") + + maxval = torch.tensor([255.0] * input.shape[0]) + maxval = maxval.type(torch.float64) + maxval = maxval.cuda(self.device_id) + maxval = cvcuda.as_tensor(maxval, "N") + self.binary_input = cvcuda.threshold( + input, threshold_value, maxval, type=cvcuda.ThresholdType.BINARY + ) + + if self.num_iterations > 1: + self.workspace = cvcuda.Tensor(input.shape, input.dtype, "NHWC") + else: + self.workspace = None + + def __call__(self): + return cvcuda.morphology( + self.binary_input, + self.morphology_type, + maskSize=self.mask_size, + anchor=self.anchor, + workspace=self.workspace, + iteration=self.num_iterations, + border=self.border_type, + ) + + +class OpMorphologyOpen(AbstractOpBase): + def setup(self, input): + self.MorphologyBase = MorphologyBase( + self.device_id, input, cvcuda.MorphologyType.OPEN + ) + + def run(self, input): + return self.MorphologyBase() + + +class OpMorphologyClose(AbstractOpBase): + def setup(self, input): + self.MorphologyBase = MorphologyBase( + self.device_id, input, cvcuda.MorphologyType.CLOSE + ) + + def run(self, input): + return self.MorphologyBase() + + +class OpMorphologyDilate(AbstractOpBase): + def setup(self, input): + self.MorphologyBase = MorphologyBase( + self.device_id, input, cvcuda.MorphologyType.DILATE + ) + + def run(self, input): + return self.MorphologyBase() + + +class OpMorphologyErode(AbstractOpBase): + def setup(self, input): + self.MorphologyBase = MorphologyBase( + self.device_id, input, cvcuda.MorphologyType.ERODE + ) + + def run(self, input): + return self.MorphologyBase() diff --git a/bench/python/all_ops/op_nms.py b/bench/python/all_ops/op_nms.py new file mode 100644 index 000000000..dd9abfa9f --- /dev/null +++ b/bench/python/all_ops/op_nms.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch +import os + + +class OpNMS(AbstractOpBase): + def setup(self, input): + bboxes = torch.load( + os.path.join(self.assets_dir, "brooklyn_bboxes.pt"), + map_location="cuda:%d" % self.device_id, + ) + bboxes = [bboxes[0].clone() for _ in range(input.shape[0])] + bboxes = torch.stack(bboxes) + self.bboxes = cvcuda.as_tensor(bboxes) + + scores = torch.load( + os.path.join(self.assets_dir, "brooklyn_scores.pt"), + map_location="cuda:%d" % self.device_id, + ) + scores = [scores[0].clone() for _ in range(input.shape[0])] + scores = torch.stack(scores) + self.scores = cvcuda.as_tensor(scores) + self.confidence_threshold = 0.9 + self.iou_threshold = 0.2 + + def run(self, input): + return cvcuda.nms( + self.bboxes, self.scores, self.confidence_threshold, self.iou_threshold + ) + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_normalize.py b/bench/python/all_ops/op_normalize.py new file mode 100644 index 000000000..a17fd296f --- /dev/null +++ b/bench/python/all_ops/op_normalize.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch + + +class OpNormalize(AbstractOpBase): + def setup(self, input): + mean_tensor = ( + torch.Tensor([0.485, 0.456, 0.406]).reshape(1, 1, 1, 3).cuda(self.device_id) + ) + self.mean_tensor = cvcuda.as_tensor(mean_tensor, "NHWC") + stddev_tensor = ( + torch.Tensor([0.229, 0.224, 0.225]).reshape(1, 1, 1, 3).cuda(self.device_id) + ) + self.stddev_tensor = cvcuda.as_tensor(stddev_tensor, "NHWC") + + def run(self, input): + return cvcuda.normalize( + input, + base=self.mean_tensor, + scale=self.stddev_tensor, + flags=cvcuda.NormalizeFlags.SCALE_IS_STDDEV, + ) + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_randomresizedcrop.py b/bench/python/all_ops/op_randomresizedcrop.py new file mode 100644 index 000000000..0dc1f5c03 --- /dev/null +++ b/bench/python/all_ops/op_randomresizedcrop.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpRandomResizedCrop(AbstractOpBase): + def setup(self, input): + self.resized_shape = (input.shape[0], 320, 580, 3) + self.min_scale = 0.08 + self.max_scale = 1.0 + self.min_ratio = 0.75 + self.max_ratio = 1.33333333 + self.interpolation_type = cvcuda.Interp.LINEAR + self.seed = 4 + + def run(self, input): + return cvcuda.random_resized_crop( + input, + self.resized_shape, + self.min_scale, + self.max_scale, + self.min_ratio, + self.max_ratio, + self.interpolation_type, + self.seed, + ) diff --git a/bench/python/all_ops/op_reformat.py b/bench/python/all_ops/op_reformat.py new file mode 100644 index 000000000..eb4c2ddc8 --- /dev/null +++ b/bench/python/all_ops/op_reformat.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpReformatNCHWToNHWC(AbstractOpBase): + def setup(self, input): + self.input_nchw = cvcuda.reformat(input, "NCHW") + + def run(self, input): + return cvcuda.reformat(self.input_nchw, "NHWC") + + def visualize(self): + pass + + +class OpReformatNHWCToNCHW(AbstractOpBase): + def setup(self, input): + pass + + def run(self, input): + return cvcuda.reformat(input, "NCHW") + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_remap.py b/bench/python/all_ops/op_remap.py new file mode 100644 index 000000000..31175d66e --- /dev/null +++ b/bench/python/all_ops/op_remap.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import numpy as np +import torch + + +class OpRemap(AbstractOpBase): + def setup(self, input): + batch_size, width, height = input.shape[0], input.shape[2], input.shape[1] + batch_map = np.stack([self.flipH(w=width, h=height) for _ in range(batch_size)]) + batch_map = torch.as_tensor(batch_map, device="cuda") + self.batch_map = cvcuda.as_tensor(batch_map, "NHWC") + self.src_interp = cvcuda.Interp.LINEAR + self.map_interp = cvcuda.Interp.LINEAR + self.map_type = cvcuda.Remap.ABSOLUTE + self.align_corners = True + self.border_type = cvcuda.Border.CONSTANT + self.border_value = np.array([], dtype=np.float32) + + def flipH(self, w, h): + mesh = np.meshgrid(np.arange(w)[::-1], np.arange(h)) + return np.stack(mesh, axis=2).astype(np.float32) + + def run(self, input): + return cvcuda.remap( + input, + self.batch_map, + self.src_interp, + self.map_interp, + self.map_type, + align_corners=self.align_corners, + border=self.border_type, + border_value=self.border_value, + ) diff --git a/bench/python/all_ops/op_reshape.py b/bench/python/all_ops/op_reshape.py new file mode 100644 index 000000000..37bc63950 --- /dev/null +++ b/bench/python/all_ops/op_reshape.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpReshape(AbstractOpBase): + def setup(self, input): + self.shape = input.shape[::-1] # Reverse everything out + + def run(self, input): + return cvcuda.reshape(input, shape=self.shape) + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_resize.py b/bench/python/all_ops/op_resize.py new file mode 100644 index 000000000..0a3d4fcf5 --- /dev/null +++ b/bench/python/all_ops/op_resize.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpResizeDown(AbstractOpBase): + def setup(self, input): + self.resize_width = 640 + self.resize_height = 420 + + def run(self, input): + return cvcuda.resize( + input, + ( + input.shape[0], + self.resize_height, + self.resize_width, + input.shape[3], + ), + cvcuda.Interp.AREA, + ) + + +class OpResizeUp(AbstractOpBase): + def setup(self, input): + self.resize_width = 1920 + self.resize_height = 1280 + + def run(self, input): + return cvcuda.resize( + input, + ( + input.shape[0], + self.resize_height, + self.resize_width, + input.shape[3], + ), + cvcuda.Interp.LINEAR, + ) diff --git a/bench/python/all_ops/op_rotate.py b/bench/python/all_ops/op_rotate.py new file mode 100644 index 000000000..b7d0697ee --- /dev/null +++ b/bench/python/all_ops/op_rotate.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpRotate(AbstractOpBase): + def setup(self, input): + self.angle_deg = 40 + self.shift = [input.shape[2] // 4, input.shape[1] // 4] + self.interpolation_type = cvcuda.Interp.LINEAR + + def run(self, input): + return cvcuda.rotate(input, self.angle_deg, self.shift, self.interpolation_type) diff --git a/bench/python/all_ops/op_sift.py b/bench/python/all_ops/op_sift.py new file mode 100644 index 000000000..1d0e23567 --- /dev/null +++ b/bench/python/all_ops/op_sift.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda + + +class OpSIFT(AbstractOpBase): + def setup(self, input): + self.max_features = 100 + self.num_octave_layers = 3 + self.contrast_threshold = 0.04 + self.edge_threshold = 10.0 + self.init_sigma = 1.6 + self.grayscale_input = cvcuda.cvtcolor( + self.input, cvcuda.ColorConversion.RGB2GRAY + ) + + def run(self, input): + return cvcuda.sift( + self.grayscale_input, + self.max_features, + self.num_octave_layers, + self.contrast_threshold, + self.edge_threshold, + self.init_sigma, + flags=cvcuda.SIFT.USE_EXPANDED_INPUT, + ) + + def visualize(self): + pass diff --git a/bench/python/all_ops/op_threshold.py b/bench/python/all_ops/op_threshold.py new file mode 100644 index 000000000..6cd277fc3 --- /dev/null +++ b/bench/python/all_ops/op_threshold.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import torch + + +class OpThreshold(AbstractOpBase): + def setup(self, input): + threshold = torch.tensor([150.0] * input.shape[0]) + threshold = threshold.type(torch.float64) + threshold = threshold.cuda(self.device_id) + self.threshold = cvcuda.as_tensor(threshold, "N") + + maxval = torch.tensor([255.0] * input.shape[0]) + maxval = maxval.type(torch.float64) + maxval = maxval.cuda(self.device_id) + self.maxval = cvcuda.as_tensor(maxval, "N") + + self.threshold_type = cvcuda.ThresholdType.BINARY + + def run(self, input): + return cvcuda.threshold( + input, thresh=self.threshold, maxval=self.maxval, type=self.threshold_type + ) diff --git a/bench/python/all_ops/op_warpaffine.py b/bench/python/all_ops/op_warpaffine.py new file mode 100644 index 000000000..9a4f062b1 --- /dev/null +++ b/bench/python/all_ops/op_warpaffine.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import numpy as np + + +class OpWarpAffine(AbstractOpBase): + def setup(self, input): + self.xform = np.array( + [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]] + ) + self.flags = cvcuda.Interp.LINEAR + self.border_mode = cvcuda.Border.CONSTANT + self.border_value = [] + + def run(self, input): + return cvcuda.warp_affine( + input, + self.xform, + flags=self.flags, + border_mode=self.border_mode, + border_value=self.border_value, + ) + + +class OpWarpAffineInverse(AbstractOpBase): + def setup(self, input): + self.xform = np.array( + [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]] + ) + self.flags = cvcuda.Interp.LINEAR + self.border_mode = cvcuda.Border.CONSTANT + self.border_value = [] + + def run(self, input): + return cvcuda.warp_affine( + input, + self.xform, + flags=self.flags, + border_mode=self.border_mode, + border_value=self.border_value, + ) diff --git a/bench/python/all_ops/op_warpperspective.py b/bench/python/all_ops/op_warpperspective.py new file mode 100644 index 000000000..c73ae25d2 --- /dev/null +++ b/bench/python/all_ops/op_warpperspective.py @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 + +from bench_utils import AbstractOpBase +import cvcuda +import numpy as np + + +class OpWarpPerspective(AbstractOpBase): + def setup(self, input): + self.xform = np.array( + [ + [3.46153846e-01, 3.33031674e-01, 1.28000000e02], + [0.00000000e00, 6.92307692e-01, 0.00000000e00], + [-4.50721154e-04, 5.65610860e-04, 1.00000000e00], + ], + np.float32, + ) + self.flags = cvcuda.Interp.LINEAR + self.border_mode = cvcuda.Border.CONSTANT + self.border_value = [] + + def run(self, input): + return cvcuda.warp_perspective( + input, + self.xform, + flags=self.flags, + border_mode=self.border_mode, + border_value=self.border_value, + ) + + +class OpWarpPerspectiveInverse(AbstractOpBase): + def setup(self, input): + self.xform = np.array( + [ + [3.46153846e-01, 3.33031674e-01, 1.28000000e02], + [0.00000000e00, 6.92307692e-01, 0.00000000e00], + [-4.50721154e-04, 5.65610860e-04, 1.00000000e00], + ], + np.float32, + ) + self.flags = cvcuda.Interp.LINEAR | cvcuda.Interp.WARP_INVERSE_MAP + self.border_mode = cvcuda.Border.CONSTANT + self.border_value = [] + + def run(self, input): + return cvcuda.warp_perspective( + input, + self.xform, + flags=self.flags, + border_mode=self.border_mode, + border_value=self.border_value, + ) diff --git a/bench/python/assets/NOTICE.md b/bench/python/assets/NOTICE.md new file mode 100644 index 000000000..7dd764391 --- /dev/null +++ b/bench/python/assets/NOTICE.md @@ -0,0 +1,19 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + + +The data files obtained from the following sources : + +- brooklyn.jpg is obtained from [pexels](https://www.pexels.com/photo/people-across-on-intersection-1486222/) under Pexels license diff --git a/bench/python/assets/brooklyn.jpg b/bench/python/assets/brooklyn.jpg new file mode 100644 index 000000000..89e67551c --- /dev/null +++ b/bench/python/assets/brooklyn.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e406cb60c8f9200d4bc5d95e8b9d8a8168876390f0c6b08c837b90795d4beb +size 247098 diff --git a/bench/python/assets/brooklyn_bboxes.pt b/bench/python/assets/brooklyn_bboxes.pt new file mode 100644 index 0000000000000000000000000000000000000000..3261e47208b22be52416c8639207fe1bf06569e9 GIT binary patch literal 196520 zcmZ_12Yi-A^Y1;o3rR@tz4wHYkU|M1^bVo--g^g8k)nXoR6qd>q9WKH6+{sP6-7a@ zAPNX5Ql$6ZVb6DOpZ_`Mec#WkA9Cm3Yp>banc3gY&dzSywlw8Le0=2p^IwpJN&o(1 zrVN^pH?ZHtepSYf7}=yjkaYil{x?Gpm^fzqfT30TOqw`s}cX#AjojfM@F=sV6Qr~SAJ6UX_sALm!4ZA)*TssDV|a}AW;zqbFsy$+D1|GU?D z`Ar(c{lEMCe@mcHc0Ft^N9C#wkoI!UD*45hmjIb3YwQH+EcfhjJ4|Awv0Sr3QeHO7 z1v^GUB$&Eg#8-UeruC=%YdK+`rCz#RvdzU?D#=}&DoXA{;Y;z7LiyeHlUlM$?%P6o z5U(ECol;Jk(_UYxC)?zKZ9sWG<-b$jR(9EOQYyEgQ%kfoRS)eSl=r0mBT`>Z${kx> zjI^ixFy(#ZCp%XfNQk;=3*e`hveefyT7Iz4(Mq_wZ7WmW2_@};uc`8feL-qSeRbC+ zK&Ksau24Q-j@o5XEZr$j=KXf+zO_7mNlx2WB}dBCJ?d*2OZh*vyGqX5S150yZrNDq zKg08Vl&_Uj(5WOn)OAn!XVgCqov-8=^{dI->V{3D-f{RjLBIc!-S$OkC|>%Gtpfdh zlpm-3w(PL;q=QtTydvd0C_l|-sQS@9CC%g_bhG*Xu7`ek$xy%B$D~O9rax)CKNr5f zkqT0v{zPw)@+6{G|)^X7c=L_-ZLX+SQc%NFef9VW&t7c|(OuU3r)C^Pci3 zwVC?Wamux!k?@_01HS{uE-?c~4XQn=FJAouk_$)$RKT-Yz<>~qw_UR`t>yy;4 zAa7B=gz`rEmK}mEKda9oCok!)@AF-tci;Aw>vowwV{@rDOkcKLk?$DoBOT>0yG|do zk&LHm`n>JP_xcY@(2kSGXW9>I``PZ* zN3E}f%6Rxqr`<1f6k~6n{Z9Xl{3B!r?We={4>}2X?zFq~U$%lo$x(fs@(S{YerPMn z5qn1e37r_JV}wzA-)_@4=|_1HbBJ~$r7h+4u$v#X$SApGBg`oq4BuVNE$Dd35$tsm z&s&)L*uR#B<|O?KkkinARBqTN<~n%dE%nVF8%6u&jFLfe(l#`g;Kx^*Qa^(Jg&HG$ zZenla-N6kaq8v2XP4I9Y&@0*o28cv>opFYsv z$ou}F-<$9shvb)&*MXC5(CZYZH7A3bI=PBN+0kUFa+QQCmrH7KhCjy8mYk<|N* zR@#B}{jldA*ia)Zb)rPeAN0Eivg(I^$MY{u(qtyce}sC&c&|V9@CY(Yp}pl`;{&jI z0x}tn9X&^R8tr{X{qkVebZlcRMmjuWMwEZ;txhfX>2m|lKq>2ibt?)Vr zIs#6HQQiccs-c49SNK?f-3Nk`p_C6$AyQL?$|+>K5ZaBvnP})QcK8LI2hdr7yxW5# z(eOJ}Rgl&yNg|b!*~n=Kcp8Ol3+YoMl_s^pvEj^VBf+0UB$TVli;s$7UI|3%SL^_A zHXgZU!FvVTcno^^@IH;t7<8Edo}ZBF@;ZEk^L;w+`SBUUd$-ZxNA%&gy<(qMn$`oM zbIl%NzWi2sBjq!84gEA&lfOE`eAGn#K&}t$9?Byq_oAPl+1*&lIrvYIFTw4H9y{D5 zvuunSEO(JhAU3j1{bTbbS2oB+`;^KS)I#r1*-zAQ+Y-5ch`r5#W<29Pigq_4_qMd~ zJ$jm^;w4Q-qv`MM9(3B05$ERf5h?+k4WU1~?PYMd8}^w(KWC^+>8Jy#wa0qvQ?@Dn z?51kVBgni==QA$8vsvI~Df9M0RTCL)x6O4U#@#!%F1TKa`6f{pSywbNd*1i7Blb)+I4|FFyi}&niaM~Na4A!~ydyl=N zTQOQbx7&3E^b;9s(R!QKcwFDv-=OP5`Qz|861xd9)sfUOdy4tZ2TR|iV;Ltm zs9%ho{BEyk#YhX6oy-$+8M|K8FGXKhn3w#if1A&~Qc=Q8w$!Bk2yhsbm5Sh26831} zqbC}uWGp!BFSX3K*mVtw0hgNcy#PG(;j@J~0Pa;uk6P_GxyqiDa6d4Sw6+9}{+1o9jU-e+K6!@%bn z=w~{(9Y+0^!T(fgB##>}topKDLc5_@-rMFi{Jz=p6!S=ZH2EL+3gpSV=6%W^mo@O8 zhVS|f?F1vA4dyiXUq`+$K4|qQxb5bnNZHT4k%A<58aJQbwkMf4!oa0#X1?_TrS_Q+ ztn3ea#oR$|F_^tC^UrEDoDQbmv|3i-KRe|8mUg>fv%yI9w!LOA*c?zc5Dck}om8d1 z0?+-ym=rM53rh7U@50k__6T$DA^hw9NIHelcm-dhC0^HFbg|72$9qkr4-IL54hVhS z{={5$mHDSGlzc(yTG(iJe*4A_KsNdGusW26purXR^f&D$I}vV+kxwJEIgpX^9%JFU z{m?#+?3>c&7}^_)y?urEP!1n@2{LFR*)jo-dXwz6-yzL-`P+WP&no=hg+5>9b3ar; zak~-P@z80@+t*kURR4Ffj=sWE}IKzT5)4LIQ^+if1d zC&B4*+HvE0lxj|E%WNHW$aa<~@ZDL;;afflo+|k14(^p8;a`zjwE7D?J!|iQ7`;_6 ztBDr!P)2}RhwLlh*HtWi9Pec_uF_O8K7JkfSw-X31u-8~roRzj5;%)aeuw`<_5tlA z(6125Q3dp|xBR1Wk>)q{b>+Ny5Lz)t)eO0(s^GP3uHTkC8c@P0h}x23EW`u!6+ zZz6{raO@epj3717#_KlfE_{A!Z`+|d4UJ?X=jLGkTorBFpxr}uxUPb>?%4@yKWmfy za$UDX8hhvS zO8eubx@m`gKeoGdb@XWMLOohuz!uX?J@9S^{&N-V$VV1~?*p-KF)>(Q27a_c>oMSb zvb+gSRW$j??JhX%yl*dgl6m1bCqI(`ojdk{PNm!*zuB$5)1;mWg3cuyXdJ3C?-+l^ zJTn1!RE9osO(fs^!C7y5QBI=4wZ^a7jX2{vRUmyGa2tI>A$G6RPTazBYYm(9G z9=p|OPyO}g80-5{vevk9^{f2>{C0Hqn`_J;9l>XZn+NP~l)K-qnAKqBvvSNt!1ZB! z)kM?Z7{+pQ>y6iVi&kn-eixgr0H&0e{j5hCz%#M{i>}!7tmFFf#!dX3f;YYEgl z&U#`dGRpzkZrBz`#~%rMF~)|2rgxbI2OzipAe1ljS1Khv@w9!gnuS>EYYU`rjS z)P?F|s2`&IA9}T%UUa0bK|I;SxT(PSJjP7485x&gXI;?74eZ&dG(7e&`4n5Iif2$j zuVU2`jQMUrV>67o7Myi_FYX-i23UggY`362NvQnL_GMV*Lm|LzfHw~qJWqJ}Q zPvDp16-hm%h)s=qqT$Tho4g@6JHat2>FNxdh*x|(dDUtcos+{2T35}6gsQ`RbH zC+F~=ZxL^4D_hhuMnfZe1{@A#ru|txfJzS>On)S|wRc z-oR>JVU1E%J&r$IUl!S&Hb@_2UA0sOs_)&Lqps=Z@`n8bJP(q7c&by>Q~37vm6!rq zZSUC5x;C?Bfm{Qh)70C<5g7IKtDnw8Iz>{Zkpk~|n+Eb6_Bj#L3xa<(M@Cr8f|4 zI?!?W+?nTYJz5>St_==#Fu9b5gXgJ?=w-hR9n$!`-(0@m&;Q%qIr zCCN17tUX$0K|h7hr_e(vh_;YyPWLO%l>TL>TFSlnCYF!;9&{ElIC`pPFJ2kxWr zn>RDgHp|DxdHZMW*Piki@cd=Q{UPENF5-O9?lcwQGg5vqJD{^hb~7*e$yWO->kKz9 zTu1J~@~G?qw_Pi6!;;(aK=I->^L2sa;e>!u@l31%mtSlGOrXMlP)}Q zK5-#>ZiCKZXtx0Abf^4LyrwF&KA7jxw30{f$53Yh*6L=DiFnSjv{T#R5|+A-Rgf?3 zOb2VzdC~}uo}lipjPVEffiHkb>Ad+ky%~k2SZw|r5r%ajEvtQyzq&ln@3E>9_?ga{ zv;nnKWiX>=3wE2YvWV^lW0Q?(H5K0b$S=@HRmJ#MMs^~LG}>s(^E0%Wsq&C=1$>q_^z*l+nGcO@7M&D zZ*#=uPy7DGi$0>WSI&DsoR6k z8J_Y7$Y2K+xlZ{|qZR&mGw7sAv5I7s)g63}17};pLtV-pZX2+^6>$vb_m|2v^0^a%}D)BRY9H= zm#dHrMA4!uMu&(RCzE4z$1 z{Bj$l;?!<+0InO$Yx*Vlf0udV72-7)k>eKo3)dVl=#OUdli4U9@d#8e#EVJ$s@8Ce|{%E%xIm!*vcdRkj$7s zFZjDB@n%NrNJdnNEYfjEN6WXm5u+tQ=JH-TV`8KZM9UfS7xRK)q`Uy0=OE9itREU6 z7a#CwH}tHBCI_T})#}Hb?~jooKI?smr%uaJUpJiXwho zgcT-%&(-)Ik00&uBaY{8HLUQPoqrdDzwP*|AQP#dhn`1M9>vI>Vd}tbD*kjMq*Vcb zxiRG$b)Pv7hIS$Sz$~W>C?f9Uv z^Uw)|9|bqw@VgIuF6MW|?Cau46&Pz3X#W~sdOb=M_!mU|6r>x)^M|aJ>mUziHohgI z`2yaKLlcd7?+(6q6r;Bh7#oEZHG^sy_H&!H>;*7#~hDL9R9rtMkUhnY%s*dp$1|DO~Cq3 zu--UmF7n`cE;Gg>pzznkHyZF(Fz=O7J44>U_Dbaody=Qsps*QVF&oZa0C5Mwa|Km^ z-s5B!9A(g>neaUu4>MR*VO+%{@m4&oEKk$Z7eLrVa5fQ(xWfAZv_Fwr*O&)WR4_PI zSFSUsg(A7R)Cy8lSeJ)jXN@FSHO4-p@%@YC5tu9n&S}Ya?uDuDqjZmqKv;NW?|1z0+72)qWdOlm>8_RHf z{su_4g!k?cBkC)wJanGL&uc=zGl&qS(SWfC!#RDK(+Dl&(KgQpddW-VEw z+VUAM^Hgm#_XU1$F?x%a_f%b^5-!_R87r_g*2O(85((Vm@n1t^vziPJ9kpjwP4K5S zc-oiV#)8LV(Bw6HgYS3o8xN~~SYrf!ZyD0PZ_~lMU+`cqt5Hx7VxB0&V|dqAVI6;! z9aZDO!&3bIJZ$ZV?W$)%?UFsJF0)_yvTS4@cn&>TqL;(Zr}lt)!~R8NV>5DCNPj-m zW3j@$Hd!0|!slo|pE*8CUev^I_^hiNz-2}7HjDBH_7Rqf10BU z(P|j`f^Hlo$V{DsU7WUSbte2)l8N}wE&?!L=h6=!{8JZac5&fStT91e*Ll>xX5Znv zi=Qrrek77vsxugO(ejCILN8)unQqS0Jefy3^@%v`*0mV3+47NY1Gn+=s4kGnSe`L? z@KJ|&Oh3l1uRNvywnNc-HtTCkr0BFBLAk&Dt$VW09Lhdi9&b00e{@T%qze9Z85aBj z_*@OU43!BzVs0%~SsjW=x!xK;@{6`IbB)?nfwLwKGg zn)N~}Ms|^DO{<~AnFlfs;w9O%gG*nj!KWYfJDc7}F&e+SDcXrAp3wpOk78f42S3Ak z?$*`*GJtl2_}&}(e&Eq4uZmsI1Rrao zpJk>!@^?5~lkfAu@fsjTdEyzJD38Ri-pW3&uX@QOf*{_q#dL+s7+Gz;1BbPGlX<5j zIQ$cIT)btq*#(~8qMq~4j)Kpv>2186G;cFYAC+CqU(u|9&mte^c?)lQ^ZY0_UV@z| znM8RsWnGZ6%gktwy*XcJ4E>E`toA`ZiC}GSwB=$YefZ~M*QHoN4z}6}Z9HI4uax$y zGDb@n-AQ14A23}Z(`>ZSh(1=LJPXX|#5*pJqdN6#U?DXqugLS})bs}p>(JXAY~RKC z+u$j;g=#c-?cyb2|Kb~S!&GpqA{1s*?)E!o!fA7;y^7Ql;Cun2BoQuNq7N0}`~~U{ zqQ)y=rQ>}8GU&$lH~80^-#!KJQfO%bR{j`zI0Noh1uwVrG>zY$qMa9!%YFE+#IyZy zl!&G$!v9+8xL9WveLjz89)TtwgU&C=ItARr3m`9JByx^qrBg?KfX7P84=MFw?{YY@ z52LJuyamEkRw1BiJ@zSwVMU?rPfmgQm$sIQ!-_gcW&A>40~q3zPu2%r8)`~%ejZa*O|P?OTivR?JX zhWw~kjD9ZK?^Hw3wT^tMx?^WP@{9TmD_BnbZfHA)cm-n}eg3T8LYwc~JK%FD>Cs2_(cH?vl_WoOw++Q}h^1ZDVc@YJ-)reX z*hyvStX;OzH+G<|L2E|p>7kx+F+z96zRr-RQITF1%2?eKeMQP_-2(oMOxDg1NyAU> zO#3nL(S&}27vOU+7BofogWm{wLl-etg}92F?^tQ;!IVdnH&T=L{N#J^Jp?){;U@zA zUMD`0k7W1g*7UOi_}q`$6`5CZ!I{gpoN3PUO7e|vh*h?b=d=%0@y1Mb{FW@?ag|_!(+Nf$P2C^O~(@Mk2Q$ z{Ncwv{jO=cG13BQe-PipDWA$)e(Vo6rk8#)h;rv|_cJ5-=|g;@C33lFM}n(v-*77P zN_F%<)${>xZrTO#=ltIJCY$jwf;e+~=pVI9O@DeFEw7mrQ0puF=Z08|KlnU^cB9DO z*k!x3ihdP2^aee*m|-6NR-50!?F6+F|Gz)}^G?d0zx^KNU5HSwHBpS}li>3R_>Ptf z<_A{utL0mhiIuqh!S=`}9@##|XDoB%D2D>foeMntkXe8{g84ogsk)fdBj`UFS$3vB zS-dk=+)i37vvC9#mP>gi<0+2SS%GH+xvac&>U8#$7Qnq*=hTO8EbqJ7HVaA(kWoe6 zZzIE?lf=_z)Gy?FPwHmzy+6`VX4c5(X$vrB5C|P5)4>kM-xTV`V-I5}b-R?a_|2`0 z=7PIfeVEH(rhevIRa^5R_Q1htm4L*;=A3kmy zs318K^|Wd&zk5_a4!?NZ&GeY^f>Y5 zMxay}xZWMP6@!ye@Rfw$+KSSi#4(<9@WQ9tFDjPZsy#SbMK%zV(Tw9y)Nu56*q&E? zu%Qm%uZxbPLFZ-q`7QqQ6p!5Zg3rmc`z{kBTk8p+ z87m<@iua3%_x8f#1V4BHV%=w_lZ9{ zKOvMj$56^ci92_vy*uFZSdU&F2e0!$pC|FVUHp4C`0VyIX3_t;jE^~{EAz!!yWEW9 z?Ra^`J_bf*(G;Kf)Z`ouR$q% zC)2xV`P`f#KPW=IYib~`)Akthada+FKbUd%y=g`-PJ_=~;NR^JK1csjz~^abvl1=8 z44pXIcpAQfh#)OQn;~fU5lXY@`%F;HMI;tO-w*7Z!gx#qW1nOkBq8}{=(jij=0PJK z3z|xKHY4O|-VS4AJwbF>meT%tEt*%-+!<(&$k@I15-j>LMR z$EN7y9Y)~=xP1gZW0)PBf8-+Ci)iN$^693)wcCS#l=d^AIvB3X)9QS<4WoP{_BIB; zxgV{EV}lFPdK_HVWZvJ1Z_tBrQHi+7BA&*h-u&+DT+=@8DVyH)hx@~xKGm*&Q zc$019Fccl$z6{>JEdK#V-MXS4dycQdhd=cCF(*7nyuoD`m9pQs0zUolM7lHAEF*ud z9kX~Td2=hFA3@xF0ld{>pT8|~sSZBBM*SfC;TM?+$RP(OOPIm(*mHF8D7Sa_0g@~Q zS6$?@nB0jdB=(Sq&SK=%4}UiTO3l?1)W&Cqk0M6OZ16LJnjKVs-fPHyWLtXSEzhgL zNcWx{sv4t@Zt|w8=~@4*BW~h_UZ$$9P-#v4x&&_S+xNlS1Sl@zy+jahovMYjtAOXj zkYY>X2+L_rgU2!SqPF}5{#r7wKcl=n^fswXtgtFL`xv&>oOtvg?6fs`er?frE^+DC zK#efsBXt-*)A8ebVJU4q^`oixH}gYZBrpfOc9s{XCqu0V<&T0IL-2>mMaORI>Bo>t zW%9=Rqs`^i>wq>c+7a5Vhij0RHx=tFmFoHttml-SphqLGo3^T6gq?(e+e46|NGIJH z?ZuNn;qaE&sUAu{ePn=kIjP0cRnI^#;W7|tS!dKlc4 zfZr>i6AA8)!{_8-c=Um5MC${grr^X9AIqqG0l^wa#jZa;#KKODcN zd(z87`48=9g10;M7`RObpWVFLR-VzFu=6p*oyViEOzPJ}J5xRTpil9hG0vVN!2QaM zq_yNp&u1hw5nuBJ-oOQWnEth8_4osPJjKkG0KPi>P{c1>{L>43bt{tyaIYeI3pdWM zAd+XMF{|X`54VNiT=Ftr;JsjQ`Eh!cNnB+p`VEzQvz+?mqnoEZa;w7o6|ut-v(O{A zK4v;nBIn9HkM+lpKQ|lsgn`dbI;~4xdCY^`vrQl5Q5AfigB5oqzitLkkJuH);d3f^cX?QwzZwZYy|9VsTTZv4MrO34yUB1QlBxt=s`%!4*HBd1bS-yre+|Judq?t~= z)o9J_1HOS=+zRa-ys!j*e+#(uhZTwaQhXk zJBH^j-cTEVb{6)0g3N`b^a=lj^+Ffgev{vVJ-u5*>&f7J26%gpmHh;G_91`zHCj)F z+emo0E+0{&J@vLi-$h8?06k{mC4Y{!ccJ_sHDi&?X0YHfJd9sxxfLy30R!AP*a3b{ zf%o%_g+%@@4-UI|`B%y((^@G0-!1lo!qk7@CkA|c3Q&cZ7w^b&NJPPZM0H0rn|2X2;FCqIAcD(9MEpiX>C-GkJo5x`V z=j=21yMgql19(^kBz#(p1Vyc#tH#oE_5r~4BIb~1`5uVHzXX0p(9`MQZYA`whW5O$ z|G8>DRO8qmblLHF#50_K5sTlwjM*iMIL3>NnhM}_N2sLmeIfJ%_W+90+sO|Zf`w)g zcUXjt*5^EqvyhMEs(PFi(|y}a_eE}Ha-a4av2qF4&tX$O_`6Nu8w3YGoqyI>e}T94 zIe8V6k+QY<`dzpUm6m!o_9f(13`R~x(pax%Y=nZ_v(Q=`csz@K#)8ide?p~~cDRsE z{b%T1Ii4?qPB{4dI&uv0)UVFW_ZaOu{9mdE@UukP;*a-4V~h1t=mdG<8D!mq+fRBT zlFOhUfaX_#=OOU>678nJ=Ni2dIz*PhUzcI_g?~<3PJBd;sxdQzN+B*Tg)^L)5{~S!u{LEUk zON`FE!)S z5cU&arFJg)drP1qVBeeh^eRUF z#!rvOr`u_UiaI<)Ot)I8Oy9$u>uFZ=k1q@OLcZa|C_e zfOqD@nD~JHCSno&z^{3{^Bwqqm7K^o;HDydn!x)r(b->&n?E^owUBXJf`9y;hkkob zl;mN-Pw*MZ8xx2`Bzsy}ht_Y}8S*W>#ejlidG8UBcsbhN&mPccp7O_-)rhvy+AQ9? zZC66qtw-E>nOBkLM&3wdXJ#$YH5adY3N5@uv~~ke-Ol;9%nQ-uS%rjSx$j_9!9O?7S1QFoo&&Xgh-uVUayp-?vkyAMI52L-Y`0W8|I`7vf z{=5c0*%PGqL!p!f4n=`b>FRy-0`l>^h&RzZ--^KJFDd853HaQYH_Fs3w3|&X$LI7Y zj`NGp@-u||+!v8jCUN99_y&%sXK05=I5{E(Skyr9%FXZ7p;Lhx18A={)-V=-J06KI zQVS?g;>^ZI^mCc~=O-LD?E5{-93M%(-CF2-b6(J`zhc4N6^y?y@cvEa;Y_}}%tSwr zAI<3%{N-0XssV^kUWgJyv2Dny_xY7O?+cD`tT#({3zDYP}=A% zd<&P>`fcbY6Zd!psUc_msfYh=;JeG<9th5+Qqos{?4dJO58-=PaC$D%^CAEG6Urlr z4?he2BI%|-WDOY$K5vA6Fu1*#rv>=o>!9|)z6_rZpI_1rKPnSHUJKWMg3li#%@A1+ zjs`KZKZYN-Z@x}{K==zWkvXoY`h+mHbZ=1;(p4X-4zv*S@y$bmn zUtzBiat}PN$;##`e7b$E2YlvYXI}W5wV~)`w!ja2+~#xI%K*=x#~yr%b9{kyVWH-0 zv{(dQuVB850G~gnl*~=@E#-;$&+kI50`ZM6;X9PL#t-P3oleSg(0qyc6zWmn@ejPs zdfa>nHJ1;)jh|7(nSY@?RoZ})#q?~Tar%ms@%YU_=w+0dORq!0*H5Sqx*6x+WyliB zA97-AuK5qovxz@1Ldq?{e{O1 z^eY>BTbYO4{@m-PHP%$D-ZD?~%}ahUZ$K}WIL2a6{|+!;IREA}^Nt%Y=ir0Wt$bg> zD2kF(o_Q*cvGpqY%pubG0XSNKZSCUySn9sTzg&3#5IX6+vxENUz|m@??lNrN;hT%G zevEeOfLJf1&uspEi=77(yLks(tjg~%W8b~t`6AEdalsg3=}WaR!g+ zIrP>5AN6UTE=E@8@ul}s{ygOki6p)bA5ZY_5_rCzT+l_7HwN=o(*McKZ9726pX_AD zVj{RVmHDDF?=HY24j`_%fY_>&&l9xPnE1mAP}k-Ay~&9)cOGdF-sB>@(jDNrJOBF` zp0U&S0J2Jd!)x0Ew&oHA+Kw+4Nxz>YCc1}H+I!eZYyh7JJ>`{|ePW0|)K>fCL+t$m z=T_Fz_Z%X8U1;M+RyJ?a?=x$!iXsX-(0>#J@o-uOXp7c<9Y z>%MzYbBU)PF5dAi)^*9Qpxnj(KT+TD)Yntq70Itv+i5otT;0K^7yE}_!lwbJcTzt9 z-2M*TmLng{`9n2iKkc}EpY_ZW4Y0Z^;Cy-P>~HlG^~1>1SdFx9b8clVPit{LW+9(V z$*+3}4U>1MXHu&V`-v~(6)xj^-{;`k6}BXP*0}bkCcaj&!16VRo;Z}I96b9>!1IN&+DL*15R&7PUQJh?&7rTm@iV0 z%U=B}xO#~5*;^=0I_L^gd>{pX>p>p8jW&-!YRhJVegXek0cDmzXbHeDVf- zWm0mEbwzC~x#_;hvSe#lG%h5 z%kZnerPpW4%iBPCCg-%bF$$`I+Z*7H++*rRQ_|1;g&adT|NRd3QbD?yujnWFiQsYw zeANe+o6zDI=(~Ap960CVM*YBdPNB*q%AFh@H=oguRPcBmm=^@@x^WjHi_JLX%Ulos zy4;AjSr-Js!8|jYsQOCq`bDg|Ir()?pV8#w4Q575|olM8{dL|4D}C zCTP1|#dXk`3*GC`e-+&Robq}^G2W-w&q4PLt*zqh%XVm=Wl}3+v(j;+zoOB zEhVzkH3{7Ofmz~Xz8?ag|A7A#+P`eK$fN2j_;qK~zmPrjGmf$OrhNb!{6W1+?8|*m zbrUUhRlce`NSvQptO+uIw$`WaQJ=tQ;!{nmua^Q|M_>y6WGW5m7jjh0IwpuGFDdm z(M34%iM>1zBo4Bl^;Rr?^qM-_&-$A+aUz)1FoJ_&zL(~r> ze)}iX0-%3|n9&XUggU?|Am5gHp;%0Jbpo8fVi)k;;q!cY8cz8XKI<}ny-c|`_c-7xdgd!e`76jj)Ex z>I_53pz>WC zqGvEu4kPbw9oCBfLENYvUO}i{LOXeqp|`+&oD_h=@leXcAI+w{0{s(Ql4-6_&`Kcj zldtG^u+-7pd741n<9DQfpZt&y;ETvP_#TJFcG1sZy?wyr@2D5S=MQ{V;&TP_S!41u zzC^y>@`!f&AfHoT@{FT?dOvo2&(70-QXUIFAEvw*Klw+vDg~EMAfE{GG!9W-Nj}iK z8I|Pe>T~cJO1^bpo({n;-iI}Xk(d1s?Szs4ag|KkgXGQK0c9@QUuegVcDHFS>~%kJ ziwoGpHG2ep0vO$!bfKH4UGgX0%!IoDXCF&aeIuDYjuh za4YiTE(>tR$tMhat%bdc*}?O>Hr;$r?PT@~H#6>Qlkf2())+)Q;{g5Ogf+PC;uS^s z$th4MHU}wB`J5o{&y3Qqz8EI>|_+U7|Lgf`5mn@KFx2)AsF0t{#O+K^iQ;= zh&%sAe}mZ<{K@m)RNARbPiL4t)DM$I<|7Y3E5J*a*}lf?_29q~zPt0dpPIL^*B0`j z`2#D6kQMmbO=XLGW|qNkU0Ls;8wpOYg1gCbgt&>Kf8W8ki+da~f1{-wIY2#MJn~cK z9P~M5Xa3}Ab@2HZdWr;}ukjhr2)YPYDV#++%y?-;&o99(@gup6{F3k>E;y7x1E-Ns zCQmL`c^b_KIR?E_G`kh6eVMWh)rI`5c5)PX)yLELh0!z{^h;M+@)-E+=#=6m|48eL zp!ASd7lY5o(0>Whiw~gpF?f6uzP_=~z@Ou13}a>{iywZr$XvoNkcqyO(%ND7dtP`N!GA6Y%sYD7ue%%;g9EB`2sK$I}Xo%)VtU%JFvu^tMEo3IOXCM+3FX5va(XE z=zl9d|A7;BR`^VXe-Cw! zS`N4G^HwDHh1`M41McOxi+u)hr-fFbP->`dAt&NU)OUGJb5%JNLQ6f>1*|p+T)sm& zXL%_P<8vl>T$hrUdGB9*_T_~6Jv&F8VN`k%N4`P1ANkG?urbX!zf<&Ev0r$}gC`EB zUB>-Z=(v66ufg*WH1P%VMKW}+fy>UHIS1e67#BO#k9aHT?B}_8kre>+T|WO|{BF0N ztU`RY5Nj!~&pMv za}I9{R_iYndEXb!G}Omwo!o!@8`9zKCveZLQ`&)#Zd|m3AD4I19=vvbQcwLU_E?|z z^dY#2#xMS!@?7E^>#>7w#4G-$9YdVs5Y&?Kqu)giEy*YO9qaKXANmShMick&R?fd# z3crE;zZe{M=Py=)r?IsA7P!f&dGPr%+$0d!c#_X1oDbZKmWe`A?)Jg|qZ8=warOuA z(+($v!2by7ey6)4t#`?fyUn=K;IzYuRPs1nHvCrn8APe)~c3q6tzyNQMi17|A5)5u7)ROceb#Jnv$E@D|+U5Ct}%{A{uT z&NCbB^TM8NV#G}h2k09{00=ODY9OD|&N@rit z&r{x0CG#HfKIpcDuO2FZ?;+&dC8+=?wN(!pr`!ple6f=V&WNWe=lAvmN1MR!VV7cgj_9f)2%Oh z>6^4ihCR6NPHc67u8UXn2T$F(lor&lg#C>ISDl><0FQH#@F403Fe8isKb@aEUyr7C zf0?20(at$L6FTmE+)`bKahJ-umRRKWyM12&i(~JA-?30$#k`RTg*Ey*T1+Iaalli) zNgwpg=N~X{WHIW#(o6W>Uw-t=OV)l%93{^H4aWUoTS9$U}PC~yN=URNglSIy^JVXl>$fLW5w7sN>u^u}dX)5p@kqqOn zTn0uv${qbtwBt@jS2Y^kC?Kv8j6DQ^&;Ha82cH8}Th;~RO$fNhc}eKF`|X}GO4Vds zPo>=9`66((GBfz=%ma71!DhV)<^5<`2|ja53w%C`_m;~2N1@nHJb3&u*4dl$C1=4( z&S}z~i-Ubl93=~jI|BdLxp!qV`0L_A`2O@0#zY1=P?_lY7s}}E&}A3jAU+M#pI2$hHs2r zrQn&l^Gc!kxqa~w!@&B4tw`4HUBC-Z9;IGhU~jp^wo995vdKXa#L zqH-rg7K@9IAD1~qdh;mf{E3P}pDsIO9H%#ygOqKN^crGEF3#+ByO&ExwH3r<^1M9)Vus zsF|QDF>gI&{wu~F?^EvLC-;#{T`blIoQy@P0V)D}jwMecUU_a5P}iZ8%O=dLz`Z(@H&bDh2Z6^Av4U9Q)IPMsT?WeWAI7=t`&e%x z@eN<B=>#X<{NoQDi%E^k@x28IS70{KrfmS z8H!N#v5x;!z8`CBM!rW7c=8+g{BQktIXmsnu`Oplse$EkFNcTDdX=a$837xq@9wAk zR{1l(gc9M`swWopm&#E0xC3S{{c!8Pck#cU;eT)KOFzSiZXBz-oz6JsMk#PF z8@iQsMao0S*9b$O?tTlyxDH}pF&h3k2LVnyJSzbAUEWS#T@HPPgU=6Wj|?N~a~m7? zf%##0!1c5xoP%+(8NrJFpVon1c!pQDgNF!^s=_}Q3qE5bPOB@Pk} zoe<(0{>ZZ_=USq%^E;eJybrF0a^I-4yEOdi+v4ug+@LK#gNZX=fgeBeJ5DghGKqhb zhYt77G9NT$bRK3taHoXN=tRoH$n)@Le75C0;YF+`P%glyTX!APfn+o`0LL#fhPmi-k@(GL7#9k^_$c^XpE!j-<06G~ z**B0&1h{*La^gYYyUUCDlz2ok?fnV;tK19mmGJ{tL)a(0NbhUOAIu9Hy=*q#(2wH$ z_H}y4*?n`LRynyue;uA*HBD%DsaTVNyoCFblCkqp)=;irkFZl6iGJOk3+dRIy8+b? z`M(4*XJE&FbMsgxbmFj#tN4YV(}N)VXLm<$p2yC8h%?N?SLEI!JWF@xya2h!(dK4( z8qCWAcUxl@4Y@_eohvL6cgil9HZ5Mzb^PaQ$d7x;h|9LchvnuEyqpWkhrImCfyuxA>H#GaG)u**A)U_UZq_j6}jRpcBG z)|N5aQ^^WAPW!|`!GE`2n+HcTsPhGURzU;X*pYMl0;_3ZBr%Yqc*-vC6feRfhYjrN z4W`e+I^3Oe`;+;^;rVvF@WIH<7o2f7Zk=EriNwNAK({mN1y(KeCj`zi89zx#EDAZ- zrTh-E3upGJNV}25H}ZJ?kb5@*R1x@+qN;fCszjwS9>eKX4b=j9R#7pu%iVnV!%okc zszjAw4^31!bX;Du^PgS5M>TMmGk_|^LuWX6=Z~E>=F{bo4pZ66?eC9O>Cg+|p1VBw z;yx4japQfFid62z;-i$u;lHj_HL*|fzQ9ki_{hI2rJv=&=MvAjSO(6M?Ia(nGWgg6 zKL1<3lKxa-T(1S^gYaiIt0E5`uBY7`?DiYv=JLe8Q-vP-U#e)v&p^4vd}X=kYM-hM zKe71FK5D+KkP9kDxf>(Sf$!X=2aboRNkpH*bSVBt2=V1~6@&fk1HWHkXCNBf%fmYT z!C$u?xW_!yQL5pG-$UC4#GjoW}^_kDQ%DFwaCn z@3O9mJ%!0`=+$!a^T@ozQvJv+W)z-LE45_}F-20LzRA{qbg9C$wck0kC9 zjJ)HCGy5a2EWYQ_|9I}vxXQSSC%$}xR&&AOOz1_)bNJ0J?mvV2#o+ED+T(U7_WQix zH=MjWZ|dccr*RkR6~XIZ#&eYH;M3hC|Dnf!PREZ<0RQ6fd&AUCZp8f>ejI*op&f2G z29MoM5BJHt%lC}W6UN0Yn}XXBssZEah$*6-1n{^o`kli)qZL&m^1WuNFpor_>C5mx znUjVo=*6vU>Ont}5waC~{e$ndq5rcxneAz}9QfNEADY}e#_x5`n&GLzpW8oN4kicS z2@YjlRv-=Or(3CR1QT7%Sn#{aoMxstNKA!|QGY zKF2dF6nW}5QO$XdM?$^k*k?oV*u|^6sg~#s|AhLj7%!bwb?QaQP}Kr^@kOik@i#cf z2JX7|C#v??i$WWX!L3a2xss>;B5*K?)lHddhaRJeQ|D4I2ERI;{4&P?!&7jM0 z@VP2-cKLPx&c9D;m+vrw{lZf4?T}r7AM1sk&L+Mcj9e6FjrUhyUFTOEzTBG;&U@Z+S68E6v;>(N*uf$0K?+eW zsu_&G>ipzf(}2$(ZIp>ZKIMpS6!1NTct#d@6wA4kROn}c)Ac+!PzpY|a~ds88sjKY zn(@BF;U4&x-puQ*!Mg(5FEdVG5i-J*fZNCHV3Q4w1#`Z;fd@C{!p}q2UE}amOX$xe z<8UU9yk~cF0{1XNuLQdDp%=wCTaMrD^81%S$K5pbDf~HHf7jH6o8L3|8sC%0(TMLtKD66c zs3teeAf)&>_awDO@AtT$KNGnUmt()rOG+pyLJqln@8Qv_#_tUW<04r(d$B$hJTd3D zJm8GLBzYB{+Nw_MyQPsq>-<0q!Y(D+m@Spn5bimsDjhv?a5wEQX0#IHSMKJTCeqa- z|3YFFqh$y>Nl=4my}h&pKi%2)t>9Z%WFM~DgHL_I$xOx}w~2!SMG9X=l@fm&0fM>v zmqxNv=3?1{@lzXckNRlhDn-a)HvBioYh4DNBJ}wT9%)4oayfa>^@&+8C*IKk`MtrM zk_!D5%rD)+=5_e9?xwY`KsL9p`Zje3lcTX4{Lf_;+DQhn%V%2;z7Iz}e?vEi@pXWc z)y@y!2{QL!#c&CaH-iy)lKIQU4cS#xb%=lZ;RnVeRqkv=-rQ8ni9r|FiN~M3P9A0? zIP2!Abo}Eua41r>f)4kasJb3J%mu%lo-)8`hyS_YvBTjeY6SMnO+A!XL5>YnC+bJj zi=iHztF0P1+3~$EdJL73Y834-lY__Auz-oGBjeG;m6_gPpW|TxRDtDK2D1H-j46la+>~y;)i^XJnA8@Kfraj+i}@b?$7t; zssu~D$onoI{6=R+ArH=E2Wt`Wo8m554*x{G88wZrwdyyEs=j_6uvMs*LOZ zDW9RekV|`Ty%uub_TO?p_WPQGE6MUKpGC<3Bi#o&0rHD(smjsbD&3gzfxYV99y$L` zxyyTgM|Yw;fWHS*>anZC9=`lJ&(fRn`{dWv^2q<3?g(Cy=>+|1;Kg;_8T$z#?%aX# z%HNLw=hNW(A>|GygTbe0_znU8va!Q3X7ti6|S(^gRxqJ1aOef0YIcHLd@uArV zETO+i;B|fU%MHx<(LwN6*A&w(C)U9&x9{1D__a6lahYkaTpoTa>N`Je6!>Vl=dqt@ ziTz))-DoFCs^Tv<0>8ZRs~ez4cMpfN+Zgic8X?aV{!U6`>T?r=ads2MK42r-Pvv`i z_>YrywD01&YfMx4567SGhW%CmpBuwp9C>u@s27Ic+zFhzY}cB$@EHR>m!S`D;vBu0 ze|Fgo&~fKMjv5y&|DO9qr+^##>>=hUmp6OIv;uX;6OSHCe^1#P%p3eQB+4c;3%v@y zPNsf2Mob~|QzRb4479KwiyMPp_Yr??hdsw3`L1}N^XWqe)t6mGY=(A=DDTLs{%OM-9uG|{wLseI<0<4 zoV^36y`3Gi7~&_}K)z(DPJ}0v{=7!?xh47&;v2ouON#6z0^D3C;%%3K3tsrm-T5RB z27kF4ew`n<5`czog6D20ojc}f-~Anr+Uh*#FXF&QhfArdGwq~;&z*T*6+9k{Jw$-p zov=GjWP+z|eV3z#fD2))de{-g|~aC#Y)B zt0evKM_pWPoSMXU&hD}novMqqzk z@SppGkA7IjSjwZp=b`Y|9XuWeo}IPt@xJ4KBk}Mk z{G>1*e){kHauB|g;A^YuhfRcm#~nO4d{XsNZ8~w7AjUVgI&*kU<{GI^L4K`BluG(bq+Tm$?{WO1lXN}Fk z|Ln#H$=BUIdh4P4gMX)NE&R)T`rktL0I$N??`uc-E)-;Vn^M$w-H_65gM?uQ?p4{o#tho>XQKf&RD%ctvN>bD|~ zVS*JHK4Joh8`M{%aT?3qEM;w2w zov8(Gmf{y@VOI$<*bGs1!0n;nwu=id#6RZL5$6cILMKw5h2In@;#^8y`kO|aqZ@k3 zN)h?B=^iB^nA`fWZgk@6B#TA<(~ncpF%F~ zUVXPtbAQR}1xCNi{eA)r-OUXsc7t@b!(2E_#0=r`LZYSu8Cj$0+c>d>;+@b zXV~Fv2ClZD=kMdSuR>;>RB!g<3P6q;_>l$dsPzLCBEZ8D&}qVo+~F{HlHuiyMYfH> z=aZhe2e;yalwZmKP@o99W%$oy;nUq^ItUN89r5as`0KT~7knn|yL(O+5O=5v8k2R5 z{T#DTdT_oyxVn?k)}0-sKB_wUS`EIsa~_`)%ON|JT#bJCkKTCM?a@a8QH)^px{w)f z7;@K0q#gCEgV!nG`$Brr6FlHvV*Khfy#EulQ;7d_2VCZqD!pk1zBVGeqrP(Y8TqOn z=rIGlZiqd$qh5XHw3<+BL;R`=s6PZcH6%%m$ByzPOig6oOk`G?hdqRY)8pwMH=5x8 z=Ah|{>M75-XhodE3w)}<^IWu98#?Y@{Xwdhy3764b*b;p!8HeeiKZg?cGTy_b^KR% zDzvYffKGG3-5!i5cYkR=^v+JO;%|KLd@gwJvcVryqp=_E1y-}5pT>Pi3!vABxb!pN zGo50|I$8*h^tt_RXCnQnRtIL1j4{TJnRi5m}vZ~nF37ceun;4khOXYc#%vT`7!E8 z;U7DEjwYXO8a_}U_lOQdzTB$@p1b_v?CC~SnXV0G5+1by|&d$!h0H2r9tH=?4E4wt-`iH=kFXegOEe*Rj zwiZ+no>XgJ-ETn6wM?5ggDA_R7aw=cV>&Y*DW<(<3{{j1gRg)w6GfptRez!lkk9Kz#%N+3xd$E*{pjQ=H>tX*5 zWaW3z&-=jVlZ_kkyDZ4=aP}Q`axalwABj+?M~W-W4;|>)k4Y`h&Ewy=6yGWQo~gbO zywPzB;EqW}xE1TcpLv2~KB}L_xsn6;H%|Avh#z>_orF82BYmI3emeMq>M!Y; zF2RRPzsmQg|J~tY;Nes_TqTir#`jo{k4;Y|+g7;kmqwh^5coa@Zcm1Dk)``E{)IwZ z+MM)6{Z$`M&!;B?-EUi6`62N6q;x!a9RTlx?jBctY~D*x(8rBPZwWk(eEhLLxv$Tw ze12P;ogPRIgV@KPYvn@m&4-Xf~23qPUDD?ARva?v{|N-0|63y)lj7 z`EdHskstaZxZpN5aS7FvJ=oF57(cCjzY;FgO4^E}s0q(U!@Uadx@~p={A$2&F_avB za!zou{%y){aTfh|f0B5JI3F-4I~%^b`!hQSU1mCVxz&wx*(aruxo=i>x&EllK0Xs) zqr^p@rCsg$H?CLSy@xFXH{UO$pAFg1C#rWE{0=^9C8uVW(r5V^(M1LG zKc9iGAKgEDHo3VM4?foQ+y!v9v-jR)9*K3$v*wEm#`CrCx_8K*z1)Eu-zn#r8vGqg z(xFK&_VY31+*Q1D_&qwwH@Xa;wZ&(gQNH&_a*1;_ucJpM)9QlQuYxc*IpWJqKT3!RJM3Pvay!uqWH)@17iv@9){q z^YJs?NY2uu)Y;UCo&e8BC41@TIZ0oB1GnMo>t*S#)^XcIhLD@H;r6rlC9D0|8{Zdu z@O8(!bqdRJHdLCzuhO4>YS`Vj|THT}uAeMt2-`s3sVi!}Uo zjnjzRmIExkoE)71gp09%D!PC^Oo!W-=)dN?^k>4u-pR*G_N?err38+kV_rT<;!$ylsiRrK~H;_2%1PsbcR0zNO) zFY=?OcbJ3MimzLuU*z_;A6dEfp4*cW4^iD^O0})#q4x(A*uicazov$@I_krhk826Fq`3!p7F4;Pd^BJ=} zcLlwwCa=Z{?QH~~yU>Rn+0U22&yDc;Cj8YFuMv7ROI%*)+n2=-`8CQngwO5Baa1x- z`Em4T9=kjCv(DmYsYrhh&CfPY3-Ql!)5Lh^xhJbn*3JAX@d+6}^un^=@8IvKpnd1# ze``2@oq8IR)BSn)tT-N)V!|M?qv^|TTD`ih9{STC{=SaeOPyP} ziodIdxV+7J^`~OPk~{wPC?oqu+$~t(hn@UL zIbMEa(blj6_<+S2=jMJ8ksJFdmD6_Qx&v={ly|-dKS6KAzO4c`HYq4xIcty+%7~Dt~_(8R=JPN6765^Fu9if5rDP^z#Sx#d(C9 z_SGs|e|)I?F2-XDCnK(`Zs~8fQ=HyNUz|M39w_PEv%ci72eZ4`ft!j}*?na_L$dCE z7fGJokN&&+Ua!YTL+7}Asehz+ySvJM8J(S9&R@G^J=08ojmhrRk46rBihP&z)n5sH zuJ2ay?pu9AJq_UR1k@bLR!mG!GB( z+!gd%oJ{sui9dImH)rE;kvdC&uZc*&*cA(z(K0(&6NAPxg@aHj?*f0es_E zH7`V-$=Bh2#8<51->FLe&zcV+F7AKklbC;g%#Kul2e^I%IagsX-%_@>BD*sDPOG$c z61`mu@7pE)l4k78D*TZf@XxDP)W|P4pE#kGw@edyULn7WerD|DTg>n73C-J(_x6$h z4!yQ}tTyZCmbr+1c3-FbMz~sseBgSqb@S^l?ty)()Q6S)COP}!9+`(n zlhz9?z{q#^IXk|(as4=)jT>-2f)wF@`;C7m=C$|W`3_0W-|?Dy-NkD^EBu_Fkjq^4 z6n4M@XBwnOqIVUwcz20owbGZ->+$d}pl^edD(M6C(8?-(zQom0{1~znnGGK)@oGRC z?{yc0_>3CtjgIM)#;H86>1(CFZ4Vcrmfg;$$g>hp(5v`qXxBIP!Db~R(udN8=zFOr z^z)c>TY52>{rq5QKOCx_@avv#zp$qI4{}554DDK~o_Oyy_T#FXRgc1FWX8NCeWNUY zz5T)(+PfScP9mS1($~xK&+*?>Nro$T4}VuBz4B67{uTR{vWl3c?;z(!{Lyb3hqaQg z(-YJa=T7cmw{-OVeb3imKQG4b_~cW3jxpYUO5eq|j7-+?RgKe6wRcx`T9&<9j+eUG z>F906QHMNv-m{m>@;yttaj5)w_pvW{XtVdx#qaa8R!e#%4YS+vU)R3hnw01UohIUhI!P*5|S;~ zzqn`Rl@fm*GjBwG>MP6xTbZx#h1;?2dyoAb>%JS!J3AWZFJ@00N8R9Y(0hr?y8|9( z&JjLm{%y~{e4_StvEO(<`Dgsk|78z|?9aE>?y3ABkJ5+r#UDN+Z)aWmbv2FR4cWu` z1+wKY;>QVwd>EYDk=^|$xjE5X(mTqRvAyyl?b*TGChqpjUoY*Y5$dm@{=V>UQr7f6`UHcY@pZ!}Z9I{&qgyd*s2*UlTvPvUtJ#5O&h4e5UtKch2xmc8Hy3&&7J@ zk9>m9+3t^h9KEsq!UyPiEu1~*bEr3d$gcgnD2k8O6Pe0x)Bl6)2!EKyO=#6boW}mF z6R2qm@{~ayUTzfs{I>q9lXN$7dn89|!7KDz8%Kq3}QU>Gy!sL*RxTQ1fxZgZz?yZ^v$a zD($YnjzG7&?=It4yf5ycoeKfRY79{8}5pQ55xU89|0m%b!|ApDc7%|nrg^H%t~3w!TV?ThyQ2(juK z2hZVSjySKikS63-*}6UaT5oIT&St$$$vfoRn0*}UvsU6fo;OZY-#=1MU3k36`?q&m z{Z+WPspx_p?;R~(BF-;Vf}4xk;r-M1l#BD-^ZhP|a{7Y)tqPw%Dfyqno~@$S`lTHUR+8(ldV=h(ymX7qu^xFPxoG@eTy4D;I5~`e!TkN4(qCd${&9wCqB@cC==*J}4`zh2UNW#@T+@8oyDD!e!tacT{agNa z_IRxO4$Hn^C;VF62A^Yof1G_B_ZFU`o@v&(4|#v^^RD?L@*|$F{CxWSTJ{2aqiyoO zc_`w%9?n*gkNeHDnBN=0IOvzJF7lxe~Ulo;`1}<#Xz|Iq;W0ICVxjgec|av`nQ*S9?$BRy8O|fmHZ#Y zuMzz?9gfdc&#`&T?=_uAIe{ET@o(G#dvkdG8G7LJ2jtVAhX0gyRX&l)JnZVaB@VST;wol? z?E7?pKhZ8H&QrHT`~8=+uXSs8coz0dSN@N-P`8DV(@FX6|CZm<4p3wta-xX+Z{NfS zsROSj!o`koU@*%i?*EcGh&*FQ^(A)0&OA3&vbeAAaQp<`&SqIn@t6EUPrfWRqKCavJ$>7I>he?n!oK~ks0nXk z9oLqhwVn8)JpBSaP`$P6 z|GkZWC;rlE*>t!x8%_p&0e&JMi)h|#f?l7e{^-ZebUPerkq!{|vNZjkURB|zI8Q(9 z!<)2*UTu{3=!X)ou1asEKU?ws|70BfP%KA}`fufLXlS)_Y`RIm{a)PWx$yTs&u*^C zzC9;hZ~j_aJf(chgD>*e)M5WTsGfds=C$zxr*NUspRwf^jiHa zBd>aU)BikMLocc%MY@6?Xog<=hM#N``i~{O61=Hv{BBCu(chnoCfVmD|GoGLY9um; zryI;`Yu$_WSvih^UNh+ihySLxpE^ev@~O@r{Q!9nVt@Z`9981)2)(KSPao931Dtc& zz&=d)qhs7v=YPImJ%i=z_>Ep|Dh|q4m;4``-KCzv{2)K+-=B+P@e^`BJ^PRxSTtFd zZ3zdKDIfF2^6XjiWC3ZgL${;}IBU&jXPl%I;n z@EPlici?N3za*=aZKwSEWxE2mLmxZB^?%at_SsjJ{MSgnHqX>%pE&=mUp^|ngWoaF z*3KXE*#yv451+sQ%ZQt#V~JnQG5;M00AKZkzq$R7VD-IxEcRsMZh z&oKT4S^44n59W*N@H*sHgI)cI`uf4~Kg^FG6Zvau z(zD8UmB(X4dZ6)sbp922R^^X~b!S)SzJFyO{$3oHe_iUsxp~;1BiY|;jHC7T2fr%i zd|}D|aQCHrZC-J|Qr;*Fy}Dlgb|st}T%|v2+6Qcyg*>Pt`LCB3L@2zPD3B28& zMpVz*$&D;imGZH2`5InNLvN6EfD_Zu!~ZtHj(cZT@>V>9b+vz%-@|^GEYfxXwA|lH zHS(hzB(qrL&DbVcp3XzR&<^|R;s-d~EDIc#Pg=j;1wXrGPL=R)G*>?C=STI+Uh-#D z&Eov#9XyO7pBwoj#FDUwYh+#E%w~FZ@fz2dUyc-K$0I2()Bn*Qg3qm>{|z!ZME>(f z$szD$75t6(`nz~Mx2Koi!s)=z)yntaKUtY{$~wW}pUF4u-ftjGr{pkkAAi8z^~LYz zrNH6p>7Vqgo_OgxS=^FPJ^dTsP5GN|AkU-uncHOb+1p*yoP1b1?ClO#3w_fj+ViP< z2n)~GVL#tv{O%|YuTnN#`5EXj?@WbLp`SgKAI85l3tom^?GOJN!G{rPx2z{V=cXG= z{Xd-lBJv&Wl(y9Gf574T^s_R2e%SoAkLMbYr~BH_L;ufAA6I_&1*qx#$hEuI4}I1+}|s@;y7du~}Q*jV$#&&te=^)=RIVLt6Fp)ZRR6h)c_cR4g6Gr&!3>nTadNJ|BvqXu2|9!g2VOEcffZy z@;_ER!T(9{tA5gjAEcUkzJ||LvWUYt9}YL+7dj%Vla1l`yGs2b|7+RDqAHy)te1_& z&)r!~@~O`MyvF!#@7zlL%=v)g3GWSkcq{u$zjt!3rAgMG-oER-C2qjqI6wCqdQIiO z%l<6&VO3V>m*)H))#&GU{LC9l{iy?A!>{sm)-u}#dZh3#)EH-`hC;d?4MS7UF+ zzCdH^^)~b~%Y*;bMfbLBr5zwviD z-)*V>mT;qEUWNSMD(TU#!{DrFcKe1Ml&{QAu35_Y@cb9_F7_2WWP|bZfAn+M(Y2GI z{Mjvy-__`~%JR$6WsUWG5BA=>?zj3!|25>7exH2CiI47>wPwGqfWu*@-H@*|?#99A zP3rrNAEOO<)=XYAZ!|-{Ew9hM-B7%eH_)!S_6%EaF=jF6J>r+l$<(yNK*(-3nS@L??1#0$z zKO3C-?k;mo@Y9?hp%XkD=0>)^#f=6%q`OT-^`qEl65t&CJWY*Wp%o`v6L6?#wfFSWm>d0dth z?XAaO{c0Mxy^#g?IWN*W?TGlFI%!SMbrsjqi5x^!!Nst*+oW%ld^SqkYge13OWHaM z`@Bv1dWqXk(`MhvKRFD32Yn0v0Hr9b9<--^qAQoCXvf6@FCc|u=Ldys$1 z3)Z#d^V{@y{LJS4@24L-CZEImz@_im!A;QrN;_m97w(tOMi^gJ+0Vb>XQt2Io-g3@ zh_ZYg_GG){Kz@!!+7suxd--(pIlQXJ57RW;k{n)v+d;3c_iE_B9?q3?$O`$#`eohA z@jC@hhyC2s_cruz^1t*s=&@h6mG6;vbgp^=Z|7yX@`J_Swa)B{ITzTq#M^_iG;64y z1zCUc+)x~y^(g7bDc=bGAC+~>+S1#Tv)(2BB6!&fpGRjS%{#voS7)uX&%Pms01sTbBLxhTmm5HEBm^M z=X&sKv@PZT4LiQE^6zCm;ok4X-|7kZuglhZZ*TbCA=|>bDS@LwZ(tqY0nU7*{;u%0 z0{b)M*@zw8-ukXyKEiue747oYF&M zkjrBnIBA96o}Q2OzPiTy^pai;J~trG?#ZOQT^8{u$L0e{eK?Gr+W>m)Q{rlyl3u~Q zc@8|SlMLmL?#Ukb13nKb%b$;47e1epkHg=$#pUoj^!z&XCQ$39d@y@pO>u3>e|>g! z(6^9pw0E|#_@r>ZpmE(4J}3Czg1@A%d9ay$qBZD8WB!YRK2P9h?nv+dU_aMHZz8Yg zm)^UpxV#qn(Yc?ziQnte;J`oU!re&U);v)m8A_iT`~OdAcXryZ#o#RBr5eDoz>Ov> z4bihk+}P}&;tx1Hv(yuJ|Di8Rwt+kUEmoEE7>{Gvi*b)hlVlv6{@OW|cKC@*dTaT) zE`bouGreO*98=P#u&d`m(LH(c+R=x7?92q-ZjBx|e5jF88$KUyCnolnxA~WT99(V& zS5GO+H;07ZrX!Ne#dFjnr#L4#s^tG}JNRLLo@eAm9^HGbSYv&04|+T8dOR7K{ZahN zPM$(Owf(-1J{%?sXy@#W;xqn@aV7qQ{oFG-j{jpgIW*>f9#hg+|4U!PV%k`IEqnGB z=;5#4Tt0)hD~oRm`8TvH^3+u@&gXf4NH)4zJzt5cw<}%JUx2&yjEmRP3FKB2 zK99g>HTH80`Zvq$Inns(oqVO981LVw&5iTfdS_VnqxP;V%U6fXEzoDMqlaX_xwo-$ zHleJiE<9<=4^|sqZzxDU+rF%)9^8xD%|^npSL6w4kDx5ZD~m%Av7c2xD*mHY=g zUl`BVhR0J&`f=!u(ND-i&h_~_CKz8e_#>LboiT8DB!2pc-xyxfFUuMmS7V${8KwWM zx4kFUFAu=SxbNsc$`3aB9?6E5{rg%rSicN%US*qXXp-9>9A1|Hz&y|_>FfMTeD>jo z2)R`kU)Lh*?E7cRH6@R~y(jqjHLII-H=q8PZ3R#MDYATIDd&dl%DDf%YCg%lmWbCF zUAC)HN#B`YV%zMm;urWlpwz2o?B$MdyOZzh=)+Jr8Tc@oUD*MBTt3!~0&W1#xAb0l zxl8(v@V0;U9(!P#_e6gk=6eJ39}WNe>6dx=FmkGgJ`BB=d>q5!e{b>56V+2w+*~L9 zGE`jNAo3d`-fnZd&WD?I_-`-G$K$_Nas~cF4%eXvKHunliFr40I^=wP-Wh$IIF8}y z8`;mHAA!$Z$Z-qztqir^?C&0xe&zUmUHN9}dCm8@SN(mTabEQ&_#Js=Up2opVu$>i z55#{1l8yOnf5;i15zjG2Jxx3zBZ2kxnqn-R?2MC1-WwUl8<`i6NGc^0w6}_px3&7? zCrNBfep`_U6O zbHLA-kJ_*+JCJ`fIGb3fw)eT&e%?%%)hK)4`lWxmb=falus0ji=i%rn{2ZT7E9E?e zecju}7xygD+{7whZ~PDZzHir?t9i_@d<{ZC={K)=-x*F0VOCL}ke zOUU&T`H`lozouQ_?Mi%pB|XIY>(}CqvV7q4_!6I=Nsl1+O~se$je7o%{%yM0{P;z& z4!sI}FSI}Kc};paK7Qi03p$S2hk?)4;7Q1*Q5O8zXJJnUy>B);`w%{~Q$GBwy|Z0> zc4Mu~%zly=Wgz^FdM0PHv}cgjtkC zHVMD=#czyPeu#a*N$|M8xVdf6tvum!H>=z?$*r4y`y<;4pS93MAevAA(4H8dX}%XZ z)L;kq)UR&L$!D7%Wsu6Fe`zlJIB>gTzC-q>bzeI;7wu}IeAvl-;p2Da_d)-bADws8 z-oWSSSp|0SR{7?3{U3clc=(oic3XHKIJ`KYfR9Q1%yac;Mfkm?@!Np^IrtkZ{|1j! z=ySeR`QCHAC+5w)^7-0ZKhAfT{kuGmaW+Dn^AVY>N5xg14?E#HzXv{Fn;%L4zbS4n z`K$?_gT4hn^!D1hj{Q8Xv;$v5Z>s#AKI2~1ck@}~*@*qT6**6Fu5b$b$GNtAJMU`( zp9dHhyW2k;VcfOikKPuak-u&h`dRGc?b&JZzun4qG-A(nhi<>Su_y8})J}Hw8FuwZ zIDC|r@1y=r;+%IW@vSM`?9BVn2M&J*$Kr;|==YKA+@A7#baZm67Qf;c^@qJWQT;tc zC6D20YiE5l9XY1RAlQaF84j|Ww^(@5_a-3BcKUC+b#HvdFNhsZqOeT zdAgzaiT%9mzqq}b{sw+l7H`7m9mt^}d>#+qj&%}fiGD5IJ~KzZ1P+fcaeIx(s6_t% zwd#p^=X3P9Ke1A}8@u;^;x|IBG#f59PPUgP={S1zMNvPUiynTAE$QbHr*f8(%U`m* z23|NJ^e=rxdNh0WL-zB|C4C%xbJji?1otZPgH1?xEyw9(<$D_!lhb3h^AG;#(9=5N zGlIUW)0lf^KbtQW{7XLqE;nUA?~bXclX{}|DS$o zdIp?-n|~wvw&u#5CpQW7{ zzw{p7=O$HlZQEpFQZZXX&(owqw(I|=w}a1tw;vUs7VWZ~%lG!qPWF6vTDs7BiiZ>~ z1`bTlj-?j^l|RY*JLuIN@WVfp9YeoG^=GHi%LcSD>Zu8jmw0Y)vR!tf{%Mu$>%Aej zMcGmGWk7PU-(~%#jhn~O$=Q+U1I1$;tp1MTnm6m)C0Vq080|b!zts~r7jmw}zp)2B z86{5RSoX|M_6PSU>wg5_b>Ytq-qXT5{W0$iynRnQ;Rb*t@~4ULl1o332tn~ zpS;*O$lQOmTdAKvqj%De&zg6J!jWIh8-aUk)zbqmeV!dn-#77Z>|TzGhH$7Iy!|sf zRljd28ls2&)+nD@;&ZF~Jmca6_gn2rE^f+FzD=@&eZ9rz<#(M=*`wrhG~DU{1%~8j zD)*!P!Oi-=nZ6Uh@uAt*#?j1w=?md(on$H;KL#HDT zJC*M}$Mbc_bN~EU^tgv`7xdcB6^0!!L>{}7@Kee8?#=pqeSWxl#<7Qwr#IF4n?wI= zJAZPB@>{^^Mfl~}#ec+^yq52)zh}Vbh2&9RT=Smf(9k`EyX(if?B;p&u#)pAdy!jH z_&k^X?=5aRaJC-%c|ZLd_`JmTrAG8I%CBMZ9jczje%YnOpRb+tmo2QQksN5ew)E!R z&7;TAs57-|jXX#Pmi^KMy$gEBWHEh<8!Z;2^NS`s>6f#4uusG9du~)dgdDQ&v)dnTd19llC6^+>CHkj$)WIhwR}fmk2HWh+hm|{jvB`Ju!a2K<~!x z{eZ>NPQ3bB^@JXNryU)<_ctroKa1C#PYF8DI{dB3p8Lm0>nX2PgLGg0;%4A9$}fTE z_j~^jMd$Qj`d~*s4Lvy8>B#Hh*n9B#Abi$M#-&G+?~e8Xuh&m*!1jFDOIs;#4iz7L zhjzas|8vw+n;m@+`Og)teYXBuRm}H2>OTUWHc58l_xO*`3eFiGS@N?Cy(#@!n6A+8 zMR8hsSjqnx=*{%Y1#o+vecOxKzfsRk_^4%EU#|XEwCrZ~Y}ipRr#G`lhVXk_2%mps zKOX^CBYxu=`ZAiI`4sQT;qzg}L2XeqSJKY_CM{fC%ICoBYW+5hm2(<0$6N@?l<{9f4juxj8$Ne4M|B&+Wx^+=?E0^;mX>@>{Tzmy-Xt_6LtF>kM+ z(XQW$Jm0LJjo?=c^8Y!z6dq)7d2_!vft#^ESTDcPbKlz^jQ846%}>SW6!!GF-v23l z4ms0={3z|&fq(i2_5E$%F!ahzBJi}8`lsfn8%JL|KN$7Y5x=p#tY;$p4?pU@=rN8C zS8x4fs&go(;lHl<<<0$aZhj?w8^RBAuJKyU{jc%fI^yB3(l0}Oo}=9|QT@yR&1Ly< z?om0L{MHsX!|^Epg7?-Wx9jp{#@Uy}6Y2@My>32eYFymr_cr?F4e|+GdfRg?q0-CR z9qXfy%}asLYx5I*-`DztZC5ylkBESKz0%_g+V1>wE8^-a9mTQl5>|OZr_h;?_;(Ik|G5`rnkN zF3L9)|9rlA_h6R(J>K_O@kGgg6Ml{h)W4VW-&bhQ^X~6HjvO2EXIyMtjFxTb2LAug ziZ7CoZ$o*Iu2g=m^WC?ZM_wv^RZonUO-^iyY86L!Ir%)#{~U5N)20s^*KO<@uF&ph z-C}bze(Lf!Ux<&zNyYRbdi$k3{U>_AdrH$Qv}-=U$hFFSQ;bYcQ%_y~=WFTxPVo6A z<$0;nGfVxP2)`O9bKv)F^yH&rFLq_vUHigcStINVUgx>ji$l_L&}%wpa*O(R zuK{OYFUx;q-iiBeHY#78T~j%G#^+de^=Z4G|H~vC;77I0`CGJ`eh;z;BEWtPUGz6qL=b9FU-h3FrN-!KR=`XhwRRrUeb5T z-mwnvWpx?tda0P^y-|L?@|ED~r0o7|O!8C_cpUW~lKrTBQ+D?4nLX~}2;~E>&dGkp zN0RK3U6svFo+wU-%Q0Rq&tA`RevQM>w=$1k&)yeS*W&Ux275cE&88;pxt$xHZ&d=6`~Wy+Zc zv+MotCL{Bb)AU?qQI8DV`Q7^0jmg=y{3I*c$)_tXP5?dPdw=)-?r^vVT#WLa^GESD zP(Hhx&9k4n7bWU)q6oX0*{!%mKRoao7K*Df<#eyMsk6jRj`@|l@G;d3;< z$Cc*g_r=$RKG$@w;3Ga~!|i*FyBCW6%kupC`2*^o!{2eAardtHy1<{>@b@Lyx4AffMfqXu>09X4D*lblHClzv=9LPg$MDMBL33faWd9l0pU1tvN4?kL@{}eB?&n9`_ zZoC}-wbmVn-Ut01^y#qwApNjjJ;TkzTc(?emFPPue|U^PZ5Ygc*$(|P^j+Pla=P_L z-SiJR@+z|b3gt7F_)G3indF>a+zmNVU`k!=dSu!K3 ziryXl1@v!y)=OWJBWf$VxEqVGQ*JKnmoYYdM~vYBcIGDaC;Y_gi)W12e(6X0%{@ls z+eSNgBc~Vj(}Z-59Il%R_p}yGm7DCAo*Rve-F=_PPPjDvrl?1sr{MQS{GXWq;1=z# zirdrgX^az#%AMf%Q}H#$_+5b>_VC2?N-;K*MJ>8R6~3*IJvHc!(?^vbB+j`JU)UMq z@z$04vnsjDT)DOORWsXOUL20U*4p*0oZj!VS0l<^E`VZ^ejI?0EA0jUV0=_I3R)z0 zy8XwUG1~PwO^%z2IzY2VWH(qnRP3(}w5UX&ykq5q1nA-p^t zJpXR`Z81o{%(r{6T7PyDWwO2bexlFF=ld%9OXwY}77l_Vmsw>GBDd?&&qp863g4G5 zpKTW=&Ry+bcVb`fneTHKcIL&rtlgB`Pc%@(e;;TTn3;SmF79)=Y^o)L>~_vH{?0YZ zd&A*}vQ^|?nO**iaTWG*HCFu#;+DS@wNk@s@%vI94;sxq`Bx$IN&ZzzAqcj(pOaAcQxxp+hLOI7&3nzy)vcC552>j9U(b) z-{kuS|Hd!qRq?aFh^%g-d`^E~Vx3H!7 z={(t4`Csx6)!%?5|Fh|z;!QPmGzxxD_G}p4g2vui?24bt{y)lih#Xd7pY-+jq5K>@ z$*!Hh<3`xHUuO%lIf<1VcNFx|kDZK@PD*W|%nDZZRM)(U@XLpQ+-+|BfA9F?Qa;}Hbi74mRB0#$G4Qw!%RAn7) zgvGI&)`}n5?J_XyIJ&XOcsmDAanI*R{%rtDmMM3%cRlX+M*d$lZ6^0nB_8oiZqm-O zLCaCA&2ektWSDV{qw7}bOXGNP3_XBb!5-`hcCRd zP_a#RF^q3GH6*Q_yorx9>3uCYSr2_rQ8hzFczr{*XQ_KUoN1!|<$+)7`A0oxlJ|CS zB%)`Qq*eKjTeG{xAd}O+@HAF6{k3U)+Fm?!6>|8CKlvy6(bl;ZQMe+`rlqw=eXV=C zf4AEugEZc&ddE@f>x1`WeKysX9pU*`*4O{RM+4uF^IWWdn|kg~Js79YoQfTCh?o1vczCt=Z~BF`S>&zyN#@%wiEh#QZ;JcUFN-PUdIx=r%%v@i_r~7)Fu70n zh8OkIv1to-Z|nxYSiGxGrxt@^~JncUQyhn2#E+&;r zyU*d5j@^fq#oBxWxizKs)yzLD^sqZRv@1)li(PD*{;c=TrrB`{=}$ZJYeXwe#ZzJx zT8-qw+W0{ok)^1s=QHzZtT0BPt}^$2!oJ>vq~(gIQ3uhMulP%wxcgx~>s7`_*xi($ z!C6p#zinXmL^DBGb<8wlpPlUBuV>^(YN9pGAO>4O4_?Wp|D$-gI0Ux?$!EOY+{;M0 z#aQ^QxPxE1wNh>Qsi(l@zltYW4;9$Z=aa!ie%{3_>W7o}iswkPMe=EJr+<6-d$uhlA&Lh<4ub;muK(>=Y;>fuMvrtFIM^v}wR<|}A zmo&=aG|mI;{P)e>JBt$+Rpgz0*oXfVZxt`4?iwZIUa8wl@V=w>CiLMG{4AxhgZ#cq zyN=Daf+n|$4{0nWv!8i~b(hXguFJN@|C{o7H0Dd2C@yg-Ur}4}Yt!`B-Ni_Fe=z@-$J?R1%#y{YM6_`F{n^}Nho>2$Y!Itk{>t({NRyPp*EvtD}Z z?_zQKfwjp8&PPunjraIN+R(M?NfYzX8*pGlwxyBzc5z35?gjR<+D+2bLK}WAeBi z?&qRb9;j;G$!i$%W&(ILO@DOD8~GOJP%3Lh1#ukzxcja>9G43LeS7#9_Xu=`pVN%2 z@p;_lS&JVzo|3mQucnZEcg|J=a|gCn_Itap%HMx3};|SA#kK6zAm5MB@LZ{82e8&QDJE{8;*ZCqBd9 zdt?4QU+ywE-A1pzTRfF#W%+ya&#mtdNbbsGT)kPm1iz!65At>9kD2f}aML*~^!RIS zz5-^RoqV1*#Pw&zueqE(_(_vVqFt7oiH4d-K63--TjsSkdhi45k;oiH7O-foJG=%Z zZUKZXk&nAO*)`M7ihQG!ShsP)JL2Rfla5;x{7UDb>0f5SJ;`kk2vyDe)edE+(trIc zdvsqj(pYc!ng4l!c65P0e4gq%4>$G5VK-wjMr_1#Eap$X$;^=YWg*n>f!>-;axw42 zv&yYClO6$EMyh3$T9>Q+6Z9{%>JqJ(>0NvKy6*(bDp+m zvAb`AF7#y?^wX+l=`qH9A1z%gZ&DRj^fty)UuXYzmzQ}Y%l|FsUp^Cqawds|ujB%D zYajf)T0HIT+iT5ex->04m!F`!RovG_W;{35#-SwGBRx+%Tb$zixTuS-hUWFX@LfCI zk$<8)i}P`L?ZSJvjd+w^@b&_@HA%1kx9FC>Y!15|c2wbWxlf!#uRx!o2i%)X>X&QH z)b!Li&u6qRh4=kPU=%<30CPxJr#_3~JNY;lq(kYejJEi?g`}Rr$4^$Ok&Q9({BOrj zexHBrDST#Ra}2vYPL+;HQ@*MF?c)t+g{<1G^IYCAZlZelI<}=$UTebme;M znd9vy?3i3sJXm~|UhFJro?dQ^(lMBe#r{=GR@+}olM)-!RPkrjr<5ky>_`zmUIS)1|K> z3hPbu*f$%(>$!$yTMI5$FfKpj?QhFs=xg2JE;jgCM}P0Y57H66DZlqbxx)T-zf^cd z+$ZL}vT7%7^Lb|LkBbp`Pk8)|bA_GN;v`ER<2!J;8uS|thok?U5XpOaE_L5Z1zIP+ zd0Aggc-slyS5ZE8bDHq8#H~`bl39L__2^)DJpvBR$h)D+#FCe{-ScC_Z7+WnyP=2O zySwKa`D<|=w-{7&UrJvizO{2Iq3uoN=WnJxP1wc#lxijpI{a?^e7choj-7>%$h&cU z{u%tc7=AZ3I;+V`*A@4T+0Bm{XOB9c64w6r#nUCdDLlW-xc`K|!cDC#k(cxC_-y1{ z$_whbK^#X!RX=g|8vHtU76jn(v5+`jeuBn*6`kn zbhaw=_>;9VnC~@Jwl0ptdTdkv9kS&9Q_;l>3XO)PEdWoUvWH3sgd@? z&cirQ{lMn`m4D*^5{T0U-Dtv=#_Qk3RDIUed;5`&`(bf96%zd^zAkPftpOAJvWZ6< zQ`5btVsZ$}xtaIGokWXh>fKgBv0r-}tnJ{J@i;n0**A^xf7pQ+z@!eIJX&iGpeY5N z|AB|$Zb%#VMIDQNuD_e5J>X{tYtnH_x8vcTEGPd?d<~s>_w3i2XX>NR=7oA)9qrPA z?5o^9VJoA89p(4$yt7l?AbEy%S&4sHnz~Jc5K<&d>mKt2kQ)mlrkih(@f-IA0aN~_hM*zxj1Bdl%iy&!rh?%Lk2I<$jF}byV>yoIMq-K;`Z}%^uRM|+~qz6 z52Mfnw{uv(H9tn!{qnPtM_v6iA1)uICGW$tn*0Tq;nyvz?#F6D&h?TiSyiiIJ11#B z_Luuh+2_9(FNs%n=FdodFzrR2-Ni-6-bDr3Rr(rTquA;FNv54sOpzt!=i*nk>_1k> z@A^IR`+S#*v1PAR%62two0}g7l5=P0T8=eCUSQvD5w3of7rHA+kK~WvQ~5hYi!SEf zdtX+@=gjN($OjUqX8EGBv+Vj@%nyG_Q9Es(zLowD*Yd7pm*W3s^Ty@;H9wK#W5w%w z`{8V=mEiB*(?H*Dr$3&M3uUc*6z<|NkH=}OIA8a6TERMKH@oQrar-NKcns}qEo%R3 z_VD52-Q0D;hFO@!Ee381%Gwwc`h~swHEU*J*2st&X7@Yp#*jxibMlrK@^sI2FeVmc zk?o?Re9d9iHL;r=x0-f`->qqIbGX-+H>ZtV^65si$e*k-ZiljGYqQU{_1sMDude(D zEVMb|DB?Esy_4^=Y5LtgIo>D785|Dpoea%zIDDNOvwgMVxMD^=4(|L?e9d0(PyXHW zv%PnD(Fd;8hX=pG|MqmPcYcu1)$S#1>G#0vt-LkjG~$+H9$~mQLk)fNi6wn@KEcR_ zMEKv{`-bPEe5(V8N7JIl{Ko@mVQct2$afJV=uNeEd$`sfitUz%eb0T%8~u{HBy^+p_h%=cZr&N7 zC+<{Fod3Npe}tdx8#rB^mSyf6oPgfU_j|2t*YI!H`(b&#q+NmMH|0;Ge*~Y`!_@DK z_wzCMY$i|Aqh`xz>>q};|5ov%dTQY(#Z6`Wz7L-V`a3oIh99Y_u@?7du4AQ-K}q3X z=H*CrMISMxyN{UT4t`PhTWB=GNA z7}d&~Plh4khuVt{Ooa9K(cb3jJg>y_Zf1M-st4<;%mPS zQp2Nc#V$Cz0OIbA=c;Kx{odSp?j4oxs|Az2>!+el+RM0VLE>BcwWoUkrg)~3_=<>j z?adOKPWvJfyCpx#&itKq;M6E_cH8O~c{`P=ojR?a$~B6|I_U)Z*@W#s&>bri`M$+= z@Cr49?Qt*Uy=3{c_eD0jzu@XXc643u{V(l)8|M??Lzfwo%|SV<77UYe;mm=ozEY8v|hNbtmkZY-bC$oV^8YtDw4Pz{`N`c zIHxk!(|yF-bvK7KfT!I_bGU3M(|xuO7d=AHJjae5dWT3vjf4uwA>;OTB!>s)Mlpq^Y;{OsrCvZZSKxmcb4V!!mVTy%3z67oX&-ZZWBKIxZ+wo*#;LzMyzPws zkNg^a^ag(^dTTA(lE1h=`{|Udzh13xUogf|OHoo?>BYD5=6Awx5AhsvTVE^DNZquf z3i~wt%;6UvO&eP}2iRNrb;V7-hyQfB`Xi6VMOjDVu8C+QCv%O2i?d07jr+3P=*i>s zRyN$2?c(0pskm(+N~&LS7|qM`-uM{FAG5o$TTR}NPm6i_y^HmA!Bg~Ywm*6eaUDBZ zXU?+^*W1(M$wfZ0eRrp@7;57I7_cJ@63e5uc{~fEi3Xj;Q4yx!a7?GpSRK47V`GrXqH|rzAjG9 zy;OWpK4Ez$o{szCSJUxf^ejydMQ?&Oo0KCmV-kG}TSx8!?P||%jQsbZt>d()iSIF! zcQOXL(3EC+sV(`p_r0B#4pnR96Cb18CNsuxr8?`!X{6ZEU)w0(%m|(YRpP$8-Tc=P zAJb5}c>j)iC%hX+!}iE;vnRak>y1a22=DxB;G4N&3Eb+ghLh1_zvBd)PEy+yq}Ec2 zGmVl~xVTgsy5sx;<@Z+Na#$I>pGgLDeZR(kyZGyVc-PjO&ZOl>>4)#&ZeMu$ykFb< z>m={FkX$z6yNBOCz)>qbeF*;VQYL(zowfN#mU%-xd6aryC+oIwj};&;V*xogveFsp zPL^4G9C6BeS9_Q9$-X|?47pB!wWh8MO~=@$+>aL3vp;!++Mg(fr_E{64Dob((%ugE z+bKClZ;ejtYu(ZC^gwd$Y2`f}XVuy0-N;m*!Rul8?INCH0c1WFeTKPaARL~L*DcZ) zj84&ssXM^UQ47)glhthdklQEw%cIdx>xa9IavS|)t(|uFJ+ie7*7`nfN!U-X#eG(j zaeE)Xz);klZdr)zn$?vXsDFMco=f9ap^>s7Mi#g#;+-F)1(zs4S8sRauQ1m2=WFTp zdh^y|1AMM${Qi*6@V0*N`54bJ=`FUBOl_i&%p7PNA*xISn1 z;Md@MFuSF*=ce<+%wga6(7$hsZ`+DAojb}}I3w_ab1V~iFJ(ScKJodm`C^&T^@;i8 zWNY5``4n1^i_6%#9C4N5>Qw#L5q?&trvu?{Wi9BNhaEmhe*WS9WohMm(w}j6uG?f8*Zn zDdZCOTZJ{=(m9nuM(z$icZD~96ldZu?A|l-F2=|H{LI^_e|2$DzK2#fkw>E~)OwQr zyajEk0iXBv-p1lL-f*`_yYvcj*adn%mhV^cw<3QBZnsRYX8-TOetsD}?Co37XY)|r zmCM=AK7Sv7;fGn1zhpJPLY})$v?TTiw^?FkFo;ZQ`mvYmb3gY?^&HTw8!Q=)Zf|c{u0B~Uc=ivZrShNC$ysVU-}`6a=SHYhPyMGVy`NuiSAM8> z-VN1a9dx1h&S0fK&#vx61E2F>6P#XXG@gL>4LBX*^FtK3liKg>D>u{4{0A{toeQ5Y zQMz(+HS6GtO8?4xw zTd8|cT0?*D?tYYk-rlc_EjhyK-~e`GoWnhXT{}XH_e!^bYe%z(BZg;)IKGMHn&?Ql z*NNO8PGd(@Tt0m4iH{55ZDYOuDE{K!nWxgNQFr3UI5FVGXY3aZl)Y*@IND1#!KH9J zcaO?^{q|Y$W4bFHngM^qJJMb~mucs-?B^p&xxWXW+j#G@eD*!83pS)vXxXmfp%3*r zjs3nHmk-Hvw7u^#f0Iu~Bk_BDHsPt*B0Exh`@rqFaOj96&Ej-HJ@(_l{_W~q;TE_% zl)ra7+_SB-#n8-3C_BLO1NeJ)rf~^7cu!-ax45@u`032gaJp99BR(T;8EGlrd8vMH zg+9+`9r14NOoexgvg1lS>ac7da%tiF9PJndpO5psseQM-=uK;%+u^^LsQ)89-H3fZ zn>14Kd~@*8NBrFuYVR*tlIOG7MeOGHF}{n(WtR`3qbt=Hc^)pxVx@b8==@Gb z%GUB`>_*N#+1bAqm&*Qfezpz0_(#0*49_=nDxxLZ;XUzr`_qz5;+2En55-rv;qUuH ze!Ru(haUFnB71K|=TYVvA+y|rwTCgcEx+-$q`(^rhhsgsA#>NF_IJp~>u(vG^TWug zw!9m2wW}{Xd7|3tz~>W<(;D(RpJ_BZkqOuO@E_Uf``8`Jcm1sK?JS54|40^A6s(0X{D(`{ihO-5dIx$nFmR_sQ@%_BEDj z|0v_*lzfi);``!~{2))aaE`DKH2IJ}WG4tz6gTAuzU`8vkki}_Ys@hyCw zjsMsmJV*Q5!slc4W)E+^Og+uL<5Yab35c`wW?emeIBI8YKLM(RM`F49tHaJCjK?@( zexh;En&eN@?#ljKrj8agXbE~3Bji+1$9a{r{o6uY7kOK6I&q?3V?>_gEs+Ur7g{-i zG&*C{N^k*5p+A>@Fv-YF5m1{dXH2jdg{LUr;&XIPdem^Kp~1BO|`PN_r(e ztFuIQFxOlxer;Q`xSK=LtJUxJ1D5*@YHrV8P?scH^HIyX%O1YaOdy6FP7X7J^|t32 z{;1fyyPYIQ!`1M74iq~P_g`${(OFJjyR&y2s%cVsjM8jod<-yBPKKXuLG*nO&uw8p za=KPjPR>pDA>F@=dFcfGF*mt3jnif+e|2PyugWhyM=jgV77giE|FgU4}7JI-+*K-ZeW>r3=bRXE>?7WRd+N6{_k*wVe}={Dm0 zrt9x+{L)uJjr#l}BaNTq*zr5ll&NKT*>m_q##=4jF0bB{QuCT;hpBBx^dq6hzU*PK z==63>b`+`f6gR$y-n=Afp3Tr3KNb6Du^x7VYqo@Tj+X84aN6@tu{c{uUVk`0cpg2e z54ZOv#l-o*S$eO9_=&(lY_?)EDdf`S4g&I2^vhdm1@) zwT#r8J7u*WZid0{E7j8o?jGd1y3QFMskD1CvlHomoadhAeeIm?TB4sC`@Xo;|Esc7 zz5Tc1PW9Bl@m1MQTG`*}sN-pYGX~j#xa|O+V_u!^9Lx6f{6K!^gY{P@<%jB>CFMTo zNuJB|(C3D5zo(IOmpJJ&jD(3vwfq>?z?$MS?VE1(_!2%&GPAXWuYn&a|3&yeE5X-T znKXoZ-Su08JnV`%-+ioEr4M^}I)1x}%ecUEb>Q;RTGfePWuE@6m-NXmQC>WHeo`s7 zUY_qx4~MxwHRQHyew=^BT;=D|ewlE+H_qkl>b-HF*fu_U=z(4HQ%X7R>bb_A+)sVY zM3BwXj`}2YI9w0=<=FhF61PvzcO{R$@c9&4JX3t#@qS$;zi8ldJ8^ftXB0Lh#-4J`8w#=<*^>`44;?Kzb}f{yeI5)`R&N}OXtGJ==FNZ zx1Ni2^v`))qo9rWj-7nVogX~h_eRFXx%llN(&Hj>9jGs__r0~cuJQgj!F?4}Y(th; zkw)yiT}_%Dlv|Ix@5Umf&BK5?2=)y? ziumcPyuBT6-GIwme5VrIDfg86!$Wcn^jOMDeuTEqM*mRBW+d|{EI5jV;od^yzYhQN zCAjYdhv(7vd)U(vTgbBxhhtuT6Fsti)MNjxw;xnLy$wH2;rl7(q=o#%2jH@+{l>f1 zCgLf52oA|In1+9_zqscW==K!T3(Rdj`H>Hx-3{UMfAHU&|N2t0|H?hDyC^00kUbgq zO)Tag$k`>Qv3K194xjfney2HSvH*$}#p&rHEw>K<*9VwG&h|adb6pBQ8*1rsaJPs4 zxWW4(|Jv#48ES2Ae=wrsd-G?6ebJ2FeX-f4F+awIMom>XJyR`heLqY6HGQ64j-w~w zepmec#?JmkB+OIzXsiEj#&`J0zE5v55AZLQ^MZSW%s)HQuLju-tekQD)H7KQ&x;S) zmxgxccQ}uZj&&X_biiWOG?zY}A9>OZ8fFc)V0Q<9_eJpL(*a z1TJ(?{xq$t;P>;?(*QpIhaBsc<>UU=qr5lp|Dr5z2^;1-_~|SX`w-bh>Zx7&Geq6P z?GsCnqd_3$jg--s}KJUa^BHFK_InSZm&hy64GeNVnQv8(2z4&?WtcG`*N-#Cfh9twXCMXe(5#?|=lhu_LBcr`He=C4Dk60^sEPbzD2)B{*9}>zb!lZ9JR)I z@9VUyuKmL+l

e?|ii+;_$8`g@*8W1v$6kf4&HA^z}XJkND<0=~GMoj*CdIIluF* zr95|0K2T}De5v36WIvy!)vegmC#a`ca$0_@*;jrUxEuGpoa*_Am|2z|NuG`QH7@jY zJLd*xvbfeI2f^*h>c1`z`>nOlen!Wzbg6bsG#@_6?vE_Y*X5)1-=OqV^U!oFuov?) zw5ni zbAN4pfOS^In7CbgThWN!;ny;XMk(vLE$@wR!} zQqTQaT&><%kHmSIE6DR9PqeZ#a~JPg`0Gy63zzcNI^T`=zvUOb481R#q7eynOCL7B z>>?uJeZRX4*Q_#?gjUHHeZBT=EPgZ-G=!V4k-=fC^EL2%wC@|qse$@G(|e29?KRWm zJU^EI`38K(dG8sJ`*bsK*ui$*(_8fmx$34kd!*{BVXei97#{}9-eS9BK>HTbKkxN?p|P=)#b0g6*?BhBKCF* z5=&RH>sHRoM1?ylKYQ9;Z( zN6a~aB1V#8M$Y@K2k-yA_akRdO`q=S>gr!tS65e~w{vIXLF2Pl{PRh{jdQK{h3E443s z2wYu6KbxyR^t%&(#3Q_rE%~J{!NoXs_i=P(MA9RB#_fpq{3GM6z~8akcd_T0B_GiO ztI}xqNzQ~jPdmSx%l;S%_jbfxysGnvRsEEtV>Ur;ySVok`;lE`qkX{md(kqxha5i= zhjFf7xv!QzV*b?dZ`^D=8uB+EPa7sC+h&jY)=c)nTh-g1-{TrmBj;>M!T%ii9ueGo z!`Y4~yJw4vdJfC>^?f#+K9}_BiNC%Ny^4SMBK1dhv4!@KE#dQn>Mw`em-*`ecKE$& z`>S*{J_Dbx$^t+7@E_lU>yP2{BGRl#7Q@jdB)tefac+KZc9+ss{LpcuOJrhplX~lk z2l`Bk+us27k zu}=Odew=aV4{NUrJimlKG~nlWjCRq`{3)Y22wq=jy=xAiA4V0qnLmr(mi>IY+GH@v zAIGI!I+mGi9)fVy4&-k~RpLr>INB6t`&B|;jFI|jR zm3!Z3_>_4iKUc4t!q@whhi-Y;cU|PEJJ;&inqT=&fA7x!d?_hUg~zA(`#AoK`_$f9 zT-|J=wUT{(3aLzVKjkL;@PFnL&ARF70{!Z)z9+3iaX$B-d>h&{IK3(Vhi`S{qXc%-&l8IzI=}lw_AO`&Md0-E@7-wk@@;s;~ANV?t-HO=+^6g-%{C| z{MQrj3)R!vZ(h*<9yq$0)FZ;?Hs8X>y2$8`hFDh{&+h(vmY$dM-P{Hj`}^Nl)9+pI zyxKf_M;zVdsN?9~1@P+(HLcN)Z&^$i8@CB;)PMNvWn}dOTlzipOVB6sB>o2%_(GYGy#yu`TlF8yw!2iF35}j?sxgO3eLa4+G*ro_d^99?w;OV==;RHZ|<$!_J+s@OTVMG7l5uI}-rrGX;l8qC-LHJu*H5F1FUpQKs(U(TypfNwQe4H8 z{%yy9{EYfL^OJ-=SMztgtA6jI`+eZ{e82Cjr&njMpjX1@2hjV=v+<(Ze=Xgbz4kXg z->aS;aQYE)66cQ|e%gDj7ahpu-Rupx`lBDTX{&yAM zahf^2uRI+0vroP(t+8H&fAVL1bw&B!zG4&&*_b_B=u<6t8D7gIzlGkq^9h$@A6Jz> z4PAEUCwWl6)|I-#;a23(E`Nv=r?IP_)$8xX<=v0o)jjQJ%z`0s`+nSsJyxy;Wsm$5 za`b*FJ3Y=lcg&yAPw|OxxdLC~;qqiH9;*IWrw)g6;Wyd?zPp9$y}`!|Ih>e3pdXR9 z?p~NDLtKI9E#$8|oP4eI@Hb*3ZnZCHf`jw&vv}424X^)6tG5(i7votY9&es`U6VeR zUu7JJc^Bm+^*SGC7f(w*miPG>_Qojjbk7>aw#ir8@d^_By@%9x@%gg)TZpe)thPbk z9eh*0jo|dl{*8mQ5Id`R7Ks(7z@&Qu9+7l2u}n@1Mfw_wnCB`)f;&%GUM-e&cTXW63-E z*~;9!L9V&?()X2<4Mxt2rG@+tWvhR0H|<>%?R;7K4F1Qvzl&-2_54(yC6Al$KP$7; zd~B=rbFH&C>D6L=ch6P%Sl53}WTN(89d>QtLU|hU?*NDI(4TQm7_QdtH1_k`=xvZKo9$6 zO8QC26h42dd}IFGkJVPD{&jptKbKBQKQs%(+bY+T#_W`S1?Sh5F7Q3@`FuTXhJKvS zG1jlE(Ia2(-RU>}UJE{dL<{95CaJ*f2h;EAoP9X@RP*IQ^#%Q5pWRslkD+%qBCo42 z&JiB4K5R}K)~1j7>nMJX56y$7{Lv5k%$)z-V7&JA_jj~gBme0C^mlowGP}X*wT-_& zsFv>HuvbFmKS~wZsaDGU#ogUYTiGA{Ml)CiO|#3j)6Z)+kK(?S{75g;^-6a0BEOcG zDti@|BF(d}w9=UW@iwV%5Fc_EzIe{zdn+2dV|EVh-3=Z;r(AQN zFZt~4^E&IQvAn{B56MoH}^Y~SzTJH zorc8J`q-axh$sJj5gNw~ab~X#N&C zl*8??zpB~OuWBvjcYaHM%bg3pTKsOlcKYb)srfs~S9<^T?t-7|;AKR%-;%#u;K1d+ z$9>!-`K|PIEAJ4#PYaqS*Ri*!B#$N!D183HI90*tpM179gI43JgS%*-n=hN|`5N5vA0=za zuN@0wRiK0s_=bGC`pYWxh-m+%z~?T?$3D6be2nq`yxGlJ~@q7;0x0Ds`RA2a_pNT%IRm(zVGt4G=)nqSbqXHPQud?DEgjtEaC${ zNxo9P%D+$X7#*K{$}S&nzJ8Aq@y{FB*W0kz*6^ncA)O7{2|TZt-cE0w!16oy^^3-E zD*P4Q)205kvr2E&|E+y~i4!L)K0D%gsQrI?HBL-_QGfU^e$#$K9DNNBo!tFR-#Pum z_=qI-eV+c$P5)FXaC?JaRmm6fyITH|m*Wrm*+fo@EN!Z$iRrK8#E+zW#5J9p)=De1 zv~#+O*0zJozoWa$i(cvTGqUF@{sfo9p7|c%wakkb(pOk3ot@{!x@ZTW z{)p%QoZTJ!$-exv18Ga`>|6C$vy+#xKQ@+H*uT#v&xYAI=vLrtHMw_6T4dMK{~lg9 z`wE}l>B!cZhnahLkJ4J1p#8pqk(rpS(7M?F>@Cuf`w8yFK4k`cjCnBwKM{B0wF2cj z$fCQ1K91*~{uCEA?BW;Ed-8KErU!HQD?Zas&d>3w+S;?DZzqST;z{17J$1!Huft^x zzen9P?5ls_H{SiZ435YBiyPr-OYPnSH{F_t&ueib&v|yb&&lowzH5|uA<^R;{LZY6 z{;qI7xIsH^3c>&8>U}PoO4P0L`QKP5e@$&JNa>ZbocqScH6PP|BL(i z@cBFK=WzE!?R19QFO%YE_VjZ4nzEah`&|oukI&WXRZ=)QSpC)c*XVNcz^B&qs|)+8PYFX--3!tJ5CB^~ZVNGwkJV>Ukso)VK2F!F*N0 z=a>0!xOEde{{bKU;c%@q{P*AHb<%jRc@6xI{IaX_CTTsAk-*2i8^Tik9bf%@ua$;f z%_d9ZeD9l5?Q{xGb35)hA7m4xagYBm*E4YEyOYYt?=?I>O;%7>l26P+5yU$MXP zjgqVRtLvp(da}8X(2?5ihaD)wp}Sw7pZr&E*7BnNhUeFv5B`dq9=yOgls`)} zzzR-5*T_+8Eo@cH+Djw#|&HED>H=6Uz?8Q3& zi^cJcK8xemR^ao`5O^#4{vP{#TN?fqO6*@6z{er}?mcfc_EyUV^p;xxNWa87uzLfS zD$-J+&mG`aJ92HqZXRf4I;AOoD&ce7JZwh_Uxn4dA6=U~n!wZgaMjCl=})B9*|}iD zg1&Rw-tYMP)H@YlbJH^4E5zHiOXD^6>FGxER8EMr0X=E%j(Ddu?B1Q>=w$q!2oKB2 z;ZnFrz`-68ow{SD0UFReqe$9c!_ z8|bhd9A3`bU|*=e4QRq^=}Ko6!}x{&qx>}a?aIidjV0j&EAuX1*?CxNVvnenq^`sO&?i(2AeCep+9D$HT8`<24#C#|^oWa5MH7yJWv; zPlP?(j~iP%syF-OOg!fUt%!+U*{0uw!{Ok~=_e@UB z{?yL*rPI_C_v8MRjW+K(y4TW@+}Ex*Qx1^uDy zqhdY3%ev9cta~E6-1j|`SBiD1RC<;lrBi|9&+-Fp!7sgz9;Wgz{igpd^Tlt?J13BEIniSCyaD;!Z|C39%hB#LE;GO7 z7|YAxjuV%>wtm$X*RfLlwcKl|2T$6%pR&;?)E7thCu!G7Cg-I>4-d-g`yEe4-YAU- zj2Y-r|H0aco6-IA3~mhI*C?kCW$?L?^4^2g-gNtdBk~Hk=f0$RBEQ`kd72J4ujiu& zo?i-Q`&hyMV;%TI2AjosQ@^i~KlEjC84aJ8v)($&f7F!zw1&rvXy>l(mwX2+-P6=w z_`#myN9ji6mf?TByep5u-|!24U|r}01wYj9ME`!UF2sCToxh38&Ea)z`d>%$RIkb<=J%u~ce~&uS>vI2~x^{GwEyj^sy`i+~*4lgzKQSlp;R{3LqSfn^r= zm7211cV{CugNL*HI~1aXoLlin&r9A+`m1k>c43F|J&f&l@L~bP=$uaEN1Oro`kT^-rdr$OAdge1CxK_sJ{NbCZ}cFG;W4m zmV|$NMRGPzZ$EVRPtt1R8POq!yS=dhQcfr7TlkK`&m5=S3%mmLB*ct-!q2EL?AtTR z@-lV2r(eC`%-cK=vfuKa><0T2=<|tEJ(AC1c${asajzGL;Pww_lK9@-xU^ETP1@VK zwbA-Fm_BYm5C6#qav4Ko>%mE{1UV;cPM_ml!q#b#Q6OEfp7vJ1HtM(EPOJ5!h3~b~ zQO2ncdf?<#cv?^Uz2V+y^exg#^d|6lBQ0pjPhC$tav7w-zYNlFx6JrVfy;42;$Y=t ze|WTdBA>x7zHg!2X=!8nQ6=BUZ@!63N?RJocH-VbpS`=pu5D$9cRaqw(~ke2{sC>= zLcB*4c=8r}-cv~Ake40ura$WQrf+Q|CSl!re{z0Ht( zl6Fb|lmq5n{fK?v?dFj8JNHu*ps7J9N@VfS|9=ZN`K zDbB7{F)rt37iwjWEa;)v8{u;`Ih2ckd(^*^+*?V^lLq`C&D2xDuhED+r?}TrL(hMA zA8`X*YnnWeh2C{yPyd!g9?ko+l7Gwjnb+Z`q4Pg$&)9c;ku?_4I025YHiu>W`kVhX`ajgXtjK@WPM4%S z@1%U>53QF@CiiOew&q3ayq$U1PF{@+u6XY;Uu)JjVn4S`tIWq)xyitXIF5bXNk6yB zThl|YhQQIVF9zj}>06C>jaCKy;Jm+2`O?ti=HKDI2i_i<_f+034efU^9!KW`jcX(4 zdLL_`g8w#D!n{q6O7=k9*KL91>q>2KiqkNITn z{wpcvUC1kuFR48}Z)y(}{W{i3b(QgroC}@lS!6(MNd8wr<}UR3GnvP_s;3pr_>o=s zupTsFKSz$8IR?aFh-ow*QFjdr%`TV56g&qE?+)G2!$kTTSOVVzG z9bJ?5;0Hd{nW@*I%+deR%iqo5h9Y7=kTb16Ik$$j)6Djcq5@WEpC1(d$A0ZR9G$An zWBBZA1fFps7v};uYvCY1kmW4ph&yFP6mq!Vsop$&PVB?up4-RPlfd)mS>f}@tpS{g zj9Xt>N2+MpSL)rte!;Fqe=2a+)%VQQnQRbP#;$ zkoGO`YDC)6d~Bvy!_uwDvv1l$yK?ifhhv<(rX$i3^kH(^Og$0T9QJd>dklfY?f|A8 zih2%!ca7-jB%cvSx_{a|jr0A(($4Cukms%+zT`8(Pt5mA)2cLXEFO>Eiv9ZUbU6JK z_XR&i@rl11tDm*u^N3A<4?$I}|~XK(MLw3f-~kJ5W-AN;grKi5moPp z73tBuPnB6y_C^K&a_2O%oWBRZuW>R^1NZvT&PMPz_5*)fhqh0Ku*3f}+5`BXV;ozH zw;QMZkG$JmI~`-3`eg$PIfs25>(0omsu+jqS)T&8_f}u56GO93={V(ggXbZ)eX@3c z*H=YGXu5fM#@;_Ytf#>ICw;I*i&Hyfg+7DcM0Dm$6uF6^)_ge6|v-#dAdCfWz=Z|+~9n+EI z_F~qvDF2~#BW^joYhCGXi+=${mjNy)|o2xt<8qhr^@6_^^FYq6>>}1&yfwc zTHd`8=>QVisGec!D}$@Ce`y16df=-SoEvETTIFpDyJ$4KH!?2`f@_`e-yfcaojIkj zZ!`IxgRd6wIqI*1&rQ>u9#7A!%>TFtzM1}4@q3h$cWZv;M&#Aa_kQ}{y6|I!pWsyO zMttG9c~|vSd!IWckGklM(!p@|Qtip^53egx&Rr9D)tvo0BPcoVnD`&2C;EAW(AKo?rnJF?u5`!@RqMm29wJx+Kuv$`TxE2_-jbp&-eG7 z+{O4lCz5V&wRr*qZgu7T9?0Hjzd0eC%t~*PCdRp))n^E~cERD#NyHa>mz%W)QPNT5 z(8{VbJSA(R(wmIBz}>Nh9*#;UlXoSndvo~Q+Nv<9C_g#f(%)Gm%56zMC&J_KSI`oPJETLEt4{Vyx1ujzkVwa~H{4@`yYc&@((UL=YBVOnt*-F7cTxUQIM~ka zW?DMKIM(n}_f)P0ySlUf%;29Mr$1p=??LYax7*U&9oWG`(@(vN{hxF%^v3+tZPK_? z`jGV|aF&>ndr^oYBze`+PPl^`3@J?F%{n}_?hd&@saH2 zO6?4jA87=+ba2l*@PDej%gyL#9q$qLP`;D6y_)nWJD$MhxH(e}C&Rzm3O)^1e;4$~ zzuzevUyScW_*89Q=k-#5cQT)iDd;<9-P5>tJ1T3z&S)Z@V~YBl@gq-3BMy9w`bL-? zyJg$Ji@0a9O)*{*wYN2W+Qaw2w==UxpvwX9xj#8aeBGw?_w+2{9S(F}I2OKrTDpK8 zTZf*W#lKz7uD&qqrN67apAs2o%i!@aIMKMzFSqcyB-=uN z_hLS z45B9+y|+?Dk7g#z^AV)+qBxGtefM6Qb)W_s{+#baiYJQW*v7d3DL;Q#a^;tE-d86X zfij33dip-M7}t#5+YH8eTakXe1#V^*cFSMx2ppYUi>K-7Eaz^WM9_vESc0(s;%)4n z^hx)X=b(SGeIbX)w11h|%EzBZ=9_Ywn`6>=c_BAGL(tWhPT`~iH#@+WP;oEGo zZ@vonZ65?Age)_5@^$bJbgwohL@) zP59r_D)h1l;)rKk4BrnRpU>3W!~FWt&FZkjUxv(c?HJauc)OT^E3IGQH*~6)j^_Vt z$PR2xs&Y1xw@kHe42EB)HG6WsILsgx@hvUEp-! ze@}QEI6OH$kp9ZlgWi`MC#18KuhEPB3Y;62P6*lgzPoX3njDlKtQ{*kJRU*=j!L&T zAH(0gr?G{&+8qNYPll5b_j@^f?*JdRNq2%L;z!|X;OOP)l(fAa$$WU<#{54!4f%vW zWW4fo+0Waif0fpIe{eT^$S?r6yRjc0hwByQ--GxGe10_@;CC_-v+r9agMFXjdrk6< z{xxNXyhtA7$m@N$9;Y2^3wk5pw=}<+xeqwNe75eT)BH}pOY6cA_5Q*RkABu;_r^W& z#_Yz(WLTMP4yQ!2XWJ^jUc5&y`>YH;Pry%W@fQcAwdJYl3O~oelfhZ6XX0PrVPgJ| zXQy^Dzb9uq7V_N(J^nrl4vsg|cFks*UrpTKp62gW{2)`5%zH3n3%&Zh;H#1QEW4s_5MMX4kpK7D z_V7xillq6ii}l$K^rwQ~d8Ya5{RlYU32tpf51cHAPc`_ifPdZTVPzilmh$2S?tNG4 z0OumFesw+zy|sHLz0HrzIp7HWZ3nN%8($fi+0jk#HzpsfT{qU@R-Ak8!hap_S5C{f zOCt_{YvsdEI~YEe_-~cmtNrln-jYwje>3*<9Qs=yK2O45D{*u)l&fSn&w^8H zOZVi{@!0}CPct5M`8js6{=8ngUp;XjWO*Lm@)zY3Jq~WXSNhaC6>+n_=UYRaBlx2a z)8DU4KU+7vhJ<#ERp?sydW`b5%$R}Jrz#f1sYYQjjXTu1zQ_MOgFd$+`PnSc^Yvk7 zy1P?FI-}h|=-b=Xe+C1`qzB2E*e%%#zY+Zqmg(izqCx2#>;6CyVMFZS2J%bqmmHbO ziIg4)hh^I`Zd0rzOKI&qGV2M?*NVp&Y}}@+X>qcL^Scp{N?ujC-IRTFIPCleZvV+c zvzdAEn%q8h==-sD3a_xm9}(Fx!hBp+It}iQRP!xj>$>9UCVab3FTSoP+}+Yy-+OlH zZ?UK6rhU|ZAKZ?2kr(o_&xG2~J27j)fAS3EYo83|;c2Qr*YiG4HGT>I#;(StL-H;U z@RrF@tnF!Vp$@xwC!gYA*vmuMbq(Q4nGyO0p2tbL9P`?bcRWU=Ke&I<3O)udbxe2A zPDl8>gTMEO$MfhzbGSW=-no$pPh)@ABi$D+RND185%f78K8GDYg1s91j@_&Wai4rk zc-PPGr>DCX>)?^;5cTy=_F#{OU+wVpXy4uIQE%)I_Jn&~@O4spobRoS;6C`7%`bfb zd)xVUdW?Rxkq_xW`nMzdc~AJ*kY*f)UIU-^!{1Kucz<~IUFm;*AN)VaA0F?JF0w8( zv!8mpu-99#ll#LH-Z1=hFdtt1dwux`-|g`AWV#2PsD#He3mjgV?u_rw;-z;pPs^Pj z&QYJ2M6Dkc_**q?LEkj=84igBBp?Fs+BE{$YgZl?cRWpm+GGv|FXwBG`b@5@do z=XaiKzQj9&+rTULg|cJq9M*Pd0;eUqw90%)98|Q+Dp*LhlZw5D}!{JlN z@nbl=iGE5pSoy8Rah#^yRsWCv?<`&h=pjyeHvRskbi49lC%=M^de*fC_>FUfCE0-m zt}fMHr2 z!kih-3BxAyE-yQ_8$bB=`4vi?B){EW=6UXYtZ~JB*(HC>Hzx-9{^>yb^vU@|^giwf zF30sSIPFB7JvJjB2{*^Gi@VXQHr^lHHysVP_k-Kv7ru~v?ADQcgtOICm0X114#^<* zQpV_SCw`7O#-*!#MhELp6Zg1lYo~+!8G9CTydyub(389KGm>7ga(R9-xqM!_&)=KT z%g6J$zjrpf_z1uM2yVwZzc+vLV71)`pZ`goc9!{0=2s`>0(~D!{>(2m&fk|d+IR47 zlEDHfe-3;;gIpr7e(clYC9f;Y{)qiP35LEY1IiI_ySegrn5D7j*qOcYoy?azllO2Z z@mHwn?$lW@eg4)7=N52vTRp#n)qW$H%}RH59yb7T3}Z(QaH6&cRA>$l4^+=&JMzF` zImuW#hmq|h_`I_4d&r6lDIZVvf&zooJB|H(7(OGH={_vj8T{1;vDZh*3x1mRBhSeN z{0_sRv8ZGE^F`_O0_V4bt1p{vJ2^qxEgfpSZicUM&*Ndf9MP#_HTGaX)@5aHXFLY* zVw4-#3$1wjlY2%YGn5|+uRFl^3-w}Wcpxt^ySfwW|10ebWdHmQm)%m;o2}vN=At{s zr;*R7K|0qsc7fLu>EjIL##_^dt8H8UtG-ZwU-b-2+NDR(qs@}a^eF3QTdUG}^r0D? zK0^OwG-3btFw(u!Gm3e!H9tok_%zJldm7DA>WRGi`=le&Kjfz#qkP<^xgTT;}xnErF-CX_!XZm;yb3fce0;;G!Un82swFA6Fz5t z_YfQ(Y@R=#9*b_L%-)tI$9=-1;A(yTjT7O9d@^bHBfGh`67^c|@Dn)Gh+SHje>l(f zF-}&<>_qb;;xyj!YmwpE-r5!EmK_Jj+!V<^K_A0!ydS>3EeS`%&Y!}bER~)pjZ@En z*j=KBUvaMee_j5vN!c;!C`hqow!iOjkNfCU6q)Zw;?Gq8`9QBMaQozJ4mnSQw`Y3^;7>TbprD_keE9Ph;rngx&io(! z+H8099`1SX!yfv>{gnCSAg453&FSk6_!|m`mt-;5I(p~z9Q~`G+>;#&FWU1XpP+m- z`#A8qMm*igY@o*Si0)6m@~Xk}h#!6tJ?!mOa6kM5CF@V@*Vd>f-s@PW{@BMg%i~;E zE~q>*mp4vo=T4Z7XKUwYtOxf7_rz}__f~eLADbnk@;b(0sQcYVvZKnJ6NdfKPh3V_ z_TxzRSB|$|tHr;uPtop9c}^bV{q7k2tmfy~w7%?>*D?;nFwoSg6{)D6SI-OZ`8;|RIl}K?m&RWID7dmhp6A`uuzPbSa+lhj_J&gv=!v~>fzK_SmmTYN zLKS@5lYGX)=>g<2h{s?zbvG9wum`-InH*f`@emg2u6D`Y;NE2R>OPR94SgRE{k>LB zclO}-c+cCYrU1{yLB{u^(pfyCv7b6szox?Ni^S6P;g`N3A`$R^zjML* z$?^OdrJ<3pxjt1@AF|H!L`S3XM@yGe(zCN$_`AzAf^iXmb!an{&FAovld=xoE zzR}b4r(5!bxQp<4-N!EPs~?Z0m-<}``7?%EFZ!{cPe`p!&I!*oF5UT!+nA>##LbE}>zvCp-Ei z^QWEf*TIFxNe6KhW$=6~+^YqzJ7ibDuV(xf!^vT_dxA$A-xmB97t(+4Cy9rM`vJ4F zi{PubKeLO`Wu{}7+ugXAeMTCY`*z5#F&<6W#~0vhjJW6v^{W&A#!bq5H!M5dc*^OP zU2T2r>H9zFXLI)R>FS*Zzk`p)UU|QYKFimLE-IM+`Fwo+;QiT)$j!TW@Ug=0u7I=M zw0oO%B=$AWS}$su&o{v9ej$JMa%XaUyI5x$@psHmhbMj6&nJ;{5An|7_vk9$=y~{T zB0l5%qTL_JCGOQMMUPCT-|26R*M`ClZ{>HV+qX7>&zC4)BR(VWaA)xw=b7h?*wqIX z?RLoDCpUS{(R;Dxs`9YAx00X#He6oK|8cc_*&n68@Gs&Uwt;(PJfrRME7BeLTc+iw zlSlJ-=hZkKXxOH@|F!ZptIFH%% zXgIu%^UMl%^^s{0oZZeJ9eGP%&Yz+Um&r%>4>%L?8&k~JneL(NZ!}+ZFXa;R(FwHv z20r|fUzpS(hn4wa^7oQhKAwKJ^uNO2KFf~aQuD64nSHwcHihNu$mc9AUPF(*;@=4S zq&+{mx?BixZhLD@P;m))7C9D0}8{d`or|HIj!&W+~(jxdD>-$@fEZ#xA+gcL&3}1r(v0i+|zHjL7PpN0J-+jr)9&yOO zI@hdY9OCZx7Nq%|oqoi@*H2gbHjh;Aq(5$Mut>vSSDi-Owj5yTHRR|9AY6>|Q_%(V zVLIGiVEkI~(q8}%`z52(mg?WmsNF`t>+wciXnu8p&tvg9RQY(}>5tMl^tcx}m*2Qn zGB6pMUPZ6?k*p7G@qfH{?1Ua|lU_&9|19kc4+DqKfWsO7=cfN5AMf*8A0ltzzr>fs zKJ)-MyuB6ftn^}bbq716cxiD4sa!`6&DqbF;J>4tRLN*;<9Ao&FaQSkW?<03zLy3iWDPJG>b<07}e^T-C~ z(<|xS{9T`UC*@e}c7?ljtUEK4W$B&P{hz!8a6+*jHP0H-bCOB#N=M+V2F^s>TDL6D z`*B6{9Q+}(-v)9wxa*Mqk+;c4Ur*dn{p-<^cAs~cOF z=CFf1k78&1a=Yjt%)`9uiZS-pz zJiduM%H^B8y(oWjc7xyV=p670a{Qxoy?WwY@*Z|_L;7(hKF9gK5Z;G>D8K2ZFXqfQtJ*6>%!ym{3i4D z*HR7tYft6I=IhMPdGLIpc^`S6&!@K?ldbc(pRuFgT}!VT$g6R){#L=~?(|_U`*{KU z+yI|%!(S8e8lhJ^h|3FoTT$96|F81R;d4iF9Fy#)`~>>5AG6&Mz`g zOX8p7rHRem=iW+vvTo+ri%-b-p${p>eIb8GE&aP3|69ZP8`V=yPD}Fe$H#r{OVc>- zyN|se{`1%KhG`Z1=}*b@5s5d)$|ApWbM^+iSH;nw!@x z(#7;#oOyZ&{#)|;1kUV0a*vxA@$S{V#k`)wo{sbXp2|%%ueV8WFX}%sZ9?C6C+$Vf zXPv8{2cEM!l;4@vzmR+~Cj|GPyLX({C_l%_d6$fLwcz-of_?-3>zilCrnjKW6QSK$ zPjBaMX<+=%vyQ~eFE^_vUy?>f`ephNa{J!;(3sp`@qHZq{9b)=AEC^-T3!2(kCflld`#hF#FbSP{$?k| z>8_P2T$$NAVeB)QOE<~QmrEovuE0*!^)F=OEtp^bo z_o4M9)}J4;TfkW_#pi%=g(fuZdoIa z;}UZ0&JI4(I_@p;{2_YsUFo9y-}=)`-nrA^Ko@Z$Po+nj@3-Xtq4)LSubm}aIW~Va zJ-~i{6TP1E!+($`yvZM-4=>3tdMn(om%N*I(C)$TdI_9gDgSf$3Ob0-s8Ie$@pq52 zLw@pp_jTHBp8QG<_29!B`P1f0E%x-4^ekSWxl(=MseU+(OzRs(q}J1)n1PMx;a}+E z^sAm7W{i8aov$b%JqcV*wfKVeB9p@y^Z8dJc~3fsJ`8pe6n-t9H}YDQ*a@g-rE|cA z#=X5A7aEqu;ZWeUcT{?-%|beW8=Hh$lS7OpSfe0=OG`cst9qg7@R9G9}2L@nN1;8^4IW%Q;z z{0Gsup-H{;etKwUl|EnK>KJ|uS&FQN4;OefD2;Z##UMVT5qqOc`hMO` zBGhu)`6PMP;R$*bKh2%`#yQvy$*A4`|9)^e9OpWA79@*{Y-y%W2a@=tHpe&$j(LYAdWiZ$@8AQT$Jxq*p0*G$GeYx z!9$z9hc14fpS3~KH))>Th5riYes2`?E#X)LIMgw_gMHD4ALMoQHsqJy6aT}!vvRj_ zZ!JIc8@`t%`(=Ch9PVCA*oUp1BQC|aI4|`yH1D?0oQN8)W3!OI_aoS;k@w<2^zlB= z%R)ZQo%6Zj1E-HeAB&#_`W54HZ}xBUY0Z9q#QckQR$eLa=TYlM-V<&Adk?8UrWFEjJ%x{&g;s|;@`7}j0Ol~m&?qfS@~n=4c+H{uApC&pQ?PvfcSy zZQ%V&`rFR;=L->py(`>a0@ouy`dj&Q?Z|_h zzb1b8{n888hp>~D<=bm_x_gFqvqPLT`(5mJe#s~K+|m27kD*sPFT9_gH^$imK8I`Z zBX;edrBdk=^+cw!JB|MkC&C}4@e*1C5vOrJ>jrAtnmlEYhnE|~Kfh)C%9CDZZtvto zJ$Qxw{0g7nBi~9p+%w_kPDbbx{R+?E>-dv@$_!-<8j%muQRd5R5maHH#LJ|;>G^ia zpT^~<($Vlg&gu7r)5G9~6Hx1M!h`&haqq}(elqQ4ypBV6y6-LHSF{&zP|u|AM2<&-5?)`vb(PFb|%?$82$4>mW_Yt*(80__f~B&t0r~ z8QZa?C^o< zyUNA=?gRW?4(0R(<69p-e_HTAn>}04s0~P8giF;OTX2fs@`3SFVXs3!EImKk{CBBK{vq-$!qZem?mRvr4~G>^I*~U&J%4>GGf}=t_Aq(0g3l|hU(3C( zy|keB%`Vk`zvO599AaL#&OSh|FJ9wv?e-GS@fLo5cRu)eF;Bza(I|0aE_;_8c)+ty z^+(<){^j6vv+RBQk=pS26ZF963($x1gS^RJ%6#8c{?P2)qWpCAPq(i*8lK1ccs@Lh zdvJ5IkM+xYK0#FQHgGHabtCih)icaF;?wED=D~n`d70TnQg`<8*2ZIU{tf=zi_b5h7lYyU z6XvxzoBVC{^oOVaGQNG~^LW;{RPaZCUhsbczebGXbT~doJtya}zL&X=avC{|;on#c zw+FG4SHXb>aQ0?$9w46MCG%i&add0Y>+?TqllW9hxB>tdnUWQ(1#>cvOubq9% zI46J6&-dkj{*)Xmd0IZF4=H>64|d4L(sJL!o>^&sAnFP}ueUCLRg!PRIMu+}$Mvr( zf5}>UvZAyBJ?xE!>08>V;HO^8zWugT25({?*MXn4qxhmc{SrOnpZHe&&G{oP^SLv; zUj{!ncr$usLEqf@UsLq9=_>QuxmNm-dK)|cdkg<={G|=D>2PUBI2rVV@Dur1MDu2o zjQW1+k8#{YcfygD(m~>0&P>0fSM~TQE;SDGc#~Gqs}1rVeP7_!_30h-XA9o{)#lOn zrOVKx{yX>^n%nK1oNhF3KbP+GyYTlu&u%Vb-(H;lYW-SQdQ$mV2Vdl`DQEvYpq>G6 z=C$;jqWm}fF3qg!cc$yext`HlUGVu`dV_J6kypL_=zpHAq8Ig&QhGB#&`hKFH9y%# z^dAa(9e7h={%%ZvqQ7fOHQ5&h|9$xh8YMD^r@vd*)_E7}^I{$ay)5YohySFvpSedE z@@dE)eLr~*VSoQ@9@XLR2)$|qPaiP8gWPlZoqd?_N5{Nt$p5@VJwxT|SWmAumJZLB z7yKWQ-J_nN{2;51@0!xd_z5|mn|(wMESfCKroe&ol#lh|vg}#%WC7{VXtVNqpCQlN z;ASQK6Hjei+=C#8ST`TH{y6QFuj6<7l+~q2@frJzx8ZA)Uy#+wwp0GSqF;gAp^shQ z`d?`;=j_W0{u?DLtusy7C+>e6myb)|!tYpTo8*uB>?NPansmPL`Xl>JxoY@)7dgmW zs=d9*voik_pZ0_K#q@J7d;IHkU;e|k`FBMyTR`cJ;&R8vw_D zu|9rW+5%pMUhS@YBlLFp2J6CS{LCM!r&2zoXO-_EkH_!nq2~LE`B&swpFbk@oju(9 z{*it7bLo_PWuXt3x%c4@~SNK>L&F&m2hux znei-h4%j>kdES+8B+qv6`O7roPM*nY74$dQ+ttn=Z?mq<#>caHmPNnb;(w`vClAB> z$m{!l{w@74cVF;A_3q_<;>QJUZ-DoK!)dYspZ(3;T3Ou7%w&!UIn*S-8m|`qS7j#Z z%Shw}y_K2Q#m$UvGK0l!7Cr*)mRUgpZ)eenhFM3sk!7kUjI(#G{3_}MdatAu}}mGWUfKVn?=mOrC@7WX$7@-T*cZsm^;))fw~A>XijzlJbflVikv`~r7>E&Xg=3LI{j{zku=ikB|W;+2Gk>7V#+!QXrf zd7jA6+&-&hZ}&)Z@?q()w>#S{^iON_=QHmRmi)e){k+KhohuHnPBv2cndq_ZYzwDC zKl>;@f`4fTco}+iApEO>52MnaSs#4PNjDVwe=Pq+AD6DM4g^18FSp`{JS**$ZN|TM8-C*6*UaI*J9c)pRa@ST zEcHFlVjk5sO0S|rTJ`kN-#lxMPnqc0kMX{61Gv=6JZ+iPE#^_5Y=!yWQ~AExO#ZP} z{vLKt-|Q>&UP z@KX;ye~K<|LDmlcKX~7}c0oTJ4p*Y@gzui@f3kXl|1;oMWzwA=q=9-?!smKf#9>?x zhimwSj?2okar}PQt3Tv_1N&H1rTc|Vv+?-3H!CBbhWyW~%->G#tyE_22b3PyZs^0C z*&oKet9vaq*+6>xj&=*&fWL8n?ltr><-g5-EA(MmRx&QF_&plX&u{see=qc>9KMEM z<*BSqwkz~V;a|wDDco)a2R_I5w&dJ^y&2~M)%NS{>1UP)|I16A^4jpBk^3pj$#bB4 zF3qy(=Ean}j`D%e%e6aDTwW{XzYxFiXR+Q*QGXk_(Iu}({%;oa=+`lDRy4bF!_Lar zWha*vay~X+i{9P2V&`lqe*TYsF*~|RGMqoVjrqGAy>U_gGIUvE{k}JQ?;Y=3{bl@` z^Gm-+J`==8cgfnZ-)@G(VW-`azi-}6fX^G%x1JxPJ$aTTuUR)*qTiWUvTuJcy^=T6 zuL|de9~eh@s`5r~@?-IJ89lEQSNEvj?+TxP_9tj%CR_|0n+jL=hk6Gk?Xs5E<=vgI$N9+C z&foS$pU?AGkG&yMCky*?8|C-qMZPdj?&yr_cP!wUK${q4kBe^$K-JzUHK8U4DCU&JY|TiZ>uwWSZ`h3-nuBGJ+N zs{eKJDKkI5ck?X#Hp|faps&#XGV8c3Df-)#zxvfQaC-v_?h9U|pY$W*f6CJ`zw05c zqboUxsDg`OZ?{ihFZirVJLy;ZqsZ@FZ#O$`@2Ol-QP!9FTP_x2fZPjZ31_O@kbA)Co=M+U5fI@rYq>xZtUr9#;=@z zqMz|^2%o=DkNpUG6@5B8?LiJrc)NnWcXB2AccnF@Yw;iby#wCIs9lu~^xZm!9{O|V z-}Fbd_l>yhC-f`U@fWQ>ktcL%+ME1SUa%eopWmcE<7Y?S{{hCaOY#N04_x|=9bAL{ zN7_02q~v}1Y?S#`pZ&ZZKim84=l3P}JgO*P&YtX;9Lmp8r9W}6yRT0#pTn!B{4g!D zDdg}9+zxs}qu0pz^>(kMb5@dnY(Unln7>oublA^*d~Z+xw)&es8$Hg;w(vdjj?Pg} z;O%}{uKZB(ckME#V(ta@DDd|1EX|s$=b&sLdH!BHG3#B>Pf@-K{-2Qb%sSB9v$B2# z{a^61Ej~}oMq78*mj0c!)j#K^?B=fQx{I>@aN}#|fScs`VAhztx_e)6Gq{wx&)udN zuh;!umP7vM0a@f1TAFRrlee<==snoiHGbEdU!y}I|F7Bc)ylt{^?`dom;O{w$p5G8 zSMBzL@13)W_Du;K4SF;C_|9qnw-%D)iwPc5XB1HLt+c_65C`b@O6)TAmE&kM6@B_ys->Dav1tUICvk&L`mS zo6qniJsM4u<|Gk2l)zp$Un&}-xsU7_9G#O1X%j_&>BHU3^fg9HEE3-=;@2kS(w zWH^1Q_W!HXUhK3VOGC4Wmud#b0yk<{8lq>-xbfLvrC;Fi_Jy8!`w#uFWE;5iOles` zkNG%`y%_J9)Fcz&^h)8<1ES^yzhWk$z}IIf^iWmoS9Mfc*#>qsBwIhhH( z-5Nb`_-Heu34A`*NlctCZ}T_(6u8_HuAW_#Zv_dzNk=8ui05cZPH|6gOu_%XPVmG2 zywuE#Ji3eRSYv;&2)(0zJ(i5lekuLPPM%6WP5k{Q`f!Xapxv_9OP}*^OepXt?B_no zDf}NJ$)TG6d0atX{x^LUi)lk?rR>=g(ZgT8seA@+-!FY#l7BqOMA3fZBaFZn6iX0k?!)Qp3yYoknRlaR9 zAAZNWBl;aZ@OcoqbWhGnhZOxiFRfqDoziBXIz_(vZ~7f@(`%jZc``Y7=A9bucV%$9 za~7ldl>U4rs@|z|L4N`6HZ?Dnrjy963_g#-X9M(=Yi-Ju%qq;lb{&d$J3yq9}e7X7I2vj_PPb-yrv-vl0SThLEIuSP#D3prQvcT6(B8u3T8 zf;;2j@M!#OCVpdNLBBeyHm}CHpEAby*>7tn_AmFt$9V7PKgtg^`yS4Q7vuX{Hq^Kb zabIPdYbuAIEF}CPeRYBi{Ut&h~M`7m5*a2{O>2;d9r%S#LaazF2lv;4I#f_;_Wu|>vFhR&VPGVzB&FIC)eUX zn0i6Xt@9p>P$zh&$o58Wjs2=!yy!z|o@|c1i2(XEM zx|tMq$eH+$c&!_f*2edwS&SZd>(vtLW{r4_sRjRwtrH#bA9%gN zY3L&?qu9R$J<7j{&-dxW3;G-VTBe@B=lA`7LwM(T5@TIk{x|(65wd@XhFK$%L)f>Q z=<@TxyEjX2hM~uKPh0kG7w3|)`_U6GbHLA7kJ_^^@+zV2sT-ag%vzKLu`4;*%zmH%QcJ9$jDs#GC* zWv23h&jabt9-?dJYPV7}&GI1LM|6T=R7Oo0Qy|&L`JTGU{q-&k6q-l*sQ=-;GAS|7hG{e)hRzL)er@Of2wEIwBA+65g)?8Ct42Jj^0 zQec z=ndG*JK}GU)%#GN_2G4_kActI7v*PW2OFol$pP98`r+Ah_&GE=I6K_;pS%ydZPBmO z&;z%R%l1(|70(fJmd6!;o#DVK*<5$PtR(xi?_%(_3QuW`@_RGt+O-WecHpWIy9z$b;B(L?@jm9@_F_-uWoVM@<}>W-(Qx<#J)fukjpCelE%2=c-0a5tuo)cw9FD~c zmoe_6*|~k>_vqs0RAYX{aq16yb+Y<t+oJEo+k9NIBfa~K{XCoAi@$}V)%-K3vQ%X2DqZ7y zyvwl2x)OHsd1gQjKimKC8SBnsc5cug5PAB0X*K(Kx4&_F6a97kykB|)KJP>h&EfOr z@a<$bf#w_6lGkTu8<)W0%?sRKB{C|J|9_o&V%_-yJ>E~OlkU#${ZRZy$dzWp#p-01 zJV~d}t1nBH=^XU%V@#o+^WDlhlU)9gHih!Mb_oW&%bNI=L>MWDf+AFS>{C(cJv+v{hRb~<2{|deZKbHYfERVC;ZX} z`M!hGjmzQf&!u(B2Y!~)L&$T6)8-5HGvk-uQ+r-gW!H8{4oPZf^XYk-G|P7TfAo&< zIq>%5(r2ZP*)B!9{j#(Cz85V$#D0p06fOo1Y?Yl%F9s`rhW0xf)t&IeKa`zBzeV+D z=g`Y$v@z-_gU9pzZfLSycDnIto9wIIklSI|3G`)9a)iIj`b`@*&7-rj2Cp{S>PUB?u%xdR@dlvOSjPDBgbBlIb+owOO z-N4&-^&@V3U7wwj+1q+QWq0(z=bhok7W~Oang^NpuXZo=a}9b|LYuyOk zTc@7haOsQeMEbswe`AkgUNnb89pUY7**}c?@1^GGVZT-7+ZXuUHow%o_`v&Cdy$Km zvXpP1?BrbUNbB-D?x*Zo@HrN4b%p}N@(Yyv!TI1O{oh32ncw*6Y^8a${onLM;B0xa zEgU}y9Yx_elmKzL%1t?6ZZ7`lhWyQ;|4rOKIZFA7aQZO(^6cV2;!Ix4_ch)#;qxKn zQ7Nu@FLG$^9l||~;~aMLe)O=8`zU*pTMPI+hyL#)ZaQ$bDf@YU;~V%q-}f`k=#!LR z#o{|!J=OlQYk@y2-Sn3&tkftu)O>BD1;9oy~JsM69g2R`X0b%#vtemVr$-XpsT=Gci z>Ll!uY8Lom^x<&7yV5@Bm(p#%N8I(|G9+{*E!%##>orVejoPI7Rp;g#YZpH@3-ZDj(VD~qYo$lIij^MGG5C{2lyWK z9|uoslHK_|{^PTjdxpms{G5m0g8m$m-fY}UrE}6_3jWVWZ)sevfZIPgx4n}68};0V zkH+TpHR^9m%Wh}Sh8^{CdOLe$7{A9q;qwpd=i}gN#BcnUzKrE(K3h9Ed_KlJXd;T{ zI(oa6D4;9hfqabW5Bm4%73Pa~J<_XvX7G3u|7muGao=7v{58fW{LhDJ*GuI-z2eN< zwuHW|Dm8;cn{fLu@;xYNo&7KSwe)JK1A8{)vsreR&p!MZm(rVeO8v8A3OQ`8T*Q@i z&Hm}!GiNU!UC?*VZdG6G8n4i<8@bt0MY{)NHujuTgJfyVGOfVdTTSCOZzjNpgF30r|Lp51%`U z>$n3w^y<;Y<|Tk+OK~3;(XV>q zFM`hs@f#=U|0MCy=aSQg(%PbYh4;G8vJQ=g&(W{-r99uHpH=XyHTkc}u7U>{T;4S9 zHE=V|2b<=%`rUWV2cumls`)?gIh8$qiS|E(&mm`;ke{GGJMmB7qP{^U*0q>m*m&cw_*Gs zmzb{&y#E^QmWzkG-nb0+d9i-WMD;iSw-@Cnct_^HB9gau&7qnYOZa3!VnP)3X zkEOvhgtH~*r+E%}v>OfJnPD1R+}nrQb% z8e6H|qqRFcc|x9za|`-CGU8Sw`?@b@vFC{vz#tUV6OXzlNXV z3ia>p{`a-|^St-FPa((V{25o87h`2xx`qG$^U{||$hWyXNY^Pp$Nla*ts^g$epFA) zmyK?0iE0%`cMbVG&;J~9v(lyynAh!{8{Vwn&w9n?MEq3nH~$kKM<%t?2kGq!dHPS+ zzIRH~YxU~@evuoL`?@qbJx4ti{LeSg`<>zQZOZdfr56=|IbHn2A>Zt=&RVzo9LIQ zH%8@B^_5EBv7gT`>c2jHL;YR&J#MEb&zEk*PvGHW=}P|A&gc(VPoIL%=b($^NI#^1 zqxhrmvo62n9``u~Ki{S+l<)5r(F0k3Ta|itJzHc>q84ee`;*)9_7vR%Q!|`BIf#%TKbR)E(XhJ`Bi2Ai&#B z+1=*Z?WMlT$GR{x`@niSi2eMu`X6*Ub8bQ3HGA7WyszD5^y{V4H0?(D1C*}=SGUZT zWaE-2OM%Bx|54cw%C}%=-<3JzE*++StbK!E#mus@6SFHsn9KD_dyXTZ`|wAe%xyk7zgI!rsa5ppToo zXLwsSnw|5Gbs*@UWw)3YJ0vgRC-OP0%g$5II+)$$?_M&pF1bz5MHcnQz@6V~e7%^Q z-M~-sK0Enb<;4l0M||(k+V2I2d&9*j-z|RxUxVedyWKkbnRij5em8XT`~1DPIE_o$ zC#%@WQI8uv`Ca;DWy`Np&+nye)f4jBK7ZWjSbmS|tjq6-uM2%H^RD2-K6ixMi_E(h zN(UC@`StVr)jylR<39859r1O6KTY88OUjRQKV>modbf0$cEfLTW&VotBiPgbqgTuL zH#U|3U;dWgjr8|>(4Uac`Try8F2L?6l0Q&S&%|B7xVyW%lLR6JNFYEGG`PDv1eb;2 z?u)y-v$)IR8VJE5A({Ku+5f%w;o&-@8+u zjo1gu_f=d04|bwwKV%&z&&Bcj)dU`2rC*ET`d#q+t@0Z<9p%3kxA|Ny&(bC2?eB7J zajW*?W{Fof%%lze;!I{!^D&vL)%ABpc3ruj@mfb_cY(^&*j5^{>)elrdy$t$o?U3& zsYM5k^OGlx=^38uYHu-<{vfk4>l5!iH9Sg>XCW__>EG4X;34p%ZCYQxNI#pYy``)i zPuc5ERz6Vqo66_G{(X$YpXwQ79iEXklBX%F;s#O%I}^?HT^aXAI!{|Gi=WIA>iJpusdV-W$kA5H zJ9>Y!jmE%nL-paqwt{dsuPI}3j_owWj)t#HIE8kImJM6pw7Z#`A zD&_Zi{!8^dX2rXhK6QQiUGD5^^t?dnn^w6+`>XPn z^gW3YvFN#D{C*O;W}Cm)Di3{ldb*U4%`9Gv?ofqq*UFw6@sWR#`;&sYI|{c5c)dk*SB(dze8UQFMGKF%9+M-C^{~67QD**sA(3o&2Dr1 zk2_=Z>k*P1Hx+e-W>t7MLOoOQp%PtVH1aj=i{7rPxaJbzy4cn*@kjf@eldWBVpGq$WR%x16rW{a4BA|8+T~_z68S19-l<{xvBI+QR$l3HcGRTfB}RGJ_WD z@b^{rjWx<^qvy{uvbv4;Ia9v+9hArYg42q>=$5r3uM#m`38d-8qhLhZGyiP1Ju*@w-#Ph)=oPDW!}-=vkuv`@LTe`dx+o$m4!~ z@dx>PvwYd}k;nWr-`i{dCiTry&r8M6mHD!-&k5Rpw8*U*ua}<`KPqn`Pxa+&3p4mn z7uhzRUsHUb{$@CN#fJ6eYijCd7Oe8@88EsniM_2{f}WcG8*e^D4y({l2Kn~^>>T~@ zuCsr~jj(aQ&J?^kj+PvE6bv+u-OZEko|@vBYiZSUVE0I61v8T(7@4o(bI!43q%I3S?S`nw5f5PaDmvaXmAidZI{N{WjyFAJG{9-TN3~iH=Q~K0M_Pq6M;XSyD zZE%WVe#5DesbTgSI!+_^4d7%G<=gYB8O6iv8@xST-BaOAOZA@<_@$op>NyR+H-#hN zJ#%=f&34>@-pvOYpLT|)v8x%PPg7GD{>-)TVJ&;|YVy%ZTnjH;9%pk?16=>vJ>9=M z?UF$n?X|UWsQLz?{Yam!jAb`?{+0dp73gT@`%&JD{ckJp{bd9r`b<=0c~Z)?PA`J5 zA6qXYI<^|jjI0d>X{wh#wH{3u<J&();(`kvkfXg*-cuVrxNk5NH-LuUqTIgH!?QhB%`nhEq z>#X!=dhkH=;u&5}KbTj;c#)36qi4&R`Bd}nhH^Yy?`yqj&sy_@`zrRNm9{r`rsS{C zH|FvCYr&`f`|?^^U3KRRd*<)UCOQFMo^!+UB+|s9y@{S|4`Ez(%6segTjk&R z#%9Zd<+HRrcZfQLse-4g%bJB7FUn`iH`E*XDi1^F?&PLnA!5wA*f-Xed~4{!5k@|V`(yBc&_9{aC(#h2u=9slT9 zNB(5r2);FBkN$$(bar0&JALI3`8R%o?~%{{FY?hOTT-mY_toXW@G$Z`><#b3|7I8X zJkt2cb7!8m6PNNjy>hB^#2Tb}O?j?%!~5a-;xc;Q32~pS^6Kms)p5CrIF*}~AI1;5 zfIB~yPiiOPJMJyMh5zHSM~aBv{S0o$dQ?R}UL&`sm9dERl!eB+5&3_9Ely?aK9gNK zP9NSae=h#Uw^qcyj`inSBkYb2{mQee;}n~uKds$`Bs-#z{&F(^BX6ZSXv*wD>)>2y z8>`e2S&DjjKewL74r8p+Pp!Ql)33L~X}RJ_)IOx;EB4Zs?ta*n_A28e>~6)*AQqI} zZxh%(-Ad3)9h+IP&&am&uP5b4YN(LgQq&NQ_qIWYs<%I57p?<=iAGXn0C+Fv!MfZOZbn@0e)a;@VP0f8ugg9V zw;6Rn=}PUyyxt?t)Y=8?8y~X)9gOZRvTAh8W8rB+&sK16G!DOpYc119@br7xgeN=q zO z%dl_vN`v)rTV9U6aq~&{ScNtZp_4h9j^^dwpVo43c}MDI70A`|vo*A97GAg?$YXpS z{k)rT-#?9ja!;^dbi$R7og2PvKFy%t_9b=BQ`2m?ad~-H`po`Iyho~QZ#Frr$|D-* zUV8pJ*6wZiiSsJb=0N)4`ttSih2*YLJno;|UV`?;+Rezr$LKkp#18lSr}}kdz7aIJ znSV$NKAD59JG8sBFuOY682zuwwXWPRy=QOo zp7`k5xbZf7NJp|(H*0AfdKC`*ozE~c-zd+{d(gj{iZ*znOz9ElZ2K7Xdz}Ny73W|3 zvr3+fvJu66<9lcMI%{4tBXD}Yr&FKm;`t`RyXWZ7a`%}-=fL+J$ahw>B7NQH(aESg z5!U`yaH>xcw;ivA!=ZiG$d0uxyb=5>x~b>yay@Hf=<}bl4y-VbCKho!+)rhLB2d-3 zQ`9r(tpxCBuJPzyRQVQhC^hw>8b6Np?!M~+$K^s$zB&AhdjxvG&$(vS)FN*4Y`~5j zZ^_$O)HfPAdq*?%%VQ2ll&>8mL>x*aEa#z6V zjz;yZ^6?_CJbzd5x&8ef+3iKltJlgG;CIyXUh%8-V>9?1xG4@xdHn0=;#!z_M)rBp z9MzwczZ7!zpeJQBdAporB^qTN`NR#JuUpqT8o~GMMkGg*$Fd@GF^vrfaQ&+vD4I5UP&#tFw}=EB5PO=%YJZktS*3 zC-&!|`q2~muz9NQY}7Qtho$CX%-Hbd*pEH=1}j7I%brlbuksFbk_%WLp7z|&R?>rE z%Q&@+SL-=y|5*7KdbLt7Hq+LQetE#Wscn9KZY6pMk4KZUg+}9d@?4}YG$Ivruo57g72?k*(dtYI3JPM zF06N(@JHznZ_k5UGmZLFW$*N&HSA*8QH#yxZhjK|1AWTAaBmi_U#vHC(n%4|XSN>? z?+4?+cy{uk){tJJKFjhu`8an^qsXg_w&=PEr=CQ|YCF})#u$12H>D@P!#?&nI&-`^ zf?gg`r4v(Po7&Ae-WXcQuk6g>Q{T>c=r?7Dbdz(r;n}_V^{91c9Zlv~JR6c7=U$VL zlU4NRKiOO6i4VNMY?xYp2ZtM2Y5$Yft7V+*C1>F8g7UYthgtNj@{Dwjbj~;n|K2V6 zWk93GqK?*?Ka=Ijn?t*PfFH(EQuRUkZ2HZ*KVP<&ZOF@e{Mh&|l=4fU|1=ul3uz;q zTT`BtFU-yNSgsW<^jV5nfM!l5cFF$jT=?yDO4%<>5ncJG{mju$6BcLZm-m&Q zqzlA?7U^PpltJk*`tYRe?D9qVJwC9nI$vIr=VQ%DzvgnPmhZyzhS@Tjs&iPn{iKu? zgR^Vh%d`b6?kv*wD}3&fu4hLu>-EchvbZypHz+Q6uVyhhJCl4K4p+CQ4NTAfLw~-J z{dP9%4}aZ!D9t497ys2+`AzV7HcMq4@qgn;cr)jnVIO--T(dhA?66)_9_MBwSv`NG z**1WS)y&HeSo=HC7zWulxQh*bHZtB@v4eC|-iqCOx?ExF+%FYY5%-B{S61z;Q?cA? z{ZTo#=m(F#5m(qlEh1TpnBRfJb)eq_I2_|ILZs;Lz2v@?YNSqn^Gbd7;ca(#U(54x zn$wb64KsMe*V__(~@31*i)_fL5JOKuupeV!m-oQ5qUSRDL#RJ7r^gUW@jCF z>3X5Q1--cemSM}rN zYxWtjF8qe?jhx@T1-F~iAAiD^us`Q)+BNAN@47K?j2RHS&U)IdPG)OEkH2UuBiLS( zXB(q9w8xH>f5)x$?LGUQ^=7_4j#2)N{B(WQ8xd6Ru~!^TQ>xORIC+@ttyOgXU)VSH zz=4P^=uHx4n6K-~ImWD&b_e5*`(aT!2NM0ozbq=+*&(v zC(+&{^$xqBIIlel)^_#FR1_WI+1JeR_4L5=U{Y6a9;P>Ykd%_luVP`i1JcHQQAa92 z%fDNvzVNfFJ?UgmcV^+AB`5#ZYz;kF_ng;SXPPQs$O`p}Iy$Fe^sB--VSBTJ9_9D% zwAnokz~$O>)Zo56Wqun7duF6Tth6;@V_UuIOqXrQ#&H=tZa>ej=fBa?+SUobI?!D& zf!p0#3IBFJ*rCFm6SXrTodY)qv7$yE(eSmI#10<%+wuIBBWg>Al+?%bWBH~0UXDr^ z^Fww<$xCJq+>P?}c<>yFjO76Wna-k^B1_6o)Z_`91Rce3$sx(pPHa zOU&Cg)`wyE+(TT;kygm_oZIb3vkoyOLjs{u`|u7qi!_#>a=tSB&-p`5ZgJ-?h`s*lub( z9+eB_XZa}H#bX_hXsn2@dm~k|4_fLpeHdzgp$|_aogH}Xe@!1gh`*b=PUtXu=5dRG zn}YI==7e#f?|x02*)y*)qeeUZj=M4B5za;4vO=EXy{_iO?s;Up=q6utXmu@}X2-3j zec*Qo65Iyv4Pwpd=#+e(+064NuZh}G^x1~=^ZDM}O#kb8{yiGnB7PKc8~Tpf_xW7o z?w%a&$#DjU!+Iw}GaL?E=Rf(*dT~^_pqLDI)|6k<*N5PLpW+Pdo>LBlYfa(7Z}7hh zSsPI7-yQj}hgq^mlW()&+_!EkeI*Mc)qkYG?ab_Q+edeJ-Qf&+RpUp zskl{}zC0Ga@(UMZ@UMrw(UWKaU$buv&Kl#;_4+@Ao_vaRXQ+|5O+69+dv)<3JJ~mI zx-Kcp-8VQ*d28SAvakJ-eZ$!g&FcmI3Ov7|ctZJy@cB=e`hEFMF%g}u=-S@ULTKMz+=O zh4kf!foV(64Le?Snr>tT9|Er%;tKn#bs+G5Z*NC@?7nys_;(eIYOm!JU`W`Zb|3@O zVg22tw~ac_uHboZE5<=k_zCun$=;HG4PBkd_myOCLiR}cwO@y;;UT(WPn4Ypad$*> z?KIf9w-L|1*wcgbV3xL4myJ??^QtY5Z|v88?g5zXof`Zr!rOHKO>7?N3s3BJ>?GT; zcQ%4kMPs8hjeNGG`ww%+%5=7Gz8$PWtzmoI3wak_KB2wH zCbt%@4x>jm*6vfJ`wf&&gOej-cagTxqD&cZ?JD@TTk33{ZRdWgIeIl*mg<&pxU>8i zbmMGSI{##x6`ju>dzewUx>C;>^t|c%?Z%$u?kb$P1^y1q7Ku}t4Y)y+VUollD-1L+eD`LLB}U zdqU^P8WVUpf!-fpL7$e3^5fO^Q~6n1^}vmtSL`(O@rC-a(IMuon-Z zKb@ElF{(|S3&uQZ$4jagx%gJz{O;)O%bz1|>ub*&skeUAqMwGHIqbp{NMk#3fCD`L zYk7n3VLv@b{gFrG{Jfia*OE7q$Xqkwf_$c5&W}@-*UNe?`eNsvJ`!r@20#SzmBc#GnYGu8{qB9^7zjtCH?8w zb4l1L_wE@uoTsa- zJDb7bm(>&YmTQYw;rXAQ3+?PP_`HeUww1U4daLwj{OclW?uGJu{0YrF^LE@9{}~w{ zP0mtwfby2g7UFVvX3QjSq3g(9pkH0+jgkL8q;;|$we&q!^6ut9Pm4IOip^wj=hqZ8JR!(e;l zx7i-v4btKv6@(Z68u(^yI2>;EQNsz!%boqyrWKj`l3xA#!g z-bn9{{@Xnhw$2{<`~%Irxsg0nJ+I((C%8uo;FqyGK3Cc4jB_W;7Hk|5Wxd4N<*s<& zRM`T#PJgkdZVXK)I;Y%)6g6=^d9d0aEytubq-X(uy6s7CSM+V29bvR4q^5dz7(Cqz zU;Ej4k3m^oI(lzBHD>U7GOxF<9@%xQ%iX<72O#`uHB%T1M#oK({39V$|Y3t68YMn_Xa((td7Pi0qnm zJvYqwtS+BPajVcc*$^WOTrK|2_mP5?p5M}F_hPRw*Nx|Y(lti&=JId&+{FA{l@@4i zFnrzzUp}KhZ;X$VWfzFsxW>Wxi=pH3vK0isy2?Vejd5{KV?CM8KC(@IGHKw!md`eB zTdDs)a?frlPVc~qT-VLrOu;VkDZiHG=)NQ8-2^{}=9}T?H2xp0wQC;+kL&tACf|{q z^b?g8-bPQBJLDs^R*3UnUYQrW<%`JIcl?6PD&D2D&{y!63q7O@J$Vc5 zw?fBs{e&0rIqp|HIA4a|$m=}RT+P^3V!gA|&f~U%mi$KI)~UML^YFc~dM-y#G!_wvzkIGXYuP3y9d8U6eH*@J-j!M9cB^z zzOV6pgMZsbxGC-^Z!1RNIdLr0SubTi^L*y>0qeyIv+HB)#|ielU5eSHpx~FWO=ZT_ zgsXFmUpM$!lbjBNzcuw>P!W3gaQXSi_)F6&wkJPL=))&_ss&u%2Ay%Q%I^3kb7OH> zrQf59EzPtB+TYQ4o~4CYE_|3@Ow-G{;<a?%DVf_gjTF-cFp#a5Hx+ zpG)A)ALVK23%&Q$qNn+>8$0u+>i?`fzt~Q%Tgs!+32Hq?f1W~G>cQuowcCRK#;fiY z>6|XXhi#$f!^JKY`mQbBgxl@X<@EpU=+7@I4}JS4J zEKbr|TaDwMyWYyXn@?@*&f>n3!C83ab@j{^mD0hekA2Q0^|tnYthRB_;$%GPnUs``BPd(08zB+meU9l*TFL@%|P2qEq1R z0Ii&(K3OeT_V*&C@}HB*E!1|d_7|Y%dZ^Sy{U>>P7r)-(`BB=u1FFS7=wI4hKudp? zUOkWmKI8wEDE*h&cnsSAM(IeOA1HAk?eh_QbS@3zKr)y@H(+)1l_Munz zflG&a?`-YXk;i{aw2o$vh}$3c%9^Ls^{O5$--q?^vvPC_?NuMr3QyOB&%4uyzm&JG z1&XISpF2^x%yFoE2n8Fb@Y>jzJ>mrYW@}0_^{tA%AuL30u%c&fr0(IVp7CDhew1Na zA6)S*IoR%C4|-$7;hsvb9jnJXq$zOiF#2%#@QmcgH@&haIt1=@$M*+PoQU$vhmZZx zaUQ&FVbmW&U)($Mc-mO$*6bL^2E6!;^P*w0S8WPM`^zSHJlrnaqq3`U`=tCKEg?e- z;BQz*x~S(O{d}7Kd~k*DZ^P$~+I^bMzOQ}3-)S}}Tf!gu0H1T|@8_WMetC{I_g&_1 z{OM*Uevi(UEEQAoL-cnb+};un9h{{+q6?bPANTdQm$ELry^!MlWe~7oM==Te8Bk|{3gpPsy@204I z2(S6=wVq|q=5dPH+wT*7=Z{M-A4x{vRbS+JI6se_?!mnCyPGK+%bT$jpZn3Xe4ujVNpwF@N?y!HK0H5Pr<9PiaZ+@IuEV5pFU#=|n z@pfBrgae_;2kaqRL!h#}q1YGot+FeN)qKx-rPGT!P@|@MVfRpf*sIskliQ^;ij6)0 z6MQ~U@8m%#zT+R%Jl$A~#IJA5&+#Yf`O))@%)94{jr8JM_`DGPaXxsa{&j-SM;gt( zTE0j-ur5zbM2OQ<`?n-8%>?n;}S zBcI*H_*KV_c}xHQ8;-`lLdGws9Dba4YwujtBzk1{*VjsyqO&edWGidU1^n0MTgBZR zk}g-j+Ye~&*QvP+dqHEI=)guT>n?rxUseJ>NYr$FN7m+1)KTIRUPQ<#QO{ ziMan_1B=c%__d6_+gwdE(-EGgGoxdunQ{XBbPJ;I+j(z_^T>I6Q8PO$?S#AQ%H?UA z@z^rEDn+zeVy}*@@wM5d7pY|v-$$e7ANlt;B^yJs)$`64WJJW~xqLbT+`JuSdkKg!qMssD> zCSPDQR+qcwu^)DWYkoNCoFLobL8RxKa=&~}{Q5)u;MwG)Dcs%}7c=pJTNu5z{3il$ z*-rE2`dKU6J`bPNVWRzyHZCG})EE2vUEwjWa5#L4cB({Ot-$p*qO5j7&1m?2sd}p5 z?mpgYEY9!{PrD~GKbHJQJojAfbr#)qxN&OX`+gPqzbrpV>%Wz^siz)_FUz;q%ORqp zjwS_S4D!8D+Z8^?x;jrB%jV>KFLvjBjaPTik1{%kSI$9?^IlPeJU55?{mi7>`AMH@ zCQQ%j6i3hoek?!Hzj=0#FQDT%D_cAG8u*deFT(y=1HQ)2q&eK{W89h*p;ttF_mNhW zf%M^d=&*g3E{LRd;rk<;J&3Hn3RfdH(3daTUJx_kJI8INJTG!M7#FQU3C| zD$XMPGT~}B;_{YgH|`VL#Ajb4u%tM#!p9}vYvIjZ)YqB^*>e48ibDs%_0V6AEDo*U z_6fxj{1^nEPb9^g@vl4Dub;{<8u;9q-`xPxRxj<2o^8m>HO2l$w>|y2AXnqliP{Yh zvIjliNcq)8?8ke+=flb07v+Dn6Z*OQcKG|HxbTTay-D`1_hKLYQ_;yR=*YigYu^g- zg9rIuWp12>-o89N&d1kb#_}59JE-eQ?MDRnWl*srUS5V9aq@OKZg%zDwMH$@0k6cD z*rDA(3v1`!*Wg=yBXNOeyP)L~xZJ~tT$_zW&oq0WczzBXAFpJb=Lf520ep-8ZU;|> zm~R_Fw1GIz!NXBovIiqx&iEdi~saxTJMZnH{h}s-{y%; zJ@>f!!$NW;^f;cD{2*yxsQd#@w#J!l_Cc8t?vOFd2t>?}G;QCN&$QiyzJlBQrv$>ug1$X-zkL$D-`PWWKr>eD$^TF_r zAHbdw`bBGc_XSp$7VH@RGHYtX>CM#A$@kOMU*G2$m3j0i-0y|H-{{#N^MrXE9i5Ed zjpz;=+4t!N>j3*wWnFM@ko9LV`D&J5N6VSaPQ4k;;aUD6JCo2J><(v>(MjTQLPBnH zYr|o7O#hS{u@gQGmCfZ^bE65|KHjLcfyc+|XWZ`{_|rs|mB59roqF^)!Re zSKwpg%JXr5>!I2W{69aBTf#<*2S0^I;v6DBUp);g_6%P4aQoPbMe+jmNBrrf@Vqg4 zFVt=qbY7QVt{#>p_!}N(kK~8KyVl}bR`}bSJw2ke8na)VqP!VBIP|A!%C~2mWY6K=%ZI?RTO_pQ+Y}_r6-c8aqF{#M8a`@13ib zj33_BxX>IPUyIM}*`Lpc8-sk0`oq8ZHuBVtz2kh`Ys2n*bA_MVdOlETm*RN8|3!a3 zU9a2Ir;kxj>+Iy>NUN{>GH^HUc{$1Z;W4wKI0QdiuxtFw+nvP?Zbsw!HQNVn&r<)@ zMd)uGd=54{MyKQTW4iV5F?xSwVZOQ;Z~TU*ldMDY?7*HcPSvYc*?;NjZRvC`7Q52k zx@N1;QCB-ZC~pX)c=Xm+ne5AKON=$N$k(pp)=yW7eMY) ztiYiMJ9$qx8yCpu_o3c2^1cm)&yOk>aRQ&Gd1G!m-stw^%kiKdwGqE~hQE#Z=l%yJABnSHYhU&dxOyk~Y_9&0?=I{S53xeFW|uw-6%*;*2a}c2S+D#l(GeZlMd_*g)O5?&XXqn%Rk00>EIK^Ay)$)hTpO)+!*BXzy?9GRehAG*c{9)gk$v$|!dONavT#Re@ zoZnNSe@plt9^5;?+0IIq=66==**o9a_eF5}6x^%D|N3s_RqVrOs6Voc-D(fn8b05z z{zh>79KQ~xhu@*L_2otA41B&Y5B%uIetbKsKZMVB;%4LQE;!l{r|(2hoSWZ~-{R>i zcIb%c;+dFVquyHlLC)7o5BBSQ;O$1)cYYtQ=WpZ}82A4CH6A0c&E*wcK?>KFzo0YD zxz_vaP0ni2H^-{6M)5FuoN*TqYOfnSKZ`unVdr>+bdk{F38Od^USDdxYYv|uRLXO+ zcv^XT`tyxylfk5T6x}k46fYQOC!NZB8Trx0J!)?Ok6-Y%{djS!+QJ|DDSunAGryv| zv-s}IX_Z68C+aPnlPWQ}ntCe7)QA(5(pluA4k{r&t}^vMc}F?>*U{&&K81 z@c1adPh`KiTkT!>)h#kwU(>IT!j&oFQ?5Y|`)4u5tecls>Q_(oJ#HO}^SLXEO-R$Q zbWL%RZ#CrOxKe$s*)h(k^zVJ^Md&x*T6bc;tU`zAR^Kl*i(0ssFxIKae0_=Wj7&tg zz|p~E>s9Z!_UtwO?~V3b)zj5G&*^_J6kUt!;bC)wZ((D-)98+aSQi=3p8kKDoHt^- zxdAQ?^6!ht_cmx=ZJxcwkM3Nh6Up9b@as4={iGk?(U{IOZj)W%D8HdpDlj5zR$ z^)35c&koJ*X7^Z${@(QHkgF~{2#%%AZ3%ZT)tliy@5R+_?AO=?J`cT;ihxAetRlYFgi*Ww-^sJ{Ez}4q`)^L9KC^`!`{i5gV!|i8`Z8iRAp%0C8>srWRGkW;V z_}h`6-Bn~Y{N!Htd{cfJ*W%V+;#O|=>lpDdr~912zwSaK%4 zgK*8g#d+AZcHy5FmMFTr{d?T9)<56Y+jHUb^Y}{Q=+AKv{ZMv?@M#~CKY{v|{NG;G zn^om(dT4#m&B&kmzw%d%{MvH6{08XOAAY}JT-u2H+utnQS$3>@JsC68hSf4F zt|PZ%KH+lgA z??oNoW6!lzvaI+BAKhO{PmgoYg~em~$v+V;H%8YaxIA5pd#OLxseRyF*o~IKcTuSB z8+^3Fhr^2d^ds`t-2wAth^ydvEBWj8!Cz}V{0-lT>+K5~qF_aF0;~Gv@cLh*dM5w6 z7|)jc@pdw=Tc$^f3ytGQ_oBR@UgzWV;-x8vG2 zK3`OSEB`U_|Zq9y?lua;KuJ->*Nc5%s`iDF;A5eFkD&C=~ed2AuyW!CHt&H2;^r5GmcUwUMpZoeg z8y!p1C*+IXp*-}L8R?_oDSZCe^9|TWlIR zefFdcJfgg-5qVX8agK1G^sWS<_sxSQ?9uo8%$@&TWxRIw`&-&=DgWqA z`ukbgG{4H~wTa*FS4&TR*k42CwPoY{Xe;F|{O;}`t@ICeqxrOgCi%JA8Q`{?hf&`~ zexw)4dQ*Dyo!*w0Dt{T3Jk9g3w9_7v@mzDCFZk^1^HS^CH2xWn;%`m&cgJ|}=gfbq%%f%bd*pbX`>j4y-WooC zqI?)V`6X0MfXiRuPjh}6A1m)GAJXe)B|qJK6*`;nw_fVko$1Bzkfx^mvcJ$yQ~n>n z@TC2K-`w9&W_9_BcIs;Psk{bxeUIJZNAmEi`v>1L>RtF{42g9*TWuZafX>zVyL#;e zF8|!@@9F!UW?RY6`4jRVdiAZiILCS5iJKoWk1T+;9mzC#E*dzM&Mig>F$>wm4`?B9~FG==K0u1_k)k&U)DBTPb%YP zjxW94lg9I9Qzx>+L4EbDr6O8QRr1m0Z-O@t-S@W~c z@M{uH;}x^%1nAc_^~er|&r#0=TJlR;KU+;3w0aJF{@nPFX1%yYeRsm+uhI2nc{2Kf zp1sVKU1_Noz}wb(eRcMQ*6oY*_}T0jR)NoP~P% zwbG5`)(I@TbAN9%fKy?w=$T&e*Ulj%u8e{!o9|FaFYg zT@-x-51rioU-{N)t?}VW?E6mozc{V)RN(eM-l~!>)rujC@{{STk(h%jI0{=)Cf${Os* zzbmiH-+4V5%f}H3u`b#Hs6YJqKcRQWezHIN>=4pYJ^xPqE$GRg(mytot?b_y;b-0aTjf^Z zd^Nsz$y()?kpEt8H~R{m?&-*XHV<=gcn_0WnV`L2$H+{{ztlS4|NITyk^2em#Xe;| ze2jT9A3foB;)q$6e%LOZJNYYNuf5_*iWn>Crdh z!)*Q}Z<3yx{GoqFWlMICnkn?F^U)jk{+t8HBmUxQc-mUK*T79t^YD2MYUDZ3kM%iS zeBj$gnH5rboP+;6Z>PUsIv@N;J0gYPe{=OdlP|)p8~MlmW!~g)I&h*hKaTK*f1IBD zC^~xj{6@PS=-*eO{&4vGz4i;Z`@VL%!tEDvaU6a6GxD0~&7XO%6}!hL>UAqA937_q z7R5Kp<>Y}+ZOKsBwDPHq@2lk9#^rEBtPQLP<`xA<9t+Tz@ zI|6@N!{IN{F!20z z_&u0}Ukc|N!r|+R-@V;NzPlHp=UjHPCWGT+~zAdY#87M96xZ`|~PLLvwf4%H3;aP2uq27r1?v!Gk z#-kLm)#0HK_&yrq^f6QDM5#Z1%=diY&=mHI2{;_tW(JZIUIQucb}nR_OxGMjgIma7 zDV`5Kc`GtClD%RV&)3T?Vy~{1LJ!{y&h@5q@2ex##fR<2!=b0QPtC5>n>DQHf1&wR z=YxNurWY%40p(BN4e*22=xsUK%E~<2!sy%!jh@VY#`Vv|W!Ff7&pVkb5A%n4J^RJ` zt66^{<3@9qneA!k>uDU{>N7unZ3RC64uNNp_jl;ub4d7CN@D*~2R;t>yZgM=*jFw8 zC~vLx5A=(#1HCtJsc|YR^4tk-b->ql^yVQ(rb|lbX$qes@~{Igd=**;dvtaDXb4Yh z!&NuSrC)KYt8>A+mGZ4qNAI!ssdqNI7N`2YH|B5GA;oR%^U?~cGi$PavO zT4T2TDKEz=2_Bc#VV0 z5g|7QZpQv%oBR*$@vw*c5wW#Uy5cG=PSI_<0~k5Ny=$DN;#Gw(W!YiW&dZOK z-;82=c8&KduwJ!F2{*y{M7uGW2~qTcbvEs)%B}7zmBifUrk&~EqKyie98u+P@5m!I^3?2O)tud z9PVD!_8v<{Q7?rD#(d>b{~p?j$ml^u4mSp~YcwJc_2F|p&$|y&d-LoI_AMI2J@Jz2 ziTrlQ6)BA}uMbxqcz!mV?PmqQ!aA^42AjKzCf;u;f9Q+&G7dg}Mtkcl|4|e2(-t1z zNjkR`U-CVy6sM`Zu!B9pj?#m~eTx3yf&`& zPKDRi$$t(08!gSl7NJojS+7MJ)l5A|Vp%pvXA3CSAPuo{eqOdnW7tvUBEZLvIP)hd z<~M2&KaE^XyjTe_ zx~3`Yi1Xooe=~e0+?<=;3bzOH{CX7*_eRGwJza-8x5DXIH=g3--6MrwvMU@Nl3k9X z+WPw+IW60#hzvO=3;Xz&*@-N@1C)zDNiB?Lc!%sGdSfM|oQKobu^ok-IZnG*x&`WS zh#C2WpHg4wx5wkTBKFVEr3L9{*7u_LG0`Wju+k1t;AS6i)U~uDmq7~pWsruu z^^MOAxEv7@dw4$fhX<%9@)>O7`$nESCN&@*Rq}oO?9;P)2t?A|4MZ5unh zL(n~mbo~G2?~%rh_;YM;8B0rz`8@>4#Y z>RDf!u<+-}-3-}Bsaskn2h7|05&OOy>E9>&uV!9HKi%{K-y>tn!|BWHOk+Qoo>$Gx z&hl>B@5SD{&RN7!@=tt^Rs~LP=uRR_{o|lbHKG3=q zIayxOdwZ~R#C&SX&#p~nTu#YP*UFZ%pod&ac?}Q%_@d zje7VwLtINsa{jk?#DCyglkC1cS`mZeVXx^Kb{x)J~{uw=W`FDI}j&*1MsIDG) zXRQY_%+61&2N5OwUEW@KQ+^)x%+Gn^30ISz2HB73jD6SVc>^Aeli~Pkb6Cc&|D*pW z{U2doHZK0uPPeR4(Z%zTKeSewj_)m$w>K}^79GsH4)SW`aK(L(#Tv7=9{ss>sxlup zFH8nD#EJCdF8VpEXiE;=8UjZ{zZhCHAa5=CYqY79?@Ccg)cR)qXYq2w|pexq$`}PoFuoJ8}7pR{-l$`|gdcfUH z$i);{WFoF`bQV!|jkHhuqr#|`%1lA*e}?%=ucymbvJ)H;8+WM97(5$ z_Kg94-yrWs{WKK5bV~gzcr`k8HXobm)yOmpKl`Uv+LfD+J{;rRJ&jJI$;0&2Og-V( z9Qt$kdklxeVgOU8NZ-oR^4twTmwYDZiTQqZs!9>D zc!=^g^w-PNDDuhg3x4wA<9|0nKdZy%(Uo~|0i5UUl--)A~^F=>XQJrg;epuSiqM&w=6 zM9(jQ=fSrf^A7)4UlsZ7495plaBqM5ZzFVV3+J}L|9#XSc)lXP-l#5e4%pndC$~}b zsXTu!dvINHa!TF~KQ@#XT6Y3(@AEzK`yTH59QE9#{lKOB&=>RX!o00&%bZoh$x4*tt&l^+?VKy`rkwU0JHH&>r9pU*5sqeQ`78q z^^FPsRrr?CpCcP?3wifOr@=V1K|LeYS0Ap%{-qtf>4mN~aBhh4Yg4qV=tbk`y^(oo zC|v7`{z33G^voF*{Wh2HIp}HypQHXN_}nBF)}@q z-v{V_+ln0{>;y+^H~b4vDY~n#h5NY^e$-T6FAamcXKPP(C+B@N(A$h(T}{vR;-|4z z?S0^NBlEe1y!wq&WRkzNqCa<~N4JB2ZRovC(;qV8K7pRV&xf@m!!bOLOb`F?yBk=U zpI;SWx10gDo2SX<)2GEy?X-i(OO5Y|@{IOL?eX{5qQ7+{?zdV47Y-C@*vYuW&Ss+e ztIUwc$?Lnmk5&JFA~L(uZZ-J36&;#yp85NmSTmX$`r`a>B}}eQ6I^azHaMH2pRrTD zA0~#)IHPy1l|yi{`4UKcH5{9o`mhCW!@jY|zvKBS*^}z;od(ih7sKZrSxCdrQ06@H z@P!>fS9tQ0ywDTOk1lX|dK!{F!|u`9Jo>NP5yMiPR-T+)k&UCp%9(2&`Bs$8aP9V@ z_a0)Ms0p8c(@tGh>aoVXz4Bw6!&XbT)9YD*X~C0>dpj1ne?#cUSj$&u!|>%L?Z)$u z_-}hX{sz(x@ckVpcQL-t@TA*8ZLYw8Tisc|htT)wZ%zoO)6yHJ%s6+j`V7a{ZYcac z3;$yGa?{ozN*ap~ZLCV85?&jXzIfCP?oO!4;n*}Cznjv!r^4s9R)wLJ=clKcey5Rm zZYKGh0*}L9y=9th+~_Bs-`ITFB8~7|i)?wCMPA$>ktWeM#IeEMcz^%2DS1goV;bD* z4xjs0oAm?qJ`!&YjQT#e) zqq`M5b8R?2hThy%J0s;s8jUZV#CZq)&z5(&8TqW?KEhs}@4|1dW!m44CvZ6;XBxrD zu&=g(Ps7yTO?l+s?~+fdjPDfq)WW{b?WKNqGM|sHlrPMCrii;8o42B8G~~}QL;cOz zk!Pgv10S!x(Pqbzd=q#PaVDEo#%qf9HYQKYd=Gp(K7R`w;R!`PtW`4?`rp_M8?_r@OUJgYtJ4s(R^noLPy~0Q*bow zMwi3OjeOpdZ=}C*e@X|RZsg2I*ss)Kcb-t8=d-*QoUR9-`zm*irN2SvkJ=0S?x))8 zL;ioxhmwb8^yz-&x{kaV&AeALTbmE}Tub-CjwA;k%Y)Q7g;lc=eKqvt-o<2}FO@Bd z7Wh+*ePgiio!B#az#}(?7oFAL15Qt@;J^g<6mc4}i!SC-d-n=X)^3&az;62eYq<%$ z8?61gMT}Q-_Xv)L+n<*^6+PitBk|pnE4XnAdN$a1?L$u;tv?48firFRJxAsQc^K6f z_l2FRz82=$N_uzr_n)ht$TW2i`U2N)EJmrX9>0wd@J0MiMPKJPT1@b}n`F^nm@oGg z)9|6AIKo+pe(2tmq2y$P`&O#R(Sq!=Vl-|%&yQoO@9t}}4zz@ZzZW~=;$i$aHZkt& zuk3Qp`)XvPl?=s)-o7ucjB8HsZ3g4qtw=uJ05=ONddqq-0taN5plM#(+__s9 z9<-qcmJs%Q*2bP$zqGSF2ZOTBDtw4c`{$UgZ2T!Q*G$hARro)Itzyq?XJgVX?Wpxl zv-$8dV&5N!Z;SB0S=tmnErpZa%|qG5p}^2Ymq`=Y-*$swk$-7-CuL#F-ie+%SswL0 z*;R((!wKl0No&18JwuJ>$u!bF5OO6y=t*qWE7?01;NO+jl+NnE)cUd&Y`%j&8=2M~ zgKTlGdc9|N;-m38{O@fQdXWcl__N&w-*?5I|Ejl_`SrfY>d?bqgv^WW7=EJhb~6LN zwtj`((5Ye?$NpKD9@rLF=sc~$Q?Vk3~ zj+Go94<`Zpr_IdAus3gKY$2|8$HU1Z;bi#zo(tbQ!G}%KR`7)XC|nI3JvYrr9qmXC zhxhHw{}WU2C+s1UJinO!JSVL$e{=uf5_HHg0JnS4A0CD4jm^LN(G&RmavJPC8Hwrl zZL(p$&-cA$_O1Rkp@%$=9~1HGUAP{n9cwD(^?aXcel-&hIM{r)?xnfjlkd{HFkHQV z(8HsjwdlPO2i}0*7?}*4=2PJmPxgF{=YQkx(Z@b3htHGI)0Y3muBo~_HQnLoM0heR zkM)fI3p~us|4H=JF6Q_2d|`#ZJ1URg$HKu$X4K0oz;<#nC+&44Q% zvom}S#sAy#CF-f4y^_yP)wFkYKF$0fulcqWKEJ6v{GM;fH&KX$lU_a9I7S|h zklR-L>89dmC-(8s@z3Pd0kk5*+mOn3_=)rK~o$_Pr zRQS#QU2F_>_GORWTYtYPf46S94N2L7R-sGa>w%uHX2uM$K2^~ejy4K+k+{8#>pSev z^T~4?oL@u(JyjnTq@_+3$&7Y~D&Nem{=YDAeA->c#2(o!^oI9CXr|{{i-x8xt@}fG zgblZU8^SKVJlikHiIjGO!?JA|w;5KFS4iznc-9-9ui=j|%(%@{(_Psz=XawamAtBO zy9xbhAK3XV++N2*Gtj*FkK8^r$ooNd3NO*cAL7|D+I;-6JO=KLQS)_t>$;=q8gz@N z=U>+w?#^`9_l{lq8}#YLsh|4qhTCy3@~!Ob3!wJ1PRv@dpF9QmI%XqSc$(YBJ{5}XC??fJ&!|lzE!4@a+5YChrIR z_p^t`J*0P97n<2mJz3G$ThWsT!4uXn^mH;GUjD!J%N#f7&@0+*^5F zdh|fJF%J$Og^wS?;s2E%l@Ig$#{4*r@!W;~U-{*E+zil*pY$T~{YQDD=R;3^2_3bp zYb((k=Lq-YyH#-Y741dd`ao zhS;Z1FQzE(CqD2qRF8zyPQ>YB^NTTXa{|4%2f1qJ{=uEoIJmtWZiij?bo#NVBXNX_ z)Kit6f!$i-_MlkzTy7_dkH!vCi+n-aJfgcf;p@;isKtv6cDN#dCqa4`l0# zvyJnra)W&b>n0wogz_iD=i~4t^6JMvEpGBU&+HH1?;~L7>oTD13%8ql{${f@_8eQ& zH@=toa%=n^J;Z#PsSewEwTPGlc&0H(VB_ zsy7?M*QvZaCZ)({R3|Mqj@{t(6!JLVbCaxTqtrHs{i;9I-&s8)vkqxrax^e&n)bJD zwzn#+AP>#p^uGEhqY3@Dmyzz1j;qXzjoCSBz^9RZ?`<^4sweX5@0iAhB^S(rM~j!Y+L>oR)*s?_vL6nNC%{lwCS*q587C9ZrW`@#%_x$6RqI%k`rUKaD-{ z$$gseIrrWJaD13~{%kr(xt%h7Tb3O0g!{wQ+Uy&L!43IjQrIJVh+Bzzt#{}NoT*1I zt;s%IE4->!NAZ~Cw^wtx%vMu=-n2jxJ7>5z!yZ56<)pt%9 z`o{o%88zvTW5llV5AMC% z94-xKfBqv2Z?gT0!_#p1vz#6s@pNYvzZkVS^ys~fZ|=G272LkKct`J+z~_C8TT^+F zcGmw6aQb!9HJ3eOnfJS~cN|se@7?^E!@KysVxi|d(yy1QXB_?cY=4hpe?A(neNx_0 z3^op1(~l>S`rG+y{0mO}RK9N=>8wA06|#}i5`TuzgYZD+=v3RUOP*>^X-ACxDsjK+TIX#JWEx6j~9*Njf4#}B;hgTr0ZINxW}pU=W;D*(K*`^);J!_gtrujhx5|2}Xr@NfY$myT{cSTldm2#<&Xi4u{8)k3Y^QclLRi&u_~!(q8y5oPPYdUhc`i zd4GI}e51$cPmk;|eivc$x|?3!Uq2p6XM3-e{23#x7X#?ehbF6&bHY=MOHX#=cIN45 zesf2ovti`l$KSzlxw_}4^UFA#9@~=N-QoBucYOY{(U{Bbd=Po)#twZxTyUEjzl6Hk zcJ$~Y%%2XvUkVo*WS#g?)Q9I2;9fO&-6=m0el=sa7=;h3#R=|bd|R$ETrdJiqAE^{Wf}#x!W7Rtseg_>5-12@Qd6utHIj>;$=Tp)3gZpRCz&H2e!NwK3@f|2L%7=%U$vD&B{7ckGPn+!`pc8So_w7@cAsyx8$D@c(^tHjTPp3 zJ$m(Sm3BK7@8X+0=gRxg=BkR&yJyMIe*-EnV*j|vzHDvTAO3}Z!zOUAKFer_;=Htw zy=883EPgbPdtQy>?&4UEx3B#L4xeBg8`6(|Q_oiJT{$g{gNIwf#h@qroU7yKp4uH_ zJa@Nlgq`7>qM826#0$5>o^v$(5B>QndUaj=IJ!839$1SVdPn2Y+x@UTj9X`Mg&~L4 z`FSiNN2B0%oM$$sSMQg4q3lNX=*V08V(|oNI7dFZli*DFZ_F@X7l=dI#b~}PF6Auq z(FwHv20r{zoSxOdhp&sf@ZU{h#U%3G+P}iyzPTO2+2&nyGy7QmZ34@G#-9_kcriKp zihU#Wla443f5cBwdM3KQg3o)We&$J7U{7$$-zPf=-QUxncSX-UGdWL(s^L-0*(xby6t@oE?P#`ooRY~8VMolUb`2$g2Ds|>-P z<9J?$-@GONcXVobDgORi-UeSZUz#XYFESxX|H;S2CT){C#`_jUdLgnFiX?+Z5e z@I(I7xn>RH5V7AI;pX>t`r!v(JFWI@CtUqE`4PE6BMp09ixhs_a)70a@lgaIT#WNm z-UZ}g9^77O{MxY6p9T*HWaCn6^>1p_ZXn;aSffrizq-Na3FsW*`MBX}Z8=eS#6>P< zH?Ecq$ws6L$rU@2^`Slb58;oUk)uu0rR02Fxivft96k;X=jdOYuD~Dn^I9JwZ{hj; zOJX0oD;(a;3U@*}lV07)&M0nLoR2G);zM)#^I7QcY$sJ3jZM6Fk^VO37ai^Pqc1l_ zr##!~eB&E=H!dXqz2wQ*Prqsz?UnrLnzB#F8a)<1?`d4*M^Cp}gMa2bQ^ouf8CREkaoMn-5S=N1=*+R-`4$K-2-rFWj$)1*CppTlWs|)QPvX9gx^|^ zJkI-AAmL#*?Y4lgfqN73-^`bZ^x-qec^7_nhmlXaH~3!*-Y$Trkr!a2{A#~D38hyz zur6&$5AK3Lo96#ke-r+6VaFNn9`EavTZ!|_jC%`l-Ph`eQoI?iN4-6AQZ|?AZZh_gJv5@!dcwn+@VHTN zjrsa#*^>RWx927lKbxI9!Sh?q`^fWrD!J{PZCpfr#zOC1Layq_t8uOVR>9|<e7_y*tjcPG@JSPJRz4 z&M;3){?BpK#8mOzv(zW+W^oz+gq$6E&&s&p%HB~;|IS7K#&G^>^|ZjJdy25fM?Cl0 zDbD-urtgRS{MDjvs-mAfL{E-;5}z(Yeulu`S5SMQxRuM;yXx`F`=4C>B`$@HLVl#j zQ}}!T!%kBJ-!pg>(b!+goDlI}NoKIXXIaXP=RCh(W_*7uYtV>deC681?>kU-4Y^;< zzHw4ze8rZTKhxBG7J2+Zh8QRR^lCYSoSScDO}N$ke;7U=mWG(m!)SP$+v5-9X*J9} z8(W#tv5%O_{xK_S@A(VV^@6+g?jrx%!*nyffTBDtl6SC-}TA`B<49K>zPa zPL7oUKJ=wa(ceu3!wPsknEanZlZ$hNyU`;0z)Ak&@II_HH+bIuk7gMA`U~x_hp{g{ zz=JI0SROiZ8|Q@g8OMxO`)1{>!;?fk^XS*NqpJ};`N7J3`HeTdz z=P5N)XMEl2|KaH_;I}H4|AC*7?(XhxBn2r26i`sW?hX_KyD+fD!1fy0g{|0EMNIUf z*n)wHil~SR=l`CAzyG~2dA`p)dv<4MXFoe~YMDu$4^sci0*=a5MwijtXWPjGa3VPrwN6~+G-|KDoXy6=oPxTKMZ}(t1E+ewb%k^uItVfy|uaVgU#?j1yPm%BP z5cO9?pXs}|c=xS7tDgGs_ZiRa=-%*G(q80nZ1!B)-&3+z;j=v+{MFIFbF=5kt(o(J z7g~q=d+vIAElwtTy2PIctQ$3xuHvE>>X(}g)D!b`adr$jwQ#QR?sOQEsdJ^l;kJ$<8Zx7dRC+EuS<-5!NR$y0# z-)W`(j;FV4;eFerZ_<>VS(!g_1O9pSit71w))Obx@)l`A&$q}Qq@Njk`EKjGdqVTp z;#2hi|38J)+F8p)U(-|A2ugK)PT!#Bw=t&1VaW~NK8Oi?1SJv+(|DS%powJB1 zJk3mv_n@ocZs_4%EYmu?AJ-%wTQA)smb_f@`FL`KPdD}?Z&_b|aS!bCr9Q0SH_6!- z56L_{g0z;f03+Ys7wq_I=JhjhHg3TA7*d4)?Kl3NSl8Zz=es01f5)5Zbr-MwtnhPw zN-i_iQ`i9uoT;ClfZj#a;-V7AYNoHD*X7|qjJ|D|R8AkGhu*BxS4v#niXTIkBCFw3 zC0_MUqg{6~h|j3b-sq4%XP(OAn!Z`;+fHy1YT504o;)k^1igWu26la8A8hAjSo%bI z1p0pJ3H>}J-GN?AU_U=z+7HL7C;Yne>=)Kh|50v8ovL4R)f4UBY(K7=RrN%CMrO>b z)3?j=x7jbOuD|o);dt`7GkvQp|1$qgF=KGv@Cn0 zoG*2)4YK?2U&p@RvXVX? zj#Y(2ZL@pW7cKZf-d1lle(8PiKgc~R4;c66@0@?Cc@#6$TJ_OF~%I;oFZca3p^bYc6?4-O%dv@>+iM##sw@Q0ynEI=$zYn}7 zZSfoLpf?f!9P7*0&b2(n{;cW#!MDrx>tOa~=+CJ9A$U_&KBN~*eLI+DQF5KM32z{bo;Te1z zfAUY6p{zk8@DFD(sWEnY24S-!b{yxU4IVh5xZnzYm-q1UKw}T8|SRF zExY;ow43pogKl@+8|Ap)o$N-Q$B4UIS-xjI;Q5f- zqwMOM-Y;&mBQ@0i>BH>sz=sw56czO9X8jDibg2k}@K3I?4n-c$d*JUL?7h$QFZ%l< z#HwQ+yo`?-;=I;Envh#1@9p8&dRISpx9V+5{ztxz*vIjH)>53uE9PnH`^W031CM8E zer0@0Y%(T%7Md#P4z_r%Q})75MyV$^Q)YY-OX? zH+>Z@HL|<;VL6Wu=l`frZhgyoYV+=VqTbp@aW45x@V4@?etlM)2q%L+PrFXZ*;V+g zjBh_reL=rP`D*N({ggjef3HzK{JOWZ+pDYpEc^vdx<6dGT1G8!atQy(`{_yeU!1N$ zuZeyE`46&6zh1s?E>mB`GpyzZajJ~8SEPs1lkc=2_Q{&`B>eB>xliHVSMu1sQqog6 zQ^z{HOg|6MZVqRne68#$@@NL1S6jbUxnKLOlHN1BO#8i(-|=&VdEGqw0KJNMjmx#$ zO+3fD_}O58@bhw>hQFhF;>29`9y#!UXP@eiyixqi!DsJmh4+yP@c9$;z~>9mx8w(T zhrN{fzPbF7*}Ss+6!lN>UUMuwkM;2acpB&6W@aDjm-~R&*WtIh5Y9 z>acGki^Ff$znSJ;wftQ;-yZHpyy8@G(yyhntnV%0R`~0N<`<}EkbT4#)5Fb!zWJ(B z&xi0A*MLi%*~i-%k8%08_;W5kzmQ%GfZNZR*WzsQ|EZ@BJiW^J_LR@#CF4?uKl<~M z{}cH&VjQQy@tNv5HIMbZhVv+Aki%B|8xO*V=PwHi{hnkA5G#4*%9Y`B(T@Up$k4QR?mg_{Zx&zbo=(`t?ikN*=fx{^ql_ zJ5GE@ywBEUPk&SL^P6}SQS0aGpZ4%4`sKuV>ULe&{u>%~~lI0Tjf5{v~p0T64l-;m9&rRhl?yEZgfw--b}wO%YV!7Qs1h6Z@QM8D;uq!N_{lb*|5(y1!kaqg@1}Ge{r$OUoPAO9-;A|9|H0QNe|1(d+fn)V%YFrJhdy?I>l@Q<_SsjK{8vv_TW4yqPn`cYE*}@) z!S7gSYvoV->?WVb&*{m=>#yv4ZvCm(o4#Bk;h|0dZhV&QvNl0R^g9`_s%ZPegDcn{Jl6WUtQ|MrFqz&!`a_!%%k=8 z2fr@md}Yc1Q1_**wywBeDQ}pCUfrgCyAsX~t~8!C>;pE)LZ0{Ko5-^jeEu?xxRd|p zTa@%=?CnPOAMdoT%)rM>d6q@L-sOL32v44Z_mS6kMgATAukF0xMe5zx`NWS)-2MmN z2M(vnKltop=5CS2xy(%Fn2`(cr&;(2v|Gap z5_r24ji{Ejl^a>6D&^zNWi`B>j9x!$4<{z0hyQJ?9rsSG*)W^eh>R$f=JuL zpyi?7R3ksiQ8J4~-i+;&`RPIESK47;Rr~;ln`VK-@<|)l2jOSe%&8Lojb_S+{k+(? z>?eOll`PJ0-p|7r^0|XQLM#b;xO&zZ&TOVv6R&Zz_2mR{c07{uGW{R@3HaO+`rj^- zL*zeSOb&r3E8%a%*FVVPxf8wo7ET9#u2Q}`|H+D^W7ZK4|4hDN_kIImIwr@7`}hOy zt}lMKE(H!(OaG)_b;V29&f=DYYU$tjZo=PuJ9(bO&)hnz$KLLe=H$cDVQ;thw$LYS ztUsT*hp_N`ZT9m+=I^fJ@G53Qm7j_p>&_%N75dpj`62vEJHyM+t3%;mL-;T(?V9z# z=gjnYGwo=6m&w1eS#GbUPG+;KpHACmk?r=~ zw36Rz!{gV~KNGz{*2Vhgyr6zpNxo-iH?nHWyOE{7=UL37N=E4obV#e79{QVS4e%)w z9s4ov7p@AInwh6fvr6SW>XChAzIRo=XEv38teM}#&gq$bjovM3pVg;N%Zh#_y^45^ z&-9}|Eo)DnKNSh?K zux_?Beimgl$fp|r^BVKFopUSoGUo$|XSEyp@J{xZaqs9{OXI8`yIiYO$pmG4I?(lJ@4c z7^3}ATA2zL1IH%9)q|nlVM(j3sdafTJM6JPvYq|61JO_B`K!#{5UG=e{kgsJ2l66c z6g&6yb}s(IKXzmCPTB@e&a=Z)-#UJ;2-ncV2a?y}-ihKnTALRQ;r5`C{*eB*W39iW z-h>`L%mW$ydW2uZF0WJDb+ccJ59NjKNX{bB(R-@@ZStvMe*EC%S@>;MqW3`mO8;wE z$7M;;-@5$OZ=`|S|FGb`;6+-e9})jkJFVflF5)^ml7omUxES_!>-6oC&xUC`{c4?b zPFrPRpSMomDskIs+Uz^|Cx^iAppWJcaQam|NA$mDvLpP9{%*_uZjwy#`!MUp_w47O zSA(;);Lafa=mGRZMxL}oS^oI+D|)pjd%BbHtIa>r%lKD=&tIs=`v`hN`gBg(g&b<} zb_IREWC8m3#m~j{_>cbH1Mg$hu1Wj(ZXH7p{kivF`qSF`R$TUT`W5T=tJa^$6Z%%# zo%~Z?ur4K^-=@FgXFBhHU*p&z`2yYtE`84qZjAm{+CKZFaKC&u%>1gte*O(V(|q>w zd;y<_mE~)*C)*}R@^duQpE%du)2Exy;Z(ir zH|ti;--&QK?B^c7x2As+{-w`AkNvW3eUH4OGu0D#dr+1uzoq!QR+(Kf=K{NwczaZq zW)0MHSk{j`Hxwsj-Anpu$~T1nCuUu&`F5O<7C*vv107 z?#QmYIO_v9zOfIuS)Na1HOZ^9`vrT$rPO)u7Ug)o?RQxY`Jek{kzeSoY_p!co3%#o z!oF_ox$gWLZA$rn!;WvH{Cimsxc7VUw|YYU>$3IQ?FHZ4XQRD0C2%z8^}WZphcn-( zzYDzGg8dotY{-so=l!m3K1@3+i?(?y@~tc0c`f{$AkSU*Ec}*};am9aNAMeTU?=p- z2U<@>$mKB)oU}r3N6)v`ULEs&N=dH@pX-xnw`6?YHj8+aQ}h0%J{-r+tq;8pC~>uQ zN#DY{c?mqNoebuW?!g}T13nKd%U_OO2R>htkHX)##dYvI^!!%z#!&0dd`tGgn&Os{ z|9b4|ppTYsv{&{|@oC|HLG!u^d`|E^n!luvb+D;?qSfg~BmRqmK9A*R?m+MVU_aMD zZ!E9qSK8fETwZhI=-f}<*za{{aNwVF;cldFW1ZL{8BCuV`TtL8H+I^u#g-xUI8|#UF5ZTB#@Q{zIRYY!7$-TdXYUF&{^=7vmn2#>pr+z1lgHw)lxm zdTaT)u7(iJGNWTf99hyQva1h*qWkjXwWSXS*qI5u-3~o)_*gTe7JNS5PE71CZ~rg- zG`QRpuAWntZw3j!O@}4diRY+GPH|3ftCIgkcJRagyv)ptJh~5iV~zKVhtS*V*E7lR z?2qDCcJf5>spa=|^x-&JKs#k`7oYQQj4JUb?B^cIY5X5U$)OSd^T?9E>R-O(SywgCrQ!?g`IG^d)e&u6glR zI+omOz~^E3tjd0FPXBhcdX6)HdL>`0C+7S2X*2VDy3rYu{iwg|%JS9Va&z>l?C3$+ zZ|-fZl#MOxsRK{i@PpNY*Bc6w&vq*7sSEewcC+Dd>~(oUnrG9{7|zDMvpe7??#rE(ML(+e z>_YxqI$s#i*Mi5BO8RN&jnL1?LeBO0JI0z{)%hcu!JUzCcsPD~i{BVp(yz@LnO7s7 zPua@&dEeGfyuUmKALG8G|0utu+4od7xE$X%vn`FwK<8Dq&ju&C{lTGS`46lEO_M&( zuf%6>eu$7;HSu-LvrfK$u3QuH_**-{&#zgXtef@p$81}8va!hW;ia4#uq)&K_bT~# z>slgSV?^1nh9!M>eu*8jzlvYrbN^DWnzENW!0nE{ucHrx;bh>$2zF(A^ilcNZWM3> zaK4Rp<>fBvyTaRk*?a7P$=ZqWI?ng{~FBka)Yz{kj}(*5<#xCLe?Un#uL}4>{b59{7BR_7dxE;B?6Ow!9Pi zC~+J^(f?sThkgV;cP7Wt?pqn`eY2l?RQi_l_bug{s^?AL<6iamea3m!pWt`om3_nd z(vThUYd!$~^+`6?vqK?gd`3LSMD;ZGgp35QFy7F*I47x?jMd-D zX5Mz{kCR7@(2u7PLrXiVzli=)&iofWM7y<2c7~6u;PddZUt`tZi??^6?*riQWOx>K zwG+@;;PZZ-Z%qydXtzEbi;U`l&nKwA3NDW;>45;7>1UcrVTa7cf5dCuk~BBIrzEGE zX&tN_UVwO?7zXUzXzk|;e^kIqq zM!#07C-8ZN=l==sJWpb*YpedHuM;8rmuQ%uWpW7nb~9am9(eao;ba(k?Dw=}?{=^+ zDZ3v%aWe<}jP9QJS?|Xmgmu^>%%V_pyBl>e1m>f)NG z=-=4nj`U=5{ZxLWN$Rg*7kI}KpI=Xp@&5H|@pf50@Oey$&o8EP$bD1sm3pI||D%7K z&bB^&S*$~^Oy3Lr4}4ye9*>Woc zmhDPUI>F=Z%W=FG{>C}x1=$Sio_lJtsU>}pdYZzed$aB6MML;J)%>c%J|02uN4lST zW;ViT z|CS$-ch%p(=PB71?BZ?n&Heg6`oZw<9qa54@IG*Oc0Lv#xof?koAM0iT0Dnjd;6{anX>o?P02Z=yF*{sEtHuj=xAXYy>we%_XxCpuR+k^SRb zTfU?A8pG%Q=EdIj4~Lm|E%~E&z-Q#I+Zp{LcJfZ_wD{j%Wj`9S=ej|+-`&^~c^PUY zd-@E!dN>?DQO^%h|0Z$HdzARr1a5ZXedrB`KZj#+!)1*7aCUAF`8_%~IaQNiaisdg zULB|Y?xK=M^0c+}el!I=#%m(K!U4PvyDHzs>Tm!}4{ytMNxu@Gk4W~TzdyQXaCdTV zl1xJ1pSO8VGM(Oi#(thb@5SH3(MJ3;r?XUK>ng7EJ?>?A$hs1C^7&>!V}7=~@fqvR z!|dFkKQ8igL-7;)dCz}wdo%rQ{H!RJ!RKAbp#gjz1K&<{66j>(TDW~?hH(iT9#i7> z8j(?n{Qqm!6YI_w=y88y#dI%r?}y?yLasC$E;dSbk|*gjdi7;dFP(`VevEDC=gCgx z%q5qeGXi1!hYT(y_{bBQk-1Y6ZpIf`T_P2=fi>Z?jPJ+ zyEXqm{hIUwIR7sHMvQL_{^gVXzIAeAdKS4iDlH*Eq3(YCH>pHs3-i=hxxv<-HprP?eE1}BRu$&(eN-qq+-z_=mDn=(ni;>^yo|pEgE4HQ@2dp4&3nF+0=v zv`h}vZpdv`b|QW0pB(LXS-)xH=6Q5>b^`hU@fb&|zk|5u&H8qA7X2MUJI^$3b;Zqv zoNMxL>_bns5~p!0d*&zmgZq^AKZWl)@aJ~zH20qVv~~k;-_wsc?R8^zTISu>{V98) z2R`oxH@4+Zo^2jv?!Vfr)X$&MI~vEAtUH6@$S>B7z`eEV=?<5^$WEf~oA@{OF6Tu9 zIMf#2{+XR?+&2^r(8GRfm`^M5xn+KtdGUe!t@b4sH)Sc`I@!g(-fZjga_3X_Dft`$ zx7tI2LHUKs{b+x1v;J?U@5XO@Y_{4wn)WaK2sm3inFPmAfrmdA2gA$g?{WE|^k<^D zyp!pL`!Ms}%62dDd~NbPG(QzR?jhU*y_R!@VFwJ7$L=irRCK<3vp(OJAFrNK?BUbt zO*Q`J(EnP_pB$t7XgEC!zdXD6k2sSz^8=0dRQP-ZdDIivye~O4a1Y_$#&IUQ`5<~& z(Rq~p$gK%{o=N}r7dIU^TbKQOu<;FiKH2xVX7nk_uVL{WtDZ)F*`vgt)lT}$7FJYG zjx=9eXnAkz=qWVnLj77J57Lq4xHLxZjNT!cP2b{1i`nS>qRDQ?W`#i#MH52|CX@{H?&A+i0fskXNdHdZ2M} zGjJN^Plo4@YX66#Q+hOgup^&_9-QQK5J@|YSKIa#moHIP3eEyOBJO{2u{Kn1nWduL-IoiqL^Ks@uEm1T#(Ax>3fEK_5`54t7^dHhI%oq21 zq}Te);PGbu)9ec4K20?Ib;c+B&$G1argEQdaprA%l)kMg>cgSUxIK$}4@;V7cW3L1 zH;Ojw*^p1~>^`48_%SY{H_MAY*>R;Dwo@+R$~tCO+V{-a%g2`V-LgB>7jKPMXxE9{ z?3l9ML$ZbBG|=1CrQ}_M7S1l^b7XdtaT~(QIgh=W!Q-P!d0v#=;P)-z`Z?J^{k=fD zAEdC<+tRsHo6#XA79(o=*{Zsr>mapT! z?z62!!{KxE>$f7$H|u9Z_|=^Jf6lIf2N_)6JnoI*X6z5v&F}Es_x1;)T|27zx%iyO zp1xH3pTXylGfl`()Sq4Wr*Bu^-}VhduiPX8Pg|;gQa;Z-TJ8K`)Kgph#{9CLaqvI< zs0X6QJUU*z^^!@>p`3^RI^vf%kISX`4fJggKggx#YgPBZM!U7e!`*0H2K&53zh$EO zm;bxU@}u0NaxwX>E$)KjQGSVbYmnQm`T6GASH-jH3Aw#xJ!oQH-0Sz&#^r7D30!*D zb1k6KYx*7UN1s@i0-x9BXZn7i{ln;==s4vf&vRlVt3#^|c7!X-YmvZ@%{Y8c){H&* zMe(S8+n0FwPE}7MNOlgnorUv9eZDOt^|{)Oow{@2(Fs0pCeLNXV0aws@-}vsTd*@* z$&A(0eS%vl7yOSwcUsa6UPNw-itShi5q~~DIR)NMGxM(EH~X{L8IHBIzKw;~ab8YD zMz(c~hd0Ew?3=4-a8rJz*~w+uDCMumPc7}4cW`ikOv_VWd0{Wqq|)Zc;M<1Tvg zN^vWG0uP@_SM#^FM}ORU`T~4D4_zcj`XT)r#vlENb$O|C+~<}2e3yQud>^NX9?$w1 zuW!?H(W}GRx61M#TX*8Vn}3wA#;&Q9z36jmcJ;mRZDmo39T)wrlU+rghwxKBLa%?Z z8@`#(dfCvUa2VA<9>TtK+jrvysX3Mc{GNe@ynH@=e&;_ht6Di#f^% zUR{#?jE^MQC%ZA5o;+L3gUc~ruFKxaa(<2D(6_aYU!~oe>bX68%x5E7c1bqH{C(JG zHU5+PvipsDbMYM)X4{a@V(ms;*z)Wi`Z&fp!#lI#?40G+fuMhu-ELm&oGispth1lF7bWUvIIZ#|_ry_r=$RKG$%s;8Q-Q!|jL6yCub;WqE%6{4w><;O}_Eyjw25F7T%o{9UU2 zQ0G$~hD+}i^R*j(n+5sn$`4^r-%YPp@^5S|e|P?_=Z5)A{|6o%z|LN59jDLvT>YvGk8jbhUE%t}@O*jk4V(`8y8Hp32g$Q^ zJ$>6y{FUFYy|`K8?M>rZgTLC7+0lGV&D9F}dwz0zG23|kO=k~=%47LfYVqqlL54@t zm)gosx9(JCgU0^J3&!+f<=T6<7|(u?*_ii9^qv}?WykZ77klg9t=8av@S}OUv3QMr zHeP#s@pAmvyY49T-st~Bp91@jG7jt2Gt@e~O}eRAfxesa$H)BBhb`GJJEDJ%zNb4? z=6V09ll~z`UIq4Fp?t;?U+V6Z@y_YRtq0BxqzUXyS*9yw+#6&+?JrsUWR_6RTJ#Ip z>WX)SIJ7@*_W`7s&qdy2c z7m&iP>2D?dapk{K&vRD1>)2Cwq`%9Zy#+fjaf&j5_wN)}vUe)7(tE1^ee|!;|D8k??>!*x^Pp4Os?aueLrbBB4cx9{WF3D>0G6m{wIMEu@?|1;Ac z+@k$;abNm9jd5a8xikEJF1{w3zYEdB9v+w8AjW2bs6}_E!ncL8rv|-Ix>)&v;+z}u zgHR)?HKOe00w~5C$Nu=Z!CvqW=0_#7 zpn39u+ke~{qhHU^ zcAtULZjUKm&ORgm2J&}&hh7a1M|PRlinm3-RDtiSc#GTX#|m#{-Qm)AyggOn!|*&Y zYyT(x_{|9XS7Wt)l#`9JhOlOJ@o z{)vcC{|Wq2|LW#BYvDung#I|Oo4-RI zGK1!SlkeO78^54e!O!|4vbt^MbG&@@`=Q7Ef@AYP*_KrzuM$0Yt2iYOq^n_FSZ^Gv z$(ytV8}ilSnEVI3FtwfQcIP;`ose%TwpaiB{CB&AaX#fHyOc5p;b%|vKjHc6(C-%f zM;`a9@;~U`|B5%1k38lV`rcCecdKuLdS1`hmhA%7=gIEM|B`>C{`w^OpG_N!WoqhR7W|;>#W1=#jeVfl6F-&we=GALa#)3Z(#PM& z@^kbcySDz08)4&qozY}-7AraKDClh*JDMjQl^U(gLRR%8*gXhsI2;{~gTs`2S1UWf z`n~D%u5@S&O+AK>YlVKt+0(s^@6rCVJI2Cq&*!_*S`oXP>tS>v%DI&YkS=@!8>}T& zSV#ZB;@C}V$&c)I8JKk%-I!&*U4o~$=ksI#)`unMD|eE%p7DD_|F4oZm3ycnk9a0G zX(!pB<*3!?xV3Ns%(!0N7nw1ErLiy6KW$(wsbPO{LvgP0`PI90eY|a*+NDox%bvH9 zFT9gbu}yX{%x^d~D6N$&!^eg6z9yWki@vX@n!zHxz9HL-)IA2yG*~6N8gDiI^l41mPCRpEa`=ls`6v3(#<>T)h7_@!X$AFixL26oG%WX6t-YEW?e(BvT@>cyM^KIwEu;~3a#Utrg#YA$wpT0%r(&pxSBkev( z?o+hzs&P6sZO-nE-QZV?<;HZP85iq$WF{ELN)!cL`~rt(($6;fd3xG0nO0h%%h}uC z6x-t-aM7B`8b`xBmspz!u` zF^Mh9cGRnd#p!Uly7Ah9749yeNd~~cM>-jfAZngNMzoa*9?HB&eUim}*ji2FrP~ImSL>MOyt!ak+LQ`r(fJMt0xXai1)D1%8VPq+HiI zm3z=n5(k~boi)Xa+KKZWkLBOO|KZ6qd7R!|1-D~8YREqRMQ%?kV;<`%4~=yr^8c*M zFW~LIgkL&#A667=^9|(IgxXiN{wy@Y?!EV(syv1$5C+MQ0b;}p`LcIMZJR+@yT z#9L@pk_&6&2X#c2qAs4#tf%pYF${I3wf9r@^*$smS3He6ine^sU)tE+4~MW`WqgF) zP52p{1?BhK9(Iqj5_D0=G%NPS$u9nSQGTSxdeanQuoaBp4Q%>9iYJR>a65o}#u&~0 z%#^#$h2M(%`K4Pa)rOyXB3%Bfc#ie31snQuG8o6tJDWxQWb$6|GHEtXJ}VyZZ%=Upr`LIV_|dZ|yW)Lg7dDx$zyy4`@Dm3KPKH0%DupU(S}q$wQu-_`LqrDwmYq}pPEjD8#fgXr>newIq#8H z^KLdOX_&=noX6Pt?_0Zf7bh;NNSnRchZ~D`ilwQ$M#;Em>h==6@2cH|K75Lwxiog5 z-&g9_soA#Bq*>?C}CXYuWzO-@T5-0H$wGqEI*=Q{)hQs@# z@gG_8hZ^6H)pH9ubb({%u`+6<=Zo2!l>UX!N5xSel({RN?zT(E!+g25^GQbc)8de< zr_uVmn4Ny$-Q)x3qbHKad;B4-=~}g{-TxGd|${;KD+!3#y2K4YKl zD5L(EeL%V5{B;bkd?VrA685v)edf@7_`VXyfC$A7`$u)ZrMXIdBD4f*9A)U%=3$lDn9{7T+|^Ub4? zdE5^7b5S!7RJHEp)y;V;0X&*)Ji6u$eT#D_mGoi@aU2`neb)|-%Y}fx6a0&N1UkXb z$!6A=JZ|%>$&VaQ$=jIMFd7+uM}76nV-82-{_e?eHQwF#Mvr*lz4Ew)p)wp!try|% z2&lCx9~?-g*h9GOLAu-@qcH&Sk8*ele0a)HT`}7 zpW*MlBY%Z2_k1|r+Ndrsp3k$g{KNSd-tP}j9?WB2Ei0D7@2Ka4e4X`U8hj4ibPfwW z{#u(agqas7U*rvN{dw_gE@uyZ(qz17m-$wr!Pb$_-N5;db*;4#{J{H2WR4;WShUt1 zUIP=i0K%5Y$K8$Wn(Ai-zR~fl+c@DJadHz#$E^u|rE}2qFRS3bijwO&lq*8YBZ(!8l+etuykT1>`6=-G6m@jHFa z({?O&_f61+J}iU2det;N#hmZ0r)%X+s?3Vs-dyVA?BCwcj#7XoF^eMW-y$Pg# zo!(4J&yDkZX8T-t-uV<}0 zf3sxHB(r|WS?)CnJ^6wCyq>>hit~XB%!V<=cW}6-mG;eaqgsZ`UUD({PAz^*4>yZm zK~HGsAp49H$ZtgzklKZ7NH=CZ!FP^t9yjJ|>w9u+F&H~i8D{*jguYKY7(s@OXbhOiz z>%GsMZZ~1q{#^IiP66M28c>t_8~CUL*`I(y}J!{>=Sl~tYp8&1RP+wTnj*mCEZ z-J#$OYZ-d%n+@XiT*I=h2^Y68FF)e#Z^L5f-gRIt{6vOf!@c0|&3OlLANtQh3ci?bU=r;lm$M`!TlK1pn>b{jN zXr27#Wqmc^ZAW-tS^3z_Y0S?Ow@TGacJ_O`k8TOChryw#c~?}KSn~3=dwz_#?d7jx zFZ8f`_x4;vf6dP07K5tpOX*|Aw{lJ;w7s$X{7vFO>8q@cdfy{!{)6H?^`vUdy}T zv!Qb-OVo3_IF5*_e!f`dJtNkI-^jhT{k!FGy8-+0XL1SubH=A#iQTcnje$eWfOzYy zuH6cBwhHw4leMxX-)pLD9UO=C*t+~XZmn;r?043i9rST1`Zw~^byshkp!$Hn;&hf$ zL;Z=Jhf$vTfzAIb|Hk1Y5T^^e(u8fy*T0KN#;l2U`;w0PVR1SM68$N@E^Z^O4io#Z ziAR`IQ?yedIfmujR6B7e(JY#}$XihC*PaGz+xukqZ@pp9HfbMT+Dfpoxn8wp%huxKxRD>X zhw>Z6Z#1^HwIQ!oY}f1Ic1K>q4fY3HmC`v!J0sHhaI+6DYUB}(Sevc+!NY!=D_%KH zZOM?5c2jc|gT9drme9z^p7gub@o+fRKo8>f^Gl4ti)q~D zJ`oRFp$Bf~uzov!jIjIVXC;q1#_15ae4?Jb56^1w7hH>9x2(D!t2sH>O)6(qycOFy zN&B+D++WH*|Gii$UfG#HGxhPbCwX=g7ae;STga}`$L!jQo!*aR+B(G)SyFy3er3yU z^oG3L?~&i-cMaBFCqT zw~Y3a*(7g*ziX$yvE9*lJS!KWZ*$BTD!*9jZuh%9a~a8pp$+MF;h?A_I@nIp1>X4DY7-*GpFJi?iix4e+^JlEcw zI4p~77ainl4y&%Q-R!v4v>W_xMT48cy*|7-t?iOeF`GsHWR-9`m_1vIeZGU|rs;n* zY4*l=PPM&Ao z>2D++P*0ryy)}P|pX?hrU5%Dy?i(D7-qiPpz1Obc->~;52f#-MTFQ9)6pV!0G z?~C{Ik@##XPtszm<%{+Y!`gqR_)$GI@sr}F5`N!@&jb9OT7AQhRK;A2`!moJGj}DB3^^efrX6n4G#PhCJjN_s33;Y|SJSG1czS`37o9W(&+y{Sq zCo`N=+1k^+#oKkWhSi6s-AHq&Y$#KFHWwE?%*Z{%?j2ybdIbsfrlg5{Sg(kvY3(0RolAdXxj&ml3*ReRXKofEvCq@q*7!#9$?@O#+$tNT{%-KL z6Z%H^HF_Hj{!;W-dbACHaXPKbvY51ALFFb-awr~!xm-6e1JADuT>3sD^9*wKA4(45B(MV3_nh6WC@qUf_vfSv& z7vyRs|tBX`HI*MNMQdBFpk zpG_P8D4M{v?&Q=j-`Q-f%@1;*-#f|6G1v_13U?!GeO-R${mcXBwDKKr2`hX?9No_O z9;Dcq-Mx*`annpbmHvxR%6Ii|eZR+hb#-~}rs-dOanA>OZbNZ+9(cP>T*m}d(P;T@ zq!f1aPUb`G=f!$tKMTDNy*>jz4l_Or^2i<>I6T<*^WnI=X=%h=o(p^Uw)}bfyj9a{ zawqUe;^}+_E}O`E_c9#L(p#)M)8OzM>Ir|#!u)M`zFxVo&Q`(a?e(^~y#04rrB{is zi&Jw;i|@%NEbqkAabNr@IzEJ+rOC1AjnSr)azti~r*C2F$X%dc?bwZx|30*JlpZzq zJy!CL=0Inf($pxmA^x7E|ZYK?s2BbD1^#TcqoC*wGo6g&89d*z#&!Q-Jy+;_K^ z|2p7fGD>Id?`m|yyKxe1kNh_K!n;0NJfTE*=U)TgtPLl_t!`>K8$I?r&cNw-wOvnY zEtI&>ENO|0YxJQT&aY5@e?z{^+s z+Rk5RY3E9E`3K+K{q_NlS{mtN@c*DP;p^X(`vNIn?oU-22-sK@=Uk`1nT&KT!r>+A{N7|=6m=@KwKRHM3&lW?|X0&Lkc)ERQ zZ+rahmYia=Mx^!h?j(46B)RtR<~mm573qirCwLd+)TkQu~J;iLHJ&jiEe_Q{j+K0 zIad5fQ|)>WgU8i;ADZn?PkK0&712g77W-v`w3a*PeNZ_s4$WrJukXZZbknPTqU3io zUJXT;PQYI-UM}pAcI@Pxwci9EgMHuK_`WN?ZClcG?kH>SjKC7-SjO>Q%6z7L;`2%C#rbB} zr`C_Nz4NxqC(?pkT*mI@jH?7!CmFvE@Us#<9RPnT=|P`7?C^o|^AGheODo@({?ug; zpQltKxV}3+<6f1+$W7+P{G_tqgY%hYT21Zm@4Lv-+*vMi*dZUQm(`r--d{ap4Dv($ z8~1ikB$v3~Dy;Dq&Z!JEb9eE%C%pNixDbC~_g;{9Ha`yKXWmi$tBR}gee}AqJQ{7F z)^qIV(X^#Hd_GXSjl^%f?QW5_>GkBW2lRY8Ke*&?Vg5h3-6Fk-{l5?U`8D*gx9>)u z&O`ZNE@wOY{C)g|A7)Ly)Z6?*dG0#WlGq>Ik$jqy?TJ2{S|bk+6tH$)Sn?yXfIMpZ zegvt8PwIH%-^p56JGjC^cUrFC2qVn1+vvW!9i~D%b z*;>6#Js+!W+_N}}jJkP#lJc>B?F$Rq;!s4nvYmY2hxA)pHG2DX0*u)oLc5O*b_9QO zlxR95;c?CRb$@MZrs#_5%2;~9A0fYU)fKSFUksr}AAax>ks24p^OddypATaXesv$qhVT%% z!HS)^t-1%M)s6Sw?nfD*^}c0n$sBJ7hqD{w9PS0|+F^RUUpgADox~oF7@k4m_{Np* zL?^(#j^zGi8atxm^5J6-d|Ux<8yWS*_=|gIo=>+!-HjjP%zzi4v0pSm_NpD>XiwP$ z=fdsWJt~J7x6g|o(>>|XRQMa-k#_32RzF{2KhG)U{vLd8t=*US?7Mp}*pN=7WqXQ; zKGx@C_WOKXJ|WN1PQJ_hO+FpW#P9Lhn5SZNc7p!)hTAjY(3~XA;&ee>_T$n1?c!YF zXxtsk-@7C3+1A-?Xy#2QJKXdA`FnS#aS1zkUvs0ExVQ81(}|y9o?bj8J|k`!X(8Tu zu5oXPevr@F;@#Yt3h!oRr-*@pq%u-cQv0zFJR`SF+ee?CSTCzKh3Ymk*+&E7TWx9%E9kP;;UQp z_x&M1-fZ?mcl&gay|;q%Cyk%_8 zk0YmA@@~x3uRiSLacZj#pU*T;tIO+rq1o(2CS2>o6FDe99ld6chTMk_&lo|my_UiFX(e7yF2{fXT#^% z*O;sSTbUo{RZO8z@v9nERb$>^QUkaIm9=T$EDZ*zT}rL|sk;!MBBjJ!lEkqK=N zS~-^FuJVlh1?q|OFGrchsE9=XOE~a-h<99d-e+K+l4l7 z$B!^y?Tw-CBu_5pi9CcgIY~ad>&UCBH|ClCyBChedxeZ&P&wi_AJN|BxJlWO5no?9 zy#b%qSR%VvYZi!K+rcXC=8*Iz^}GFm<$k-G+wm9FA&FLe)Uxifhp)5}h#`lQL#$xE z>^X)%D)#Q~BgqkPH9VgK#7@Nh7n^u==9Aan?A-=x8lRq`G@BV8{mqoK;ip>=ec#7( zqwPmd(ThsSrRf2r`*(3rI@Wm1Om0czv{}ku9a-b6@Jr87%l5tx!OcJN@9#)A`a7p` zfbkfBK9sb3!t;YkaUws$)mqEoah%eQbKHm1wI=ZTYNJyH&Ud7RecxX9{ZkyLt!8{fxhUY#_{rW%c(ibJz_A9jOlb~5c8A=}~cwC9^*c6J1L{o(xJ zW%Q&T+&+*L6XyeWHhRs)PXylbon{B=XXRwyEMigzIPHJBadC1-eer&O2s{=Q4u`MT zPD7`z&L{O|PFWp{n<4P~2K6+AyGMDhj&p`5DD9ri>`eL}=eZ|qudUNvCmW|mzRxc8 z|HkZGt^Za$pq}bDzA@WPFZ($ibvi9@#vnTqx9#C`tgBO;W7&zGAIa~0wDIbw{9vPV za=8zBmgn+3^tl1t?_nlAC{FqUGhtj(H9v(lu%`H2|E73*T#ApgtZXgdYv4!9e-ZxA zitshwOd7zwZpN*C9(F~X?>^P4(wjXz1;1UzWnAI8+HmV_2|k}p|Gq5V)K1vv^4pQ`SI&ix zH0pJeZ#@_9(Ld*H%!1b9J9hIecYg4A-y516m*Tg(NRO+?b%3$F&G%O7x>@^ig8N3O z*qSVFB#qd4yNNX0E4R?7#XjK8|;t6?M-DBBJ%*TdybMr2_!3_oMN z14a2saC|G2;mY?_&s6vp{oMzi^fTYKg=oD|W~nFMM~@*nN3huhcD?k0W@f5R~L>|FZ%waA65^of1qePOz`Z}C^{vUithy)~Tg0B?Wr zW!@Y&( ze{KHft8w2E4j)9{A7W2OY$4A&9FBE)8G2;>sLTFaZ$GGBdM|#O!1r^kNk{M#ACAi| z_8S+eO~h0B1RRoOFb)4;KXK0s(d{XwS6JJ6@FO2iyBomg|KPtF|MfLw|FwHw_fSgg zA$v0Jo0!c%kh4q9WAC~J96s-F{!Vtzs4WAd6 z^XOT)-vxiav9muF3G+NY+8DpP@Ety~@6$W21N=+ny5Qa*>(8$AtA2JnD`ylx^)!~l zE8;^Aq@kVo9WJAzTRV>v8uEZ!8&2}Z^iQ#Ewimg%SycODyt&lQE}>Iytc71K0>R;D z*k^lX&#<@M8al@04``jv>QIDnk`%jUAs+^7q;=Nh$U@OZ9%#{J%b zKXqkU30!Ee{CRq}h2JkzPks3OA9AcymXG^ePtZw)w zGeq6P?K4Y{YBHzLfQ$xeiKO`U5w-@je?)8n*O z9sY}X==IsbVLy#U-6p{f8!?Yw_!(L zqSiR?eXD-ev442I(p|;xU9Ofy9Nw*@&;TAUBL`fts{erx5kuh}sqovRo9?yz5Y}7!66x%JwznD?{|0MT2&^I&`SBDZ_~ejiXW{64dCXRWN;kod<{Gw;rl=2RA2p{8@*ZV z_8RGFp5L1P`F4E9dGD!^d!7|I>|i_Z={?2;^7*~LCyl&s{o(Vo=uVu#=dqrcoX$16 zoyBrIrAN)2U%c4Ab;RdhNlG30k#F`5j-=;lhbLT&NM9P|+_!ImyH}WJb@;3AfsREn zi@n{P#4k%PL~rHH#uLWp7xB-h1UL5mF1E%u6mR#S`YSmv81G+A;qDFQUp;s)v#WL= zCBZoV6?Sx-&wet!uADbd@jJ)+?4#-*M6#>3FM9}FeTaTGP=DxmJN}5pypT=#r7y+B zNOt!ex-ul`lKt1|h}QfgBdx&Sv)gy~&a;zzL|0pthC5F(7w)`h|87_I$56O89d~i7 z&SI-Bhst_pWe55`15VE)y~^UR zA3<-(KYX$JBfHrB-XWX9=O@%(8*b0{YhQNwBDHNSuEl5I^EFxEM-Tqv2XXx|e13>D z>m(1u(OM+^5Po9cd{K6v(hd2c<3yLp#OyZpRu&I(l~y|OUmpc;w@tqD`&N4XZnnU< z_Y|-39DQvduju)-@UP-me8xW4MxR~jSw;5dFf~@ppT>_p?))k3b%5uW(ub=29M8}$ z8k)ag6#K*L8?1K?;PX?cA~*Aw&|9*f?^2r#Ci%1YmQf^MYMkwKqIWU!L-I$}-UuEq z^|bf#{C>4XJoJD4+lZg}E%dg|ci+UytR^pAj8{YFzR&V0^GZHXuj|6sN0f(ddDwRy zrm{^{U_g^HVsH`%g^<#qI?`TtFI|P#{9B>A6hTMzWLU=6Z7Qv&Fi1)k2iL*G5v_EU|GFIslcpm3}A7>fI zc<(E!^j%ila@g9H7w8qXSVR3jlTUQHQ`*bzB#y%8MF{gx32Uh%#qU+CG1$s_z8SL44cJ392Lod|+6S#vYt?hSe~(C1^M z+JXQ24)aHZCVXvU#`K485ic<^y-&Ye=)n!fu`@r$efnL`Irsbg+Yrt#VeM3RuKURn z5BEwRDD{1G`l9h!SL~m@P~zwL>BGjiro2k;!oS~&6O@lg&5P2&8#mLYZ(BKjk$>Y+ zT3407H~JC&j+bcH&u)Q#R6YDT>C1kVFARMK$v>FBqEt<|y2NKi`-jisGl$c!DPIF_ zzie!`5RVr2&>*+2g&x*thu=fKt;N~hLRTYB?hWPZiPKm}T7NpX@}OUbIv;b9&uzrl zU1NmB9cE`JwWs~WYgvU0obP)IuDQ233%}OE;`722#di1p9=ELZ%=Yl~Wcd6lxzafH zbKFDUpWh*3+WTcM;J&f=x7YOMhhic-w1#rqWH0|qf6K`KRqT`93Eg_a@1@42nR9=~ zn1u(*j`f)GVPC(9F1{!`)~N1dpYc{c#(Lr^p7(Dn{^S3uzdb)m=yN0fj`!5>UUbg~ zZlCP=o_cz1_Bwh!`209}A9*%jRr~tlj_l2U@p+MYy1?nhVsjq^D6^IuAGqD#=!my>E4Uqxdj;SpPeU?>NI8K2RQx$Ji%d7C&1r!aw;tzB;1( z;JsoP4cU~vRO-_f@G`uXNq#%Mb>-q$}Cq4}h`V^?vJ zdgiJ3H}zi+x1T~E$%}E1USFc#@3eNSvoEi}An~K@JozaJ+9~g`J{1fl_-3^Dn z?_%60rynb2zuO8D_}tz1iTKzv{hWTWJJ7>^*(UuYWD1`@RlX+w?Z;}Xq5ideM!y%Q zq#v3E;%$|yOJjCPzlQT`iwk`Ze7-!Z){xs~IhF{WW{5p)E<3sbHE`RhB zJ~R7&w-~Pj{k~kgjpZM`TYpy-^|D*6UfcWq3AJ<;|!sm_t-UwEgE}^#-7C3d+&lGR~r(jd|bidd(FdpE>*N&d$#McXoDm_6dLO!0+)Ysjm|sax1=g&f$AI8oOapWwTP8ZvG`cn~S&J*-iV(*LllZzIJmod-TfesWpBnUGHl_&S!pX2p~K{)_ygPbuBmdEtZR{W$R+ z&(cPjwTrj)!wqY=9{!49?Bp(L7+$=C-p#$g7wJ_dKhgtmc$j<{@0EJ8A6$$)#AAwA zjXHm5@d`OKgWF+$)v%|Z(^}$peno$qITw7c{M}OR4A9f#i`SHI;r-X!OMb3`ml4^1 zL-Bfv0~h)p_i^tn{z_kGc!%&!TF@%HioHEOdnmiFxCFlwvb)QAtKsyuxI4^y-DhZZ zT=pM)hu>pa@qtm2(Mvh6I>6^I)$f*F@vgs)gwG!tr)v28z0dY$&}v+DaTo1F^JNn~ z{~EXaN7*;z*NFwODp0}*d`3P!{bdz;M6~}{;&Tt>V;?;LK1O_5`|LMb8836J@ax_z zo)!K*++3;nYcDqRNAPTdp1w-UcJ}|D{k63j@{#%`8rjEa?d51a(zfEW=48vsYcfmY z1+(cC=+`~<%8r81QO`tH^0Qh$Urp<^dI5a?$oP-py|`6RVvRYu}r>qTeveh-(!p7|2rwakkr)3dCV z?#}aKU9V*vZS-AL}b^?BC~;XT$t6bSrSan%ui( zZSpJWe;==#eS%N#bmZTfhq-%r57Angp#8pqk=Zz3p>?tU`75L&_Y>TUeaam881rHd zej@I~YX!=6kwy1j`Z$??`h8rqW*0w&-j|=_ZhEkQzv2V!6#N|TtF1FT`X+LiDW2q2 z+EZ6N^tZTd&F@h+g?)7~e&gMr3*dO%zql5jw$<)+aMP`M`1}oS-?+lRj+@TV;t zUV$$s>BTbZ?I<|@rSWQlz7k#LNH{(kKL?|SJs{r(Z4~#4kID4^Sa=wCem49bLc_0u z^G)IK4aE@89g@Ug~+N zc;B~X+5N?;lFyHepK9d_$w=U1-i=_X zuEp12-)p6?tJ!2J&i6j6)K1fJTG(;N`5>Dh#XbJtWOs?kYI_Xz#(nQDDb{H+N^!S3 zA`}AO$3UEZW(u1q4J3~_$_EZ@%zrVFgd^L`M)X9~KnlE_1=*&sHHWg`wiT}w<-<iZM%m^3)%8-?;RnFEzHIJ;b)>fYVf%=1=vFPH%6YVn(fK`@wn)fF% zZnWZ=*^PDn8;j#JeHO>Bt-$9WAn*+O{s#MdOB((OO6*@6z{e5(?mcfc_E*a~^tM`m zN58~6uzLfSs#2xY=Pq!o6S;O^HxD&3-BQ9&3-}y24?B^f$=%`cbd*ceSM(@8|Dtz0u#+oBuRj0t7yjKe`;(nWZpe*= zo3X#xA^%Z(BJAOQ+}PSyz2UFe6`sbulo`tRr+@pw)zFjO;Bj9P-b4AuR)hoKXV}SS z=KCwXYj%A8t9HJuoS>e#A9ry+-n{GTUQ1hYd$n?2z5&N>gx_t|e64k(ry6d^*BHf) z{2FhU^nc{#@ zZ~qJQOrCAOk4zn3%#)`&p$B9c(Tfb_H>sYD&TJE*fgC`x`Pg!pi>WicMm9*<*(~3%|hx-=w z{f;N2Xp|xXV-9-Mzn^yEX7u19hZ{rqHJZ_f#_+k3^4^2g-fa7VgNrJ-=f0$RBEQ{9 zMM|U1>!s*{=jX%O0aoxoTL*rT!RGFwx!<>zKlB-L84sVAv);POf7G1*w1>xc(as&+ zFZlvix~HkV@Pqx6AEg(KTZaEN@~%7xf5R{ImUW>U6ntC1GyVI@x)AeWb@4JTH-Xo+ z>3<#Z8?DX5ny@I6tk-TD)lI!?7N#-Y7EN;{u zek#4{!ZM5dO3m51JF^j6!ozv~JrbgXoZIn7FUnrd2CHwnc43F|eT?lF@ZwU4(LHU< zk2nYJ4>ZF!gPXIm+u-&_BEMdQ!+r5FRZo8mh zc{wdRrnnh$K^Fe;71^mgy@Sx*KS?#lGonKda(m-aNI9FNZ{Rx$KXaURU+NX8#~^0p z6aKgQ!oEG3EH6;U8~W7`&b-P4A^R=w$xg68gFf#m)hGK9hR1o98~1v#KW=}4CYkT8 zj7vKuJEZ>Bt@YNw;q-AGdiYP)k;?=cTMtfpCCE8pEBYMo61GoeMuD_OJ)Nz79n^2X zooe)>jqkP7IO8+`J#ca+Jgukw{%~(R`lhJ`dQ*73o))y_r>>_RxeQY9FM~APZESp| z!{xXkv7hp>KRi@DkPw5 z+VQ{iw`k+0;ys$elULyLhVnPLJ017fF14PFqPg;Nl=a-5I;Ngx!2QZce#*yFBkN0Z z9{xOen<0BI^+>MY_o`6rf<(lq{#o~eTo{Zaed(zvnwvA>FM?3)iRf}Oo%348t0jE?O?x7Yl$VW7KG24xo-8iy z-d_A1F`rt9vujt5%jx;qT3H|qdg%3f_*_E{&BVVwTgl9m2K*o`)l- ztBRkr(<5tEbW=X^ht^Be$h`)=qj}N3=w#k?l2;>#E8cr7zA`FD`-fwu<~eU*1hL;F39 z$DzegSi%5_tXu zD|`{THGngbaqAQ7NHq=nM7`VCFW9x{PZiF3nm?ULtcIK?psyvnmR6sB+O6i_807Ek z<=tqUhQXIEX<&(0V^UZ1v87&(N;AlFU}~dXx%t?`G0r{Hm^6kyOiL}*6LHO9KS#XB z2srExVCquVvp2kJL{F#sj5yN0QtuS!`v<1(>Z_9HZV>H_w!SAiW`eZqPJte z{wa;7pW?pYrzk%0cN6uqHhdma&WlUnylAKFwloVL55eaR^grp(K=b+zINy{#^I#fQ z;^E!e?L%+>MQ#x%_MbGgtpAbJ+Wgtw`zY;Ya{8t6MjC*hw(RG6>CEgZ?+o@ykul;+ z_%53s9Ir};@;S%u}+N4yQN9W?*z|7 zZhPdNey^{Z{&t1qgG$^xg#FtLUpvCNZOH#1^#`7xk^j}G&UX&j%D5-5QS>XzU&tTa zke-~LcOZ}TmCLLYEt1D~V*YWUnd74&#^QEmRmJ@Ae6znb5p8F{zoXKqAZy?h^}|LsdZ zM)(Pi*KWiYo?i4+Uyb*<6M57{Zn56q)32EA8j*?C1{guN}L$Mfy=j+<)RH@be+<$Z!mgBh$k=ad#V)^YfD;{Fc+< zcB?eSd|Flv(@qC?ytDB=Ri4p)sU!J*UktR4#QRp?z=gxyH0)wrVrMf+{ncj3WAyb+ z-^Z!{`M8-~wp$DSZpVfen`i#6ao3EchQByJyc8xkW(h8~FB_6=te>${y&ooq&p2au z{UV3p6!RsJ_*yu&N$STJyaWHneE%KgH_ra0{=R7=_Ul6Uyax|y#2Lz*M;|`61LzJ< zo|PARqWRGcE>BBCv;XjWbTyAYkUL^{iqp!|vOj0zS+R2FT1P%}%Vvaj`>=bDv`*B8 z&ug^Pke7O#aqozJf^*nf=?->1FEA^3vT^UgBX&p`yY{XyQ|F;7;vj6@ApvlKKsoH;WSoy)07$KPF9~0|Jiw z8bnFs$f2E8X>=lMqtc&@dcfU@r5=t;)5yC8t9ujp+}^4%tSmn*ZRYPR66H3dpBuyD z@K-NLGmRVjMET9lmu=EW%VG99#2K_f@V9ySlsn%;BG&q(5O-??Uebw>#3?ZP>vh)BE1V{&U(5y$Szx zhZJ{8|6#q!`STxAU)X8)s&7Cx*1f^j>aFp+x#+FnYA<@;-nm~D`HmLXF%#cy_?hd& z@v-dY7TOslKhhX->EfPu;Qvf{ms`@$I^HAfqkK1Ud#%$Uc07U0adV~_oDBbJJNPtQ z{XNhl|9-c8ayh;m!>1biIUp34ZqQ!;N_-1 z@69*W-*`WzlTR;l<|FM_8t^+$EcscU_kq)m;B$X;=UDn1e6G@7_;;6SuOI#YHXlYG zTC%4H(CY^BX0-IXy4f%J2<2LP4|WticwZi*suW($X6)6lllvA^d_G&LDQd{47XQW& z-@EW<^nyoT3@^H>zZaaIRN}xy_!RdvW)|JdqmJGcoTA-o=Yc)+`}@ik@NS6qXB9DC zt-K>R25x^;Syc3fW6j*}o>Jn*>G)Z1-*pf>b&UQTS_ICt6ZagM6Xao3U%VG~ruu5k zvrF0C5#N8IdLq-*z4!}Uzo{6lzDD9UM#2~OcS?I*+-Nb;-@PP@|H6E^ub4^>o!ujx zk=TdcO&LZ{)_ZTInjX!~mKS44<0)|*oA~a%HtRrZX!t|12Pqycj$;es{;T}_J;{|{ z&Us&*Yz)dUa_H;(!g5@5c5h1<=WRv$@d~(^TiPwZxg&6Bb|s!>r+LoZx{06-JFo&_ zFXnCRoefBP%5yL{+q#rPWZJ*LY~|xmk-27CHouhrP`--&vptPThqQ;*x60k z7<`*g_AS#^@M&i_+0#6fO&kggOKh1mk^gNU2p0L5_H|MgzU)Qp)G6|)@6WF?j2uqE z|7NV#OVl&Wc%H^0?FS(*6$d?;&-zmSj=ALb7i&sa^#(0_A$TScC$L{@Mj?NLOX`9S-d^Wz?Igo@EbZ+ zOyl`K8?po2ld7DJ+fzO`N~E^%&Lni8_}eP`p?Jli+zryVOfJRU&< z4oO>^kKu3L)!0H@?M{G`$HK{o`@ImpcYzOEr0w8|_))kTIC^24o;ur+EQR+S%>PqU z$S3?Ela*h{e%>l6nRV{uQ%a(oOXOu(i{1{nfcYyeZV2+vvn`c@;mu1tqUX6`y)F%`dN?N z8~4DQup1+jVT*hdI3k zd?)oZ&R)o8rdry&HlJ#K(ARv&QqHfUN8Iy``8@Q--h-J~>eYuOUya;n*%5u6_`0#B z{J+e%hF2n;)IS1VtjV{bKUMtBbIn)pN5J`RaBDq!;AAuS)EeJa@UJ&LY*7Tgt-N@F zdtX$#z`4k)UsKFOZ||N-fAb@E4md`CJHhM8##aVrc63wxO(=$I*Nt_!73ZEi@L$LK zm9vViQpDkJu6)>O`@zSGJde8+GwJ`=l?B@AlnrJtZvwySva6>U$H>DGdfP@k-6rJO z#XBiem6wU32)(Is4!DW-yZXKr{@Z1DYCrtCHx$$H-;zDOfd1Bp&r|W&P8{7FCWD&l6>7Mnwz zgZZNm)Zb4lKUg=shJ<#QRp?6idYJOH%$T9pr)n0%@kZfp8h3zkeS`mb4t;J%^7C1s zXX?Y;w6jx1I-}iT=v&*>e*gm~qzOMm)xF<2GAOcW1jezZ(Oo!OqX% z_OCoN8<`i+%k5K#z8`L<@GM*WL6IF}%*R!g6X5PxHQyk%t|y+Z!?*kN;_Ldt-OZf! zycReT&Vqo6SFq_C;x_gowJcVJk9mzYToCW#xLXF*wMIj z$zJCH-Yh$WwLJ?i)L}Po=TjUEdwB%At|44$Y=nM<=W&uQ$GrCA9glJ8EB7zj!NCW(w=akgg%8F!X6F3+Ck}1-`(p`Z|o0tg?l~lbxbC;$Pkf|8wN$*iU~tI2Sw+y&*e#Be*de z4j)I3@512?^yBj3%5N@?;{@d{`ycwB@^~4bk2vZ1^!vxkP0ELz{474|S=TPbZ=56C zo9|QN>I>S7y!U_07b)M4-+33`tKApeTYKUR^8M*=?p!eRHqHYN#(x`rj(y-wP4=Dk z%5(61F}!Lb4tlO}>CC^pJ^6?f;%|+6==F*%$+MAs)jO#lf|l`X6WPf#%&RybTtE-| zh?Bb?=FD+U7&e)AdD*eO_`$a>E>h|k`R#Ty&kOHkO)BTh4#mU1IWZ{qN<;0_rxhEc z4{$$lIj%>+X(!_Bu{p(9xH*ws+>2gy@c!VQX*}Fs47bBCd^Y>ott0mc=c}hWI|sjA zvSIF}OwiwM{2U96OHcWX_S2uH?s3=FP8azzb}i+2bFojUC$|?TW&L2~^5R%>`LJ@2 zzqh29j}&o#?^Jg2!G8Y^+>UjAcmC$#YP$zM|B*cHEQ{^TuWrf(`u-#PwK(56e_2^? z-@&^{2A4wl)8O+-WWL;< zyhl5Uzer8DCuhO*`Ew_no5IyC_55a5`?X{?FYV|&ZV2QU#f}`}L~R$S&GY zYs zo|Ciq9Y#ZAQOES>qsoaT&Tj=*pE29EbAq%}8fm<)hp%zZ;{m=L(WzoJc40r(Wo2(= zJcjUMG&8PeTk-ZH_nbuLC_fTjcY*I`>&5o)Kwe^YbvM@kC)ydx{#gr`-BQ(?&Ee}N zqB|z1$Y<0bEi{fj;PuAzagK76t!bmxwk7}7K&Zc`dPZfP(!um-qpU?b#Jbths&oc@ zXbGne);}3d*uQ;@biZ^`IWIQn=cogpM)`YRqd87Jkyn3@G&cPrKlKFV;~v~*@Rv80 zlQug+H>Fdn-||^$tIQGmY4#b@_*j zd=KMfh0KpOKO#=!6~7i4p6{+*k#6}BaLi4S{5|vu{Kkv%?QKao8g~A4_GG2iYmrczNOXDELX8B&e$35;tlPEIZm*BHMyZGeNem)59MLvv;^OH*d z#$vc1{^4WbUwiXo|NI!^6Zt?dDRKMQd;vMng14u73E)>ad}&EPPWkZX--Yj2y*u+? z`jz?4=zZMt-h(~#i~A`{$w5wOxLVNHYw$M`4&R%{T&gXHMCS4)S?$6Jv+->2{EYSB-r%nIZRFm{j`U-rY+O;tIE-|^dkH(Lv2(((KL&}* zsLOsF>;B4-_G`8HH})vo-L5FeW3t~JhM(2^92?e`-HKYqVYGZm2b=F-dG~4_TpGdu z{9_i;WJ`*rX$1UP%#M!xbmtb|8MQ6h(FYpeT)Al_ZeLNnp?5pM=Yxz}3we?D)c;O! z`eoWRi$7x*zwg1{aa`Hod&DtEbn#oow#s*AU+=7*@$BdG{rfon=i}kphm|{vA;w{Q z_VHv|e}{ODKf;NxD{oszy6Vr*g>0m(#BbsAVEz3KJu)X}{0VhMi`8Wb1un@Os{et{ zex_4ZSv2`zE-_7n-3v} z$Txa|{`ATo6?YLnuY1_#1NGzKbiUuUkw0Uk^*(asQ+_z1Bh4E<0K0XUy6U0TItzX^vH?C9OyJ7i>##2tW z{BrAKU*G>oKU=Y%PgL(L_#J#S@yh#U^jW?}bWy?l&u8N6EAP*qLvG&1gO62ycM+WJ zrQI8?BeAb}+0EOO>&dx9K9cFuDS@jdxre{H{$Yg{*TM;%YLa0gntp=um#*}%rn}lxF~JQ-!iK> zkvv+(JFmuZU-wu}wy*sT4xeHio3fABsAoIxuAG&|!@~t|G5Co%=i21CzjntO&wZ^M z;b*v@XsN$4@xtx!=Nu3J!+ySoUEPp8jxUa32iD_<-otqG^*(Gb&NHjn)k{(zoZZA99eGQiDgH?tE|8Dz4{#>pH>R7fbKOJP%V<95UdnmqqZ4TT z4Se{qI6JFD4l9eh$=^$2#bo;3)_;Y+eV!e``Q}|KGy6pSZ4S%7C7)BYcm+NBgnuLK zlg>Dfc*JEmJr`e}z~=+g0P`d~u%|fX@0T5p?=RTTd*f%enVhGi)Y-<2o(j(=WV;*Z z`PpE81Fzv4>od|$_HpwdL&(kDaOc^3lGR@9jV~&@@^#0)btcPlTc|XXUu7uyoFwup z;uiLSlbb->10cx+^b@li=~H{>g8k{ox$yZ^>t-#H^EZ#xA-C7d)44;Pov0i+_zHjL7|5VR3zx$YvJ>rmm za;{m&IKvSSCb-cTMn>v1v$C_2p8l0RCED- zm<_itHGb`Q>Cb|PgR=3dt@^hzYB$pFdc09*n_oTP^F(}(R6bsK`lT`nJ?=#=gPv(z)#FE_O!o(&8Laxr!WGv7gVwe^)!HiqY7@?=IKhDsj=#?f~|33w+A6 zoh~-Mk$2-V`rk*Mj3xS2&uCvNo~{M|bga?i;Pd{*MSk>jn>F}b@pVg$i`@RsBO91c z&!*e?yFTzv%Hi7W33uyQcjjix(k<5g@4W+XRJk6t${W&il1aCwF*s`tXCiK`S03km zJdp6Phjwe=YvA6*e2w`si9LJ{J?|#&?r8dH_Xhv#!P~j;H1Yy$nqTYhPD0t$O{_}` z*umY%XRG`c^*0w!7k-=(-toQx-AbHaZQN_z>%Lw;oVw(98Q&V`fqyC2fu;G4^lKJ8 zzK%Sa$v1aXS^n7k8o%GxIp9U)_)Fz#^~Aa49qiq9c6h`GzRV7e_2y2$k9^Bd<;Sbn8*uFK2K4AZaCQhezK4&vKk|)zLfkuiOZ}s@ z^I3iydK-D}4lnz+j2x$G@4Ng;^6xHAqbGfv%ujtiz52RRn;jar)Oy3iy70JJah>`4 zQ>8WkYhUFi7T=nki{SZf=6&ROK9k;d%{DLMe#W+bcO|`QAg{*t`dba3d((%7?B`42 z=Q{X&BmSC-*9g7ZMqFO#+ltC|#b1A%~Vs>|&XPwQ@Qk(uBTAX8^ zR>VKYOB0*8&pktZvThbvi%-b;q4zJx{WkuNTKab({x^s7*Q%$6obD~cA0PL*&rfmQ zcMp3%{O2zg4O2DyTd()E> zWq=QR=_>s9aD(9tcs+#vU%--!bA)^F;`V`){Kw#Zcx!G{-u{ne82kFm?68OPFaARW zS?IAmbo4gP3GXwG8L#%u=xrmCL_M?F*LUEn89Vuba=xr#(KfOJc#Fjt_vXG7ksIeJ zbyHVz-Hx|B%Ddl#pP*N>Z|lL0^~z6#lTF}C)ZYWH*2QNt-{Zb~=X9;{?yS`wsW7kG zq`T?4IP-Kf{@e2U1kP+ja*vo7@$S`~<-A_No{sbXzRJxsueV4ymGv)4P3hatqT>zikXryJ1aiO_DW zr#JDpG%)^WSV!XJm+RFN@_Zuo!{k;T3!2(ca-1Jd`xgM;>xN@f3uU~ z^jG@g=28CllHM;LO#VhNzmpxfzS1$jr>tjG-p}7flIQoL|KPpX>+sRaJ???(A1~hS zj&fWk<``D-y~4kxvE*=9e!q6BuHQh;_1MdQEBjlUT^W9-W%|1rz5N#6cg==mZP}Ui_#@ZipI5Ka zsJOy<;)Yt$A!YQuR&fXY%-PF-v%Y&LwCGIU^CJHpdQ=y$ly~k#IM73!$UoDe=KBrBpXq&l_-khgR}L?p zOMBbzZ=lz6e)tFSgg3>5^xt(MOU9`I&yuKICuay5ed<9*^XH+S_MEu<& z?2zxh-+h&KTV+3yLp}KLQt>bIr51bo5_%Rd&|IRv@KisLBGdYM5vld`CuU$HdiWdq zB>k#qhZ*BuW9KVMNKXP+lNO)UUSx8ZU_Sq3B=1PW=)-U)LE+cpc_Xh?nVo=oRyqf~ z&A4~AquiSwd_r@dR?mC<%=eOiTQ?$)L%-jO@q~TH$KD@&&^UJH3%SI4znJ9}nS5W9 z;r<$W(!)vIRrGs)wnw(Y`u*hp&~I~c7V(6~nW?c4x)kn)9^S+XQ-EPK~_=^e4` zKP8{{W&8W|Vo&yh_4OO?z&=*$!<+mj1^ePInTIEm){`v2$anVj+UxLqyR6{vcwW8U;&q-Ce$MyFWr2DsPQU_ZTBak>`-ob+v&69`=^6Cq zJpB98w~<-B^mlq_XO*5PadiSehAc%^!v{*d8kVA6Z!w6^XvE&=ksdWq<#A2Vm-;pj zE9YK@&MjpXu}a@Y&eigeSCfM^aK6fiJg|`&z19~DnA{)i#Y0#C(nESOj&+FX*Z6RAMYOa z1rKfh2DpT`@K}sH-lph;8556X7)t~evlW{+mK&+SNxCi&dTk^ zy}kUKKxYq(P!lCta4u0*eqU~|HHUIw&E%LxWQ12fpZJl-4BwR8_gxX zhkP0Hlox5w4xW>F+pl<`w3o)IzmfU}!)wwOzwt798}ZMvzD#zn z4{q_i_~ADzPg)E4%rz)X#Vwf$ntQTgK07FW#V@OWzO4 z_Jz-0(eQ@RbLF`IE!&@Ti!A~n|e>D}z`z=t>aDQfH0 zU-UEV(x*ibgnx3mbtv+1-VA?tWbb{Tf6?EsAXb%m@E?547w7dYqzSpzwQmo<)+_qC zgH>;R_A2?-u#aPZ)=`|t6Xt31{ay7`!Q%t9KhJIT=iuJ@N^kUNccOTSxW7;bZZ2Vm z4^6Kt7x%mO_IEjy)04)xK74+^{i~@uMa9m!O5VXu3fj}oGN@!#{6|@(pDp*BFR3r$8CLRxxK&2l zZ>D|d$>-V+`(#x*8vpbB?tQqoLLR#(N_v7bRo2;;^mCDR3pf+yo8}LYM?3hu()zXB z``Ry*^nv;L+8>ntfS>)%>-PCu==H^GT&UfC;yGTy&syh$AC~hp{2h%lH|Fx!$$&MrpWj0dd_D_(BtOW@?4{iI4doBWPcO^QR{w1KnnU4vtdD2H z)3^t>Ab(fCya&X-4!_MG(Zf1AIRBuWUzh)0{#N#Q?E4POKV~O~rtee(>{}^&|2lUa0)u^!fSxN%lsU><#Ns#CiQAUq(LOH_u~zuZG9>s;@P> z`b~VUseF~cR@x_Z?dK!^@aulR$b9_CIu&t4RqWfy;;_d0x4^t>SiAz~yTjdxSDY(O z`k8c~^}Pe!3V+?$;!O37a*p_~w4Zq}q*z|+`56A<#&D@O`*?HXF|GIvf9}N>XVHt{ zaQjj7TAWSss(J>)(~FJoK>0i#H!fBD(I1xlAH}Z`<2W0RFHq01MXc|Q-A6fr947E@ z+zq#fv6EN9fd+8)dU75jp5tlrU=wk4U!&LOe|}59@k`@NeB*tiZ=l zm4}OuO1*uRf4mC%T~xfJU*A-oC<0f*-+YR8r-|=~{cLmg^rt01Ys907nwN+43F~`1 zc>N)I;PYGLGn9s}PJNWmWHJxCx~jyX4rW~Ke1vnK?(ir2<;HpPI<&uklLp(j_Je0( z&-CH{=mK@yn>js|@ArH8=}v$m`;Z$&?0@GbW=J!5H4QHIfCD30F7f`C%t7QCC#p}g z8+PEish7ulb%*08@OE34)hzys6SAL;a~=4+fxfr(Wr}^op)%O?&+dfJ-=*-2-jbb5 z4vQf18uO(hd+KMUe%^@Qk{y0Fe2kZ&Z!_a#-MrJzzHPRIhyFd|^{T9@pDN$TDdx4t z*E`PHZ}js``Jdk>#}+&-AJT`!9{+_Mvc9t1_poPH+8>Czg3oKL%b!%_+b~Y8;p`*& z*OR~G8+x*$vJO4$jfUwJ?Nsqof5X20ywVun#6GSIKWkUY*1BQEf{ zJ-lBAKi7FPdSyx9#Q9%y^p0tjdF@;)y`$bH&i`J)zZ-w4K|UKUZ3`!ZzAt_vAB$+- ze5z4jto|6s4Rj|Qsgj0?dpS9ML9goZQ=D%c7V##nqF3wWJ^Hf5tEbInC$+@1Z>(e#%d_ z9{sD5UI*S(nZN7P_w@JcO6&ZilK+AH1dTG8!_!*p+PB`t`mmfwL2sP(g~Pwn+Yj6$ z4EZ$VkN!J(k6?fQU>?=s?+CqW1W)fXzQf#eS<61m_@iUqHRON3S3M);>sUjt)>jV7 zmzVq>oZq3Ik^CU5jqlf$WAPJmK0SYj99T4Yo==AZXDA=*#Rd7} zH^R*p@J~Foad8iV9Ae#k#QNj3Q@)P1_9?3?58*TR7q7zCD1T{QC*Ml>H_CnmZihbh zfa|}de$LsKmHao#R$6D8vQOOqHZJd0K8N42&NeL`^4U*5kFV2Gp6U`Brj} zxm0_*lV^+KeSF#v7U$B>h3xT9)1Lf?9g8o@ddBcC$jT4jzqDR7gx4XrM(pYb)Hei< z|7d-Dx3Vd`3ccD{`9|oSignh75BQniR!>RLF9`d}kSWljv;Pb~R;!gfu)GFyOvA1iSKi+6vnU9ahi#(5hy~6)e4No3` z_mS84&Ej+V-^_i%v(>ws`-$(CxV;YE2M(uf9XbNM^Cfo3TZ9L0XLdM&ubi?06*QW%@7ue)!xG`rjavL*zeykQ@R}mcieMufK!Ga~{3?3{D4rE?2%k z|H+$K&%7rb{+fKl?)?L>(2+?A*}d)Gxqac=I=srcy;oz%FjiQb!SUB z75X_q`7!)U+rZ1vt9{^KHGCMC`sM@hxgf18_5X1Gi^zAhed=J`e}uy==x1H{{15Bb zBEM@%p5ALi5B)zYJ*@mL*{P`-pYK*KvJM14VK2Aihdd?q%QxcRyAeNe@9b9ewnq8J z)YJIBB>%<+xjmKK%w|_Vl)C1T?e>*a*Wa7L<7d>r0KHY-$NJ~KpnliSzF=q9SheNd z$dm6y9`mTKQF;y?(yC{G{uX&Fe9A<}evJ2p8^EP@=4so!ZaI$zrS4lD>L%?r5qN(^}v}ydDncraa@o$ z!cRT;{64z81$ig@f8~Ae+9mxUINSn#JAC&g|6|n?{GSBBT4cTXK^mxMC48=zM;yk5 zaJV(U&=GmFd=kIk)#?xVU&B5YRq1|V^L#RX?#vsLPecCaRpxIu_f}fu?gvyJ(QfF& z%lR+Hy{CIEt@EMu_BHL6xB-9T{@nBEjg|jA|GCtMWqHN8wBz?^KtDg{XI@+CPc!%$ zewBab9r7KaM}mJLx8`uW9US-&-&>M%1NLT|3)I-Jcc!0t5&SQ&bSrAZheqzFEGN&Q z?zyzgXPXz(i#o~&J}=kqP;q(fl>bQl#;@giH(mW5;6{(49{Imq(xYF8!CBGl&JDXO zUzeTSxRmqZ#W(1^ohx?FN8;zd^mEzKO|#Mb*&WQ^<>*by@)w}X8teD{*?X^f-|9Ex z*NR{I4f2^HKDtNViT!px91c6}hT={0ZVG%}uf8?>7@f(parV4*qb>R^MGN-r+RC#< zOZ}>HZupjQl&7j_1Sj7WUzgMK7UJq2^7|d(^LP9N-fJux7_Y$R4)Ag#{^!Tano=N>!pK|Id=boC(UV+lUPFS7kAZZzm2 z|0(dgH+!bLonm+VO?4w6=AE}l(nh=%W3)d-D|6vu;Mh#Kx);>jH|vzQwJz`Mggwqj zHh2EEC;C#Jzk2Kqkve(UpIa!uCol5ZadJ;@PsV@v$F9y^PF>*S=}vfBTE}k@;Tn2) zd-g2cJ4$>&FJCXJdn|^d-z41^18L%JpZQhw!F|i$yp>i z`at!+NIs3tk1yRk3%|`W^a1EA^uMunT$U95ZO&i)TngM?$AbHa7wLQbi1?posj=Vn z5!ca^97I&X#jv+Krx!~;t5Y}q>YVjXo$|2HJEs>)+;*Eb|6KmbG4MO+8}kRa{VJX# z`rjnm3jRfZH)Vgf$!7ceIP1k1?B}32gtJZI&M5xq;q*jCp46i(e|TC!uXbWj_cDIX z_$LM#|Az4SBlXyipjXqUQ&S&uXv*6a^xd;d(7&jBUAYqf(chcleT>>=X{hhkG4#-% zTYjfMq`l9?Wk0H4v5r4w{fRuGFQoqDpLoIglze`ce!$PRy#GUtV~^}3cptd*1v|Jk z`Y)+_{$9oV^7%OPt3LaA4Su%vImquT@OfNWz8QP6YjyxXN45UMz3zcNy?hR@n)Ac7 z$)}UUvv51;4UJwSzjFT0gwtU^5AeM+{hRSSeLi}emu>2MdmIp79)-k&!iuioAl+z2iu_qjWi zevf_!{M3&j-N0A1c49C*=Qq z{*!hG!T0X@#`aAa91VI)`}po~=2P|efw#5TpCQj`c62xUyXM6>?JTQwEjp2JbMemK z!p|A<-1X1HZ#fITh2MT6zd;Xn!k}V=^;Cph5%a)JEA(#ke6sec%=g(Py#ah~NuK?( z&5Evh#G@Qr3@i2FFm`TB=(VWC)y^fomUZ)7c-kx*%^y90J@6xZ9#NLR5WNaMpIc19 z-)EI8;CJZxwdk#(){VtT_Q0yjH6{No*wsPbSiaFg`MS#c74Hk0*KOc)hVPB}O9op9 z+sY@}hXIm%dQa5(VU#(p5TO%|2v)F zhy8iJnHPC<@3v!&{l#7AUG?kXY<&J>8n^w>nbZ{&)yh4{M8%E=kWH;%BL0iH}os=)YUT27yJFF ze42eiV|W(*l8>Xr=ay-_dEJbEVoW(MLGNyT8J#xD->Upj83A`=UE4V2=JgJuVus~k zy45*1ZDC$Ca{DIazoU4JE%U;B+16*!_eq15A16L~wE5sBNt!_pO~hd|B*&fkqbDlg zFD7cZo#6)E z#>25^{Y3QvBk&aK#$@&0@LVl(SnCSGG=*{|x7z5~C+ocx!{H}H8_saI{;%RS(B zPv5_%52N8^;KM|AWq0%`#bhrEcmcSWu3dS#OZq~1J2ZctJupikl+t9BI;bt@b+sleg@ZTi6 z68|BGYtaLr|Ej&rx*Iqha=xzUg+4_b$5`}r?B~#rz~|oNxUu(EM%!-=^^VGra{j)c zd|UNA?|Zzf{)W%Eueut3M_$?ItS{BTI@whHt!L(KuKu`rRD*svjTl?nQNu*^mvZL67$DlMW3~-^ zTn?Ydm;IWm{z1IGBYYnYhiAdFu&dpG&I6x!_xsM|ut>Wt;aFr;4}3mS{q=EqSV<2A z*g!wgObR>XWc)|G)-_ps<9keYteMuse&lNFNyLHNjUIUG)e`GwYw;R0OaAY+PISe8 z;PpDEp%1c*V*e8KDE~4(-=q&u>TmRGnR)`B-}L)+;hpD6jCF1K@AU6Q$o?W4=4+W8 z!oJ-=m!AjTy&kDhp$1AfMO)R|q`o&4Lv*~~t*o6imQ^9H)C zM)@1|FGJJj<+yCj-mIa|W6=}*+$7B^9Cn+P|6(CKc|yLb zQYCt2uJVD;L+Q^hqH7jvw}oiPnaX=V6P^b?ADXr?-`Y7ZoLSbtfxehmM@IU}_s$1r zpgT9kZ;aX{=_q{vR&jb@+!~2%o~?gVv%jXL7c9b*6WUFF5H zeBkpYB|iTv9YOBvD=X9+_57FqSz2O!{J8QxdOiAH(f`2bRq1eitmd@~I*!om10T97AO6)r`HnvOvR1avSIdhs9DYVUGxBZpXN2A4!St;q zdpYP0*vs4EZp*8Gc)c-`GU?QO*H3gU3U~&252h=Lwhl z+U34PZhej0kNNiaY=SNV(R%u${>1!D#qQ+Lh#fq@xOy?C*w*?egH#dYOLN)Bf!jTb z?ed@P`?|up=vQmy!%iLyA3wLgkNCa(#G*nso-ul}j?su2tdqENN zY@9gfBXU`fDp&h`*a_GAd*Jgm#gX*?)5@(SpN-*j&^P9Xo~NJRv!7>`cHr~qZIr*= zXS}QWTCojzRzi}S;A=3Phr=sEa|{B_%)pUqC5 z$4-lXb}su-&7SKA-G1<5Pvm82n(gE>?CSAw_$WPJr2h5doOdkotqt7l#rv=k9R3iF z#S51)?&I0H1LXJU;pS8me#J@Z4|{c*`umGYp2XAE)&6KUdW_dheuYK64hxmF-zG8Qg)~+hkj!@4?%AM7Axx`+)sCpWchVg`+k6Gsm-3Wb3M2 z;d{KxaF=x@?Bp}dfY$tMf5T_2J9o2lgMOdL)3ue=?B|_+$L$UD7xD9Es7(ITckQzg zaIZE$*wnOBIZtOOKghh8k&e~RANij{Pn(I)2>MQLWA2uJZN1p{clr@6q=Q-H_XX`BC&`Saz_#%lb_lH_W3`@*~lQi^n)v z{XN7rZ_u|(^XTsw+IgaJYc6gs&|F6@{M&P zaPM37^oL6yhf6o74+}BoGp@;oeU2I+AbI0O*^WrVoJ;Y51&NgR1?`3=gpO^Z6vKf7h z@~c>UhpMN>Uv@0HWcjwHxPv`;p6mZopqgf4{1%P*2R?kJ0Z)iUjAT)YgNqWn^Ley{ewtn^9;(+4N=DfHlIwp)*w076)rjBt3w@c$&wQ$O3iy1OdC*i8 z%~kYvhA5y*;DLOM>JR#t=_2#RyB_IspE*3B623`D~Qm>T>`;#`*N-waVc9uu=}2D;IHP zJ@Y?0_bk}UhnDp1^S`Puc8wQl*Nxo#kh0yq^XtiJgx%G7jD=HingR z8hbN`#|M@2JUhS2-$%mrQ}Yq}d!}|no>!>1iQVa8@G$aWUXdSx-ZZ-@KZ|_azlYD= z#C6<^9(wgqex~vpvy)FI|IeHc9$D5S{<*~AYxA?=?RxLS9*Qm#Q|={zY%_5m=g_Zu z;xB^FD)Ae~=>Jsl(5I8ry2>|Y`6};qpJE*v51*r7Ybr&tK|ibES9|jRI=>7aS-o^9H{at`@@Te%62NBJkU+nC(0EzU5{ zR#YBQPsr^B>p>gy;ue4JY+PO>pTMP8{H_C3dPcuvfApSpDe(E*;zZx~bbc896CI~q zprq&D*gjKdRj8-1c!EzGKx>1IbP$x07&wug@1{r2d0; z37J8Ta_(#bLAfOza&pxly4>e`9kaN!7Tl|wD)1yx)BsIkw`@xWv4eDBIEv{Qni(vO8Euuci&M|>>FYNz|@?FxDN zPt?A5O4F74wKu=WHOhTj8J|v5PZj_3HS~Ua_ryj$6w zT^V-Qp72-J2N=f60D6qtyR1(Obgj14{ZT_VWh% zdFqW(IbVI1$`|bCGt2s~PA{py2fxQn^yG=kwfG4auEpd zwp)I?d3IA}pz^UU%+24jo(^L_|4aS%JDoYbr0 z<@e^3vd1cc$5Hrm>XALboH=k|(-tDs?|H-ZSZN|O5_>Qyk>E!dEb|WtAwftuK zxQTm)H|FEnIj>m_M|@7?_qfWs{D%0t(C5b96@0+wws89{ z^X|#YK4p1+{o?QHpU>ZMk9qf+_`1NKrttS^<;S|8ayMLhy>fwe!*6p*@vQP=*wcTb zSIhV}HkAKc@rvJ#_4hl_AC=EJ@UWHq8jt$jRJeUNd+#oJHew$r-&gTxc(90_{h)Q6 zJ{KqJS95rLjeaeJ>vzNR*D9aF>7ajK-0pL+JWE&7x3!gDircj3WPfJedXLq8+EZ}0 z9sWOiW>G84>eVyF|Ia^7SW~<1dcYBTEE#mz9tSVk>wx{IP06yM4WfpZuikT>-&-#; zHKh3XpE{O)eBbt>(;`BRT>kGrzt_0?E{E(g@W6fc{l7o{|5*#?`2YSB=_CHX9}n8- z|Niv<=PBa6|M#ES>+}Eq^!J|3o>`|(SZDE1$6~;S|M2hP_y79u=RLoFzd#M~kKYOD b%Cc!ZyYWx*|JgHZ*NOiw{4dIHDEt2a1%sTJ literal 0 HcmV?d00001 diff --git a/bench/python/assets/brooklyn_mask.jpg b/bench/python/assets/brooklyn_mask.jpg new file mode 100644 index 000000000..6f839400b --- /dev/null +++ b/bench/python/assets/brooklyn_mask.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe74dbed044a192d00cf4a8b44cf89620b5fff29265bc847ced7d60dcab047e +size 44761 diff --git a/bench/python/assets/brooklyn_nms_masks.pt b/bench/python/assets/brooklyn_nms_masks.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e13cef5e672532735d1d50c4b8b14704651af54 GIT binary patch literal 196620 zcmeI!L2IU06#(FGCNq;5=}yqLATBHD%%Coc%Ry#g5MxCcW)m_=s3Do8Uy>;lx^PwS zPbl5`YpN@Mf;;ICsQP}~@o=YehXXFk#q-2B@0`5%+~c{=fpYivw(hJ}J3Fh_e%@Ve zuby39UY?zQa&d6+{QBVe;q}?|gX6=S!}}kdoj-oGwR-yW=^t(%-CSNB{r>*Ji<{H) z>)nH^lV>kZ&yNpoPA;x5uMR%`$?kgf-g{5Bm-^RB=HZ7Y>sxKh?xV}g^WU~_yX)Ie zZY?Pf*IQ5SE&m?9I6nOO!}XnCuD-gxx>?`-&E9J72fzBqcRyR-Ti$!OK3i}9@#yIC z{QTtT=JfL7diU2?$0t`O$G$-AwYlt0RjXF5ct0kaJ8Fm zj4_f3rfYH3P~UBfmv?(c>B+agT9TSAN2v(19o7jDAV7cs0RjXFR1k1Mt}tb1BtU=w z0RjYy2)No!H|Q8i1k<%RYN+qF#ml=rqx9rkUoA<^mZMYz*$(Rj2oNAZfB*pk1S$x) zAXk{OGZG*`fB*pkMFd>!rW6-8t*@4(X3J43f^3I%0t5&U zAV7cs0Rj~ST#zeF*%=8CAV7csfg%E~cGC?yMiRkvEsh%MyKV9EZqFz^`PNrUQnTeK z6+yPcIspO%2oNAZfB=CC0xrlErtFLa2oNAZfItxeSG(y39V3Zgx)w(b_1(63dADbj zo_y=8C8^nRl!_qRVVwX00t5&UAV7dX1pyc23R8AQ0t5&UAV8pqfUDhfgN~6zFkOqI zhWc(>yu8~pN>9G^)soa~IZ8#4?XXUO009C72oNAZpn`x4a)l{7BLM;g2oNApM8MT< zxXK!5-N0tAW(xY|uO=om=^)3rEisPDGL%ey_J^yFJ#ElJIm zqf`Xh4(kL65FkK+009C7DhRkBSD3Oh5+Fc;009C;1YGT=8+42$g6UcuHPm<8;^p0* zQF`*Nua=}{%TX$VY=?CM1PBlyK!5-N0u=;YkSk2t83_;|K!5;&A_A^<(+xUC62WvW zjvDH_ZSnGM&nP|l)>lhXv*joiLAJv>0RjXF5FkK+0D%euF31(8?2H5m5FkK+KoJ2~ zyXgiUBZ*+T7Do;B-L`mnw`Y`|eCw+vso8RriXhuzod5v>1PBlyK!8960T<*7Q+7rI z1PBlyK%j_#tKD>ij*&z#U5lfJ`fgjiyxTKMPrmillGJQDN=12xx$p4kpKY#1PBl)BH(H_-JoM65lq+OsG+{w7BBDijM9^DeYGSt zTaHo@WIL=AAV7cs0RjXF5U3#Ff?Q$B&Pad&0RjXF6cKQ>n{LoCk_e`2anw-XZHt$8 zdq(NWx4v4Enk`4E2(lg42@oJafB*pk1PD|Ra6zsxWoINnfB*pk1d0f_+D$j;7)b=v zwK!_1@3zIuyFH`yjVf8AV7cs0RjXn2)H0un6fhxAV7cs0Rlw? zT{DmiD0@GM-BDews?8BXOy0N>#HTH*>aSMAlqS` z009C72oNAZfItNS7vu_4c18jO2oNAZpooC0-E@PFkwh?Ei=&46Zd<&(+cQc}zV+3T z)NDCQMUd^VPJjRb0t5&UAV8pkfD3YkDLW$p0t5&UAW%fW)o!{$$4DZWuEkM9eYY)M z-t8HsC*S&NNouwnr6S07SSLV$009C72oNApLBIvM!jzqn009C72oNYD;A%JBpkpKv zOxNP5p}yM|FYoq@(vxp}wInrLj#3e1JFF8RK!5-N0t5&Us372iTw%)2NPqwV0t5&Y z5pcDeZqPB32&QXs)KK4Ti zbpiwk5FkK+009CO1YD3SOxYO;5FkK+0D&R`u6EN6Iz|$~bS;h=>bq_6@@~&4J^9vG zOH#AtC>24r!#V*11PBlyK!5;&3IZ<36{hTr1PBlyK!89I0av@}1|1`bV7eAZ4fWl& zczL&Hl%9O+t0k$~a+Hc7+hLsm0RjXF5FkK+Km`F8sBsE)(QW0c3tP}XA1@`y0 zcb@<&LNk t>t*!cfB*jS@4qe8Hb2Y0?ajY``FHz!^PiLby}S1|-?oYpkfJqM{=Gz1?}U^MXVY6TtpD(~sTnP2RkD z^PM-dv$M00+&=Z{GnSOZew0pQE!f1NV+A(IC1%;fQP^Xh<1`uKWz4dFWUreQ-u>ErA7iwvGLzCr&!tWW)Z<-eLfz9Ia7 zpE}dWpE7gk(6OUO4<9;dqy(%HT{wvXCK+xifwm(F)ONrT1?F~ff(BykT^ncHzKoy z;3)*(P4HTRzm?cB4Nt%IptQkkDf8pc37o;=VS%C8UMbJ~QYQ-&JAHmnKeI97UjdVp|E3 zu}Rsgn5kq=35TcS4}w|{?aH>_WYMI zKz*2z`8l27nl#BKep`titFE}GgRGjXD`81xSIz+Ou_7`L(`T8qTJq;8>UY)`mHGA> zGC#kM*pUqr+bVC2`pf@31HwfNvKV2&yn|*MKmesLRnoG12+Bs2Bd;O5h{? zAAYN(7&s~e#5WSc$KYy^Ik~dA^r-rNDpTkO8LFQV8|Xe;FX7K1Ka=3P1k-zpnExFU z1P0sf$M_n?ngG||avX0<_S+nJm1>y*(xFql&+G0P`ojS1fd9eWVNdOzAL$g96sXWLeAoZd9ViraBqunexk=QmP) z)otHNl&5`k63viRlYTS_r-ElGc;XTDF^X%;66)JIr0rz?*$PP?4!02VDR5RK^Vuhe zZ9lUHqRdW>C2d20zRF-_8#&G$5@e%ia%{Z?vJv$E9YkMkwePs;AwJVmjAf`Sg>ZP> z>aK?3F_KejyToiVvAG)RsxC_Sp@paqQ%q%Qf>$H|NpSUHd*K<>3ZhR!ou=o*FhB1{ z89Renb%bv-mWUxP53qp`pb!6DV;kZj*E(ykZvFAhOc(EJ5Mqzl;rYH~kj3|dRu192 zCT%)yWl0~=XY-}RhihvOs%W8zAXHEGwRwhUBV*;gMcXr25w4e<{UMvauZ$4i{;B#- z)Zbu!@lt)+MyrKv<4NP(^F7DHUU?>56I`!!0~*-`SEBMulFeb zPx?1a_OG=<`f&b*l85jMnD6w1iTMdXiwqa?kROsdk5b`@&!X$e2zeGOpq!t0v)md- z5F5bfYCU;=)Da4Ogf}t|NAiSEGPbF`Z11M&h{1;oS0T=W z=Uff(RE=5hi?IOyoO=S>LYwA{F)xue9whecAUH~7ymBW1q>%BRPVjVrp&b?4QUjY- z%YMmEl6~5%m6(&^$l^+wpEn=zt6<9k*B*ON8O+Bqm$#lv-T~hpj0GhKx0w9UdnBGD%_7O zE_3c3&~~RiCnNcYuihux2y3Bf!hW#p3Xu;6o=`gpC)HWi|0<(ysK&ML+sk z$~4*6eIFJ1@c4e8%#S`Vu%G4hK^eHe#dCRnY>&ap>+<~ar$fwo4Y^GOm<{`PGLiPm z%-TbgADB-?%AG0f2Vc#SII&4$Uw6bXugA(M{uU`D{o7pPl<5+WGo;TKMKK@vHpMd|-0Q|2l=9&*5}(kyB1ZZy zPx5JzJ$Ck%%r{ffPS^+XUBTOEBxkW<54zayTb#)XKXkFLLu!`1S8l7N{`twiC2bM! zzhLY}LTGbYfQ9QLcINLL*e<8fQ_wzVtn>ROx1ElWIMM%;c2wt&)VHfsuBx&`e?s&l zoRb&#R#0QklfKRVEU~>-Vs^d6kxtTA;{_6%WZp|zCiA`bDOEMOMslQ&U|-Tp)c^8H zK7ST~g;s!-%)>GJg3fXWxY+!5Zzdav_pc#ez?|(9`U$UICAADpVJFnJ4*PCgfE{jw zYbR5;{?oyI9%m`zL(IngeaqR48ieu(K!=#O9pAo4_yE|N5N)Zc-INQ8YpVCB%lo<# zAL}FileHslWP(G`6(zWk;36q+iv4O3x4)MS5md zbjNhkM#?kAR40|UX<1Tll;AwVxz`s^&Ukb5-*M z=RR07>&WeOx;^GocTO%^w>;bTpH{AKTRB6re0S7yag#OsVgFy;bD=D|huBQpUQBqP z?~H81RRzw8;TDl8BYNomhIfB~avsls4-)xOf@wdJswzNJ)>}FQeHa8%V~D|aGe!D& zg)y5>>KZ}tUnD=)V&cPB3k-FNxxFsP(zf3$%d*c2o9x9wU43m7qZTMZu$-es}Z|oY;MyNMN%I+X_QJp4{u?VivNtOH%zT{lLKz?9c zUcb#y3TDo*uw26P@0D0rKfQW(yhUy19hUO6PVcFB0K1B~A8A@A+a47GUiPWIjulb+JOy zN7PR&nIHL;>~~ApRt`E|K|RgEUh$2sa~01lj(u%NTj&`-`d#f#DPvzL>#{GBHk%Jf z8S8h59q#EnvgV{+bl&jZk7TfLW2wLVu+(1|5g25clwmIsonI6DqQq81X`4Ay;tKa5 z(N{lpjUneY*w7dIEocKR(>_7@{_0^23*1g`Q01|vj& zIrh-G#U^_0B=UD6C(bw!dkapzt4LqGPUOY2fTN$DBl6}$=MSVVGN!DplEWK2V%tGF zPsct28wzpH&C$YUlX~fWs=d_1NFTAQMO)w*b34aYJA+yGpuE$!qq$D2sm`ohr5=m4 z1>e`WCeU*uMS>q_Q*1HuC4*ox-r2jte(0l{L|uSWB{n}o3_hR7kO%OSydsIiOOY$c z&wo|=b8bY|>&uqdf47ur=k0%Hu(hj2KG1NX9rTeMFUY+21^(FBp_c{U(}nyQuaz2A30BDcxN#DEa@Dy4f-grktiErDZ2#r zvqv9A85`bfZ$aED5MF$`yB2lcd0KVOzpb5%D zKmHJrHeB-{VvriRR@eZ2u|Qy$CVwlj?;U|NSm%~fPyJUA7x`J8=MY0(tp=g|pTVES zy8{#UsNw9Ro1A;mX3@W}&J^zmfFG;A$Oqh2VDN`O2n_ww8uftAr5|8ju=|#=^}}9vZQ$ zzsQFfd=>HxoR6E2^#Uj9sZ!oP4fCO_>9rl&^^we9a~A4^wj`Ef*%`nU=yFo)r(~V; zdxuzoSkG7)*3&Pe4TS-AZ7Rw^yR1nGyg1?L=$D zRz{-aA;dm^w8t}uq25jHP$vUFzfW+T!BT;1H?<0|UO9-DYb@(6&(cv37&t>8zC#dE^n{1hPtr1*LI&-A2|UY zV&Z+bmBA9Dq|Md>QE!07y?NWu*1n4}A8Txn&F!DP0rKj&<2QQle$aiuy+77TKHWQ; zbo%r!y0O#ubNODYf`z50a6j3;C1f?t;I>7sG1!V0$VqR_-Ai@*1^5~9C(MU+-_TRA zEx`G^;+_lR{jE7Dvn>#w(}3%@n(5I(ep6BoTCYaKj%Y@sV zmu2NP=lQ9PL%E0MaBQ{G^IC|y<_GqV{2S^_3Yfe7ST762FGxi^E|7lbS6G)3C<;$S zd@x`y{u0}Gw`QKd5b;`#t(k-v&Ml0ciSl8cy*QC$D^q9ri;zQDGn{TBE)K9p8xZFO z*xG@J%|LX?d#I<2<~#pd#1KQJ^G@z3a5mT~6}~r9vk%YXHbhP_QZ5?9+j!|cp(#VI z;O!c}Eo5|JJpYW3Lb)ZIxXvcmg~CNcIgWJ?`qqj2=IVfXAc5@ye@Ns(E9>1}fYtO;;=Vj-$9>i@v_pGd?g9}`2zxQ>Pwf}ss)AxY}RMUt>sVrW+2zeHU`RKZzC=a9NOEOa}_BvjAz^M{FLvEzE{mf z-)Nv8=)RU?bF9&9!{0D}v3A}~4)LnQQa;wj<=@Q9zIgb3SayLPFM5(=*zf$) zzvGC<1=g%7MZ6*q>YK@N|=(tgYS(n2IoOax+2ddTI7DdALx9OWbXP3^V?{w??$c@o{8+Xw{Si1 zTw)HN%4OgTSM*Zkyz{Eyp{{kg&1X*wMGjoa@v~2a)^uC$VspTmSxsKJnBxr#GLxpm zxfv&=ObN!Wa`w|99fa>hx_-dxW%p|A`zDA#(^$?d^kF!I)_I-l8##}@mYtrO?3S&2 zLu&W%!%JQHM9U`Kt@qF2cBa97es!4p((HLb^X;d3{-Iu(1rrXtWzSrEf46v~ty^~V zmQmfT_NTfsBQ_b?o8H8+xu`)_BI|kA{@k?b*$0C6aGQVF88RO_hudjAYdkl-gty&X zs26?Knai*{bZgh?yk6EuJFms3T*f}6PuuuUlz%|~a_@s2r!5Z^EFH%2r+J~CkG#n7 z`a~$NWhTFuL_ahdru4!4N-M2gho=rx~Vy%z;n zw;bl?r>&ZqJ?^Yr7n^OnWNX*l@5)%OY|id{4e!Uo(pK4Rr_JYO<9&_E>%Qd}&gTyL zU1QX5YV+F)-`=DBhP}^ad`ZTL^UmNn`ni#`emUo-?B3w&FOkpmEkXMhydN1OgJ%EB zxlH;+!L%I21GfajZGxQh$lf6P_5GaN)cb?2?i$bKt)Omn?$7J8lJ&AxY1kM4(CWW* z0nhh;6O4UPkLTx}7s@-pcz&DSAzN?HaoJyj{^>YZG`ctRLBH>LK3k;a)q4zaj<&ha zGOnj^NH89Mk=s^OYV7#@9IW@CX8-XC^nvmKD_e;E++0sAeB71KnfTl6q7_SA`8OXw zw|o9MPq_Hc@uuC)rbWDL_$tGCsx`;C9fLV#mvS3yOTV?{dbG#X2EMS6W9RJj0RGGp zdFT49Nx#0)%}?)nZuZ^&XI*S%mS(Ohx|PefZJyaAI-29eGUJH>zvF#sonn~T^LReX z1b@B_+v3+!ibE*VUq8H~3&-XVqp&&p^HWcRuAln?FI&(kl=Rdj<#e6LzzYT!%MmU+pGLpW#@Bz z&)&M-!k?Yt$`rL;m-WHTyIgx(`Tv+5t@9MNYq^nnNrGdmk)Cly3$&r5p7!xunBPqA z@xdOBeS?jn8P{=~^0jd&C4*!86JyHbS93i7rC|I}Hur%Yj|L~LJCo;^vSN|33-KMo%6hk~u4`uH}+(WnecjINv1A3=h&?kLbaK={b zm%OE+WwYPl^1f${0Zl*V*t)>z*JA>2Z|p8(Xu;>GbEy{jwGVCB7tj}9kDP4N$~IoX zv30dE>hTq>yfvX=X3pCtlmGMZBJ_XvrwM((wE$hVvaEkUcq7Mu zEy~<8;~8x427Td$Um;G?HwTkMUHa%bt9ZWcH@0-MIWD?57`qJD)!{`(Y3wqdUkYo& z1?ea=EqHqQR^Gpb9fQSN-{shR$7r?iI*ylJ6)JraeQ#~Y(7|)RvR}#8~ugxpICXotoWVSa(e>cw$BoU56yK8tv^_S&XJXwRX*;^f_kztB!uBi4Zv zG^=?>l#d14bsT~+MS+oB#69$R?Qne@*DR(z*?9}fEYQBHe-m=pPanPp*Anb*&3ey9 zoeBR7=ZwUDZ?A0}A4d6l`Zn_s)YDV1*Iul3$LPL?|AG02_U5ZsW7$1gGz{y0Hx=nU zzE0y9?mP2GVL!6-w3Y3*V%e9quO1d<=W0D?;TT~7{p`=ipq@JVbFYp-yjY8_$9#K> zzO?rrc)poz?7UzB*Jc+|}^iIuiTzPkR2|B#zCy^aFeFS;M|7nAdV8&o65k zdVRx(T#xylp1L!LZJDF3nUv4-;k&NR?Z@+c>r^AU#W@EL3D)P4b?AF=F5CBdJuc5o zZBbSs;hBJM=~lZLRkkw_yGh zjs3fa_dLiJ`-M!P&rH1cu{GM-m+?7^-KphFc?HYv(axSYi{o%lV_*)h6YRtK_Bz7% zuGjlq^)D`C_cz|WKg6+hseZ!)c;B?L^hq6s{t_+qhuxejtAk$lIL?jiZ`wZN2`t-F z|6m5Li{QQJSzU3SH(%7x8=S*s*jDZE>EltpgFdRq6wI%q7u#aKdRo8X79Yx7tQT$R zgncw$kEFFm{Jj=iI+o*jmSK)K8~fsE&G#`r$FX#+s7UN}bkR2bfNK)?o-DHc3)J(H z)^6iav|+BcXh0q0xl}7Xa}mmfw7#P<(1+*f`l|hCTZ-Ot@cHGMwbDd?OA5r>E#`WxPI@d`_*<%OHMksQU|-z6NBA?dd}A@+ z&ah_OhBh1u#2?iV!=7QwYterW!k%OR%ffzh0>*}y-jVNLqBBH3oKG8r`e8q(U^ceP z)Qra)Vp|dcpSch7qZ-TZkMacpvrRY5FA12#bJ3pLbXEv4KHL-uf{EhcC+J7|y_^@B04>-K5r%t=EKiG^ zaUI%FstPMPA8?ZS zxSn>+71-WfJ@=-eh!1H?9}&DY(xdD7^A3Z1y;=4-a@9$*>a9i$-^Q)}5cz>K5qlRS zP71JBUPcUi)DL`#820BkUXB=iAkY@|nEvn|1?Pjb^pX~6PqJp+el@ny(0wh&VgB11 z+qeSTQli_L&vQLil5S7g&GXq1J>~i5xV&|fe$YPy%f6|x8zy7DupfTC*gv=Z=CVR8 z+gr2Gy%e#PVb&pm%;2RdO}fYXK7qYS)rIeV(ehxa7ny$QU(Nbs}4ZXd8l z;{6f)>?Hk}J%#zOKl~(oyW*y}-|Qv!f1>{U9%78=1lSq4r|q0AGXEuY7J28Z;r&z} zF-G8>ta#@Fd+C$KS(LEfYIZxecbk?U67$R;-7f2d`EY;9clOPll%vJ(d6?&0z4TUA zcjPvt#m-oW@^J3;Ht}9BO=IOgYzv$RNfz(sz-Km&!h9RPozF!a^|Npk@&15m3O_$f zwxz5`^{TwMuED!Xz9h>sPKdwrPra2mmtD?0XH;JMmO&ibM<-(ExJ z=l2mivSDJ|v1?Ki|FaATKLMXHgf9VxbI)F?FmHRg)dxONms6BtqDsZ*CuAsCeFngH zMM?-CgR4R2zT!n@@ck z-otr`e2c2$^LwliKzybpnG@jM@GmNj@!q2`6`vn^2F8$z&o4Zb_%MAor?FOQYY$3P zS<~x#WUP@m7%DzLCsxJhcX&|o`D;4N-0~IH8)UAi73H8r+&4`XpI=a_;`0k8j>8qs zCwU2cuGSNr1LH}>=NH_n`22#2TDqo?^-ns%(*=fky289&gL+qsX9tM+BjOc+Iu)Ov zbE@L=|6Jc9Yl-c`Z$Z~VF@FN4@$PEk&$g^f#pmb!rQ-7ozm>ira(>>G>LEU?;`9GJ zI%%mGS*PX6xkAO~2a=q5(I))F38oy@2vgs#k)lcp6a5L%zwoVxiqFsa+JKxSsrdYr zI8YU`iH|Xgt5CECA}T&Vl2=IXiR3((7t_+2$5ed&s(V_3%=6x!!;!@+^YhB`k1YS= z-zq*oa*{{<+3O47)q7?Etm5-?vYfGfv}6${SEpq-s2gR(dWccQno5{JK6VB z@%cr^kiG~LOt@9?`H?Fh;fMORH}*k}yq8q*`H?qz#!toP7XwdTV>pl5Dn7p;MaAcL zn1J&tDn7rDpY)a*B?}TevG{X&$}u< zKe&X7&(G&c6`$YbLB;3CI9e(`zv~YwK0lAOXsY=9u0N>w{M;W@e16V@iqDU+SXF#} zcYaXu`MEz7srdY^Kfu4UqvG=e6DmGG9~UYyoK5Ov*~( zsv%42RPp(%MwybT0t0m3@R9S(75uphda5fz>!sh>RacLaR}56f0QF%qe^w@5vwSkQ ztN8piqf(WY3{W3tWPYx#*l0HKTUl!bq~h~ephNwrHUq@RisU+$(Px=7;@j2MPYOg) zzq7um%(qp1enOan)n`EXF!-2?&tHA|REb|017v>ozI6p3H^`h^*<4!fmWZ(f1*v{U zY@qvW^xHiZpTF8{pWFhZPfwa5!Unj~XFwnE!)n_mbV5P8o>B4ntIgz(3eZ0K(H~Dn z2I*^)%!gu~c`{5AbvkQYkth~$-J4~wh{FVAsb^hS_9_O?=MXMw$?whdxz4R05!$nQ-+fq<6 zEJff9W{u!;xaG|2yWDwx#NC^+nbpM2@6cpackAM0_q((Gp*y+@6Gl8$zdN&jzGsR@D5Xt32ajrOC4&)(UwhM8)U-5oxuTF@mgb!RKl{d48qh^9ycOe15@1 zEnR8+Iu)P4=A^Eab8`48IS&^!5ceI%(w~&~CVPRLw^e-ppG!Zv=c@SpKbJ&Vj;_BaMXR%qK5IpO8NyX=f_Z#4AH{uv~uub@h z6YP;zQ$T&Yrn-(+p6FNc`8h%PWDZgB`H$u_)yyM4#wf0)nre!x`20v%A@OH&9!$mO zM}F*T`ecI4^WL7rkkHu3i_r~O#pmZ_Ict=oC5!lQ z8R3)eZ_sz~9?zicAdxR6xGHly+4ocN`9;T&z6cXcxK;7_kt-kJhx)cR_CZd?=jUX# zk$Yu~p7EpK)$Wus_LXvv!oEoExtR}08S8h59qwDmd`8b@(Rm{z_Cp}9V#;4pn@9hY}T^pN}ILPjke$ z16+pKnFnqln64i>3whAc1N{o*W8!^lT7a1k;&_K|t*mnpJM+gZ>4)B&;~3IR&J|h3 z0>d^;v6N?rWxa_P<$N;gS@BLGgZaJ_`F<9S3JlvjmlHkHq#kRYlu_~dp&#Hrtm5+n z4=O&tGb$N_Jq8t@A9zsl`CWHV@%iyy1JAoEK0mmGiqG$k3#Q`pyF94){3<^G?0G@+ z?WZ{pDn38Pl~wWi-T6Vq=jZ;Q;`4JJRD6EU15@$&-T6Vq=jZ;Q;`3u1auuK79TzG- zKOYw=KEI03&;F+EGoHZpgNo1Z&JXa;c2ob6xR&^fR=#%t;#ak`fqICSYt~8=pHGtl z@pg;3FIk=RSoVDMt5n@;a5>7fHkf^n_b$frjm3OB!iQ2$>wRxmr8x69Ow#~Y&kiGa`Chxt*BW%oz< zf`Hkk8|JI{{G2xxpP%!f;`4JJRD6EU1H1}W@%fPo6`x=DL^ZF~JcCqC?I%y1v@1f+ zWYuZ%q1jE!Ln^{jW1X zeVELj$A=a&j#Pa9<3sWPqE_m|jLgp}KL7usH~g2jnaqu_&*h~s@n<*>T`MVy`knQK zP3on}ReXNRhQifmK=?5Dmx|9{ZT3`wUm62so=gbehO0s5actRDAxLxSUtk^F1f$IQX`>Z^Hh!iqFqkSMm8h@2h<}pLS<^w4O9QwobGSXckv$2 zpyVLozm#BB1>ZJhy(&Jx=n%;X3lmJZRq^?eE4V*-iTbuT_TjH8K0l{O#pm~2<(Po) zk-F#E!+mjoH9x@GT!46nW=$S|aUP1DxZ=28GGPA8C3B!S|KL=(i>yQP1%~|1L_UdN zo8UW%{GIr$`P?F-GblV@k#6cRCLEOch+I`1}qd zE+;BJzsrP*&kr=foT1|L0}m=bzdI&We142QrQ-9u{-EOX;gXsqu>o^LhP?Y6h@7=z|Vx;cC*m$Ba0 zqnEbf`JGn<4|T1}V-=n~EfhI$CCAS`5n9u2xr@yKXJ$2d;bM+AEXYinekw1UG9?(h z%GpncbZ|(Jb^QSCxmRP~H$nWF#&TxiI|4Wh+qG)?hh(HKj%Tk=jS}A z`22j%RPp(_KdAWpxF%8Y`Q7`biqFr-g^JIQ<03u4%D%wT{J7nia78*knp;@G-Wzu|#%5ohU> zItu+ITIvtG`7?{vK`(neoy(~B{OAMAwZgx5;ryuh{K$if&+pzZn2OKOc~J5BIS(p6 zKj#5X`u{(Q&!5}pl%#d3XDzaBufv@Ga{JV)4<8REiMGn?sjNPmICSiU;S;;wJbc2$ zkz>bP(gR-pCp#M6Sn~TT>w>8BUv8f|&c)Ml-`xERBkhD;jLW!xV| zA6b~1*0e&&1pFd}yQUvml-hJ^g`%-L>WHG)lH5KG8+JyS_Vq6Cr0XFP*VXvDxmCwS aQ0^25yo|q{+}u758ajFP;D3%hm;FCNa$>Lm literal 0 HcmV?d00001 diff --git a/bench/python/assets/countour_lines.jpg b/bench/python/assets/countour_lines.jpg new file mode 100644 index 000000000..8d5c50219 --- /dev/null +++ b/bench/python/assets/countour_lines.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d59100dbefed479522bd91fbeb18b57f8a2fe58797f5f1192385bb774645753 +size 26397 diff --git a/bench/python/bench_utils.py b/bench/python/bench_utils.py new file mode 100644 index 000000000..91c9511a7 --- /dev/null +++ b/bench/python/bench_utils.py @@ -0,0 +1,411 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 +import torch +from PIL import Image +import os +import logging +import inspect +import argparse +import importlib +from glob import glob +from pathlib import Path +from datetime import datetime +from abc import ABC, abstractmethod +import numpy as np +import json +import pandas + +logger = logging.getLogger(__name__) +logging.basicConfig( + format="[%(name)s:%(lineno)d] %(asctime)s %(levelname)-6s %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) + + +class AbstractOpBase(ABC): + """ + This is an abstract base class of all the operators that can be benchmarked. + It provides basic functionality and guarantees uniformity across operator test cases. + Concrete implementation of all the abstract methods of this class must be provided by + the class inheriting from this. + """ + + def __init__(self, device_id, input, output_dir=None, should_visualize=False): + """ + Initializes a new instances of this class. + :param device_id: The GPU device id that to use by this operator. + :param input: The input tensor to run the operator on. + :param output_dir: The directory where artifacts should be stored. + :param should_visualize: A flag specifying whether the output from the operator + should be visualized and written to the disk or not. + """ + self.device_id = device_id + self.input = input + self.output_dir = output_dir + self.should_visualize = should_visualize + if self.output_dir: + if not os.path.isdir(self.output_dir): + raise ValueError("A valid output_dir must be given.") + self.op_output = None + + self.assets_dir = os.path.join( + Path(os.path.abspath(__file__)).parents[0], "assets" + ) + self.setup(self.input) + + def __call__(self, input): + """ + Runs the operator on a given input. Also visualizes the output if visualization was set to True. + :param input: The input tensor to run the operator on. + :returns: True if the operator executed successfully, False otherwise. + """ + try: + self.op_output = self.run(input) + + if self.should_visualize and self.output_dir: + self.visualize() + + return True + except Exception as e: + logger.error( + "Unable to run the op %s due to error: %s" + % (self.__class__.__name__, str(e)) + ) + return False + + @abstractmethod + def setup(self, input): + """ + Performs various setup activities to set this operator before it can be run. + :param input: The input tensor to run the operator on. + """ + pass + + @abstractmethod + def run(self, input): + """ + Runs the operator and returns the result. + :param input: The input tensor to run the operator on. + :returns: The result from the operator's run. + """ + pass + + def get_params_info(self, primitive_types_only=True): + """ + Returns a dictionary with keys being the variable names initialized exclusively during the setup call + # and values being their values. Useful to log if someone wants to know what parameters were used to + initialize the operator in the setup function call. + :param primitive_types_only: Only includes attributes with primitive data-types if True. Primitive + data types are bool, str, int, float, tuple and None. + """ + primitives = (bool, str, int, float, tuple, type(None)) + + # Get all global names (e.g variables + function names) used by the setup function. + all_global_names_setup_func = set(self.setup.__code__.co_names) + + # Get all the global names (e.g variables + function names) used by the __init__ function. + all_global_names_init_func = set(self.__init__.__code__.co_names) + + # Remove the names already used by __init__ from the ones used by setup to get a list of names + # which are exclusively used by setup + all_global_names_setup_func -= all_global_names_init_func + + # Get all the variables of this class. + all_vars_info = vars(self) + all_vars_names = set(all_vars_info.keys()) + + # Figure out all global variables only by intersecting the all_vars_names with + # all_global_names_setup_func. + # That will eliminate the global function names from all_global_names_setup_func. + vars_names_of_setup_function = all_vars_names.intersection( + all_global_names_setup_func + ) + + if primitive_types_only: + vars_info_of_setup_function = { + v: all_vars_info[v] + for v in vars_names_of_setup_function + if isinstance(all_vars_info[v], primitives) + } + else: + vars_info_of_setup_function = { + v: all_vars_info[v] for v in vars_names_of_setup_function + } + + return vars_info_of_setup_function + + def _setup_clear_output_dir(self, filename_ends_with): + output_dir = os.path.join(self.output_dir, self.__class__.__name__) + + # Clear out the output directory or create it + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + else: + for file in os.listdir(output_dir): + if os.path.isfile(file) and file.endswith(filename_ends_with): + os.remove(file) + + return output_dir + + def visualize(self): + """ + Attempts to visualize the output produced by the operator as an image by writing it + down to the disk. May raise exceptions if visualization is not successful. + """ + output_dir = self._setup_clear_output_dir(filename_ends_with="_op_out.jpg") + if self.op_output is None: + raise TypeError( + "Visualization Error: Operator did not return any value as output to visualize." + ) + + op_output_npy = ( + torch.as_tensor(self.op_output.cuda(), device="cuda:%d" % self.device_id) + .cpu() + .numpy() + ) + if op_output_npy.dtype == np.uint8: + for i, npy_img in enumerate(op_output_npy): + if npy_img.shape[-1] == 1: + # Need to drop the 1 from the channels dimension if dealing with + # grayscale in PIL + npy_img = npy_img[..., 0] + out_file_name = "img_%d_op_out.jpg" % i + # Visualize as image + pil_img = Image.fromarray(npy_img) + pil_img.save(os.path.join(output_dir, out_file_name)) + + else: + raise TypeError( + "Visualization Error: Unsupported dtype for visualization: %s" + % str(op_output_npy.dtype) + ) + + +def get_benchmark_eligible_ops_info(): + """ + Prepares list of tuples : op-class-name (str) and class for all the operators that can be benchmarked. + """ + class_members = [] + + for file in glob( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "all_ops", "*.py") + ): + name = os.path.splitext(os.path.basename(file))[0] + module = importlib.import_module("all_ops." + name) + all_members = inspect.getmembers(module, inspect.isclass) + op_members = [x for x in all_members if x[0].startswith("Op")] + + class_members.extend(op_members) + + return class_members + + +def summarize_runs( + baseline_run_json_path, + baseline_run_name="baseline", + compare_run_json_paths=[], + compare_run_names=[], +): + """ + Summarizes one or more benchmark runs and prepares a pandas table showing the per operator run-time + and speed-up numbers. + :param baseline_run_json_path: Path to where the benchmark.py styled JSON of the first run is stored. + :param baseline_run_name: The display name of the column representing the first run in the table. + :param compare_run_json_paths: Optional. A list of path to where the benchmark.py styled JSON of + the other runs are stored. These runs are compared with the baseline run. + :param compare_run_names: A list of display names of the column representing the comparison runs + in the table. This must be of the same length as the `compare_run_json_paths`. + :returns: A pandas table with the operator name, its run time from the baseline run and the params. + used to launch those runs. If compare runs are given, it also returns their run times and the speed-up + compared to the baseline run. The speedup is simply the run time of an operator from the compare run + divided by its run time from the baseline run. If an operator's run time or speedup factor is not + available, it simply puts "N/A". + """ + if os.path.isfile(baseline_run_json_path): + with open(baseline_run_json_path, "r") as f: + baseline_perf = json.loads(f.read()) + else: + raise ValueError( + "baseline_run_json_path does not exist: %s" % baseline_run_json_path + ) + + if len(compare_run_json_paths) != len(compare_run_names): + raise ValueError( + "Length mismatch between the number of given JSON paths for comparison and" + "their run names. %d v/s %d. Each JSON must have its corresponding run name." + % (len(compare_run_json_paths), len(compare_run_names)) + ) + + # Read all the comparison related JSON files, one by one, if any. + compare_perfs = {} + for compare_json_path, compare_run_name in zip( + compare_run_json_paths, compare_run_names + ): + if os.path.isfile(compare_json_path): + with open(compare_json_path, "r") as f: + compare_perfs[compare_run_name] = json.loads(f.read()) + else: + raise ValueError("compare_json_path does not exist: %s" % compare_json_path) + + results = [] + + for op in baseline_perf["mean_all_batches"]["run_bench"]: + if op.startswith("Op"): + op_name = op[2:] + + row_dict = {} + + # Fetch the time and parameters from the JSON for baseline run. + baseline_run_time = baseline_perf["mean_all_batches"]["run_bench"][op][ + "run_op" + ]["cpu_time_minus_warmup_per_item"] + + op_params = list( + baseline_perf["mean_all_batches"]["run_bench"][op]["op_params"].keys() + )[0] + + row_dict["operator name"] = op_name + row_dict["%s time (ms)" % baseline_run_name] = baseline_run_time + + if compare_perfs: + # Fetch the time from the JSON for all comparison runs. + for compare_run_name in compare_perfs: + # Check if the OP was present. + if ( + op + in compare_perfs[compare_run_name]["mean_all_batches"][ + "run_bench" + ] + ): + compare_run_time = compare_perfs[compare_run_name][ + "mean_all_batches" + ]["run_bench"][op]["run_op"]["cpu_time_minus_warmup_per_item"] + else: + compare_run_time = None + + row_dict["%s time (ms)" % compare_run_name] = ( + compare_run_time if compare_run_time else "N/A" + ) + + if baseline_run_time and compare_run_time: + speedup = round(compare_run_time / baseline_run_time, 3) + else: + speedup = "N/A" + row_dict[ + "%s v/s %s speed-up" % (compare_run_name, baseline_run_name) + ] = speedup + + row_dict["run time params"] = op_params + + results.append(row_dict) + + pandas.set_option("display.max_colwidth", 100) + + df = pandas.DataFrame.from_dict(results) + + return df + + +def main(): + """ + The main function. This will run the comparison function to compare two benchmarking runs. + """ + parser = argparse.ArgumentParser("Summarize and compare benchmarking runs.") + + parser.add_argument( + "-o", + "--output-dir", + type=str, + required=True, + help="The output directory where you want to store the result summary as a CSV file.", + ) + + parser.add_argument( + "-b", + "--baseline-json", + type=str, + required=True, + help="Path where the benchmark.py styled JSON of the baseline run is stored.", + ) + parser.add_argument( + "-bn", + "--baseline-name", + type=str, + required=True, + help="The name of the column representing the baseline run in the output table.", + ) + parser.add_argument( + "-c", + "--compare-jsons", + action="append", + required=False, + help="Optional. List of paths where the benchmark.py styled JSON of the comparison run are stored.", + ) + parser.add_argument( + "-cn", + "--compare-names", + action="append", + required=False, + help="Optional. List of names of the column representing the comparison runs in the output table.", + ) + + args = parser.parse_args() + + if not os.path.isdir(args.output_dir): + raise ValueError("output-dir does not exist: %s" % args.output_dir) + + if not os.path.isfile(args.baseline_json): + raise ValueError("baseline-json does not exist: %s" % args.baseline_json) + + args.compare_jsons = args.compare_jsons if args.compare_jsons else [] + args.compare_names = args.compare_names if args.compare_names else [] + + if len(args.compare_jsons) != len(args.compare_names): + raise ValueError( + "Length mismatch between the number of given JSON paths for comparison and" + "their run names. %d v/s %d. Each JSON must have its corresponding run name." + % (len(args.compare_jsons), len(args.compare_names)) + ) + + logger.info( + "Summarizing a total of %d runs. All times are in milliseconds" + % (len(args.compare_jsons) + 1) + ) + + df = summarize_runs( + baseline_run_json_path=args.baseline_json, + baseline_run_name=args.baseline_name, + compare_run_json_paths=args.compare_jsons, + compare_run_names=args.compare_names, + ) + + csv_path = os.path.join( + args.output_dir, + "summarize_runs.%s.csv" % datetime.now(), + ) + df.to_csv(csv_path) + + logger.info("Wrote comparison CSV to: %s" % csv_path) + + +if __name__ == "__main__": + # If this was called on its own, we will run the summarize_runs function to summarize + # and compare two runs. + main() diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py new file mode 100644 index 000000000..ae2c69b08 --- /dev/null +++ b/bench/python/run_bench.py @@ -0,0 +1,236 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 +import os +import sys +import logging +import cvcuda +import torch + +from pathlib import Path + +# Bring module folders from the samples directory into our path so that +# we can import modules from it. +current_dir = Path(os.path.abspath(__file__)).parents[0] +samples_dir = os.path.join(Path(os.path.abspath(__file__)).parents[2], "samples") +common_dir = os.path.join( + samples_dir, + "common", + "python", +) +sys.path.insert(0, common_dir) + +from perf_utils import ( # noqa: E402 + CvCudaPerf, + get_default_arg_parser, + parse_validate_default_args, +) + +from nvcodec_utils import ( # noqa: E402 + ImageBatchDecoder, +) + +from bench_utils import get_benchmark_eligible_ops_info # noqa: E402 + + +def run_bench( + input_path, + output_dir, + batch_size, + target_img_height, + target_img_width, + device_id, + num_iters, + should_visualize, + ops_filter_list, + cvcuda_perf, +): + """ + Runs the per operator benchmarks. It automatically discovers eligible operators for benchmarking, + sets them up, runs them and saves the runtime numbers. benchmark.py is needed to actually perform any + timing measurements. + """ + logger = logging.getLogger("run_bench") + logger.info("Benchmarking started.") + + # Create an image batch decoder to supply us the input test data. + decoder = ImageBatchDecoder( + input_path, + batch_size, + device_id, + cuda_ctx=None, + cvcuda_perf=cvcuda_perf, + ) + + # Set up various CUDA stuff. + cuda_device = cuda.Device(device_id) + cuda_ctx = cuda_device.retain_primary_context() + cuda_ctx.push() + cvcuda_stream = cvcuda.Stream() + torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle) + + # Get a list of (class names, class types) of all the ops that can be profiled. + ops_info_list = get_benchmark_eligible_ops_info() + logger.info("Found a total of %d operators for benchmarking." % len(ops_info_list)) + + if ops_filter_list: + # Filter based on user's criteria. + ops_info_list_filtered = [] + for op_class_name, op_class in ops_info_list: + for op_filter_name in ops_filter_list: + if op_class_name.startswith(op_filter_name): + ops_info_list_filtered.append((op_class_name, op_class)) + break + + ops_info_list = ops_info_list_filtered + logger.info( + "Filtered to a total of %d operators for benchmarking." % len(ops_info_list) + ) + + if should_visualize: + logger.warning( + "Visualization is turned ON. Run-times may increase drastically due to disk I/O." + ) + + # Do everything in streams. + with cvcuda_stream, torch.cuda.stream(torch_stream): + + # Start the decoder and get a batch. + # NOTE: Currently, we will grab the first and only batch out of the decoder for + # performance benchmarking. All ops will receive this and only this batch. + decoder.start() + batch = decoder() + batch.data = cvcuda.as_tensor(batch.data.cuda(), "NHWC") + # Read input and create a batch + + for op_class_name, op_class in ops_info_list: + logger.info("Running %s..." % op_class_name) + cvcuda_perf.push_range(op_class_name) + + # Step 1: Initialize the operator... + cvcuda_perf.push_range("init_op") + try: + op_instance = op_class( + device_id=device_id, + input=batch.data, + output_dir=output_dir, + should_visualize=should_visualize, + ) + cvcuda_perf.pop_range() # For init_op + except Exception as e: + logger.error( + "Unable to init the op %s due to error: %s" + % (op_class_name, str(e)) + ) + cvcuda_perf.pop_range(delete_range=True) # Deletes the init_op range + cvcuda_perf.pop_range( + delete_range=True + ) # Deletes the op_name range, too. + continue # Continue to the next operator. + + # Step 2: Run the operator. + # Repeat for as many iterations as we wanted. + cvcuda_perf.push_range("run_op") + for i in range(num_iters): + # Start the iteration. + cvcuda_perf.push_range("iter", batch_idx=i) + + # Run the op + success = op_instance(batch.data) + torch.cuda.current_stream().synchronize() + # Finish + cvcuda_perf.pop_range(total_items=batch_size, delete_range=not success) + + # Get out of the loop if our operator invocation fails. + if not success: + break + + cvcuda_perf.pop_range(delete_range=not success) # For the run_op + + # Step 3: log the parameters used by the operator, initialized during the setup call. + if success: + cvcuda_perf.push_range("op_params") + cvcuda_perf.push_range(str(op_instance.get_params_info())) + cvcuda_perf.pop_range() + cvcuda_perf.pop_range() + + cvcuda_perf.pop_range() # For the op_name + else: + cvcuda_perf.pop_range( + delete_range=True + ) # Deletes the op_name range, too, if run_op failed + + cuda_ctx.pop() + cvcuda_perf.finalize() + logger.info("Finished run_bench.") + + +def main(): + # docs_tag: begin_parse_args + parser = get_default_arg_parser( + "Profiler for all ops of CV-CUDA.", + input_path=os.path.join(current_dir, "assets", "brooklyn.jpg"), + supports_video=False, + batch_size=32, + ) + parser.add_argument( + "-n", + "--num_iters", + default=10, + type=int, + help="The number of iterations to run the benchmarks for.", + ) + parser.add_argument( + "--visualize", + action="store_true", + default=False, + help="Flag specifying whether outputs from the operators should be visualized" + " on written on disk or not.", + ) + parser.add_argument( + "ops", + nargs="*", + help="Optional list of one or more operator names which you want to benchmark. " + "When supplied, the benchmarking will be restricted to only the operators that starts " + "with these names.", + ) + args = parse_validate_default_args(parser) + + logging.basicConfig( + format="[%(name)s:%(lineno)d] %(asctime)s %(levelname)-6s %(message)s", + level=getattr(logging, args.log_level.upper()), + datefmt="%Y-%m-%d %H:%M:%S", + ) + + cvcuda_perf = CvCudaPerf("run_bench", default_args=args) + run_bench( + args.input_path, + args.output_dir, + args.batch_size, + args.target_img_height, + args.target_img_width, + args.device_id, + args.num_iters, + args.visualize, + args.ops, + cvcuda_perf, + ) + + +if __name__ == "__main__": + main() diff --git a/ci/build.sh b/ci/build.sh index bcbbec23b..b5114d3cd 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -28,16 +28,16 @@ # SDIR is the directory where this script is located SDIR=$(dirname "$(readlink -f "$0")") -# Command line parsing =============================================== - # Defaults build_type="release" build_dir="" source_dir="$SDIR/.." +num_jobs=$(nproc) # Automatically determines the number of CPU cores +# Command line parsing if [[ $# -ge 1 ]]; then case $1 in - debug|release|profile) + debug|release) build_type=$1 if [[ $# -ge 2 ]]; then build_dir=$2 @@ -55,35 +55,28 @@ fi # Store additional cmake args user might have passed user_args="$*" -# Create build directory ============================================= - -# If build dir not explicitely defined, -if [[ -z "$build_dir" ]]; then - # Uses one derived from build type - build_dir="build-${build_type:0:3}" -fi +# Create build directory +build_dir=${build_dir:-"build-${build_type:0:3}"} mkdir -p "$build_dir" -# Set build configuration depending on build type ==================== - -# Common config +# Set build configuration cmake_args="-DBUILD_TESTS=1" -if [[ "$ENABLE_PYTHON" = '0' || "$ENABLE_PYTHON" = 'no' ]]; then +# Python build configuration +if [[ "$ENABLE_PYTHON" == '0' || "$ENABLE_PYTHON" == 'no' ]]; then cmake_args="$cmake_args -DBUILD_PYTHON=0" else - # enables python by default or when asked cmake_args="$cmake_args -DBUILD_PYTHON=1" -fi -if [ "$PYTHON_VERSIONS" ]; then - cmake_args="$cmake_args -DPYTHON_VERSIONS=$PYTHON_VERSIONS" + # Additional python versions + if [ "$PYTHON_VERSIONS" ]; then + cmake_args="$cmake_args -DPYTHON_VERSIONS=$PYTHON_VERSIONS" + fi fi + +# Specific configurations for build type case $build_type in - profile) - cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release -DBUILD_BENCH=1" - ;; release) cmake_args="$cmake_args -DCMAKE_BUILD_TYPE=Release" ;; @@ -92,33 +85,26 @@ case $build_type in ;; esac -# Configure build toolchain =========================================== - -# Make sure we use most recent gcc-11.x -CC=${CC:=$(find /usr/bin/gcc-11* | sort -rV | head -n 1)} -CXX=${CXX:=$(find /usr/bin/g++-11* | sort -rV | head -n 1)} +# Configure build toolchain +CC=${CC:-$(find /usr/bin/gcc-11* | sort -rV | head -n 1)} +CXX=${CXX:-$(find /usr/bin/g++-11* | sort -rV | head -n 1)} +cmake_args="$cmake_args -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX" -cmake_args="${cmake_args} -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX" - -# Prefer to use ninja if found +# Use ninja if available if which ninja > /dev/null; then cmake_args="$cmake_args -G Ninja" export NINJA_STATUS="[%f/%t %r %es] " fi -# Config ccache -unset has_ccache +# Configure ccache if which ccache > /dev/null; then - has_ccache=1 -fi -if [[ $has_ccache ]]; then ccache_stats=$(pwd)/$build_dir/ccache_stats.log rm -rf "$ccache_stats" - cmake_args="${cmake_args} -DCCACHE_STATSLOG=${ccache_stats}" + cmake_args="$cmake_args -DCCACHE_STATSLOG=${ccache_stats}" fi -# config CUDA -CUDA_MAJOR=11 +# Configure CUDA +CUDA_MAJOR=${CUDA_MAJOR:-11} for nvcc_path in /usr/local/cuda-$CUDA_MAJOR/bin/nvcc /usr/local/cuda/bin/nvcc; do if [ -x "$nvcc_path" ]; then cmake_args="$cmake_args -DCMAKE_CUDA_COMPILER=$nvcc_path" @@ -126,19 +112,11 @@ for nvcc_path in /usr/local/cuda-$CUDA_MAJOR/bin/nvcc /usr/local/cuda/bin/nvcc; fi done -# Create build tree and build! =========================================== +# Create build tree and build +cmake -B "$build_dir" "$source_dir" $cmake_args $user_args +cmake --build "$build_dir" -- -j$num_jobs -# Create build tree -cmake -B "$build_dir" "$source_dir" \ - -DBUILD_TESTS=1 \ - $cmake_args \ - $user_args - -# Build CV-CUDA -cmake --build "$build_dir" --parallel 8 -- $MAKE_OPTS - -# Show ccache status, if available! -if [[ $has_ccache ]]; then - # Show build stats - CCACHE_STATSLOG=${ccache_stats} ccache --show-stats -V +# Show ccache status +if which ccache > /dev/null; then + ccache --show-stats -V fi diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 1d4fbb378..41baeada7 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -25,4 +25,6 @@ if [[ $# -ge 1 ]]; then build_dir=$1 fi -./ci/build.sh $build_type $build_dir "-DBUILD_DOCS=ON -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON -DPYTHON_VERSIONS=';3.9;3.10'" +# (warning): Use "$@" (with quotes) to prevent whitespace problems. +# shellcheck disable=SC2048 +./ci/build.sh $build_type $build_dir "-DBUILD_DOCS=ON -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $* diff --git a/ci/build_samples.sh b/ci/build_samples.sh index 9b4fc80e8..27b5c383c 100755 --- a/ci/build_samples.sh +++ b/ci/build_samples.sh @@ -28,4 +28,4 @@ fi # (warning): Use "$@" (with quotes) to prevent whitespace problems. # shellcheck disable=SC2048 - ./ci/build.sh $build_type $build_dir "-DBUILD_SAMPLES=ON -DBUILD_TESTS=OFF -DBUILD_PYTHON=1" $* + ./ci/build.sh $build_type $build_dir "-DBUILD_SAMPLES=ON -DBUILD_TESTS=OFF -DBUILD_PYTHON=ON" $* diff --git a/cmake/BuildPython.cmake b/cmake/BuildPython.cmake index 38cbfb86e..cab2e7371 100644 --- a/cmake/BuildPython.cmake +++ b/cmake/BuildPython.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -58,10 +58,10 @@ endif() foreach(VER ${PYTHON_VERSIONS}) set(BASEDIR ${CMAKE_CURRENT_BINARY_DIR}/python${VER}) - ExternalProject_Add(nvcv_python${VER} + ExternalProject_Add(cvcuda_python${VER} PREFIX ${BASEDIR} SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/python - CMAKE_ARGS ${PYPROJ_COMMON_ARGS} -DPYTHON_VERSION=${VER} + CMAKE_ARGS ${PYPROJ_COMMON_ARGS} -DPYTHON_VERSION=${VER} -DBUILD_ROOT=${CMAKE_BINARY_DIR} -DPYTHON_VERSION_SHORT=${VER} BINARY_DIR ${BASEDIR}/build TMP_DIR ${BASEDIR}/tmp STAMP_DIR ${BASEDIR}/stamp @@ -70,3 +70,9 @@ foreach(VER ${PYTHON_VERSIONS}) INSTALL_COMMAND "" ) endforeach() + +if(CMAKE_BUILD_TYPE STREQUAL "Release") + foreach(PYTHON_VERSION ${PYTHON_VERSIONS}) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py.in" "${CMAKE_BINARY_DIR}/python${PYTHON_VERSION}/setup.py") + endforeach() +endif() diff --git a/cmake/ConfigCPack.cmake b/cmake/ConfigCPack.cmake index e5ba75c9b..e0bec6ada 100644 --- a/cmake/ConfigCPack.cmake +++ b/cmake/ConfigCPack.cmake @@ -22,8 +22,8 @@ else() endif() set(CPACK_PACKAGE_VENDOR "NVIDIA") -set(CPACK_PACKAGE_CONTACT "CV-CUDA Support ") -set(CPACK_PACKAGE_HOMEPAGE_URL "https://confluence.nvidia.com/display/CVCUDA") +set(CPACK_PACKAGE_CONTACT "https://github.com/CVCUDA/CV-CUDA/issues") +set(CPACK_PACKAGE_HOMEPAGE_URL "https://cvcuda.github.io") # ARCHIVE installer doesn't work with absolute install destination # we have to error out in this case diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake index 8c64f160f..24bc2453c 100644 --- a/cmake/ConfigCUDA.cmake +++ b/cmake/ConfigCUDA.cmake @@ -21,10 +21,10 @@ list(GET CUDA_VERSION_LIST 2 CUDA_VERSION_PATCH) find_package(CUDAToolkit ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} REQUIRED) # CUDA version requirement: -# - to use gcc-11 (11.7) +# - to use gcc-9 (11.4) -if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.7") - message(FATAL_ERROR "Minimum CUDA version supported is 11.7") +if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") + message(FATAL_ERROR "Minimum CUDA version supported is 11.4") endif() set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) @@ -38,6 +38,7 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES) if(ENABLE_TEGRA) list(APPEND CMAKE_CUDA_ARCHITECTURES 72-real # Volta - gv11b/Tegra (Jetson AGX Xavier) + 86-real # Ampere - Jetson IGX Orin 87-real # Ampere - ga10b,ga10c/Tegra (Jetson AGX Orin) ) else() diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake index 898a7ee83..b011ace1b 100644 --- a/cmake/ConfigCompiler.cmake +++ b/cmake/ConfigCompiler.cmake @@ -81,7 +81,7 @@ if(BUILD_TESTS) set(candidate_compilers ${PUBLIC_API_COMPILERS}) else() # If not, by default, we'll try these. - set(candidate_compilers gcc-11 gcc-9 gcc-8 clang-11 clang-14) + set(candidate_compilers gcc-11 gcc-9 clang-11 clang-14) endif() unset(valid_compilers) diff --git a/cmake/ConfigPython.cmake b/cmake/ConfigPython.cmake index ab627ed5a..83f65f8b0 100644 --- a/cmake/ConfigPython.cmake +++ b/cmake/ConfigPython.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,7 +14,7 @@ # limitations under the License. if(ENABLE_SANITIZERS) - message(FATAL_ERROR "NVCV python modules don't work on sanitized builds") + message(FATAL_ERROR "CV-CUDA python modules don't work on sanitized builds") endif() # Because we python as subproject, we need to create a fake Findnvcv.cmake so diff --git a/cmake/GetGitRevisionDescription.cmake b/cmake/GetGitRevisionDescription.cmake index 74839ab06..b18506492 100644 --- a/cmake/GetGitRevisionDescription.cmake +++ b/cmake/GetGitRevisionDescription.cmake @@ -177,7 +177,8 @@ endfunction() # without an express license agreement from NVIDIA CORPORATION or # its affiliates is strictly prohibited. -# Addition by rlima@nvidia.com +# Note: The function below is an addition to the original set of functions. + function(git_branch _var) if(NOT GIT_FOUND) find_package(Git QUIET) diff --git a/cmake/InstallNVCVDev.cmake b/cmake/InstallNVCVDev.cmake index 40dd24a32..5368d63e0 100644 --- a/cmake/InstallNVCVDev.cmake +++ b/cmake/InstallNVCVDev.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,7 +19,7 @@ set(CPACK_COMPONENT_DEV_DISPLAY_NAME "Development") set(CPACK_COMPONENT_DEV_DESCRIPTION "NVIDIA CV-CUDA C/C++ development library and headers") if(UNIX) - set(NVCV_DEV_FILE_NAME "nvcv-dev-${NVCV_VERSION_BUILD}") + set(NVCV_DEV_FILE_NAME "cvcuda-dev-${NVCV_VERSION_BUILD}") set(CPACK_DEBIAN_DEV_FILE_NAME "${NVCV_DEV_FILE_NAME}.deb") set(CPACK_ARCHIVE_DEV_FILE_NAME "${NVCV_DEV_FILE_NAME}") @@ -28,7 +28,7 @@ if(UNIX) # is the same set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "${CPACK_DEBIAN_LIB_PACKAGE_NAME} (>= ${NVCV_VERSION_API})") - set(CPACK_DEBIAN_DEV_PACKAGE_NAME "${NVCV_PACKAGE_NAME}-dev") + set(CPACK_DEBIAN_DEV_PACKAGE_NAME "${CVCUDA_PACKAGE_NAME}-dev") # We're not adding compiler and cmake as dependencies, users can choose # whatever toolchain they want. diff --git a/cmake/InstallNVCVLib.cmake b/cmake/InstallNVCVLib.cmake index 4ddaa377f..7ef53adf3 100644 --- a/cmake/InstallNVCVLib.cmake +++ b/cmake/InstallNVCVLib.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,15 +15,14 @@ list(APPEND CPACK_COMPONENTS_ALL lib) set(CPACK_COMPONENT_LIB_DISPLAY_NAME "Runtime libraries") -set(CPACK_COMPONENT_LIB_DESCRIPTION "NVIDIA NVCV library") +set(CPACK_COMPONENT_LIB_DESCRIPTION "NVIDIA CV-CUDA library") set(CPACK_COMPONENT_LIB_REQUIRED true) -set(NVCV_PACKAGE_NAME "nvcv${NVCV_VERSION_MAJOR}") -set(NVCV_TYPES_PACKAGE_NAME "nvcv_types${NVCV_VERSION_MAJOR}") set(CVCUDA_PACKAGE_NAME "cvcuda${NVCV_VERSION_MAJOR}") +set(NVCV_TYPES_PACKAGE_NAME "nvcv_types${NVCV_VERSION_MAJOR}") if(UNIX) - set(NVCV_LIB_FILE_NAME "nvcv-lib-${NVCV_VERSION_BUILD}") + set(NVCV_LIB_FILE_NAME "cvcuda-lib-${NVCV_VERSION_BUILD}") set(CPACK_DEBIAN_LIB_FILE_NAME "${NVCV_LIB_FILE_NAME}.deb") set(CPACK_ARCHIVE_LIB_FILE_NAME "${NVCV_LIB_FILE_NAME}") @@ -36,7 +35,7 @@ if(UNIX) "${CMAKE_CURRENT_BINARY_DIR}/cpack/lib/prerm") # as per debian convention, use the library file name - set(CPACK_DEBIAN_LIB_PACKAGE_NAME "lib${NVCV_PACKAGE_NAME}") + set(CPACK_DEBIAN_LIB_PACKAGE_NAME "lib${CVCUDA_PACKAGE_NAME}") set(CPACK_DEBIAN_LIB_PACKAGE_DEPENDS "libstdc++6, libc6") diff --git a/cmake/InstallPython.cmake b/cmake/InstallPython.cmake index 2a8638f3f..977779fbc 100644 --- a/cmake/InstallPython.cmake +++ b/cmake/InstallPython.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,13 +25,13 @@ foreach(VER ${PYTHON_VERSIONS}) set(CPACK_COMPONENT_${PYTHON_MODULE_NAME}_DISABLED true) set(CPACK_COMPONENT_${PYTHON_MODULE_NAME}_DISPLAY_NAME "Python ${VER}") - set(CPACK_COMPONENT_${PYTHON_MODULE_NAME}_DESCRIPTION "NVIDIA NVCV python ${VER} bindings") + set(CPACK_COMPONENT_${PYTHON_MODULE_NAME}_DESCRIPTION "NVIDIA CV-CUDA python ${VER} bindings") set(CPACK_COMPONENT_${PYTHON_MODULE_NAME}_GROUP python) if(UNIX) set(CPACK_DEBIAN_${PYTHON_MODULE_NAME}_PACKAGE_NAME python${VER}-${CPACK_PACKAGE_NAME}) - set(NVCV_${PYTHON_MODULE_NAME}_FILE_NAME "nvcv-python${VER}-${NVCV_VERSION_BUILD}") + set(NVCV_${PYTHON_MODULE_NAME}_FILE_NAME "cvcuda-python${VER}-${NVCV_VERSION_BUILD}") set(CPACK_DEBIAN_${PYTHON_MODULE_NAME}_FILE_NAME "${NVCV_${PYTHON_MODULE_NAME}_FILE_NAME}.deb") set(CPACK_ARCHIVE_${PYTHON_MODULE_NAME}_FILE_NAME "${NVCV_${PYTHON_MODULE_NAME}_FILE_NAME}") @@ -50,19 +50,6 @@ foreach(VER ${PYTHON_VERSIONS}) install(CODE "include(\"${CMAKE_BINARY_DIR}/python${VER}/build/cmake_install.cmake\")" COMPONENT ${python_module_name}) - if(BUILD_TESTS) - set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS - "${CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS}, - ${CPACK_DEBIAN_${PYTHON_MODULE_NAME}_PACKAGE_NAME} (>= ${NVCV_VERSION_API})") - - # For some reason these are needed with python-3.7 - if(VER VERSION_EQUAL "3.7") - set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS - "${CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS} - , python3-typing-extensions") - endif() - endif() - list(APPEND CPACK_COMPONENTS_ALL ${python_module_name}) endforeach() diff --git a/cmake/InstallTests.cmake b/cmake/InstallTests.cmake index e896c7853..ff34de54a 100644 --- a/cmake/InstallTests.cmake +++ b/cmake/InstallTests.cmake @@ -24,7 +24,7 @@ if(UNIX) # Depend on current or any future ABI with same major version set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS "${CPACK_DEBIAN_LIB_PACKAGE_NAME} (>= ${NVCV_VERSION_API})") # External dependencies - set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS "${CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS},libssl3") + set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS "${CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS},libssl3 | libssl1.1") set(CPACK_DEBIAN_TESTS_PACKAGE_NAME "cvcuda${PROJECT_VERSION_MAJOR}-tests") diff --git a/docker/build20.04/Dockerfile b/docker/build20.04/Dockerfile new file mode 100644 index 000000000..d689ce3ea --- /dev/null +++ b/docker/build20.04/Dockerfile @@ -0,0 +1,90 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Docker image used to build CV-CUDA on linux-x64 + +ARG VER_CUDA=? +ARG VER_UBUNTU=? + +FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU + +ARG DEBIAN_FRONTEND=noninteractive + +# Add so that tzdata don't ask for timezone info in a noninteractive installation. +RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime + +# need to update and install in one go, or else installation might use +# stale data from server stored in docker cache, with packages that don't exist anymore. +RUN apt-get update \ + && apt-get install -y --no-install-recommends git git-lfs software-properties-common wget\ + && add-apt-repository ppa:ubuntu-toolchain-r/test \ + && apt-get update \ + && apt-get install -y --no-install-recommends gcc-11 g++-11 \ + && wget https://apt.llvm.org/llvm.sh \ + && chmod +x llvm.sh \ + && ./llvm.sh 11 && ./llvm.sh 14 && rm -f llvm.sh \ + && apt-get install -y --no-install-recommends ninja-build ccache libgtest-dev libgmock-dev shellcheck curl \ + && rm -rf /var/lib/apt/lists/* \ + && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ + && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cd /tmp/cmake-3.20.1-linux-x86_64/ \ + && cp bin/ share/ doc/ /usr/local/ -r && rm -rf /tmp/cmake-3.20.1* + +# Configure ccache +RUN mkdir -p /cache +COPY ccache.conf /etc/ccache.conf +ENV CCACHE_CONFIGPATH=/etc/ccache.conf +ENV PRE_COMMIT_HOME=/cache/pre-commit + +# Documentation ====================================== + +# Allow using this image in systems without proper CUDA runtime/driver support. +# We'll be using this image only for building, don't need strict CUDA checks. +ENV NVIDIA_DISABLE_REQUIRE=true + +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3 python3-pip python3-pytest python3-dev doxygen \ + && rm -rf /var/lib/apt/lists/* + +# python3 is python3.8 in ubuntu20.04 +RUN python3 -m pip install pre-commit +# Needed for python documentation +RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 +RUN python3 -m pip install breathe exhale recommonmark graphviz +# Needed for python sphinx docs and Python wheels +RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 + +# Python bindings ====================================== + +# Add deadsnakes apt repo to fetch older pythonv versions +ADD deadsnakes-ubuntu-ppa-focal.list /etc/apt/sources.list.d +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 + +RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \ + apt-get update \ + && apt-get install -y --no-install-recommends \ + python$PYTHON_VERSION-dev python$PYTHON_VERSION-distutils; \ + done && \ + rm -rf /var/lib/apt/lists/* + +# gcc-8 ====================================== +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + gcc-8 g++-8 \ + && rm -rf /var/lib/apt/lists/* + +# Needed for OpenSSL +RUN apt-get update \ + && apt-get install -y --no-install-recommends libssl-dev \ + && rm -rf /var/lib/apt/lists/* diff --git a/docker/build/ccache.conf b/docker/build20.04/ccache.conf similarity index 100% rename from docker/build/ccache.conf rename to docker/build20.04/ccache.conf diff --git a/docker/build20.04/deadsnakes-ubuntu-ppa-focal.list b/docker/build20.04/deadsnakes-ubuntu-ppa-focal.list new file mode 100644 index 000000000..b9cba6f58 --- /dev/null +++ b/docker/build20.04/deadsnakes-ubuntu-ppa-focal.list @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +deb https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu/ focal main +# deb-src https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu/ focal main diff --git a/docker/build/Dockerfile b/docker/build22.04/Dockerfile similarity index 74% rename from docker/build/Dockerfile rename to docker/build22.04/Dockerfile index 8fd5fb8ed..e974b5cd0 100644 --- a/docker/build/Dockerfile +++ b/docker/build22.04/Dockerfile @@ -20,21 +20,23 @@ ARG VER_UBUNTU=? FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU +ARG DEBIAN_FRONTEND=noninteractive + # need to update and install in one go, or else installation might use # stale data from server stored in docker cache, with packages that don't exist anymore. -RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - git git-lfs \ - g++-11 \ - # need to also build with gcc-9.4.0, our minimum supported compiler for the library - gcc-9=9.4.0-5ubuntu1 cpp-9=9.4.0-5ubuntu1 gcc-9-base=9.4.0-5ubuntu1 libgcc-9-dev=9.4.0-5ubuntu1 libasan5=9.4.0-5ubuntu1 g++-9=9.4.0-5ubuntu1 libstdc++-9-dev=9.4.0-5ubuntu1 \ - # Compilers to which public headers must be compatible - clang-14 clang-11 \ - ninja-build \ - ccache \ - libgtest-dev libgmock-dev \ - pre-commit shellcheck \ - curl \ + git git-lfs \ + g++-11 \ + # need to also build with gcc-9.4.0, our minimum supported compiler for the library + gcc-9=9.4.0-5ubuntu1 cpp-9=9.4.0-5ubuntu1 gcc-9-base=9.4.0-5ubuntu1 libgcc-9-dev=9.4.0-5ubuntu1 libasan5=9.4.0-5ubuntu1 g++-9=9.4.0-5ubuntu1 libstdc++-9-dev=9.4.0-5ubuntu1 \ + # Compilers to which public headers must be compatible + clang-14 clang-11 \ + ninja-build \ + ccache \ + libgtest-dev libgmock-dev \ + pre-commit shellcheck \ + curl \ && rm -rf /var/lib/apt/lists/* \ && curl -L https://cmake.org/files/v3.20/cmake-3.20.1-linux-x86_64.tar.gz --output /tmp/cmake-3.20.1.tar.gz \ && tar -xzf /tmp/cmake-3.20.1.tar.gz -C /tmp/ && cd /tmp/cmake-3.20.1-linux-x86_64/ \ @@ -56,6 +58,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-pip python3-dev doxygen && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 RUN python3 -m pip install breathe exhale recommonmark graphviz +# Needed for python sphinx docs and Python wheels +RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 # Python bindings ====================================== @@ -66,12 +70,13 @@ RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 # Add so that tzdata don't ask for timezone info in a noninteractive installation. RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime -RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python3.7-dev python3.7-distutils \ - python3.8-dev python3.8-distutils \ - python3.9-dev python3.9-distutils \ - python3.10-dev python3.10-distutils \ + python3.7-dev python3.7-distutils \ + python3.8-dev python3.8-distutils \ + python3.9-dev python3.9-distutils \ + python3.10-dev python3.10-distutils \ + python3.11-dev python3.11-distutils \ && rm -rf /var/lib/apt/lists/* # gcc-8 ====================================== @@ -88,15 +93,12 @@ RUN curl --fail-early -L \ -O http://mirrors.kernel.org/ubuntu/pool/main/i/isl/libisl22_0.22.1-1_amd64.deb RUN apt-get update && apt-get install -y --no-install-recommends \ - ./libmpx2_8.4.0-3ubuntu2_amd64.deb \ - ./cpp-8_8.4.0-3ubuntu2_amd64.deb \ - ./gcc-8-base_8.4.0-3ubuntu2_amd64.deb \ - ./libgcc-8-dev_8.4.0-3ubuntu2_amd64.deb \ - ./gcc-8_8.4.0-3ubuntu2_amd64.deb \ - ./g++-8_8.4.0-3ubuntu2_amd64.deb \ - ./libstdc++-8-dev_8.4.0-3ubuntu2_amd64.deb \ - ./libisl22_0.22.1-1_amd64.deb \ + ./libmpx2_8.4.0-3ubuntu2_amd64.deb \ + ./cpp-8_8.4.0-3ubuntu2_amd64.deb \ + ./gcc-8-base_8.4.0-3ubuntu2_amd64.deb \ + ./libgcc-8-dev_8.4.0-3ubuntu2_amd64.deb \ + ./gcc-8_8.4.0-3ubuntu2_amd64.deb \ + ./g++-8_8.4.0-3ubuntu2_amd64.deb \ + ./libstdc++-8-dev_8.4.0-3ubuntu2_amd64.deb \ + ./libisl22_0.22.1-1_amd64.deb \ && rm -rf /var/lib/apt/lists/* - -# Needed for python sphinx docs -RUN python3 -m pip install numpy==1.24.1 diff --git a/docker/build22.04/ccache.conf b/docker/build22.04/ccache.conf new file mode 100644 index 000000000..3ea1d6a35 --- /dev/null +++ b/docker/build22.04/ccache.conf @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +max_size = 20G +cache_dir = /cache/ccache diff --git a/docker/build/deadsnakes-ubuntu-ppa-jammy.list b/docker/build22.04/deadsnakes-ubuntu-ppa-jammy.list similarity index 100% rename from docker/build/deadsnakes-ubuntu-ppa-jammy.list rename to docker/build22.04/deadsnakes-ubuntu-ppa-jammy.list diff --git a/docker/config b/docker/config index 56cc639d5..aa84ebf0d 100644 --- a/docker/config +++ b/docker/config @@ -23,9 +23,9 @@ IMAGE_URL_BASE='' # change is done, such as removing some package, or updating # packaged versions that introduces incompatibilities. TAG_IMAGE=6 -TAG_IMAGE_SAMPLES=5.1 +TAG_IMAGE_SAMPLES=6.1 TAG_IMAGE_TEST=5 VER_CUDA=11.7.1 VER_UBUNTU=22.04 -VER_TRT=22.09 +VER_TRT=24.01 diff --git a/docker/devel/Dockerfile b/docker/devel20.04/Dockerfile similarity index 74% rename from docker/devel/Dockerfile rename to docker/devel20.04/Dockerfile index 245aa9f24..5d7fd499e 100644 --- a/docker/devel/Dockerfile +++ b/docker/devel20.04/Dockerfile @@ -20,18 +20,20 @@ ARG TAG_IMAGE=? FROM $BASE_IMAGE:$TAG_IMAGE +ARG DEBIAN_FRONTEND=noninteractive + # need to update and install in one go, or else installation might use # stale data from server stored in docker cache, with packages that don't exist anymore. # HACK: need to pass 'sudo' as a variable to workaround Dockerfile linter, it says # we shouldn't install sudo in a container. But we know what we're doing! -RUN HACK_SUDO=sudo && DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN HACK_SUDO=sudo && apt-get update \ && apt-get install -y --no-install-recommends \ - $HACK_SUDO \ - vim \ - gdb cgdb \ - less \ - wget curl \ + $HACK_SUDO \ + vim \ + gdb cgdb \ + less \ + wget curl \ && rm -rf /var/lib/apt/lists/* # Enable CUDA driver checks as this image will be used for running CUDA programs @@ -42,16 +44,20 @@ COPY vimrc /root/.vimrc COPY gdbinit /root/.gdbinit # For running tests inside dev container -RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python3-pytest \ - python3-pip \ + apt-utils fonts-dejavu \ && rm -rf /var/lib/apt/lists/* # needed by tests -RUN python3 -m pip install torch==1.13.0 torchvision cupy-cuda11x \ - && rm -rf /root/.cache/pip -RUN python3.9 -m pip install pytest torch==1.13.0 torchvision cupy-cuda11x \ - && rm -rf /root/.cache/pip +RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \ + curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION && \ + python$PYTHON_VERSION -m pip install --upgrade pip && \ + python$PYTHON_VERSION -m pip install --upgrade \ + pytest torch==1.13.0 numpy typing-extensions && \ + rm -rf /root/.cache/pip; \ + done + + WORKDIR /cvcuda diff --git a/docker/devel/gdbinit b/docker/devel20.04/gdbinit similarity index 100% rename from docker/devel/gdbinit rename to docker/devel20.04/gdbinit diff --git a/docker/devel/vimrc b/docker/devel20.04/vimrc similarity index 100% rename from docker/devel/vimrc rename to docker/devel20.04/vimrc diff --git a/docker/devel22.04/Dockerfile b/docker/devel22.04/Dockerfile new file mode 100644 index 000000000..55b652779 --- /dev/null +++ b/docker/devel22.04/Dockerfile @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Docker image used for development of CV-CUDA on linux-x64 + +ARG BASE_IMAGE=? +ARG TAG_IMAGE=? + +FROM $BASE_IMAGE:$TAG_IMAGE + +ARG DEBIAN_FRONTEND=noninteractive + +# need to update and install in one go, or else installation might use +# stale data from server stored in docker cache, with packages that don't exist anymore. + +# HACK: need to pass 'sudo' as a variable to workaround Dockerfile linter, it says +# we shouldn't install sudo in a container. But we know what we're doing! +RUN HACK_SUDO=sudo && apt-get update \ + && apt-get install -y --no-install-recommends \ + $HACK_SUDO \ + vim \ + gdb cgdb \ + less \ + wget curl \ + && rm -rf /var/lib/apt/lists/* + +# Enable CUDA driver checks as this image will be used for running CUDA programs +ENV NVIDIA_DISABLE_REQUIRE=false + +# Config files we use +COPY vimrc /root/.vimrc +COPY gdbinit /root/.gdbinit + +# For running tests inside dev container +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + apt-utils \ + python3-typing-extensions \ + python3-pytest \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# For running tests inside dev container +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + apt-utils fonts-dejavu \ + && rm -rf /var/lib/apt/lists/* + +# needed by tests (python3 is python3.10 in ubuntu22.04) +RUN python3 -m pip install torch==1.13.0 torchvision cupy-cuda11x \ + && rm -rf /root/.cache/pip +RUN python3.8 -m pip install torch==1.13.0 torchvision cupy-cuda11x \ + numpy sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \ + && rm -rf /root/.cache/pip +RUN python3.9 -m pip install pytest torch==1.13.0 torchvision cupy-cuda11x \ + && rm -rf /root/.cache/pip +RUN python3.11 -m pip install --upgrade pytest torch==1.13.0 cupy-cuda11x \ + && rm -rf /root/.cache/pip + +WORKDIR /cvcuda diff --git a/docker/devel22.04/gdbinit b/docker/devel22.04/gdbinit new file mode 100644 index 000000000..9ba78c2dc --- /dev/null +++ b/docker/devel22.04/gdbinit @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set disassembly-flavor intel +set print object on +set print vtbl on +set print pretty on diff --git a/docker/devel22.04/vimrc b/docker/devel22.04/vimrc new file mode 100644 index 000000000..59a3426ac --- /dev/null +++ b/docker/devel22.04/vimrc @@ -0,0 +1,23 @@ +" SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +" SPDX-License-Identifier: Apache-2.0 +" +" NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +" property and proprietary rights in and to this material, related +" documentation and any modifications thereto. Any use, reproduction, +" disclosure or distribution of this material and related documentation +" without an express license agreement from NVIDIA CORPORATION or +" its affiliates is strictly prohibited. + +set nocompatible +set backspace=2 +set autoindent +set softtabstop=4 +set number +set shiftwidth=4 +set expandtab +set autowrite +set ruler +set makeprg=ninja +set fileencodings=ucs-bom,utf-8,latin1 +syntax on +color ron diff --git a/docker/env_devel_linux.sh b/docker/env_devel_linux.sh index 8bafb3110..0c16ee742 100755 --- a/docker/env_devel_linux.sh +++ b/docker/env_devel_linux.sh @@ -52,7 +52,7 @@ else echo "Git user.name and user.email not set up" echo "Please run:" echo " git config --global user.name 'Your Name'" - echo " git config --global user.email 'your_nvlogin@nvidia.com'" + echo " git config --global user.email 'Your Email'" exit 1 fi @@ -70,5 +70,5 @@ docker run --gpus=all --net=host --pull always -ti \ -v /var/tmp:/var/tmp \ -v $SDIR/..:$HOME/cvcuda \ $extra_args \ - $IMAGE_URL_BASE/devel-linux:$TAG_IMAGE \ + $IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA \ /usr/bin/bash -c "mkdir -p $HOME && chown $USER:$USER $HOME && su - $USER -c \"$extra_cmds\" && su - $USER" diff --git a/docker/samples/Dockerfile b/docker/samples/Dockerfile index 0adeb99da..0a8e70f65 100644 --- a/docker/samples/Dockerfile +++ b/docker/samples/Dockerfile @@ -26,4 +26,5 @@ FROM nvcr.io/nvidia/tensorrt:$VER_TRT-py3 # by default. It is copied by the update_samples_image.sh script. Always # use update_samples_image.sh script to build any samples docker image. COPY install_dependencies.sh /workspace/ +COPY requirements.txt /workspace/ RUN sh /workspace/install_dependencies.sh diff --git a/docker/test20.04/Dockerfile b/docker/test20.04/Dockerfile new file mode 100644 index 000000000..edd979a1b --- /dev/null +++ b/docker/test20.04/Dockerfile @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Docker image used to test CV-CUDA on linux-x64 + +ARG VER_CUDA=? +ARG VER_UBUNTU=? + +FROM nvidia/cuda:$VER_CUDA-runtime-ubuntu$VER_UBUNTU + +ARG VER_CUDA=? + +# For testing python bindings ====================================== + +ARG DEBIAN_FRONTEND=noninteractive + +# Add deadsnakes apt repo to fetch older python versions +ADD deadsnakes-ubuntu-ppa-focal.list /etc/apt/sources.list.d +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 + +# Add so that tzdata don't ask for timezone info in a noninteractive installation. +RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime + +# For running python tests +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3.7 python3.7-distutils \ + python3.8 python3.8-distutils \ + python3.9 python3.9-distutils \ + python3.10 python3.10-distutils \ + python3.11 python3.11-distutils \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# It needs torch +RUN set -e \ + && for ver in 3.7 3.8 3.9 3.10 3.11; do \ + python$ver -m pip install torch numpy torchvision; \ + done \ + && rm -rf /root/.cache/pip + +# Other dependencies of python tests +# binutils: for readelf +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + binutils \ + && rm -rf /var/lib/apt/lists/* diff --git a/docker/test20.04/deadsnakes-ubuntu-ppa-focal.list b/docker/test20.04/deadsnakes-ubuntu-ppa-focal.list new file mode 100644 index 000000000..b9cba6f58 --- /dev/null +++ b/docker/test20.04/deadsnakes-ubuntu-ppa-focal.list @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +deb https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu/ focal main +# deb-src https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu/ focal main diff --git a/docker/test/Dockerfile b/docker/test22.04/Dockerfile similarity index 79% rename from docker/test/Dockerfile rename to docker/test22.04/Dockerfile index 0c55129e4..63b0d4a08 100644 --- a/docker/test/Dockerfile +++ b/docker/test22.04/Dockerfile @@ -24,6 +24,8 @@ ARG VER_CUDA=? # For testing python bindings ====================================== +ARG DEBIAN_FRONTEND=noninteractive + # Add deadsnakes apt repo to fetch older python versions ADD deadsnakes-ubuntu-ppa-jammy.list /etc/apt/sources.list.d RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 @@ -32,25 +34,26 @@ RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime # For running python tests -RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python3.7 python3.7-distutils \ - python3.8 python3.8-distutils \ - python3.9 python3.9-distutils \ - python3.10 \ - python3-pip \ + python3.7 python3.7-distutils \ + python3.8 python3.8-distutils \ + python3.9 python3.9-distutils \ + python3.10 python3.10-distutils \ + python3.11 python3.11-distutils \ + python3-pip \ && rm -rf /var/lib/apt/lists/* # It needs torch RUN set -e \ - && for ver in 3.7 3.8 3.9 3.10; do \ - python$ver -m pip install torch numpy torchvision; \ - done \ + && for ver in 3.7 3.8 3.9 3.10 3.11; do \ + python$ver -m pip install torch numpy torchvision; \ + done \ && rm -rf /root/.cache/pip # Other dependencies of python tests # binutils: for readelf -RUN DEBIAN_FRONTEND="noninteractive" apt-get update \ +RUN apt-get update \ && apt-get install -y --no-install-recommends \ - binutils \ + binutils \ && rm -rf /var/lib/apt/lists/* diff --git a/docker/test/deadsnakes-ubuntu-ppa-jammy.list b/docker/test22.04/deadsnakes-ubuntu-ppa-jammy.list similarity index 100% rename from docker/test/deadsnakes-ubuntu-ppa-jammy.list rename to docker/test22.04/deadsnakes-ubuntu-ppa-jammy.list diff --git a/docker/update_build_image.sh b/docker/update_build_image.sh index cd41dd057..32e4eab3f 100755 --- a/docker/update_build_image.sh +++ b/docker/update_build_image.sh @@ -35,9 +35,9 @@ cd "$SDIR" # load up configuration variables . ./config -cd build +cd build$VER_UBUNTU -image=$IMAGE_URL_BASE/build-linux:$TAG_IMAGE +image=$IMAGE_URL_BASE/build-linux:$VER_UBUNTU-$VER_CUDA docker build --network=host \ --build-arg "VER_CUDA=$VER_CUDA" \ diff --git a/docker/update_devel_image.sh b/docker/update_devel_image.sh index 13da128c3..aa7504149 100755 --- a/docker/update_devel_image.sh +++ b/docker/update_devel_image.sh @@ -33,13 +33,13 @@ cd "$SDIR" # load up configuration variables . ./config -cd devel +cd devel$VER_UBUNTU -image=$IMAGE_URL_BASE/devel-linux:$TAG_IMAGE +image=$IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA docker build --network=host \ --build-arg BASE_IMAGE=$IMAGE_URL_BASE/build-linux \ - --build-arg TAG_IMAGE=$TAG_IMAGE \ + --build-arg TAG_IMAGE=$VER_UBUNTU-$VER_CUDA \ . -t $image if [[ $do_push == 1 ]]; then diff --git a/docker/update_samples_image.sh b/docker/update_samples_image.sh index 36d8208c3..6dbfc907b 100755 --- a/docker/update_samples_image.sh +++ b/docker/update_samples_image.sh @@ -32,6 +32,7 @@ cd "$SDIR" # Copy install_dependencies script from the samples folder to the samples' docker folder # so that it can be added and used inside the image. cp $SDIR/../samples/scripts/install_dependencies.sh $SDIR/samples/ +cp $SDIR/../samples/scripts/requirements.txt $SDIR/samples/ # load up configuration variables . ./config diff --git a/docker/update_test_image.sh b/docker/update_test_image.sh index d0cc51993..c69598d32 100755 --- a/docker/update_test_image.sh +++ b/docker/update_test_image.sh @@ -33,9 +33,9 @@ cd "$SDIR" # load up configuration variables . ./config -cd test +cd test$VER_UBUNTU -image=$IMAGE_URL_BASE/test-linux-x64:$TAG_IMAGE_TEST +image=$IMAGE_URL_BASE/test-linux-x64:$VER_UBUNTU-$VER_CUDA docker build --network=host \ --build-arg "VER_CUDA=$VER_CUDA" \ diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 6cc8343d1..5ad6c8902 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -41,7 +41,7 @@ add_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE} COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_OUT} MAIN_DEPENDENCY ${DOXYFILE_OUT} ${DOXYFILE_IN} COMMENT "Generating doxygen xml" - DEPENDS nvcv_python${VER}) + DEPENDS cvcuda_python${VER}) add_custom_target(cvcuda_doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE}) diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv index 53ea4e8e7..bc4aecd54 100644 --- a/docs/sphinx/content/cvcuda_oplist.csv +++ b/docs/sphinx/content/cvcuda_oplist.csv @@ -16,7 +16,7 @@ CustomCrop,Crops an image with a given region-of-interest CvtColor,Converts an image from one color space to another DataTypeConvert,Converts an image’s data type with optional scaling Erase,Erases image regions -FindContours,Extract closed contours from an input binary image +Find Contours,Extract closed contours from an input binary image FindHomography,Calculates a perspective transform from four pairs of the corresponding points Flip,Flips a 2D image around its axis GammaContrast,Adjusts image contrast @@ -24,6 +24,7 @@ Gaussian,Applies a gaussian blur filter to the image Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distribution Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value. Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast +HqResize,Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling. Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels @@ -38,7 +39,7 @@ Non-max Suppression,Enables selecting a single entity out of many overlapping on Normalize,Normalizes an image pixel’s range OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask PadStack,Stacks several images into a tensor with border extension -PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method +PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method PillowResize,Changes the size and scale of an image using python-pillow algorithm RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size. Reformat,Converts a planar image into non-planar and vice versa diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index ec3667c61..890d44262 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -50,7 +50,7 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find Where Are the Release Notes? ------------------ -An awesome product requires excellent support. CV-CUDA release notes can be found `here `_. +An awesome product requires excellent support. CV-CUDA release notes can be found `here `_. Where Can I Get Help? @@ -124,6 +124,7 @@ Copyright :maxdepth: 1 :hidden: + Beta.4 Beta.3 Beta.2 Beta.1 diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index c37fd42f2..5e213d536 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,7 +22,7 @@ Installation Pre-requisites -------------- -This section describes the recommended dependencies to compile cvcuda +This section describes the recommended dependencies to install CV-CUDA. * Ubuntu >= 20.04 * CUDA driver >= 11.7 @@ -30,65 +30,67 @@ This section describes the recommended dependencies to compile cvcuda Setup ----- -The following steps describe how to install cvcuda. Choose the installation method that meets your environment needs. - -Download the cvcuda tar/deb package from `here `_ +The following steps describe how to install CV-CUDA. Choose the installation method that meets your environment needs. +You can download the CV-CUDA tar, deb or wheel packages from `here `_ * Tar File Installation -Navigate to your directory containing the cvcuda tar file. + Unzip the cvcuda runtime package: :: + + tar -xvf cvcuda-lib-x.x.x-cuda11-x86_64-linux.tar.xz + + Unzip the cvcuda developer package: :: + + tar -xvf cvcuda-dev-x.x.x-cuda11-x86_64-linux.tar.xz -Unzip the cvcuda runtime package: :: + Unzip the cvcuda python package: :: - tar -xvf nvcv-lib-x.x.x-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-python3.*-x.x.x-cuda11-x86_64-linux.tar.xz -Unzip the cvcuda developer package: :: + [Optional] Unzip the tests. :: - tar -xvf nvcv-dev-x.x.x-cuda11-x86_64-linux.tar.xz + tar -xvf cvcuda-tests-cuda11-x86_64-linux.tar.xz -Unzip the cvcuda python package: :: - tar -xvf nvcv-python3.*-x.x.x-cuda11-x86_64-linux.tar.xz +* Debian Installation -Optionally Unzip the tests. :: + Install the runtime library. :: - tar -xvf cvcuda-tests-cuda11-x86_64-linux.tar.xz + dpkg -i cvcuda-lib-x.x.x-cuda11-x86_64-linux.deb -Optionally Unzip the tests. :: + Install the developer library. :: - tar -xvf cvcuda-tests-cuda11-x86_64-linux.tar.xz + dpkg -i cvcuda-dev-x.x.x-cuda11-x86_64-linux.deb -* Debian Local Installation + Install the python bindings :: -Navigate to your directory containing the cvcuda Debian local installer file. :: + dpkg -i cvcuda-python3.*-x.x.x-cuda11-x86_64-linux.deb -Install the runtime library. :: + [Optional] Install the tests. :: - sudo dpkg -i nvcv-lib-x.x.x-cuda11-x86_64-linux.deb + sudo dpkg -i cvcuda-tests-x.x.x-cuda11-x86_64-linux.deb -Install the developer library. :: - sudo dpkg -i nvcv-dev-x.x.x-cuda11-x86_64-linux.deb +* Python Wheel File Installation -Install the python bindings :: + Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here `_ - sudo dpkg -i nvcv-python3.*-x.x.x-cuda11-x86_64-linux.deb + Execute the following command to install appropriate CV-CUDA Python wheel :: -Optionally install the tests. :: + pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl - sudo dpkg -i cvcuda-tests-x.x.x-cuda11-x86_64-linux.deb + where is the desired CUDA version, the desired Python version and the desired architecture. -Optionally install the samples. :: + Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings. - sudo dpkg -i cvcuda-samples-x.x.x-cuda11-x86_64-linux.deb -* Verifying the Installation on Linux +* Verifying the Debian or TAR installation on Linux -To verify that cvcuda is installed and is running properly, run the tests from the install folder for tests. -Default installation path is /opt/nvidia/cvcuda0/bin. :: + To verify that CV-CUDA is installed and is running properly, run the tests from the install folder for tests. + Default installation path is /opt/nvidia/cvcuda0/bin. :: - cd /opt/nvidia/cvcuda0/bin - ./run_tests.sh + cd /opt/nvidia/cvcuda0/bin + ./run_tests.sh If CV-CUDA is properly installed and running on your Linux system, all tests will pass. diff --git a/docs/sphinx/relnotes/v0.5.0-beta.rst b/docs/sphinx/relnotes/v0.5.0-beta.rst index f79f78303..bd3633197 100644 --- a/docs/sphinx/relnotes/v0.5.0-beta.rst +++ b/docs/sphinx/relnotes/v0.5.0-beta.rst @@ -19,47 +19,42 @@ Beta.3 ====== -CV-CUDA 0.5.0 is a major release of the library providing multiple new operators, features, and fixes to multiple customer-reported issues. +CV-CUDA 0.5.0 is a comprehensive update introducing new security, compliance, and performance enhancements, alongside bug fixes and new features. Release Highlights ------------------ -CV-CUDA v0.5.0 includes the following key changes: +CV-CUDA v0.5.0 includes significant improvements: * **New Operators**: - * FindHomography: Calculates a perspective transform from four pairs of the corresponding points - * Label: Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels - * PairwiseMatcher: Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method - * Stack: Concatenates two input tensors into a single output tensor + - FindHomography: Calculates a perspective transform from four pairs of the corresponding points + - Label: Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels + - PairwiseMatcher: Matches features computed separately (e.g. via the SIFT operator) in two images using the brute force method * **New Features**: - * Added `TensorBatch` in C++ and Python, a container type that can hold a list of non-uniformly shaped tensors - * Added `Workspace` in C++ and Python, an abstraction of memory and asynchronous resources for CV-CUDA operators - * Added better color format support in nvcv_types - * New sample application for the `Label` operator - * JetPack 5.1.2 support for L4T (Jetson Orin, L4T 35.4.1, CUDA 11.4) - * Enhanced documentation + - Implemented Python class for `TensorBatch``, a container type that can hold a list of non-uniformly shaped tensors + - Added support for RGBD image formats + - Enhanced documentation * **Bug Fixes**: - * Resolved memory leak in `NvBlurBoxes` - * Fixed segmentation fault issue in Python with certain imports - * Corrected `typestr` format issue in `__cuda_array_interface__` - * Addressed occasional hanging in `OpBoxBlur` on RGBA images + - Resolved memory leak in NvBlurBoxes + - Fixed segmentation fault issue in Python with certain imports + - Corrected typestr format issue in `__cuda_array_interface__` + - Addressed occasional hanging in OpBoxBlur on RGBA images Compatibility ------------- -* GPU Compute Capability: 7+.x -* Ubuntu x86_64: 20.04, 22.04 +* Continues to support GPU Compute Capability: 7+.x +* Compatible with Ubuntu x86_64: 20.04, 22.04 * CUDA Toolkit: 11.7+ (11.2+ for library build and run) -* L4T: 35.4.1, JetPack 5.1.2 aarch64 -* GCC: 11.0+ (9.x and 10.x for APIs with pre-built binary) -* Python: 3.8, 3.10 +* GCC: 11.0+ (9.0 and 10.0 for APIs, with pre-built binary and run) +* Python: 3.7, 3.8, 3.10 Known Issues/Limitations ------------------------ -* For GCC versions lower than 11.0, C++17 support needs to be enabled when compiling CV-CUDA. +* The release notes do not specify new known issues or limitations for this version. License ------- diff --git a/docs/sphinx/relnotes/v0.6.0-beta.rst b/docs/sphinx/relnotes/v0.6.0-beta.rst new file mode 100644 index 000000000..ca0995a67 --- /dev/null +++ b/docs/sphinx/relnotes/v0.6.0-beta.rst @@ -0,0 +1,78 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.6.0-beta: + +Beta.4 +====== + +CV-CUDA 0.6.0 is a comprehensive update introducing new packaging and documentation enhancements, along with bug fixes and new features. + +Release Highlights +------------------ + +CV-CUDA v0.6.0 includes significant improvements: + +* **New Operator**: + + * HQResize: Advanced resize operator supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling. + +* **New Features**: + + * Standalone Python Wheels, including tooling and documentation to generate them. Prebuilt binaries for selected configurations. + + * Homogenized package naming + + * Improved documentation of hardware/software compatibility, build and test tutorials + + * Added Python Operator benchmarking application + + * Samples updated to new codec libraries, PyNvVideoCodec and NvImageCodec + + * Support of rank 2 tensors in MedianBlur + + * Additional tests for various operators + +* **Bug Fixes**: + + * Fix name clashes with NVTX + + * Fix workspace memory allocation of complex filters + + * Fix memory fault in MinAreaRect + +Compatibility and Known Limitations +----------------------------------- + +See main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. diff --git a/docs/sphinx/samples/cpp_samples/cropresize.rst b/docs/sphinx/samples/cpp_samples/cropresize.rst index 0ba4fd645..671733dae 100644 --- a/docs/sphinx/samples/cpp_samples/cropresize.rst +++ b/docs/sphinx/samples/cpp_samples/cropresize.rst @@ -126,7 +126,7 @@ To run the sample .. code-block:: bash - ./build/nvcv_samples_cropandresize -i -b + ./build/cvcuda_sample_cropandresize -i -b Sample Output ------------- diff --git a/docs/sphinx/samples/python_samples/classification.rst b/docs/sphinx/samples/python_samples/classification.rst index 119611627..16c23cb48 100644 --- a/docs/sphinx/samples/python_samples/classification.rst +++ b/docs/sphinx/samples/python_samples/classification.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -36,11 +36,11 @@ Writing the Sample App The classification sample app has been designed to be modular in all aspects. It imports and uses various modules such as data decoders, pipeline pre and post processors and the model inference. Some of these modules are defined in the same folder as the sample whereas the rest are defined in the common scripts folder for a wider re-use. -1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoderPyTorch`` for PyTorch based image decoding and ``VideoBatchDecoderVPF`` for VPF based video decoding. +1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoder`` for nvImageCodec based image decoding and ``VideoBatchDecoder`` for PyNvVideoCodec based video decoding. 2. Modules specific to this sample (i.e. defined in the classification sample folder) are ``PreprocessorCvcuda`` and ``PostprocessorCvcuda`` for CVCUDA based pre and post processing pipelines and ``ClassificationPyTorch`` and ``ClassificationTensorRT`` for the model inference. -The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA (i.e. nvcv) among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. +The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. .. literalinclude:: ../../../../samples/classification/python/main.py :language: python @@ -83,8 +83,8 @@ The ``run_sample`` function is the primary function that runs this sample. It se Next, we instantiate various classes to help us run the sample. These classes are: 1. ``PreprocessorCvcuda`` : A CVCUDA based pre-processing pipeline for classification. -2. ``ImageBatchDecoderPyTorch`` : A PyTorch based image decoder to read the images. -3. ``VideoBatchDecoderVPF`` : A VPF based video decoder to read the video. +2. ``ImageBatchDecoder`` : A nvImageCodec based image decoder to read the images. +3. ``VideoBatchDecoder`` : A PyNvVideoCodec based video decoder to read the video. 4. ``PostprocessorCvcuda`` : A post-processing pipeline for classification. 5. ``classificationPyTorch`` : A PyTorch based classification model to execute inference. 6. ``classificationTensorRT`` : A TensorRT based classification model to execute inference. @@ -120,8 +120,8 @@ That's it for the classification sample. To understand more about how each stage PreprocessorCvcuda PostprocessorCvcuda - ImageBatchDecoderPyTorch - VideoBatchDecoderVPF + ImageBatchDecoder + VideoBatchDecoder ClassificationPyTorch ClassificationTensorRT @@ -171,7 +171,7 @@ The top 5 classification results for the tabby_cat_tiger.jpg image is as follows user@machine:~/cvcuda/samples$ python3 classification/python/main.py -b 1 [perf_utils:85] 2023-07-27 22:27:17 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 22:27:17 INFO Using CV-CUDA version: 0.5.0-beta + [perf_utils:89] 2023-07-27 22:27:17 INFO Using CV-CUDA version: 0.6.0-beta [pipelines:35] 2023-07-27 22:27:17 INFO Using CVCUDA as preprocessor. [torch_utils:77] 2023-07-27 22:27:17 INFO Using torchnvjpeg as decoder. [pipelines:122] 2023-07-27 22:27:17 INFO Using CVCUDA as post-processor. diff --git a/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_pytorch.rst b/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst similarity index 52% rename from docs/sphinx/samples/python_samples/commons/imagebatchdecoder_pytorch.rst rename to docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst index 93ed4ad0b..edb14806e 100644 --- a/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_pytorch.rst +++ b/docs/sphinx/samples/python_samples/commons/imagebatchdecoder_nvcodec.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,56 +14,56 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _imagebatchdecoder_pytorch: +.. _imagebatchdecoder_nvcodec: -Image Decoding using PyTorch +Image Decoding using nvImageCodec ==================== -The image batch decoder is responsible for parsing the input expression, reading and decoding image data. The actual decoding is done in batches using the library ``torchnvjpeg``. Although used in the semantic segmentation sample, this image decoder is generic enough to be used in other applications. The code associated with this class can be found in the ``samples/common/python/torch_utils.py`` file. +The image batch decoder is responsible for parsing the input expression, reading and decoding image data. The actual decoding is done in batches using the library `nvImageCodec `_. Although used in the semantic segmentation sample, this image decoder is generic enough to be used in other applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. Before the data can be read or decoded, we must parse it (i.e figure out what kind of data it is). Depending on the ``input_path``'s value, we either read one image and create a dummy list with the data from the same image to simulate a batch or read a bunch of images from a directory. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_init_imagebatchdecoder_pytorch - :end-before: end_parse_imagebatchdecoder_pytorch + :start-after: begin_init_imagebatchdecoder_nvimagecodec + :end-before: end_parse_imagebatchdecoder_nvimagecodec :dedent: Once we have a list of image file names that we can read, we will split them into batches based on the batch size. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_batch_imagebatchdecoder_pytorch - :end-before: end_init_imagebatchdecoder_pytorch + :start-after: begin_batch_imagebatchdecoder_nvimagecodec + :end-before: end_init_imagebatchdecoder_nvimagecodec :dedent: That is all we need to do for the initialization. Now as soon as a call to decoder is issued, we would start reading and decoding the data. This begins with reading the data bytes in batches and returning None if there is no data left to be read. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_call_imagebatchdecoder_pytorch - :end-before: end_read_imagebatchdecoder_pytorch + :start-after: begin_call_imagebatchdecoder_nvimagecodec + :end-before: end_read_imagebatchdecoder_nvimagecodec :dedent: -Once the data has been read, we use ``torchnvjpeg`` to decode it into a list of image tensors. The torchnvjpeg instance is allocated either on its first use or whenever there is a change in the batch size (i.e. last batch). Since what we get at this point is a list of images (i.e a python list of 3D tensors), we would need to convert them to a 4D tensor by stacking them up on the first dimension. +Once the data has been read, we use ``nvImageCodec`` to decode it into a list of image tensors. The nvImageCodec instance is allocated either on its first use or whenever there is a change in the batch size (i.e. last batch). Since what we get at this point is a list of images (i.e a python list of 3D tensors), we would need to convert them to a 4D tensor by stacking them up on the first dimension. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_decode_imagebatchdecoder_pytorch - :end-before: end_decode_imagebatchdecoder_pytorch + :start-after: begin_decode_imagebatchdecoder_nvimagecodec + :end-before: end_decode_imagebatchdecoder_nvimagecodec :dedent: The final step is to pack all of this data into a special CVCUDA samples object called as ``Batch``. The ``Batch`` object helps us keep track of the data associated with the batch, the index of the batch and optionally any filename information one wants to attach (i.e. which files the data came from). -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_return_imagebatchdecoder_pytorch - :end-before: end_return_imagebatchdecoder_pytorch + :start-after: begin_return_imagebatchdecoder_nvimagecodec + :end-before: end_return_imagebatchdecoder_nvimagecodec :dedent: diff --git a/docs/sphinx/samples/python_samples/commons/imagebatchencoder_pytorch.rst b/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst similarity index 58% rename from docs/sphinx/samples/python_samples/commons/imagebatchencoder_pytorch.rst rename to docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst index dfd25650e..3cdb507d7 100644 --- a/docs/sphinx/samples/python_samples/commons/imagebatchencoder_pytorch.rst +++ b/docs/sphinx/samples/python_samples/commons/imagebatchencoder_nvcodec.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,28 +14,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _imagebatchencoder_pytorch: +.. _imagebatchencoder_nvcodec: -Image Encoding using PyTorch +Image Encoding using nvImageCodec ==================== -The image batch encoder is responsible for saving image tensors to the disk as JPG images. The actual encoding is done in batches using the ``PIL`` library. The image encoder is generic enough to be across the sample applications. The code associated with this class can be found in the ``samples/common/python/torch_utils.py`` file. +The image batch encoder is responsible for saving image tensors to the disk as JPG images. The actual encoding is done in batches using the `nvImageCodec `_ library. The image encoder is generic enough to be across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. The image batch encoder is a relatively simple class. Here is how its ``__init__`` method is defined. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_init_imagebatchencoder_pytorch - :end-before: end_init_imagebatchencoder_pytorch + :start-after: begin_init_imagebatchencoder_nvimagecodec + :end-before: end_init_imagebatchencoder_nvimagecodec :dedent: -Once the initialization is complete, we encode the images in the ``__call__`` method. Since the ``Batch`` object is passed, we have information of the data, its batch index and the original file name used to read the data. We can use this together with PyTorch's functions to detach the tensor, transfer it to the CPU and save it as PIL JPG image. +Once the initialization is complete, we encode the images in the ``__call__`` method. Since the ``Batch`` object is passed, we have information of the data, its batch index and the original file name used to read the data. -.. literalinclude:: ../../../../../samples/common/python/torch_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_call_imagebatchencoder_pytorch - :end-before: end_call_imagebatchencoder_pytorch + :start-after: begin_call_imagebatchencoder_nvimagecodec + :end-before: end_call_imagebatchencoder_nvimagecodec :dedent: diff --git a/docs/sphinx/samples/python_samples/commons/videobatchdecoder_vpf.rst b/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst similarity index 54% rename from docs/sphinx/samples/python_samples/commons/videobatchdecoder_vpf.rst rename to docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst index 8fb986029..9219f0aef 100644 --- a/docs/sphinx/samples/python_samples/commons/videobatchdecoder_vpf.rst +++ b/docs/sphinx/samples/python_samples/commons/videobatchdecoder_nvcodec.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,18 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _videobatchdecoder_vpf: +.. _videobatchdecoder_pyvideocodec: -Video Decoding using VPF +Video Decoding using pyNvVideoCodec ==================== -The video batch decoder is responsible for reading an MP4 video as PyTorch tensors. The actual decoding is done per frame using NVIDIA's `Video Processing Framework `_. The video decoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/vpf_utils.py`` file. +The video batch decoder is responsible for reading an MP4 video as tensors. The actual decoding is done per frame using NVIDIA's PyNvVideoCodec API. The video decoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. There are two classes responsible for the decoding work: -1. ``VideoBatchDecoderVPF`` and -2. ``nvdecoder`` +1. ``VideoBatchDecoder`` and +2. ``nvVideoDecoder`` The first class acts as a wrapper on the second class which allows us to: @@ -34,70 +34,70 @@ The first class acts as a wrapper on the second class which allows us to: 3. Use accelerated ops in CVCUDA to perform the necessary color conversion from NV12 to RGB after decoding the video. -VideoBatchDecoderVPF +VideoBatchDecoder ------------------ -Let's get started by understanding how this class is initialized in its ``__init__`` method. We use VPF's ``PyFFmpegDemuxer`` to read a few properties of the video. The decoder instance and CVCUDA color conversion tensors both are allocated when needed upon the first use. +Let's get started by understanding how this class is initialized in its ``__init__`` method. We use ``PyNvDemuxer`` to read a few properties of the video. The decoder instance and CVCUDA color conversion tensors both are allocated when needed upon the first use. **Note**: Due to the nature of NV12, representing it directly as a CVCUDA tensor is a bit challenging. Be sure to read through the explanation in the comments of the code shown below to understand more. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_init_videobatchdecoder_vpf - :end-before: end_init_videobatchdecoder_vpf + :start-after: begin_init_videobatchdecoder_pyvideocodec + :end-before: end_init_videobatchdecoder_pyvideocodec :dedent: Once things are defined and initialized, we would start the decoding when a call to the ``__call__`` function is made. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_call_videobatchdecoder_vpf - :end-before: end_alloc_videobatchdecoder_vpf + :start-after: begin_call_videobatchdecoder_pyvideocodec + :end-before: end_alloc_videobatchdecoder_pyvideocodec :dedent: Next, we call the ``nvdecoder`` instance to actually do the decoding and stack the image tensors up to form a 4D tensor. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_decode_videobatchdecoder_vpf - :end-before: end_decode_videobatchdecoder_vpf + :start-after: begin_decode_videobatchdecoder_pyvideocodec + :end-before: end_decode_videobatchdecoder_pyvideocodec :dedent: Once the video batch is ready, we use CVCUDA's ``cvtcolor_into`` function to convert its data from NV12 format to RGB format. We will use pre-allocated tensors to do the color conversion to avoid allocating same tensors on every batch. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_convert_videobatchdecoder_vpf - :end-before: end_convert_videobatchdecoder_vpf + :start-after: begin_convert_videobatchdecoder_pyvideocodec + :end-before: end_convert_videobatchdecoder_pyvideocodec :dedent: The final step is to pack all of this data into a special CVCUDA samples object called as ``Batch``. The ``Batch`` object helps us keep track of the data associated with the batch, the index of the batch and optionally any filename information one wants to attach (i.e. which files did the data come from). -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_batch_videobatchdecoder_vpf - :end-before: end_batch_videobatchdecoder_vpf + :start-after: begin_batch_videobatchdecoder_pyvideocodec + :end-before: end_batch_videobatchdecoder_pyvideocodec :dedent: -nvdecoder +nvVideoDecoder ------------------ -This is a class offering hardware accelerated video decoding functionality using VPF. It reads an MP4 video file, decodes it and returns a 3D PyTorch Tensor per frame. Please consult the documentation of the `Video Processing Framework `_ to learn more about its capabilities and APIs. +This is a class offering hardware accelerated video decoding functionality using pyNvVideoCodec. It reads an MP4 video file, decodes it and returns a CUDA accessible Tensor per frame. Please consult the documentation of the pyNvVideoCodec to learn more about its capabilities and APIs. -For use in CVCUDA, this class defines the following ``decode_hw`` and ``decode_to_tensor`` functions which decode data to a Torch tensor in a given cuda stream. +For use in CVCUDA, this class defines the following functions which decode data to a tensor in a given CUDA stream. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_imp_nvdecoder - :end-before: end_imp_nvdecoder + :start-after: begin_imp_nvvideodecoder + :end-before: end_imp_nvvideodecoder :dedent: diff --git a/docs/sphinx/samples/python_samples/commons/videobatchencoder_vpf.rst b/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst similarity index 53% rename from docs/sphinx/samples/python_samples/commons/videobatchencoder_vpf.rst rename to docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst index faa0dfbf6..96a75bf2b 100644 --- a/docs/sphinx/samples/python_samples/commons/videobatchencoder_vpf.rst +++ b/docs/sphinx/samples/python_samples/commons/videobatchencoder_nvcodec.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,18 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -.. _videobatchencoder_vpf: +.. _videobatchencoder_pyvideocodec: -Video Encoding using VPF +Video Encoding using VpyNvVideoCodecPF ==================== -The video batch encoder is responsible for writing PyTorch tensors as an MP4 video. The actual encoding is done in batches using NVIDIA's `Video Processing Framework `_. The video encoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/vpf_utils.py`` file. +The video batch encoder is responsible for writing tensors as an MP4 video. The actual encoding is done in batches using NVIDIA's pyNvVideoCodec. The video encoder is generic enough to be used across the sample applications. The code associated with this class can be found in the ``samples/common/python/nvcodec_utils.py`` file. There are two classes responsible for the encoding work: -1. ``VideoBatchEncoderVPF`` and -2. ``nvencoder`` +1. ``VideoBatchEncoder`` and +2. ``nvVideoEncoder`` The first class acts as a wrapper on the second class which allows us to: @@ -42,72 +42,72 @@ To get started, here is how the class is initialized in its ``__init__`` method. **Note**: Due to the nature of NV12, representing it directly as a CVCUDA tensor is a bit challenging. Be sure to read through the explanation in the comments of the code shown below to understand more. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_init_videobatchencoder_vpf - :end-before: end_init_videobatchencoder_vpf + :start-after: begin_init_videobatchencoder_pyvideocodec + :end-before: end_init_videobatchencoder_pyvideocodec :dedent: Once things are defined and initialized, we would start the decoding when a call to the ``__call__`` function is made. We need to first allocate the encoder instance if it wasn't done so already. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_call_videobatchencoder_vpf - :end-before: end_alloc_videobatchdecoder_vpf + :start-after: begin_call_videobatchencoder_pyvideocodec + :end-before: end_alloc_videobatchdecoder_pyvideocodec :dedent: Next, we use CVCUDA's ``cvtcolor_into`` function to convert the batch data from RGB format to NV12 format. We allocate tensors once to do the color conversion and avoid allocating same tensors on every batch. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_alloc_cvcuda_videobatchdecoder_vpf - :end-before: end_alloc_cvcuda_videobatchdecoder_vpf + :start-after: begin_alloc_cvcuda_videobatchdecoder_pyvideocodec + :end-before: end_alloc_cvcuda_videobatchdecoder_pyvideocodec :dedent: Once the tensors are allocated, we use CVCUDA ops to perform the color conversion. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_convert_videobatchencoder_vpf - :end-before: end_convert_videobatchencoder_vpf + :start-after: begin_convert_videobatchencoder_pyvideocodec + :end-before: end_convert_videobatchencoder_pyvideocodec :dedent: -Finally, we call the ``nvencooder`` instance to actually do the encoding. +Finally, we call the ``nvVideoEncoder`` instance to actually do the encoding. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_encode_videobatchencoder_vpf - :end-before: end_encode_videobatchencoder_vpf + :start-after: begin_encode_videobatchencoder_nvvideoencoder + :end-before: end_encode_videobatchencoder_nvvideoencoder :dedent: -nvencoder +nvVideoEncoder ------------------ -This is a class offering hardware accelerated video encoding functionality using VPF. It encodes tensors and writes as an MP4 file. Please consult the documentation of the `Video Processing Framework `_ to learn more about its capabilities and APIs. +This is a class offering hardware accelerated video encoding functionality using pyNvVideoCodec. It encodes tensors and writes as an MP4 file. Please consult the documentation of the pyNvVideoCodec to learn more about its capabilities and APIs. -For use in CVCUDA, this class defines the following ``tensor_to_surface`` and ``encode_from_tensor`` functions which encode a Torch tensor. +For use in CVCUDA, this class defines the following ``encode_from_tensor`` functions which encode a Torch tensor. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_imp_nvencoder - :end-before: end_imp_nvencoder + :start-after: begin_imp_nvvideoencoder + :end-before: end_imp_nvvideoencoder :dedent: Finally, we use the ``av`` library to write packets to an MP4 container. We must properly flush (i.e. write any pending packets) at the end. -.. literalinclude:: ../../../../../samples/common/python/vpf_utils.py +.. literalinclude:: ../../../../../samples/common/python/nvcodec_utils.py :language: python :linenos: - :start-after: begin_writeframe_nvencoder - :end-before: end_writeframe_nvencoder + :start-after: begin_writeframe_nvvideoencoder + :end-before: end_writeframe_nvvideoencoder :dedent: diff --git a/docs/sphinx/samples/python_samples/object_detection.rst b/docs/sphinx/samples/python_samples/object_detection.rst index a2d05499a..8a882221a 100644 --- a/docs/sphinx/samples/python_samples/object_detection.rst +++ b/docs/sphinx/samples/python_samples/object_detection.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -35,11 +35,11 @@ Writing the Sample App The object detection app has been designed to be modular in all aspects. It imports and uses various modules such as data decoders, encoders, pipeline pre and post processors and the model inference. Some of these modules are defined in the same folder as the sample whereas the rest are defined in the common scripts folder for a wider re-use. -1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoderPyTorch`` and ``ImageBatchEncoderPyTorch`` for PyTorch based image decoding and encoding and ``VideoBatchDecoderVPF`` and ``VideoBatchEncoderVPF`` for VPF based video decoding and encoding. +1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoder`` and ``ImageBatchEncoder`` for nvImageCodec based image decoding and encoding and ``VideoBatchDecoder`` and ``VideoBatchEncoder`` for PyNvVideoCodec based video decoding and encoding. 2. Modules specific to this sample (i.e. defined in the object_detection sample folder) are ``PreprocessorCvcuda`` and ``PostprocessorCvcuda`` for CVCUDA based pre and post processing pipelines and ``ObjectDetectionTensorRT`` and ``ObjectDetectionTensorflow`` for the model inference. -The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA (i.e. nvcv) among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. +The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. .. literalinclude:: ../../../../samples/object_detection/python/main.py :language: python @@ -91,10 +91,10 @@ Once the streams have been defined and initialized, all the operations in the re Next, we instantiate various classes to help us run the sample. These classes are: 1. ``PreprocessorCvcuda`` : A CVCUDA based pre-processing pipeline for object detection. -2. ``ImageBatchDecoderPyTorch`` : A PyTorch based image decoder to read the images. -3. ``ImageBatchEncoderPyTorch`` : A PyTorch based image encoder to write the images. -4. ``VideoBatchDecoderVPF`` : A VPF based video decoder to read the video. -5. ``VideoBatchEncoderVPF`` : A VPF based video encoder to write the video. +2. ``ImageBatchDecoder`` : A nvImageCodec based image decoder to read the images. +3. ``ImageBatchEncoder`` : A nvImageCodec based image encoder to write the images. +4. ``VideoBatchDecoder`` : A PyNvVideoCodec based video decoder to read the video. +5. ``VideoBatchEncoder`` : A PyNvVideoCodec based video encoder to write the video. 6. ``PostProcessorCvcuda`` : A CVCUDA based post-processing pipeline for object detection. 7. ``ObjectDetectionTensorflow`` : A TensorFlow based object detection model to execute inference. 8. ``ObjectDetectionTensorRT`` : A TensorRT based object detection model to execute inference. @@ -122,10 +122,10 @@ That's it for the object detection sample. To understand more about how each sta PreprocessorCvcuda PostprocessorCvcuda - ImageBatchDecoderPyTorch - ImageBatchEncoderPyTorch - VideoBatchDecoderVPF - VideoBatchEncoderVPF + ImageBatchDecoder + ImageBatchEncoder + VideoBatchDecoder + VideoBatchEncoder ObjectDetectionTensorFlow ObjectDetectionTensorRT @@ -177,7 +177,7 @@ This sample takes as input one or more images or one video and generates the obj user@machine:~/cvcuda/samples$ python3 object_detection/python/main.py [perf_utils:85] 2023-07-27 23:15:34 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 23:15:34 INFO Using CV-CUDA version: 0.5.0-beta + [perf_utils:89] 2023-07-27 23:15:34 INFO Using CV-CUDA version: 0.6.0-beta [pipelines:30] 2023-07-27 23:15:36 INFO Using CVCUDA as preprocessor. [torch_utils:77] 2023-07-27 23:15:36 INFO Using torchnvjpeg as decoder. [torch_utils:151] 2023-07-27 23:15:36 INFO Using PyTorch/PIL as encoder. diff --git a/docs/sphinx/samples/python_samples/segmentation.rst b/docs/sphinx/samples/python_samples/segmentation.rst index 5dd4d1944..53cf5b2eb 100644 --- a/docs/sphinx/samples/python_samples/segmentation.rst +++ b/docs/sphinx/samples/python_samples/segmentation.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -35,11 +35,11 @@ Writing the Sample App The segmentation sample app has been designed to be modular in all aspects. It imports and uses various modules such as data decoders, encoders, pipeline pre and post processors and the model inference. Some of these modules are defined in the same folder as the sample whereas the rest are defined in the common scripts folder for a wider re-use. -1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoderPyTorch`` and ``ImageBatchEncoderPyTorch`` for PyTorch based image decoding and encoding and ``VideoBatchDecoderVPF`` and ``VideoBatchEncoderVPF`` for VPF based video decoding and encoding. +1. Modules used by this sample app that are defined in the common folder (i.e. not specific just to this sample) are the ``ImageBatchDecoder`` and ``ImageBatchEncoder`` for nvImageCodec based image decoding and encoding and ``VideoBatchDecoder`` and ``VideoBatchEncoder`` for PyNvVideoCodec based video decoding and encoding. 2. Modules specific to this sample (i.e. defined in the segmentation sample folder) are ``PreprocessorCvcuda`` and ``PostprocessorCvcuda`` for CVCUDA based pre and post processing pipelines and ``SegmentationPyTorch`` and ``SegmentationTensorRT`` for the model inference. -The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA (i.e. nvcv) among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. +The first stage in our pipeline is importing all necessary python modules. Apart from the modules described above, this also includes modules such as torch and torchvision, torchnvjpeg, vpf and the main package of CVCUDA among others. Be sure to import ``pycuda.driver`` before importing any other GPU packages like torch or cvcuda to ensure a proper initialization. .. literalinclude:: ../../../../samples/segmentation/python/main.py :language: python @@ -83,10 +83,10 @@ The ``run_sample`` function is the primary function that runs this sample. It se Next, we instantiate various classes to help us run the sample. These classes are: 1. ``PreprocessorCvcuda`` : A CVCUDA based pre-processing pipeline for semantic segmentation. -2. ``ImageBatchDecoderPyTorch`` : A PyTorch based image decoder to read the images. -3. ``ImageBatchEncoderPyTorch`` : A PyTorch based image encoder to write the images. -4. ``VideoBatchDecoderVPF`` : A VPF based video decoder to read the video. -5. ``VideoBatchEncoderVPF`` : A VPF based video encoder to write the video. +2. ``ImageBatchDecoder`` : A nvImageCodec based image decoder to read the images. +3. ``ImageBatchEncoder`` : A nvImageCodec based image encoder to write the images. +4. ``VideoBatchDecoder`` : A PyNvVideoCodec based video decoder to read the video. +5. ``VideoBatchEncoder`` : A PyNvVideoCodec based video encoder to write the video. 6. ``PostprocessorCvcuda`` : A CVCUDA based post-processing pipeline for semantic segmentation. 7. ``SegmentationPyTorch`` : A PyTorch based semantic segmentation model to execute inference. 8. ``SegmentationTensorRT`` : A TensorRT based semantic segmentation model to execute inference. @@ -121,10 +121,10 @@ That's it for the semantic segmentation sample. To understand more about how eac PreprocessorCvcuda PostprocessorCvcuda - ImageBatchDecoderPyTorch - ImageBatchEncoderPyTorch - VideoBatchDecoderVPF - VideoBatchEncoderVPF + ImageBatchDecoder + ImageBatchEncoder + VideoBatchDecoder + VideoBatchEncoder SegmentationPyTorch SegmentationTensorRT @@ -182,7 +182,7 @@ This sample takes as input the one or more images or one video and generates the user@machine:~/cvcuda/samples$ python3 segmentation/python/main.py -b 5 -c __background__ -o /tmp -i assets/images/ [perf_utils:85] 2023-07-27 23:17:49 WARNING perf_utils is used without benchmark.py. Benchmarking mode is turned off. - [perf_utils:89] 2023-07-27 23:17:49 INFO Using CV-CUDA version: 0.5.0-beta + [perf_utils:89] 2023-07-27 23:17:49 INFO Using CV-CUDA version: 0.6.0-beta [pipelines:35] 2023-07-27 23:17:50 INFO Using CVCUDA as preprocessor. [torch_utils:60] 2023-07-27 23:17:50 INFO Found a total of 3 JPEG images. [torch_utils:77] 2023-07-27 23:17:50 INFO Using torchnvjpeg as decoder. diff --git a/lint/copyright_check.sh b/lint/copyright_check.sh index cd6a597ea..d46ecb7c6 100755 --- a/lint/copyright_check.sh +++ b/lint/copyright_check.sh @@ -18,17 +18,19 @@ # Check if input files have valid copyright message # Ref: https://confluence.nvidia.com/display/RP/Standardizing+on+SPDX+Identifiers -valid_license='Apache-2.0' +valid_licenses=('Apache-2.0' 'LicenseRef-NvidiaProprietary') # Detects that the line is a comment. -rgx_comment='^[[:space:]]*[[:graph:]]\+[[:space:]]\+' +# The following line detects comments in source code and mark down files. +# It can detect c++ style comments, python style comments or markdown style comments. +rgx_comment='^[[:space:]]*[[:graph:]]\+[[:space:]]\+[[:graph:]]*[[:space:]]*["]*' function get_tag() { local tag=$1 shift - local rgx="s@^\($rgx_comment\)\?$tag:[[:space:]]*\(.*\)@\2@p" + local rgx="s@^\($rgx_comment\)\?$tag:[[:space:]]*\([^\"]*\)\"*@\2@p" sed -n "$rgx" "$file" } @@ -56,8 +58,9 @@ function check_license() fi # Check if it is valid - if [[ "$license" != "$valid_license" ]]; then - error "$file" "License '$license' not valid. Must be '$valid_license'." && false + if [[ ! " ${valid_licenses[*]} " =~ [[:space:]]${license}[[:space:]] ]]; then + valid_licenses_str="${valid_licenses[*]}" + error "$file" "License '$license' not valid. Must be a value from '${valid_licenses_str//${IFS:0:1}/, }'." && false fi } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ea384fb18..7647d0491 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.18) -project(nvcv_python CXX C) +project(cvcuda_python CXX C) set(CMAKE_CXX_STANDARD 20) @@ -37,6 +37,11 @@ string(REPLACE "." "" PYTHON_MODULE_NAME "${PYTHON_MODULE_NAME}") include(GNUInstallDirs) set(PYTHON_MODULE_FILENAME_LIST "" CACHE INTERNAL "") + +if(CMAKE_BUILD_TYPE STREQUAL "Release") + add_custom_target(wheel ALL) +endif() + function(nvcv_python_add_module) cmake_parse_arguments(ARG "SHARED;MODULE" "TARGET;OUTPUT_NAME" "SOURCES" ${ARGV}) @@ -74,9 +79,14 @@ function(nvcv_python_add_module) set(PYTHON_MODULE_FILENAME_LIST "${PYTHON_MODULE_FILENAME_LIST};${prefix}${ARG_OUTPUT_NAME}${suffix}" CACHE INTERNAL "") + if(CMAKE_BUILD_TYPE STREQUAL "Release") + add_dependencies(wheel ${ARG_TARGET}) + endif() + install(TARGETS ${ARG_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/python COMPONENT ${PYTHON_MODULE_NAME} + ) endfunction() @@ -91,3 +101,10 @@ string(JOIN " " PYTHON_MODULE_FILENAME_LIST ${PYTHON_MODULE_FILENAME_LIST}) configure_file(cpack/debian_python_postinst.in cpack/postinst @ONLY) configure_file(cpack/debian_python_prerm.in cpack/prerm @ONLY) + +# Create Python wheel +if(CMAKE_BUILD_TYPE STREQUAL "Release") + add_custom_command( + TARGET wheel + COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/build_wheels.sh" "${BUILD_ROOT}" ${PYTHON_VERSION_SHORT} ) +endif() diff --git a/python/build_wheels.sh b/python/build_wheels.sh new file mode 100755 index 000000000..ecc162092 --- /dev/null +++ b/python/build_wheels.sh @@ -0,0 +1,84 @@ +#!/bin/bash -e + +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Creates the Python self contained wheels + +# Usage: build_wheels.sh [build_artifacts_dir] [python_versions] +# Note: This script is automatically called by cmake/make. The proper way to +# build python wheels is to issue the command: +# +# Do not run this script outside of cmake. + +set -e # Stops this script if any one command fails. + +if [ "$#" -lt 2 ]; then + echo "Usage: build_wheels.sh [python_versions,...]" + exit 1 +fi + +BUILD_DIR=$(realpath "$1"); shift +PY_VERSIONS=("$@") +LIB_DIR="${BUILD_DIR}/lib" + +echo "BUILD_DIR: $BUILD_DIR" +echo "Python Versions: ${PY_VERSIONS[*]}" + +for py_version in "${PY_VERSIONS[@]}" +do + py_version_flat="${py_version//./}" # Gets the non dotted version string + echo "Building Python wheels for: Python${py_version}" + + # Step 1: Create a directories to store all wheels related files for this python version + py_dir="${BUILD_DIR}/python${py_version}" + wheel_dir="${py_dir}/wheel" + mkdir -p "${wheel_dir}" + rm -rf ${wheel_dir:?}/* + mkdir -p "${wheel_dir}/cvcuda.libs" + + cd "${wheel_dir}" + + # Step 2: Copy necessary .so files under one directory + # We will copy the target of the linked file and not the symlink only. + # Also the new file-name of the .so will be the actual so-name present inside the header of the .so + # This can be retrieved by using patchelf. + # This allows us to copy .so files without knowing their versions and also making sure they still + # work after copying. + # Copy the core .so files first + for so_file_name in libcvcuda.so libnvcv_types.so + do + cp -L "${LIB_DIR}/${so_file_name}" \ + "${wheel_dir}/cvcuda.libs/`patchelf --print-soname "${LIB_DIR}/${so_file_name}"`" + done + + # Copy the bindings .so files + patch them in their rpath. + # This allows the bindings to find the core .so files in a directory named cvcuda.libs only. + for so_file_path in ${LIB_DIR}/python/*.cpython-${py_version_flat}*.so + do + so_file_name=$(basename ${so_file_path}) + cp -L "${so_file_path}" \ + "${wheel_dir}/" + + patchelf --force-rpath --set-rpath '$ORIGIN'/cvcuda.libs "${wheel_dir}/${so_file_name}" + done + + # Step 3: Copy the setup.py corresponding to current python version to our wheels directory. + cp "${py_dir}/setup.py" "${wheel_dir}" + + # Step 3: Create wheel + python${py_version} setup.py bdist_wheel --dist-dir="${wheel_dir}" + +done diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt index 5db4089ac..45ecc94e0 100644 --- a/python/mod_cvcuda/CMakeLists.txt +++ b/python/mod_cvcuda/CMakeLists.txt @@ -42,6 +42,7 @@ nvcv_python_add_module( OpBoxBlur.cpp OpBrightnessContrast.cpp OpColorTwist.cpp + OpHQResize.cpp OsdElement.cpp OpRemap.cpp RemapMapValueType.cpp diff --git a/python/mod_cvcuda/InterpolationType.cpp b/python/mod_cvcuda/InterpolationType.cpp index 7b6c0fa10..eb1c934e1 100644 --- a/python/mod_cvcuda/InterpolationType.cpp +++ b/python/mod_cvcuda/InterpolationType.cpp @@ -30,6 +30,7 @@ void ExportInterpolationType(py::module &m) .value("AREA", NVCV_INTERP_AREA, "Area-based (resampling using pixels in area) interpolation") .value("LANCZOS", NVCV_INTERP_LANCZOS, "Lanczos interpolation") .value("WARP_INVERSE_MAP", NVCV_WARP_INVERSE_MAP, "Inverse transformation") + .value("GAUSSIAN", NVCV_INTERP_GAUSSIAN, "Gaussian interpolation") .value("HAMMING", NVCV_INTERP_HAMMING, "Hamming interpolation") .value("BOX", NVCV_INTERP_BOX, "Box interpolation") .def("__or__", [](NVCVInterpolationType e1, NVCVInterpolationType e2) { return int(e1) | int(e2); }); diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp index 226336f24..130d01680 100644 --- a/python/mod_cvcuda/Main.cpp +++ b/python/mod_cvcuda/Main.cpp @@ -106,6 +106,7 @@ PYBIND11_MODULE(cvcuda, m) ExportOpBoxBlur(m); ExportOpBrightnessContrast(m); ExportOpColorTwist(m); + ExportOpHQResize(m); ExportOpRemap(m); ExportOpCropFlipNormalizeReformat(m); ExportOpNonMaximumSuppression(m); diff --git a/python/mod_cvcuda/OpAdaptiveThreshold.cpp b/python/mod_cvcuda/OpAdaptiveThreshold.cpp index 6b2c17b18..30801fb5f 100644 --- a/python/mod_cvcuda/OpAdaptiveThreshold.cpp +++ b/python/mod_cvcuda/OpAdaptiveThreshold.cpp @@ -41,9 +41,9 @@ Tensor AdaptiveThresholdInto(Tensor &output, Tensor &input, double max_value, NV auto adaptiveThreshold = CreateOperator(block_size, 0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*adaptiveThreshold}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*adaptiveThreshold}); adaptiveThreshold->submit(pstream->cudaHandle(), input, output, max_value, adaptive_method, threshold_type, block_size, c); @@ -72,9 +72,9 @@ ImageBatchVarShape AdaptiveThresholdVarShapeInto(ImageBatchVarShape &output, Ima auto adaptiveThreshold = CreateOperator(max_block_size, input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, max_value, block_size, c}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*adaptiveThreshold}); + guard.add(LockMode::LOCK_MODE_READ, {input, max_value, block_size, c}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*adaptiveThreshold}); adaptiveThreshold->submit(pstream->cudaHandle(), input, output, max_value, adaptive_method, threshold_type, block_size, c); diff --git a/python/mod_cvcuda/OpAdvCvtColor.cpp b/python/mod_cvcuda/OpAdvCvtColor.cpp index f24337b75..c9a4eff2e 100644 --- a/python/mod_cvcuda/OpAdvCvtColor.cpp +++ b/python/mod_cvcuda/OpAdvCvtColor.cpp @@ -37,9 +37,9 @@ Tensor AdvCvtColorInto(Tensor &output, Tensor &input, NVCVColorConversionCode co auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output, code, spec); return std::move(output); } diff --git a/python/mod_cvcuda/OpAverageBlur.cpp b/python/mod_cvcuda/OpAverageBlur.cpp index 74070cc65..dc37c337f 100644 --- a/python/mod_cvcuda/OpAverageBlur.cpp +++ b/python/mod_cvcuda/OpAverageBlur.cpp @@ -45,9 +45,9 @@ Tensor AverageBlurInto(Tensor &output, Tensor &input, const std::tuple auto averageBlur = CreateOperator(kernelSizeArg, 0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*averageBlur}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_WRITE, {*averageBlur}); averageBlur->submit(pstream->cudaHandle(), input, output, kernelSizeArg, kernelAnchorArg, border); @@ -76,9 +76,9 @@ ImageBatchVarShape AverageBlurVarShapeInto(ImageBatchVarShape &output, ImageBatc auto averageBlur = CreateOperator(maxKernelSizeArg, input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, kernel_size, kernel_anchor}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*averageBlur}); + guard.add(LockMode::LOCK_MODE_READ, {input, kernel_size, kernel_anchor}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*averageBlur}); averageBlur->submit(pstream->cudaHandle(), input, output, kernel_size, kernel_anchor, border); diff --git a/python/mod_cvcuda/OpBilateralFilter.cpp b/python/mod_cvcuda/OpBilateralFilter.cpp index 4f5df728e..8e844351d 100644 --- a/python/mod_cvcuda/OpBilateralFilter.cpp +++ b/python/mod_cvcuda/OpBilateralFilter.cpp @@ -42,9 +42,9 @@ Tensor BilateralFilterInto(Tensor &output, Tensor &input, int diameter, float si auto bilateral_filter = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*bilateral_filter}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*bilateral_filter}); bilateral_filter->submit(pstream->cudaHandle(), input, output, diameter, sigmaColor, sigmaSpace, borderMode); @@ -71,9 +71,9 @@ ImageBatchVarShape VarShapeBilateralFilterInto(ImageBatchVarShape &output, Image auto bilateral_filter = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, diameter, sigmaColor, sigmaSpace}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*bilateral_filter}); + guard.add(LockMode::LOCK_MODE_READ, {input, diameter, sigmaColor, sigmaSpace}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*bilateral_filter}); bilateral_filter->submit(pstream->cudaHandle(), input, output, diameter, sigmaColor, sigmaSpace, borderMode); diff --git a/python/mod_cvcuda/OpBndBox.cpp b/python/mod_cvcuda/OpBndBox.cpp index a446347f9..1551832f7 100644 --- a/python/mod_cvcuda/OpBndBox.cpp +++ b/python/mod_cvcuda/OpBndBox.cpp @@ -36,9 +36,9 @@ Tensor BndBoxInto(Tensor &output, Tensor &input, NVCVBndBoxesI bboxes, std::opti auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output, bboxes); diff --git a/python/mod_cvcuda/OpBoxBlur.cpp b/python/mod_cvcuda/OpBoxBlur.cpp index 747bf9740..2c1b21dab 100644 --- a/python/mod_cvcuda/OpBoxBlur.cpp +++ b/python/mod_cvcuda/OpBoxBlur.cpp @@ -36,9 +36,9 @@ Tensor BoxBlurInto(Tensor &output, Tensor &input, NVCVBlurBoxesI bboxes, std::op auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output, bboxes); diff --git a/python/mod_cvcuda/OpBrightnessContrast.cpp b/python/mod_cvcuda/OpBrightnessContrast.cpp index b7921850a..f0c106dd9 100644 --- a/python/mod_cvcuda/OpBrightnessContrast.cpp +++ b/python/mod_cvcuda/OpBrightnessContrast.cpp @@ -58,16 +58,16 @@ auto runGuard(Op &op, Src &src, Dst &dst, std::optional &brightness, std } ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {src}); + guard.add(LockMode::LOCK_MODE_READ, {src}); for (auto &arg : {brightness, contrast, brightnessShift, contrastCenter}) { if (arg) { - guard.add(LockMode::LOCK_READ, {*arg}); + guard.add(LockMode::LOCK_MODE_READ, {*arg}); } } - guard.add(LockMode::LOCK_WRITE, {dst}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); call(*pstream, brightness ? *brightness : nvcv::Tensor{nullptr}, contrast ? *contrast : nvcv::Tensor{nullptr}, brightnessShift ? *brightnessShift : nvcv::Tensor{nullptr}, diff --git a/python/mod_cvcuda/OpCenterCrop.cpp b/python/mod_cvcuda/OpCenterCrop.cpp index c8eea222b..259928511 100644 --- a/python/mod_cvcuda/OpCenterCrop.cpp +++ b/python/mod_cvcuda/OpCenterCrop.cpp @@ -44,9 +44,9 @@ Tensor CenterCropInto(Tensor &output, Tensor &input, const std::tuple auto center_crop = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*center_crop}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*center_crop}); nvcv::Size2D cropSizeArg{std::get<0>(cropSize), std::get<1>(cropSize)}; diff --git a/python/mod_cvcuda/OpChannelReorder.cpp b/python/mod_cvcuda/OpChannelReorder.cpp index 8bc15732a..653dd359e 100644 --- a/python/mod_cvcuda/OpChannelReorder.cpp +++ b/python/mod_cvcuda/OpChannelReorder.cpp @@ -44,9 +44,9 @@ ImageBatchVarShape ChannelReorderVarShapeInto(ImageBatchVarShape &output, ImageB auto chReorder = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, orders}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*chReorder}); + guard.add(LockMode::LOCK_MODE_READ, {input, orders}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*chReorder}); chReorder->submit(pstream->cudaHandle(), input, output, orders); diff --git a/python/mod_cvcuda/OpColorTwist.cpp b/python/mod_cvcuda/OpColorTwist.cpp index c37ee3069..54c44404e 100644 --- a/python/mod_cvcuda/OpColorTwist.cpp +++ b/python/mod_cvcuda/OpColorTwist.cpp @@ -56,9 +56,9 @@ auto runGuard(Op &op, Src &src, Dst &dst, const Tensor &twist, std::optional(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {foreground, background, fgMask}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*composite}); + guard.add(LockMode::LOCK_MODE_READ, {foreground, background, fgMask}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*composite}); composite->submit(pstream->cudaHandle(), foreground, background, fgMask, output); @@ -73,9 +73,9 @@ ImageBatchVarShape CompositeVarShapeInto(ImageBatchVarShape &output, ImageBatchV auto composite = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {foreground, background, fgMask}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*composite}); + guard.add(LockMode::LOCK_MODE_READ, {foreground, background, fgMask}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*composite}); composite->submit(pstream->cudaHandle(), foreground, background, fgMask, output); diff --git a/python/mod_cvcuda/OpConv2D.cpp b/python/mod_cvcuda/OpConv2D.cpp index 8fc0ab9fd..41d6f64c2 100644 --- a/python/mod_cvcuda/OpConv2D.cpp +++ b/python/mod_cvcuda/OpConv2D.cpp @@ -44,9 +44,9 @@ ImageBatchVarShape Conv2DVarShapeInto(ImageBatchVarShape &output, ImageBatchVarS auto conv2D = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, kernel, kernel_anchor}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*conv2D}); + guard.add(LockMode::LOCK_MODE_READ, {input, kernel, kernel_anchor}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*conv2D}); conv2D->submit(pstream->cudaHandle(), input, output, kernel, kernel_anchor, border); diff --git a/python/mod_cvcuda/OpConvertTo.cpp b/python/mod_cvcuda/OpConvertTo.cpp index f60759029..767c54fcd 100644 --- a/python/mod_cvcuda/OpConvertTo.cpp +++ b/python/mod_cvcuda/OpConvertTo.cpp @@ -36,9 +36,9 @@ Tensor ConvertToInto(Tensor &output, Tensor &input, float scale, float offset, s auto cvt = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*cvt}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*cvt}); cvt->submit(pstream->cudaHandle(), input, output, scale, offset); diff --git a/python/mod_cvcuda/OpCopyMakeBorder.cpp b/python/mod_cvcuda/OpCopyMakeBorder.cpp index 6711948be..8a3075699 100644 --- a/python/mod_cvcuda/OpCopyMakeBorder.cpp +++ b/python/mod_cvcuda/OpCopyMakeBorder.cpp @@ -55,9 +55,9 @@ Tensor CopyMakeBorderInto(Tensor &output, Tensor &input, NVCVBorderType borderMo auto copyMakeBorder = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*copyMakeBorder}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*copyMakeBorder}); copyMakeBorder->submit(pstream->cudaHandle(), input, output, top, left, borderMode, bValue); @@ -101,9 +101,9 @@ Tensor VarShapeCopyMakeBorderStackInto(Tensor &output, ImageBatchVarShape &input auto copyMakeBorder = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, top, left}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*copyMakeBorder}); + guard.add(LockMode::LOCK_MODE_READ, {input, top, left}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*copyMakeBorder}); copyMakeBorder->submit(pstream->cudaHandle(), input, output, top, left, borderMode, bValue); @@ -149,9 +149,9 @@ ImageBatchVarShape VarShapeCopyMakeBorderInto(ImageBatchVarShape &output, ImageB auto copyMakeBorder = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, top, left}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*copyMakeBorder}); + guard.add(LockMode::LOCK_MODE_READ, {input, top, left}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*copyMakeBorder}); copyMakeBorder->submit(pstream->cudaHandle(), input, output, top, left, borderMode, bValue); diff --git a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp index bfae999b7..1eaacfce5 100644 --- a/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp +++ b/python/mod_cvcuda/OpCropFlipNormalizeReformat.cpp @@ -52,9 +52,9 @@ Tensor CropFlipNormalizeReformatInto(Tensor &output, ImageBatchVarShape &input, } ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, cropRect, flipCode, base, scale}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input, cropRect, flipCode, base, scale}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output, cropRect, borderMode, borderValue, flipCode, base, scale, globalScale, globalShift, epsilon, *flags); diff --git a/python/mod_cvcuda/OpCustomCrop.cpp b/python/mod_cvcuda/OpCustomCrop.cpp index af6b2ff80..c448eccda 100644 --- a/python/mod_cvcuda/OpCustomCrop.cpp +++ b/python/mod_cvcuda/OpCustomCrop.cpp @@ -38,9 +38,9 @@ Tensor CustomCropInto(Tensor &output, Tensor &input, const NVCVRectI &rcCrop, st auto crop = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*crop}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*crop}); crop->submit(pstream->cudaHandle(), input, output, rcCrop); diff --git a/python/mod_cvcuda/OpCvtColor.cpp b/python/mod_cvcuda/OpCvtColor.cpp index 3b8eb883a..39118b477 100644 --- a/python/mod_cvcuda/OpCvtColor.cpp +++ b/python/mod_cvcuda/OpCvtColor.cpp @@ -45,9 +45,9 @@ Tensor CvtColorInto(Tensor &output, Tensor &input, NVCVColorConversionCode code, auto cvtColor = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*cvtColor}); + guard.add(LockMode::LOCK_MODE_READWRITE, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*cvtColor}); cvtColor->submit(pstream->cudaHandle(), input, output, code); @@ -89,9 +89,9 @@ ImageBatchVarShape CvtColorVarShapeInto(ImageBatchVarShape &output, ImageBatchVa auto cvtColor = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*cvtColor}); + guard.add(LockMode::LOCK_MODE_READWRITE, {input}); + guard.add(LockMode::LOCK_MODE_READWRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*cvtColor}); cvtColor->submit(pstream->cudaHandle(), input, output, code); diff --git a/python/mod_cvcuda/OpErase.cpp b/python/mod_cvcuda/OpErase.cpp index e73022967..7f7503e26 100644 --- a/python/mod_cvcuda/OpErase.cpp +++ b/python/mod_cvcuda/OpErase.cpp @@ -47,9 +47,9 @@ Tensor EraseInto(Tensor &output, Tensor &input, Tensor &anchor, Tensor &erasing, auto erase = CreateOperator((int)shape[0]); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, anchor, erasing, values, imgIdx}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*erase}); + guard.add(LockMode::LOCK_MODE_READ, {input, anchor, erasing, values, imgIdx}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*erase}); erase->submit(pstream->cudaHandle(), input, output, anchor, erasing, values, imgIdx, random, seed); @@ -83,9 +83,9 @@ ImageBatchVarShape EraseVarShapeInto(ImageBatchVarShape &output, ImageBatchVarSh auto erase = CreateOperator((int)shape[0]); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, anchor, erasing, values, imgIdx}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*erase}); + guard.add(LockMode::LOCK_MODE_READ, {input, anchor, erasing, values, imgIdx}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*erase}); erase->submit(pstream->cudaHandle(), input, output, anchor, erasing, values, imgIdx, random, seed); diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp index 5202905b0..137bf645f 100644 --- a/python/mod_cvcuda/OpFindContours.cpp +++ b/python/mod_cvcuda/OpFindContours.cpp @@ -46,10 +46,10 @@ TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, auto findContours = CreateOperator(size, static_cast(input.shape()[0])); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {points}); - guard.add(LockMode::LOCK_WRITE, {numPoints}); - guard.add(LockMode::LOCK_WRITE, {*findContours}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {points}); + guard.add(LockMode::LOCK_MODE_WRITE, {numPoints}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*findContours}); findContours->submit(pstream->cudaHandle(), input, points, numPoints); diff --git a/python/mod_cvcuda/OpFindHomography.cpp b/python/mod_cvcuda/OpFindHomography.cpp index 125535981..3560cc91f 100644 --- a/python/mod_cvcuda/OpFindHomography.cpp +++ b/python/mod_cvcuda/OpFindHomography.cpp @@ -151,9 +151,10 @@ Tensor FindHomographyInto(Tensor &models, Tensor &srcPts, Tensor &dstPts, std::o auto findHomography = CreateOperatorEx(batchSize, numPoints); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {srcPts}); - guard.add(LockMode::LOCK_READ, {dstPts}); - guard.add(LockMode::LOCK_WRITE, {models}); + guard.add(LockMode::LOCK_MODE_READ, {srcPts}); + guard.add(LockMode::LOCK_MODE_READ, {dstPts}); + guard.add(LockMode::LOCK_MODE_READWRITE, {models}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*findHomography}); findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models); @@ -194,9 +195,10 @@ TensorBatch VarShapeFindHomographyInto(TensorBatch &models, TensorBatch &srcPts, auto findHomography = CreateOperatorEx(batchSize, maxNumPoints); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {srcPts}); - guard.add(LockMode::LOCK_READ, {dstPts}); - guard.add(LockMode::LOCK_WRITE, {models}); + guard.add(LockMode::LOCK_MODE_READ, {srcPts}); + guard.add(LockMode::LOCK_MODE_READ, {dstPts}); + guard.add(LockMode::LOCK_MODE_READWRITE, {models}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*findHomography}); findHomography->submit(pstream->cudaHandle(), srcPts, dstPts, models); diff --git a/python/mod_cvcuda/OpFlip.cpp b/python/mod_cvcuda/OpFlip.cpp index e63dec27b..72dce09d3 100644 --- a/python/mod_cvcuda/OpFlip.cpp +++ b/python/mod_cvcuda/OpFlip.cpp @@ -41,9 +41,9 @@ Tensor FlipInto(Tensor &output, Tensor &input, int32_t flipCode, std::optional(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*Flip}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*Flip}); Flip->submit(pstream->cudaHandle(), input, output, flipCode); @@ -68,9 +68,9 @@ ImageBatchVarShape FlipVarShapeInto(ImageBatchVarShape &output, ImageBatchVarSha auto flip = CreateOperator(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, flipCode}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*flip}); + guard.add(LockMode::LOCK_MODE_READ, {input, flipCode}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*flip}); flip->submit(pstream->cudaHandle(), input, output, flipCode); diff --git a/python/mod_cvcuda/OpGammaContrast.cpp b/python/mod_cvcuda/OpGammaContrast.cpp index 85a644916..8df72480b 100644 --- a/python/mod_cvcuda/OpGammaContrast.cpp +++ b/python/mod_cvcuda/OpGammaContrast.cpp @@ -42,9 +42,9 @@ ImageBatchVarShape VarShapeGammaContrastInto(ImageBatchVarShape &output, ImageBa auto gamma_contrast = CreateOperator(input.capacity(), input.uniqueFormat().numChannels()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, gamma}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*gamma_contrast}); + guard.add(LockMode::LOCK_MODE_READ, {input, gamma}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*gamma_contrast}); gamma_contrast->submit(pstream->cudaHandle(), input, output, gamma); diff --git a/python/mod_cvcuda/OpGaussian.cpp b/python/mod_cvcuda/OpGaussian.cpp index 89634c79a..fdf9de806 100644 --- a/python/mod_cvcuda/OpGaussian.cpp +++ b/python/mod_cvcuda/OpGaussian.cpp @@ -46,9 +46,9 @@ Tensor GaussianInto(Tensor &output, Tensor &input, const std::tuple &k auto gaussian = CreateOperator(kernelSizeArg, 0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*gaussian}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*gaussian}); gaussian->submit(pstream->cudaHandle(), input, output, kernelSizeArg, sigmaArg, border); @@ -77,9 +77,9 @@ ImageBatchVarShape VarShapeGaussianInto(ImageBatchVarShape &output, ImageBatchVa auto gaussian = CreateOperator(maxKernelSizeArg, input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, ksize, sigma}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*gaussian}); + guard.add(LockMode::LOCK_MODE_READ, {input, ksize, sigma}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*gaussian}); gaussian->submit(pstream->cudaHandle(), input, output, ksize, sigma, border); diff --git a/python/mod_cvcuda/OpGaussianNoise.cpp b/python/mod_cvcuda/OpGaussianNoise.cpp index 255572851..94280ceba 100644 --- a/python/mod_cvcuda/OpGaussianNoise.cpp +++ b/python/mod_cvcuda/OpGaussianNoise.cpp @@ -41,9 +41,9 @@ Tensor GaussianNoiseInto(Tensor &output, Tensor &input, Tensor &mu, Tensor &sigm auto gaussiannoise = CreateOperator((int)shape[0]); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, mu, sigma}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*gaussiannoise}); + guard.add(LockMode::LOCK_MODE_READ, {input, mu, sigma}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*gaussiannoise}); gaussiannoise->submit(pstream->cudaHandle(), input, output, mu, sigma, per_channel, seed); @@ -70,9 +70,9 @@ ImageBatchVarShape GaussianNoiseVarShapeInto(ImageBatchVarShape &output, ImageBa auto gaussiannoise = CreateOperator(input.numImages()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, mu, sigma}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*gaussiannoise}); + guard.add(LockMode::LOCK_MODE_READ, {input, mu, sigma}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*gaussiannoise}); gaussiannoise->submit(pstream->cudaHandle(), input, output, mu, sigma, per_channel, seed); diff --git a/python/mod_cvcuda/OpHQResize.cpp b/python/mod_cvcuda/OpHQResize.cpp new file mode 100644 index 000000000..295771013 --- /dev/null +++ b/python/mod_cvcuda/OpHQResize.cpp @@ -0,0 +1,761 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operators.hpp" +#include "WorkspaceCache.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cvcudapy { + +namespace { + +using Roi = pybind11::tuple; +using Rois = std::vector; + +inline void GetMinMagInterpolation(NVCVInterpolationType &minInterpolationArg, + NVCVInterpolationType &magInterpolationArg, + const std::optional &interpolation, + const std::optional &minInterpolation, + const std::optional &magInterpolation) +{ + if (interpolation) + { + if (minInterpolation || magInterpolation) + { + throw py::value_error( + "When `interpolation` is specified, the `min_interpolation` and `mag_interpolation` should not be " + "specified."); + } + minInterpolationArg = magInterpolationArg = *interpolation; + } + else + { + if (!minInterpolation || !magInterpolation) + { + throw py::value_error( + "Either `interpolation`, or both `min_interpolation` and `mag_interpolation` must be specified."); + } + minInterpolationArg = *minInterpolation; + magInterpolationArg = *magInterpolation; + } +} + +inline void ParseRoi(HQResizeRoiF &parsedRoi, const Roi &roi, int ndim) +{ + assert(ndim == 2 || ndim == 3); + auto roiSize = roi.size(); + if (roiSize != static_cast(2 * ndim)) + { + if (ndim == 2) + { + throw std::runtime_error( + "Got wrong number of ROI components. For image resize, 4 integers are expected: " + "low_height, low_width, high_height, high_width describing the bounding box for " + "the input."); + } + else + { + throw std::runtime_error( + "Got wrong number of ROI components. For volumetric data, 6 integers are expected: " + "low_depth, low_height, low_width, high_depth, high_height, high_width " + "describing the bounding box for the input."); + } + } + for (int d = 0; d < ndim; d++) + { + parsedRoi.lo[d] = roi[d].cast(); + } + for (int d = 0; d < ndim; d++) + { + parsedRoi.hi[d] = roi[ndim + d].cast(); + } +} + +class RoiHelper +{ +public: + RoiHelper(const std::optional &maybeRois, int ndim) + : m_ndim{ndim} + { + if (maybeRois) + { + auto &rois = *maybeRois; + m_rois.resize(rois.size()); + for (uint64_t i = 0; i < rois.size(); i++) + { + auto &roi = m_rois[i]; + auto &passedRoi = rois[i]; + ParseRoi(roi, passedRoi, ndim); + } + } + } + + RoiHelper(const std::optional &maybeRoi, int ndim) + : m_ndim{ndim} + { + if (maybeRoi) + { + m_rois.resize(1); + ParseRoi(m_rois[0], *maybeRoi, ndim); + } + } + + HQResizeRoisF NonOwningHandle() + { + int32_t size = m_rois.size(); + HQResizeRoiF *data = size == 0 ? nullptr : m_rois.data(); + return {size, m_ndim, data}; + } + +private: + int m_ndim; + std::vector m_rois; +}; + +inline HQResizeTensorShapeI TensorShape(const nvcv::TensorLayout &layout, const nvcv::TensorShape &shape, + int resizeNDim) +{ + assert(resizeNDim == 2 || resizeNDim == 3); + + char shapeArgLayout[4] = "DHW"; + HQResizeTensorShapeI tensorShape; + for (int d = 0; d < resizeNDim; d++) + { + int axis = layout.find(shapeArgLayout[d + 3 - resizeNDim]); + if (axis < 0) + { + throw std::runtime_error( + "The layout of an input tensor to the resize operator must contain HW extents in the layout (for " + "images) or DHW extents (for 3D resampling). Some extents are missing in the input tensor."); + } + tensorShape.extent[d] = shape[axis]; + } + int channelAxis = layout.find('C'); + tensorShape.numChannels = channelAxis < 0 ? 1 : shape[channelAxis]; + tensorShape.ndim = resizeNDim; + return tensorShape; +} + +class BatchShapesHelper +{ +public: + BatchShapesHelper(const nvcv::ImageBatchVarShape &batch) + { + int32_t numSamples = batch.numImages(); + m_shapes.resize(numSamples); + m_ndim = 2; + m_numChannels = batch.uniqueFormat().numChannels(); + for (int i = 0; i < numSamples; i++) + { + const auto &imgShape = batch[i].size(); + auto &shape = m_shapes[i]; + shape.extent[0] = imgShape.h; + shape.extent[1] = imgShape.w; + } + } + + BatchShapesHelper(const TensorBatch &batch) + { + int32_t numSamples = batch.numTensors(); + auto layout = batch.layout(); + bool hasDepth = layout.find('D') >= 0; + m_ndim = hasDepth ? 3 : 2; + m_numChannels = -1; + m_shapes.resize(numSamples); + for (int i = 0; i < numSamples; i++) + { + const auto &tensor = batch[i]; + m_shapes[i] = TensorShape(layout, tensor.shape(), m_ndim); + if (i == 0) + { + m_numChannels = m_shapes[i].numChannels; + } + else if (m_numChannels != m_shapes[i].numChannels) + { + m_numChannels = -1; + } + } + } + + HQResizeTensorShapesI NonOwningHandle() + { + int32_t size = m_shapes.size(); + return {size ? m_shapes.data() : nullptr, size, m_ndim, m_numChannels}; + } + +private: + int32_t m_ndim; + int32_t m_numChannels; + std::vector m_shapes; +}; + +inline Shape ResizedTensorShape(const nvcv::TensorLayout &srcLayout, const nvcv::TensorShape &srcShape, + const Shape &outShape) +{ + int resizeNDim = outShape.size(); + if (resizeNDim != 2 && resizeNDim != 3) + { + throw std::runtime_error( + "The `out_shape` must be a tuple of 2 or 3 integers (for 2D or 3D resampling respectively)."); + } + + bool hasDepth = srcLayout.find('D') >= 0; + int expectedNDim = hasDepth ? 3 : 2; + + if (expectedNDim != resizeNDim) + { + if (hasDepth) + { + throw std::runtime_error( + "The input tensor contains depth extent (`D`) in the layout. For 3D resize, please specify the resized " + "shape for 3 extents: depth, height, and width. Got 2 extents."); + } + else + { + throw std::runtime_error( + "Expected the resized shape to consists of 2 integers: for resized height and width. Got 3 integers."); + } + } + + char shapeArgLayout[4] = "DHW"; + int shapeArg[3]; + for (int d = 0; d < resizeNDim; d++) + { + shapeArg[d] = outShape[d].cast(); + } + + Shape resizedShape(srcShape.rank()); + for (int i = 0; i < srcShape.rank(); i++) + { + resizedShape[i] = srcShape[i]; + } + + assert(srcShape.rank() == srcLayout.rank()); + for (int d = 0; d < resizeNDim; d++) + { + int axis = srcLayout.find(shapeArgLayout[d + 3 - resizeNDim]); + if (axis < 0) + { + throw std::runtime_error( + "The layout of an input tensor to the resize operator must contain HW extents in the layout (for " + "images) or DHW extents (for 3D resampling). Some extents are missing in the input tensor."); + } + resizedShape[axis] = shapeArg[d]; + } + return resizedShape; +} + +class PyOpHQResize : public nvcvpy::Container +{ +public: + // Define a Key class to be used by the cache to fetch similar items for potential reuse. + class Key : public nvcvpy::IKey + { + public: + // the filters are generated by the operator constructor for a given device + Key(int deviceId) + : m_deviceId{deviceId} + { + } + + private: + size_t doGetHash() const override + { + return m_deviceId; + } + + bool doIsCompatible(const nvcvpy::IKey &that_) const override + { + const Key *thatKey = dynamic_cast(&that_); + return thatKey != nullptr && thatKey->m_deviceId == m_deviceId; + } + + int m_deviceId; + }; + + PyOpHQResize(int deviceId) + : m_key(deviceId) + , m_op() + { + } + + void submit(cudaStream_t stream, const Tensor &in, const Tensor &out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoiF *roi) + { + if (in.layout() != out.layout()) + { + throw std::runtime_error("Input and output tensors must have the same layout"); + } + + int resizeNDim = in.layout().find('D') >= 0 ? 3 : 2; + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(in.exportData()); + if (!inAccess) + { + throw std::runtime_error("Incompatible input tensor layout"); + } + + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(out.exportData()); + if (!outAccess) + { + throw std::runtime_error("Incompatible input tensor layout"); + } + + int numSamples = inAccess->numSamples(); + HQResizeTensorShapeI inShape = TensorShape(in.layout(), in.shape(), resizeNDim); + HQResizeTensorShapeI outShape = TensorShape(out.layout(), out.shape(), resizeNDim); + + auto req = m_op.getWorkspaceRequirements(numSamples, inShape, outShape, minInterpolation, magInterpolation, + antialias, roi); + auto ws = WorkspaceCache::instance().get(req, stream); + m_op(stream, ws.get(), in, out, minInterpolation, magInterpolation, antialias, roi); + } + + void submit(cudaStream_t stream, const nvcv::ImageBatchVarShape &in, const nvcv::ImageBatchVarShape &out, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoisF rois) + { + BatchShapesHelper inShapes(in); + BatchShapesHelper outShapes(out); + auto req + = m_op.getWorkspaceRequirements(in.numImages(), inShapes.NonOwningHandle(), outShapes.NonOwningHandle(), + minInterpolation, magInterpolation, antialias, rois); + auto ws = WorkspaceCache::instance().get(req, stream); + m_op(stream, ws.get(), in, out, minInterpolation, magInterpolation, antialias, rois); + } + + void submit(cudaStream_t stream, const TensorBatch &in, const TensorBatch &out, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoisF rois) + { + if (in.layout() != out.layout()) + { + throw std::runtime_error("Input and output batches must have the same layout"); + } + BatchShapesHelper inShapes(in); + BatchShapesHelper outShapes(out); + auto req + = m_op.getWorkspaceRequirements(in.numTensors(), inShapes.NonOwningHandle(), outShapes.NonOwningHandle(), + minInterpolation, magInterpolation, antialias, rois); + auto ws = WorkspaceCache::instance().get(req, stream); + m_op(stream, ws.get(), in, out, minInterpolation, magInterpolation, antialias, rois); + } + + // Required override to get the py object container. + py::object container() const override + { + return *this; + } + + // Required override to get the key as the base interface class. + const nvcvpy::IKey &key() const override + { + return m_key; + } + + static std::shared_ptr fetch(std::vector> &cache) + { + assert(!cache.empty()); + return cache[0]; + } + +private: + Key m_key; + cvcuda::HQResize m_op; +}; + +template +auto RunGuard(Op &op, Src &src, Dst &dst, Stream &stream, Call &&call) +{ + ResourceGuard guard(stream); + guard.add(LockMode::LOCK_MODE_READ, {src}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); + + call(); +} + +auto CreatePyOpHQResize() +{ + int deviceId; + NVCV_CHECK_THROW(cudaGetDevice(&deviceId)); + return CreateOperatorEx(deviceId); +} + +Tensor TensorHQResizeInto(Tensor &dst, Tensor &src, std::optional antialias, std::optional maybeRoi, + std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, std::optional pstream) +{ + Stream stream = pstream ? *pstream : Stream::Current(); + auto op = CreatePyOpHQResize(); + + bool hasDepth = src.layout().find('D') >= 0; + int resizeNDim = hasDepth ? 3 : 2; + RoiHelper parsedRoi(maybeRoi, resizeNDim); + const HQResizeRoiF *roi = parsedRoi.NonOwningHandle().roi; + + NVCVInterpolationType minInterpolationArg, magInterpolationArg; + GetMinMagInterpolation(minInterpolationArg, magInterpolationArg, interpolation, minInterpolation, magInterpolation); + + RunGuard(op, src, dst, stream, + [&]() + { + op->submit(stream.cudaHandle(), src, dst, minInterpolationArg, magInterpolationArg, + antialias.value_or(false), roi); + }); + return dst; +} + +Tensor TensorHQResize(Tensor &src, const Shape &outShape, std::optional antialias, std::optional roi, + std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, std::optional pstream) +{ + auto resizedShape = ResizedTensorShape(src.layout(), src.shape(), outShape); + Tensor dst = Tensor::Create(resizedShape, src.dtype(), src.layout()); + return TensorHQResizeInto(dst, src, antialias, roi, interpolation, minInterpolation, magInterpolation, pstream); +} + +ImageBatchVarShape VarShapeHQResizeInto(ImageBatchVarShape &dst, const ImageBatchVarShape &src, + std::optional antialias, const std::optional &roi, + std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, + std::optional pstream) +{ + Stream stream = pstream ? *pstream : Stream::Current(); + auto op = CreatePyOpHQResize(); + + RoiHelper parsedRoi(roi, 2); + NVCVInterpolationType minInterpolationArg, magInterpolationArg; + GetMinMagInterpolation(minInterpolationArg, magInterpolationArg, interpolation, minInterpolation, magInterpolation); + + RunGuard(op, src, dst, stream, + [&]() + { + op->submit(stream.cudaHandle(), src, dst, minInterpolationArg, magInterpolationArg, + antialias.value_or(false), parsedRoi.NonOwningHandle()); + }); + return dst; +} + +ImageBatchVarShape VarShapeHQResize(ImageBatchVarShape &src, const std::vector> &outShape, + std::optional antialias, const std::optional &roi, + std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, + std::optional pstream) +{ + ImageBatchVarShape out = ImageBatchVarShape::Create(src.capacity()); + + int32_t numOutSizes = outShape.size(); + if (numOutSizes != src.numImages() && numOutSizes != 1) + { + throw std::runtime_error( + "The list of output shapes `out_size` must either contain a single shape to be used for all output images " + "or its length must match the number of input samples."); + } + + for (int i = 0; i < src.numImages(); ++i) + { + auto size = outShape[numOutSizes == 1 ? 0 : i]; + auto image = Image::Create({std::get<1>(size), std::get<0>(size)}, src[i].format()); + out.pushBack(image); + } + + return VarShapeHQResizeInto(out, src, antialias, roi, interpolation, minInterpolation, magInterpolation, pstream); +} + +TensorBatch TensorBatchHQResizeInto(TensorBatch &dst, const TensorBatch &src, std::optional antialias, + const std::optional &roi, std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, + std::optional pstream) +{ + Stream stream = pstream ? *pstream : Stream::Current(); + auto op = CreatePyOpHQResize(); + + bool hasDepth = src.layout().find('D') >= 0; + int resizeNDim = hasDepth ? 3 : 2; + RoiHelper parsedRoi(roi, resizeNDim); + + NVCVInterpolationType minInterpolationArg, magInterpolationArg; + GetMinMagInterpolation(minInterpolationArg, magInterpolationArg, interpolation, minInterpolation, magInterpolation); + + RunGuard(op, src, dst, stream, + [&]() + { + op->submit(stream.cudaHandle(), src, dst, minInterpolationArg, magInterpolationArg, + antialias.value_or(false), parsedRoi.NonOwningHandle()); + }); + return dst; +} + +TensorBatch TensorBatchHQResize(TensorBatch &src, const std::vector &outShape, std::optional antialias, + const std::optional &roi, std::optional interpolation, + std::optional minInterpolation, + std::optional magInterpolation, std::optional pstream) +{ + TensorBatch out = TensorBatch::Create(src.numTensors()); + + int32_t numOutSizes = outShape.size(); + if (numOutSizes != src.numTensors() && numOutSizes != 1) + { + throw std::runtime_error( + "The list of output shapes `out_size` must either contain a single shape to be used for all output tensors " + "or its length must match the number of input tensors."); + } + + for (int i = 0; i < src.numTensors(); ++i) + { + auto sampleShape = outShape[numOutSizes == 1 ? 0 : i]; + const auto &inSample = src[i]; + auto resizedShape = ResizedTensorShape(inSample.layout(), inSample.shape(), sampleShape); + Tensor dst = Tensor::Create(resizedShape, src.dtype(), src.layout()); + out.pushBack(dst); + } + + return TensorBatchHQResizeInto(out, src, antialias, roi, interpolation, minInterpolation, magInterpolation, + pstream); +} + +} // namespace + +void ExportOpHQResize(py::module &m) +{ + using namespace pybind11::literals; + + m.def("hq_resize", &TensorHQResize, "src"_a, "out_size"_a, py::kw_only(), "antialias"_a = false, "roi"_a = nullptr, + "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, "mag_interpolation"_a = nullptr, + "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. The operator + supports resampling for 2D (images) and 3D volumetric samples. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + src (Tensor): Input tensor containing one or more images. + The tensor layout must match: (N)(D)HW(C). + out_size (Shape): Tuple of 2 or 3 ints describing the output shape in (D)HW layout. + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(Tuple): Optional bounding box describing the input's region of interest. + For 2D resampling it should be (lowH, lowW, highH, highW), + for 3D: (lowD, lowH, lowW, highD, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the image is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.Tensor: The output tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + m.def("hq_resize", &VarShapeHQResize, "src"_a, "out_size"_a, py::kw_only(), "antialias"_a = false, + "roi"_a = nullptr, "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, + "mag_interpolation"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + src (ImageBatchVarShape): Input batch of images. + out_size (Shape): Tuple of 2 ints describing the output shape in HW layout. + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(List[Tuple]): Optional bounding boxes describing the input's region of interest. + It should be a list of tuples. The list length must match the number + of input tensors or be 1 (so that the same ROI is used for all samples). + Each tuple must be of the form (lowH, lowW, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the image is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.ImageBatchVarShape: The batch of resized images. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + m.def("hq_resize", &TensorBatchHQResize, "src"_a, "out_size"_a, py::kw_only(), "antialias"_a = false, + "roi"_a = nullptr, "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, + "mag_interpolation"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. The operator + supports resampling for 2D (images) and 3D volumetric samples. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + src (TensorBatch): Input batch containing one or more tensors of (D)HW(C) layout. + out_size (Shape): Tuple of 2 or 3 ints describing the output shape in (D)HW layout. + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(List[Tuple]): Optional bounding boxes describing the input's region of interest. + It should be a list of tuples. The list length must match the number + of input tensors or be 1 (so that the same ROI is used for all samples). + Each tuple must be of the form: + * for 2D resampling: (lowH, lowW, highH, highW), + * for 3D: (lowD, lowH, lowW, highD, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the tensor is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.TensorBatch: The batch of resized tensors. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + m.def("hq_resize_into", &TensorHQResizeInto, "dst"_a, "src"_a, py::kw_only(), "antialias"_a = false, + "roi"_a = nullptr, "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, + "mag_interpolation"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. The operator + supports resampling for 2D (images) and 3D volumetric samples. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + dst (Tensor): Output tensor. It's layout must match the src tensor. + The size of D, H, and W extents may be different. The dst + type must match the src's type or be float32. + src (Tensor): Input tensor containing one or more images. + The tensor layout must match: (N)(D)HW(C). + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(Tuple): Optional bounding box describing the input's region of interest. + For 2D resampling it should be (lowH, lowW, highH, highW), + for 3D: (lowD, lowH, lowW, highD, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the image is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.Tensor: The output tensor. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + m.def("hq_resize_into", &VarShapeHQResizeInto, "dst"_a, "src"_a, py::kw_only(), "antialias"_a = false, + "roi"_a = nullptr, "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, + "mag_interpolation"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + dst (ImageBatchVarShape): Output batch. The layout must match the input batch. + The size of D, H, and W extents may be different. The dst + type must match the src's type or be float32. + src (ImageBatchVarShape): Input batch of images. + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(List[Tuple]): Optional bounding boxes describing the input's region of interest. + It should be a list of tuples. The list length must match the number + of input tensors or be 1 (so that the same ROI is used for all samples). + Each tuple must be of the form (lowH, lowW, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the image is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.ImageBatchVarShape: The batch of resized images. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); + m.def("hq_resize_into", &TensorBatchHQResizeInto, "dst"_a, "src"_a, py::kw_only(), "antialias"_a = false, + "roi"_a = nullptr, "interpolation"_a = nullptr, "min_interpolation"_a = nullptr, + "mag_interpolation"_a = nullptr, "stream"_a = nullptr, R"pbdoc( + Executes the HQ Resize operation on the given cuda stream. The operator + supports resampling for 2D (images) and 3D volumetric samples. + + See also: + Refer to the CV-CUDA C API reference for the HQ Resize operator + for more details and usage examples. + + Args: + dst (TensorBatch): Output batch. The layout must match the input batch. + The size of D, H, and W extents may be different. The dst + type must match the src's type or be float32. + src (TensorBatch): Input batch containing one or more tensors of (D)HW(C) layout. + antialias (bool): If set to true, an antialiasing is enabled for scaling down. + roi(List[Tuple]): Optional bounding boxes describing the input's region of interest. + It should be a list of tuples. The list length must match the number + of input tensors or be 1 (so that the same ROI is used for all samples). + Each tuple must be of the form: + * for 2D resampling: (lowH, lowW, highH, highW), + * for 3D: (lowD, lowH, lowW, highD, highH, highW). + If, for some axis, the low bound is bigger than the high bound, + the tensor is flipped across the axis. + interpolation(Interp): Interpolation type used. Used both for scaling down and up, + cannot be specified together with (min_interpolation or mag_interpolation). + min_interpolation(Interp): Interpolation type used for scaling down. + mag_interpolation(Interp): Interpolation type used for scaling up. + stream (Stream, optional): CUDA Stream on which to perform the operation. + + Returns: + cvcuda.TensorBatch: The batch of resized tensors. + + Caution: + Restrictions to several arguments may apply. Check the C + API references of the CV-CUDA operator. + )pbdoc"); +} + +} // namespace cvcudapy diff --git a/python/mod_cvcuda/OpHistogram.cpp b/python/mod_cvcuda/OpHistogram.cpp index a9aea9b70..715882433 100644 --- a/python/mod_cvcuda/OpHistogram.cpp +++ b/python/mod_cvcuda/OpHistogram.cpp @@ -45,13 +45,13 @@ Tensor HistogramInto(Tensor &histogram, Tensor &input, std::optional mas auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {histogram}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {histogram}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); if (mask) { - guard.add(LockMode::LOCK_READ, {*mask}); + guard.add(LockMode::LOCK_MODE_READ, {*mask}); op->submit(pstream->cudaHandle(), input, *mask, histogram); } else diff --git a/python/mod_cvcuda/OpHistogramEq.cpp b/python/mod_cvcuda/OpHistogramEq.cpp index ca13fe87b..30bf40384 100644 --- a/python/mod_cvcuda/OpHistogramEq.cpp +++ b/python/mod_cvcuda/OpHistogramEq.cpp @@ -37,9 +37,9 @@ Tensor HistogramEqInto(Tensor &output, Tensor &input, std::optional pstr auto op = CreateOperator((uint32_t)shape[0]); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*op}); op->submit(pstream->cudaHandle(), input, output); @@ -64,9 +64,9 @@ ImageBatchVarShape HistogramEqVarShapeInto(ImageBatchVarShape &output, ImageBatc auto op = CreateOperator((uint32_t)input.numImages()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*op}); op->submit(pstream->cudaHandle(), input, output); diff --git a/python/mod_cvcuda/OpInpaint.cpp b/python/mod_cvcuda/OpInpaint.cpp index 21176e6a0..36af7a0ec 100644 --- a/python/mod_cvcuda/OpInpaint.cpp +++ b/python/mod_cvcuda/OpInpaint.cpp @@ -128,9 +128,9 @@ Tensor InpaintInto(Tensor &output, Tensor &input, Tensor &masks, double inpaintR auto inpaint = CreateOperatorEx((int)shape[0], maxShape); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, masks}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*inpaint}); + guard.add(LockMode::LOCK_MODE_READ, {input, masks}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*inpaint}); inpaint->submit(pstream->cudaHandle(), input, masks, output, inpaintRadius); @@ -155,9 +155,9 @@ ImageBatchVarShape InpaintVarShapeInto(ImageBatchVarShape &output, ImageBatchVar auto inpaint = CreateOperatorEx(input.numImages(), maxShape); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, masks}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*inpaint}); + guard.add(LockMode::LOCK_MODE_READ, {input, masks}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*inpaint}); inpaint->submit(pstream->cudaHandle(), input, masks, output, inpaintRadius); diff --git a/python/mod_cvcuda/OpJointBilateralFilter.cpp b/python/mod_cvcuda/OpJointBilateralFilter.cpp index 0298ec8f5..243054c4a 100644 --- a/python/mod_cvcuda/OpJointBilateralFilter.cpp +++ b/python/mod_cvcuda/OpJointBilateralFilter.cpp @@ -42,9 +42,9 @@ Tensor JointBilateralFilterInto(Tensor &output, Tensor &input, Tensor &inputColo auto joint_bilateral_filter = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, inputColor}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*joint_bilateral_filter}); + guard.add(LockMode::LOCK_MODE_READ, {input, inputColor}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*joint_bilateral_filter}); joint_bilateral_filter->submit(pstream->cudaHandle(), input, inputColor, output, diameter, sigmaColor, sigmaSpace, borderMode); @@ -73,9 +73,9 @@ ImageBatchVarShape VarShapeJointBilateralFilterInto(ImageBatchVarShape &output, auto joint_bilateral_filter = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, inputColor, diameter, sigmaColor, sigmaSpace}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*joint_bilateral_filter}); + guard.add(LockMode::LOCK_MODE_READ, {input, inputColor, diameter, sigmaColor, sigmaSpace}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*joint_bilateral_filter}); joint_bilateral_filter->submit(pstream->cudaHandle(), input, inputColor, output, diameter, sigmaColor, sigmaSpace, borderMode); diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp index eb89d55ba..1d45618d8 100644 --- a/python/mod_cvcuda/OpLabel.cpp +++ b/python/mod_cvcuda/OpLabel.cpp @@ -45,33 +45,33 @@ TupleTensor3 LabelInto(Tensor &output, std::optional count, std::optiona auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); if (count) { - guard.add(LockMode::LOCK_WRITE, {*count}); + guard.add(LockMode::LOCK_MODE_WRITE, {*count}); } if (stats) { - guard.add(LockMode::LOCK_WRITE, {*stats}); + guard.add(LockMode::LOCK_MODE_WRITE, {*stats}); } if (bgLabel) { - guard.add(LockMode::LOCK_READ, {*bgLabel}); + guard.add(LockMode::LOCK_MODE_READ, {*bgLabel}); } if (minThresh) { - guard.add(LockMode::LOCK_READ, {*minThresh}); + guard.add(LockMode::LOCK_MODE_READ, {*minThresh}); } if (maxThresh) { - guard.add(LockMode::LOCK_READ, {*maxThresh}); + guard.add(LockMode::LOCK_MODE_READ, {*maxThresh}); } if (minSize) { - guard.add(LockMode::LOCK_READ, {*minSize}); + guard.add(LockMode::LOCK_MODE_READ, {*minSize}); } op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}), diff --git a/python/mod_cvcuda/OpLaplacian.cpp b/python/mod_cvcuda/OpLaplacian.cpp index c90388b52..3b5655837 100644 --- a/python/mod_cvcuda/OpLaplacian.cpp +++ b/python/mod_cvcuda/OpLaplacian.cpp @@ -43,9 +43,9 @@ Tensor LaplacianInto(Tensor &output, Tensor &input, const int &ksize, const floa auto laplacian = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*laplacian}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*laplacian}); laplacian->submit(pstream->cudaHandle(), input, output, ksize, scale, border); @@ -71,9 +71,9 @@ ImageBatchVarShape LaplacianVarShapeInto(ImageBatchVarShape &output, ImageBatchV auto laplacian = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, ksize, scale}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*laplacian}); + guard.add(LockMode::LOCK_MODE_READ, {input, ksize, scale}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*laplacian}); laplacian->submit(pstream->cudaHandle(), input, output, ksize, scale, border); diff --git a/python/mod_cvcuda/OpMedianBlur.cpp b/python/mod_cvcuda/OpMedianBlur.cpp index 72a2fa9d4..2122945ba 100644 --- a/python/mod_cvcuda/OpMedianBlur.cpp +++ b/python/mod_cvcuda/OpMedianBlur.cpp @@ -42,9 +42,9 @@ Tensor MedianBlurInto(Tensor &output, Tensor &input, const std::tuple auto median_blur = CreateOperator(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*median_blur}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*median_blur}); nvcv::Size2D ksizeArg{std::get<0>(ksize), std::get<1>(ksize)}; @@ -71,9 +71,9 @@ ImageBatchVarShape VarShapeMedianBlurInto(ImageBatchVarShape &output, ImageBatch auto median_blur = CreateOperator(input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, ksize}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*median_blur}); + guard.add(LockMode::LOCK_MODE_READ, {input, ksize}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*median_blur}); median_blur->submit(pstream->cudaHandle(), input, output, ksize); diff --git a/python/mod_cvcuda/OpMinAreaRect.cpp b/python/mod_cvcuda/OpMinAreaRect.cpp index 9c9cdd9f0..30fcb7d6b 100644 --- a/python/mod_cvcuda/OpMinAreaRect.cpp +++ b/python/mod_cvcuda/OpMinAreaRect.cpp @@ -37,9 +37,9 @@ Tensor MinAreaRectInto(Tensor &output, Tensor &input, Tensor &numPointsInContour auto minAreaRect = CreateOperator(totalContours); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, numPointsInContour}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*minAreaRect}); + guard.add(LockMode::LOCK_MODE_READ, {input, numPointsInContour}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*minAreaRect}); minAreaRect->submit(pstream->cudaHandle(), input, output, numPointsInContour, totalContours); diff --git a/python/mod_cvcuda/OpMinMaxLoc.cpp b/python/mod_cvcuda/OpMinMaxLoc.cpp index 94d55573b..eb1eaa80b 100644 --- a/python/mod_cvcuda/OpMinMaxLoc.cpp +++ b/python/mod_cvcuda/OpMinMaxLoc.cpp @@ -77,9 +77,9 @@ TupleTensor3 MinLocInto(Tensor &minVal, Tensor &minLoc, Tensor &numMin, InputCon auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {minVal, minLoc, numMin}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {minVal, minLoc, numMin}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, minVal, minLoc, numMin, nullptr, nullptr, nullptr); @@ -110,9 +110,9 @@ TupleTensor3 MaxLocInto(Tensor &maxVal, Tensor &maxLoc, Tensor &numMax, InputCon auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {maxVal, maxLoc, numMax}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {maxVal, maxLoc, numMax}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, nullptr, nullptr, nullptr, maxVal, maxLoc, numMax); @@ -143,9 +143,9 @@ TupleTensor6 MinMaxLocInto(Tensor &minVal, Tensor &minLoc, Tensor &numMin, Tenso auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {minVal, minLoc, numMin, maxVal, maxLoc, numMax}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {minVal, minLoc, numMin, maxVal, maxLoc, numMax}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, minVal, minLoc, numMin, maxVal, maxLoc, numMax); diff --git a/python/mod_cvcuda/OpMorphology.cpp b/python/mod_cvcuda/OpMorphology.cpp index b8cd44954..e3e91cc63 100644 --- a/python/mod_cvcuda/OpMorphology.cpp +++ b/python/mod_cvcuda/OpMorphology.cpp @@ -44,9 +44,9 @@ Tensor MorphologyInto(Tensor &output, Tensor &input, NVCVMorphologyType morph_ty auto morphology = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*morphology}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*morphology}); nvcv::Size2D maskSizeArg{std::get<0>(maskSize), std::get<1>(maskSize)}; int2 anchorArg; @@ -55,7 +55,7 @@ Tensor MorphologyInto(Tensor &output, Tensor &input, NVCVMorphologyType morph_ty if (workspace) { - guard.add(LockMode::LOCK_READ, {*workspace}); + guard.add(LockMode::LOCK_MODE_READ, {*workspace}); morphology->submit(pstream->cudaHandle(), input, output, *workspace, morph_type, maskSizeArg, anchorArg, iteration, border); } @@ -90,14 +90,13 @@ ImageBatchVarShape MorphologyVarShapeInto(ImageBatchVarShape &output, ImageBatch auto morphology = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_READWRITE, {output, masks, anchors}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*morphology}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_READWRITE, {output, masks, anchors}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*morphology}); if (workspace) { - guard.add(LockMode::LOCK_READ, {*workspace}); + guard.add(LockMode::LOCK_MODE_READ, {*workspace}); morphology->submit(pstream->cudaHandle(), input, output, *workspace, morph_type, masks, anchors, iteration, borderMode); } diff --git a/python/mod_cvcuda/OpNonMaximumSuppression.cpp b/python/mod_cvcuda/OpNonMaximumSuppression.cpp index 11bdff135..2df26ca84 100644 --- a/python/mod_cvcuda/OpNonMaximumSuppression.cpp +++ b/python/mod_cvcuda/OpNonMaximumSuppression.cpp @@ -42,9 +42,9 @@ Tensor NonMaximumSuppressionInto(Tensor &dst, Tensor &src, Tensor &scores, float auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {src, scores}); - guard.add(LockMode::LOCK_WRITE, {dst}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {src, scores}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), src, dst, scores, scoreThreshold, iouThreshold); diff --git a/python/mod_cvcuda/OpNormalize.cpp b/python/mod_cvcuda/OpNormalize.cpp index 147beaf61..4cdb2d392 100644 --- a/python/mod_cvcuda/OpNormalize.cpp +++ b/python/mod_cvcuda/OpNormalize.cpp @@ -54,9 +54,9 @@ Tensor NormalizeInto(Tensor &output, Tensor &input, Tensor &base, Tensor &scale, auto normalize = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, base, scale}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*normalize}); + guard.add(LockMode::LOCK_MODE_READ, {input, base, scale}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*normalize}); normalize->submit(pstream->cudaHandle(), input, base, scale, output, globalScale, globalShift, epsilon, *flags); @@ -88,9 +88,9 @@ ImageBatchVarShape VarShapeNormalizeInto(ImageBatchVarShape &output, ImageBatchV auto normalize = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, base, scale}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*normalize}); + guard.add(LockMode::LOCK_MODE_READ, {input, base, scale}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*normalize}); normalize->submit(pstream->cudaHandle(), input, base, scale, output, globalScale, globalShift, epsilon, *flags); diff --git a/python/mod_cvcuda/OpOSD.cpp b/python/mod_cvcuda/OpOSD.cpp index 434769af0..fa0dcd93b 100644 --- a/python/mod_cvcuda/OpOSD.cpp +++ b/python/mod_cvcuda/OpOSD.cpp @@ -36,9 +36,9 @@ Tensor OSDInto(Tensor &output, Tensor &input, NVCVElements elements, std::option auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output, elements); diff --git a/python/mod_cvcuda/OpPadAndStack.cpp b/python/mod_cvcuda/OpPadAndStack.cpp index 35d5c8a45..295f80589 100644 --- a/python/mod_cvcuda/OpPadAndStack.cpp +++ b/python/mod_cvcuda/OpPadAndStack.cpp @@ -38,9 +38,9 @@ Tensor PadAndStackInto(Tensor &output, ImageBatchVarShape &input, Tensor &top, T auto padstack = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, top, left}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*padstack}); + guard.add(LockMode::LOCK_MODE_READ, {input, top, left}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*padstack}); padstack->submit(pstream->cudaHandle(), input, output, top, left, border, borderValue); diff --git a/python/mod_cvcuda/OpPairwiseMatcher.cpp b/python/mod_cvcuda/OpPairwiseMatcher.cpp index 2b9248d76..195c1f19c 100644 --- a/python/mod_cvcuda/OpPairwiseMatcher.cpp +++ b/python/mod_cvcuda/OpPairwiseMatcher.cpp @@ -50,25 +50,25 @@ TupleTensor3 PairwiseMatcherInto(Tensor &matches, std::optional numMatch auto op = CreateOperator(algoChoice); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {set1, set2}); - guard.add(LockMode::LOCK_WRITE, {matches}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {set1, set2}); + guard.add(LockMode::LOCK_MODE_WRITE, {matches}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); if (numSet1) { - guard.add(LockMode::LOCK_READ, {*numSet1}); + guard.add(LockMode::LOCK_MODE_READ, {*numSet1}); } if (numSet2) { - guard.add(LockMode::LOCK_READ, {*numSet2}); + guard.add(LockMode::LOCK_MODE_READ, {*numSet2}); } if (numMatches) { - guard.add(LockMode::LOCK_WRITE, {*numMatches}); + guard.add(LockMode::LOCK_MODE_WRITE, {*numMatches}); } if (distances) { - guard.add(LockMode::LOCK_WRITE, {*distances}); + guard.add(LockMode::LOCK_MODE_WRITE, {*distances}); } op->submit(pstream->cudaHandle(), set1, set2, (numSet1 ? *numSet1 : nvcv::Tensor{nullptr}), diff --git a/python/mod_cvcuda/OpPillowResize.cpp b/python/mod_cvcuda/OpPillowResize.cpp index 75a5b9088..c66231248 100644 --- a/python/mod_cvcuda/OpPillowResize.cpp +++ b/python/mod_cvcuda/OpPillowResize.cpp @@ -194,9 +194,9 @@ Tensor PillowResizeInto(Tensor &output, Tensor &input, nvcv::ImageFormat format, auto pillowResize = CreateOperatorEx(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*pillowResize}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*pillowResize}); pillowResize->submit(pstream->cudaHandle(), input, output, format, interp); @@ -223,9 +223,9 @@ ImageBatchVarShape VarShapePillowResizeInto(ImageBatchVarShape &output, ImageBat auto pillowResize = CreateOperatorEx(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*pillowResize}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*pillowResize}); pillowResize->submit(pstream->cudaHandle(), input, output, interpolation); diff --git a/python/mod_cvcuda/OpRandomResizedCrop.cpp b/python/mod_cvcuda/OpRandomResizedCrop.cpp index 1f993a1d1..da428a656 100644 --- a/python/mod_cvcuda/OpRandomResizedCrop.cpp +++ b/python/mod_cvcuda/OpRandomResizedCrop.cpp @@ -43,9 +43,9 @@ Tensor RandomResizedCropInto(Tensor &output, Tensor &input, double min_scale, do = CreateOperator(min_scale, max_scale, min_ratio, max_ratio, batchSize, seed); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*randomResizedCrop}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*randomResizedCrop}); randomResizedCrop->submit(pstream->cudaHandle(), input, output, interp); @@ -74,9 +74,9 @@ ImageBatchVarShape RandomResizedCropVarShapeInto(ImageBatchVarShape &output, Ima = CreateOperator(min_scale, max_scale, min_ratio, max_ratio, input.capacity(), seed); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*randomResizedCrop}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*randomResizedCrop}); randomResizedCrop->submit(pstream->cudaHandle(), input, output, interp); diff --git a/python/mod_cvcuda/OpReformat.cpp b/python/mod_cvcuda/OpReformat.cpp index ba1609e90..227ba0a0f 100644 --- a/python/mod_cvcuda/OpReformat.cpp +++ b/python/mod_cvcuda/OpReformat.cpp @@ -36,9 +36,9 @@ Tensor ReformatInto(Tensor &output, Tensor &input, std::optional pstream auto reformat = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*reformat}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*reformat}); reformat->submit(pstream->cudaHandle(), input, output); diff --git a/python/mod_cvcuda/OpRemap.cpp b/python/mod_cvcuda/OpRemap.cpp index 84e47b627..3ad42fca7 100644 --- a/python/mod_cvcuda/OpRemap.cpp +++ b/python/mod_cvcuda/OpRemap.cpp @@ -46,9 +46,9 @@ Tensor RemapInto(Tensor &dst, Tensor &src, Tensor &map, NVCVInterpolationType sr auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {src, map}); - guard.add(LockMode::LOCK_WRITE, {dst}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {src, map}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderMode, bValue); @@ -110,9 +110,9 @@ ImageBatchVarShape VarShapeRemapInto(ImageBatchVarShape &dst, ImageBatchVarShape auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {src, map}); - guard.add(LockMode::LOCK_WRITE, {dst}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {src, map}); + guard.add(LockMode::LOCK_MODE_WRITE, {dst}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), src, dst, map, srcInterp, mapInterp, mapValueType, alignCorners, borderMode, bValue); diff --git a/python/mod_cvcuda/OpResize.cpp b/python/mod_cvcuda/OpResize.cpp index f5c32f2b2..7d42dcce7 100644 --- a/python/mod_cvcuda/OpResize.cpp +++ b/python/mod_cvcuda/OpResize.cpp @@ -39,9 +39,9 @@ Tensor ResizeInto(Tensor &output, Tensor &input, NVCVInterpolationType interp, s auto resize = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*resize}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*resize}); resize->submit(pstream->cudaHandle(), input, output, interp); @@ -66,9 +66,9 @@ ImageBatchVarShape ResizeVarShapeInto(ImageBatchVarShape &output, ImageBatchVarS auto resize = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*resize}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*resize}); resize->submit(pstream->cudaHandle(), input, output, interp); diff --git a/python/mod_cvcuda/OpRotate.cpp b/python/mod_cvcuda/OpRotate.cpp index ae40328c3..a12965f4e 100644 --- a/python/mod_cvcuda/OpRotate.cpp +++ b/python/mod_cvcuda/OpRotate.cpp @@ -42,9 +42,9 @@ Tensor RotateInto(Tensor &output, Tensor &input, double angleDeg, const std::tup auto rotate = CreateOperator(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*rotate}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*rotate}); double2 shiftArg{std::get<0>(shift), std::get<1>(shift)}; @@ -72,9 +72,9 @@ ImageBatchVarShape VarShapeRotateInto(ImageBatchVarShape &output, ImageBatchVarS auto rotate = CreateOperator(input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, angleDeg, shift}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*rotate}); + guard.add(LockMode::LOCK_MODE_READ, {input, angleDeg, shift}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*rotate}); rotate->submit(pstream->cudaHandle(), input, output, angleDeg, shift, interpolation); diff --git a/python/mod_cvcuda/OpSIFT.cpp b/python/mod_cvcuda/OpSIFT.cpp index 65c44d4a6..f82fd1dad 100644 --- a/python/mod_cvcuda/OpSIFT.cpp +++ b/python/mod_cvcuda/OpSIFT.cpp @@ -162,9 +162,9 @@ TupleTensor4 SIFTInto(Tensor &featCoords, Tensor &featMetadata, Tensor &featDesc auto op = CreateOperatorEx(inShape, numOctaveLayers); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {in}); - guard.add(LockMode::LOCK_WRITE, {featCoords, featMetadata, featDescriptors, numFeatures}); - guard.add(LockMode::LOCK_WRITE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {in}); + guard.add(LockMode::LOCK_MODE_WRITE, {featCoords, featMetadata, featDescriptors, numFeatures}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*op}); op->submit(pstream->cudaHandle(), in, featCoords, featMetadata, featDescriptors, numFeatures, numOctaveLayers, contrastThreshold, edgeThreshold, initSigma, flags); diff --git a/python/mod_cvcuda/OpStack.cpp b/python/mod_cvcuda/OpStack.cpp index 41c7b891e..da815a876 100644 --- a/python/mod_cvcuda/OpStack.cpp +++ b/python/mod_cvcuda/OpStack.cpp @@ -86,9 +86,9 @@ Tensor StackIntoInternal(Tensor &output, std::vector &tensorList, std::o auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {inTensorBatch}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {inTensorBatch}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), inTensorBatch, output); return std::move(output); } diff --git a/python/mod_cvcuda/OpThreshold.cpp b/python/mod_cvcuda/OpThreshold.cpp index 3eb1b211f..37c398ca5 100644 --- a/python/mod_cvcuda/OpThreshold.cpp +++ b/python/mod_cvcuda/OpThreshold.cpp @@ -41,9 +41,9 @@ Tensor ThresholdInto(Tensor &output, Tensor &input, Tensor &thresh, Tensor &maxv auto threshold = CreateOperator(type, (int)shape[0]); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, thresh, maxval}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*threshold}); + guard.add(LockMode::LOCK_MODE_READ, {input, thresh, maxval}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*threshold}); threshold->submit(pstream->cudaHandle(), input, output, thresh, maxval); @@ -68,9 +68,9 @@ ImageBatchVarShape ThresholdVarShapeInto(ImageBatchVarShape &output, ImageBatchV auto threshold = CreateOperator(type, input.numImages()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, thresh, maxval}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*threshold}); + guard.add(LockMode::LOCK_MODE_READ, {input, thresh, maxval}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*threshold}); threshold->submit(pstream->cudaHandle(), input, output, thresh, maxval); diff --git a/python/mod_cvcuda/OpWarpAffine.cpp b/python/mod_cvcuda/OpWarpAffine.cpp index 9cf94a946..a07c25692 100644 --- a/python/mod_cvcuda/OpWarpAffine.cpp +++ b/python/mod_cvcuda/OpWarpAffine.cpp @@ -64,9 +64,9 @@ Tensor WarpAffineInto(Tensor &output, Tensor &input, const pyarray &xform, const auto warpAffine = CreateOperator(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*warpAffine}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*warpAffine}); warpAffine->submit(pstream->cudaHandle(), input, output, xformOutput, flags, borderMode, bValue); @@ -107,9 +107,9 @@ ImageBatchVarShape WarpAffineVarShapeInto(ImageBatchVarShape &output, ImageBatch auto warpAffine = CreateOperator(input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, xform}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*warpAffine}); + guard.add(LockMode::LOCK_MODE_READ, {input, xform}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*warpAffine}); warpAffine->submit(pstream->cudaHandle(), input, output, xform, flags, borderMode, bValue); diff --git a/python/mod_cvcuda/OpWarpPerspective.cpp b/python/mod_cvcuda/OpWarpPerspective.cpp index b35cbe82a..33536467f 100644 --- a/python/mod_cvcuda/OpWarpPerspective.cpp +++ b/python/mod_cvcuda/OpWarpPerspective.cpp @@ -63,9 +63,9 @@ Tensor WarpPerspectiveInto(Tensor &output, Tensor &input, const pyarray &xform, auto warpPerspective = CreateOperator(0); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*warpPerspective}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_NONE, {*warpPerspective}); warpPerspective->submit(pstream->cudaHandle(), input, output, xformOutput, flags, borderMode, bValue); @@ -106,9 +106,9 @@ ImageBatchVarShape WarpPerspectiveVarShapeInto(ImageBatchVarShape &output, Image auto warpPerspective = CreateOperator(input.capacity()); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input, xform}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_WRITE, {*warpPerspective}); + guard.add(LockMode::LOCK_MODE_READ, {input, xform}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + guard.add(LockMode::LOCK_MODE_READWRITE, {*warpPerspective}); warpPerspective->submit(pstream->cudaHandle(), input, output, xform, flags, borderMode, bValue); diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp index 2b8886b6a..b48f11fbd 100644 --- a/python/mod_cvcuda/Operators.hpp +++ b/python/mod_cvcuda/Operators.hpp @@ -81,6 +81,7 @@ void ExportOpBndBox(py::module &m); void ExportOpBoxBlur(py::module &m); void ExportOpBrightnessContrast(py::module &m); void ExportOpColorTwist(py::module &m); +void ExportOpHQResize(py::module &m); void ExportOpRemap(py::module &m); void ExportOpCropFlipNormalizeReformat(py::module &m); void ExportOpAdaptiveThreshold(py::module &m); diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp index 6c5f9cd9d..e15f6eff8 100644 --- a/python/mod_nvcv/CAPI.cpp +++ b/python/mod_nvcv/CAPI.cpp @@ -85,19 +85,19 @@ LockMode ToLockMode(PyObject *_mode) std::string s = ToObj(_mode); if (s.empty()) { - return LockMode::LOCK_NONE; + return LockMode::LOCK_MODE_NONE; } else if (s == "r") { - return LockMode::LOCK_READ; + return LockMode::LOCK_MODE_READ; } else if (s == "w") { - return LockMode::LOCK_WRITE; + return LockMode::LOCK_MODE_WRITE; } else if (s == "rw") { - return LockMode::LOCK_READWRITE; + return LockMode::LOCK_MODE_READWRITE; } else { diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp index b6b49476e..afe571569 100644 --- a/python/mod_nvcv/Resource.cpp +++ b/python/mod_nvcv/Resource.cpp @@ -59,11 +59,11 @@ void Resource::submitSignal(Stream &stream, LockMode mode) const { doBeforeSubmitSignal(stream, mode); - if (mode & LOCK_READ) + if (mode & LOCK_MODE_READ) { util::CheckThrow(cudaEventRecord(m_readEvent, stream.handle())); } - if (mode & LOCK_WRITE) + if (mode & LOCK_MODE_WRITE) { util::CheckThrow(cudaEventRecord(m_writeEvent, stream.handle())); } @@ -78,12 +78,12 @@ void Resource::submitSync(Stream &stream, LockMode mode) const void Resource::doSubmitSync(Stream &stream, LockMode mode) const { - if (mode & LOCK_WRITE) + if (mode & LOCK_MODE_WRITE) { util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent)); util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_readEvent)); } - else if (mode & LOCK_READ) + else if (mode & LOCK_MODE_READ) { util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent)); } @@ -102,12 +102,12 @@ void Resource::doSync(LockMode mode) const { NVCV_ASSERT(PyGILState_Check() == 0); - if (mode & LOCK_WRITE) + if (mode & LOCK_MODE_WRITE) { util::CheckThrow(cudaEventSynchronize(m_writeEvent)); util::CheckThrow(cudaEventSynchronize(m_readEvent)); } - else if (mode & LOCK_READ) + else if (mode & LOCK_MODE_READ) { util::CheckThrow(cudaEventSynchronize(m_writeEvent)); } diff --git a/python/mod_nvcv/include/nvcv/python/LockMode.hpp b/python/mod_nvcv/include/nvcv/python/LockMode.hpp index d9246c32b..571b10126 100644 --- a/python/mod_nvcv/include/nvcv/python/LockMode.hpp +++ b/python/mod_nvcv/include/nvcv/python/LockMode.hpp @@ -22,10 +22,10 @@ namespace nvcvpy { enum LockMode : uint8_t { - LOCK_NONE = 0, - LOCK_READ = 1, - LOCK_WRITE = 2, - LOCK_READWRITE = LOCK_READ | LOCK_WRITE + LOCK_MODE_NONE = 0, + LOCK_MODE_READ = 1, + LOCK_MODE_WRITE = 2, + LOCK_MODE_READWRITE = LOCK_MODE_READ | LOCK_MODE_WRITE }; } // namespace nvcvpy diff --git a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp index 51a6be4b7..40967a84b 100644 --- a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp +++ b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp @@ -45,16 +45,16 @@ class ResourceGuard py::object pyLockMode; switch (mode) { - case LockMode::LOCK_NONE: + case LockMode::LOCK_MODE_NONE: pyLockMode = py::str(""); break; - case LockMode::LOCK_READ: + case LockMode::LOCK_MODE_READ: pyLockMode = py::str("r"); break; - case LockMode::LOCK_WRITE: + case LockMode::LOCK_MODE_WRITE: pyLockMode = py::str("w"); break; - case LockMode::LOCK_READWRITE: + case LockMode::LOCK_MODE_READWRITE: pyLockMode = py::str("rw"); break; } diff --git a/python/setup.py.in b/python/setup.py.in new file mode 100644 index 000000000..c22e9d0ff --- /dev/null +++ b/python/setup.py.in @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a Python setuptools setup script to generate Python wheels. +# It is in a template form with placeholder fields that looks like ${}. +# This script will be automatically invoked by cmake when Python bindings are built. +# Do not invoke this outside of cmake. + + +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext + + +class NoBuildExtension(build_ext): + """ + Since CV-CUDA Python wheels are pure pre-compiled binary distribution at this point + without any Python or any other source code files and since the binaries are generated + by cmake system outside and without the knowledge of the setuptools, we must + create a dummy class to build an extension here with no source code in it and + no build steps in it to let setuptools create a platform library instead of a + pure library. Without any extensions in a setup tools project setuptools will + end up creating a purelib package. One can compile cmake/pybind11 code here + as an extension but since that part is handled outside of this file for now + we will simply create an empty extension and a corresponding build step that + actually does nothing but let setuptools know that this is a pure binary distribution. + """ + + def run(self): + return # Do nothing during build time. + + +# Define our PyPI trove classifiers for this project. Many values here are +# placeholders which will be filled in by cmake when this is built. +pypi_trove_classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: GPU :: NVIDIA CUDA", + "Environment :: GPU :: NVIDIA CUDA :: ${CUDA_VERSION_MAJOR}", + "Operating System :: POSIX :: Linux", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: ${PYTHON_VERSION}", + "Programming Language :: Python :: Implementation :: CPython", +] + +# Finally call the setup. +setup( + name="cvcuda-cu${CUDA_VERSION_MAJOR}", + description="${CMAKE_PROJECT_DESCRIPTION}", + author="NVIDIA Corporation", + url="https://github.com/CVCUDA/CV-CUDA", + version="${CMAKE_PROJECT_VERSION}${PROJECT_VERSION_SUFFIX}", + packages=[""], # Must be empty to support current CV-CUDA style distribution + package_dir={"": "."}, + package_data={ + "": ["*.so", "cvcuda.libs/*.*"] + }, # Includes the binding .so + core .so files + include_package_data=True, + install_requires=["numpy>=1.23.5"], + python_requires="==${PYTHON_VERSION}.*", + zip_safe=False, + cmdclass={ + "build_ext": NoBuildExtension, # This allows us to make it a platlib. + }, + ext_modules=[ + Extension( + name="UnusedEmptyExtension", sources=[] + ), # This allows us to make it a platlib. + ], + classifiers=pypi_trove_classifiers, +) diff --git a/samples/NOTICE.md b/samples/NOTICE.md index 854ce26b9..496ac11dc 100644 --- a/samples/NOTICE.md +++ b/samples/NOTICE.md @@ -1,3 +1,18 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + The sample data are obtained from the following sources : - Weimaraner.jpg image is obtained from [wikimedia](https://commons.wikimedia.org/wiki/File:Baegle_dwa.jpg) under Creative Commons Attribution-Share Alike 3.0 Unported license. diff --git a/samples/README.md b/samples/README.md index a0c6a150e..8c32d4d28 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,86 +1,107 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + # CV-CUDA Samples ## Description -These are some sample applications showcasing various CV-CUDA APIs. Sample applications are available in C++ and Python. +CV-CUDA samples are written to showcase the use of various CV-CUDA APIs to construct fully functional end-to-end deep learning inference pipelines. Sample applications are available in C++ and Python. ## Pre-requisites -- Recommended linux distros: +- Recommended Linux distributions: - Ubuntu >= 20.04 (tested with 20.04 and 22.04) - WSL2 with Ubuntu >= 20.04 (tested with 20.04) -- NVIDIA driver - - Linux: Driver version 520.56.06 or higher -- TensorRT == 8.5.2.2 -- NVIDIA Video Processing Framework (https://github.com/NVIDIA/VideoProcessingFramework) - - Follow the instructions from Github (https://github.com/NVIDIA/VideoProcessingFramework/blob/master/README.md) to install it via pip. - - Note: ffmpeg is a VPF dependency. It can be built from source by following these steps (https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html). The version of ffmpeg that comes/installs via apt-get in Ubuntu 20.04 may not be sufficient for VPF. - - Note: When installing VPF in a docker image like TensorRT, there is no need to install `libnvidia-encode` and `libnvidia-decode` as those already come preinstalled. Other docker images may require an installation of these libraries. -- NVIDIA TAO Converter == 4.0.0 +- NVIDIA driver: + - Linux: Driver version >= 535 +- NVIDIA TensorRT >= 8.6.1 +- NVIDIA nvImageCodec (https://github.com/NVIDIA/nvImageCodec) +- NVIDIA PyNvVideoCodec (https://catalog.ngc.nvidia.com/orgs/nvidia/resources/py_nvvideocodec) +- NVIDIA Video Processing Framework (only if running the Triton sample) (https://github.com/NVIDIA/VideoProcessingFramework) + - Note: When installing VPF in a docker image like TensorRT, there is no need to install `libnvidia-encode` and `libnvidia-decode` as those already come pre-installed. Other docker images may require an installation of these libraries. +- NVIDIA TAO Converter >= 4.0.0 - NVIDIA NSIGHT == 2023.2.1 (only if you wish to run the benchmarking code) -- Python Packages: - - torch == 1.13.0 - - torchvision == 0.14.0 - - torchnvjpeg (https://github.com/itsliupeng/torchnvjpeg) - - av == 10.0.0 - - pycuda == 2022.1 - - nvtx == 0.2.5 +- Additional Python packages requirements listed in the `requirements.txt` file under the `samples/scripts/` folder. + + -Setting up the following is only required if you want to setup and run the samples in a docker container: -- nvidia-docker v2.11.0 -- A working NVIDIA NGC account (visit https://ngc.nvidia.com/setup to get started using NGC) and follow through the NGC documentation here https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#ngc-image-prerequisites -- docker CLI logged into nvcr.io (NGC's docker registry) to be able to pull docker images. +## Setting up the environment +1. We strongly recommend working in a docker container to set things up. This would greatly simplify the process of installing dependencies, compiling and running the samples. The following is required to work in a docker container with CV-CUDA samples: + 1. nvidia-docker >= 2.11.0 + 2. A working NVIDIA NGC account (visit https://ngc.nvidia.com/setup to get started using NGC) and follow through the NGC documentation on https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#ngc-image-prerequisites + 3. docker CLI logged into nvcr.io (NGC's docker registry) to be able to pull docker image. (e.g. using `docker login nvcr.io`) -## Steps to compile the samples from source +2. Clone this CV-CUDA git repository. We would call the location where it is stored as `CVCUDA_ROOT`. -1. Get your CUDA and TensorRT installations ready. If you wish to install CUDA and TensorRT on your existing system you may do so by downloading those packages from NVIDIA's website. Or if you wish to work with in a docker container, you can use the TensorRT docker from NVIDIA NGC's catalog. It comes with CUDA and TensorRT pre-installed. Make sure you have setup NGC account properly and that your local docker installation has been logged into nvcr.io domain to be able to pull from that registry. Run the following command to start the container and continue rest of the installation steps in that container. Fill in the local_mount_path and docker_mount_path to reflect any paths on your system which you want to mount inside the container as well. This container comes with Ubuntu 20.04 with Python 3.8.10. +3. Make sure your CUDA and TensorRT installations are ready. If you wish to install CUDA and TensorRT on your existing system, you may do so by downloading those packages from NVIDIA's website. If you are using docker, use the TensorRT container from NVIDIA NGC. It comes with CUDA and TensorRT pre-installed: + 1. Run the following command to start the container and continue rest of the steps in that container. Fill in the `CVCUDA_ROOT` with the location where you have cloned this CV-CUDA repository. This will make the samples available inside the container at the `/workspace/cvcuda_samples` path. Also fill in the `CVCUDA_INSTALL` with the location where CV-CUDA installation packages (.deb or .whl files) are stored. This container comes with Ubuntu v22.04, Python v3.10.12 and TensorRT v8.6.1. ```bash - docker run -it --gpus=all -v : nvcr.io/nvidia/tensorrt:22.09-py3 + docker run -it --gpus=all -v /samples:/workspace/cvcuda_samples -v :/workspace/cvcuda_install nvcr.io/nvidia/tensorrt:24.01-py3 ``` -2. Make sure that the other helper scripts present in the `samples/scripts` folder is executable by executing following chmod commands. +3. Make sure the scripts present in the `/workspace/cvcuda_samples/scripts` directory is executable by executing following chmod commands: ```bash - cd samples - chmod a+x scripts/*.sh - chmod a+x scripts/*.py + cd /workspace/cvcuda_samples/ # Assuming this is where the samples are + chmod a+x ./scripts/*.sh + chmod a+x ./scripts/*.py ``` -3. Install all the dependencies required to run the samples. These are mentioned above in the prerequisites section. A convenient script to install all the dependencies is available at `scripts/install_dependencies.sh`. This script may require sudo privileges depending on your setup. +4. Install all dependencies required to build and/or run the samples. These are mentioned above in the prerequisites section. A convenient script to install all the dependencies is available at `scripts/install_dependencies.sh`. ```bash + cd /workspace/cvcuda_samples/ # Assuming this is where the samples are ./scripts/install_dependencies.sh ``` -4. Install the CV-CUDA packages. Please note that since the above container comes with Python 3.8.10, we will install nvcv-python3.8-0 package as mentioned below. If you have any other Python distributions, you would need to use the appropriate nvcv-python Debian package below. +5. Install CV-CUDA packages. If you are only interested in running the Python samples, you would be fine installing just the Python wheel. If you are interested in building the non-Python samples from source, the Debian packages are required. Since our docker container has Ubuntu 22.04, CUDA 12 and Python 3.10.12, we will install the corresponding CV-CUDA package as shown below: + 1. Using the Python wheel (only works for the Python samples): + ```bash + cd /workspace/cvcuda_install/ # Assuming this is where the installation files are + pip install cvcuda_cu12-0.6.0b0-cp310-cp310-linux_x86_64.whl + ``` - ```bash - dpkg -i nvcv-lib-0.5.0_beta-cuda11-x86_64-linux.deb - dpkg -i nvcv-dev-0.5.0_beta-cuda11-x86_64-linux.deb - dpkg -i cvcuda-samples-0.5.0_beta-cuda11-x86_64-linux.deb - dpkg -i nvcv-python3.8-0.5.0_beta-cuda11-x86_64-linux.deb - ``` -5. Copy the samples folder to the target directory. + 2. OR using the Debian packages (required to build the non-Python samples from source, also works for the Python samples): - ```bash - cp -rf /opt/nvidia/cvcuda*/samples ~/ - cd ~/samples - ``` + ```bash + cd /workspace/cvcuda_install/ # Assuming this is where the installation files are + dpkg -i cvcuda-lib-0.6.0_beta-cuda12-x86_64-linux.deb + dpkg -i cvcuda-dev-0.6.0_beta-cuda12-x86_64-linux.deb + dpkg -i cvcuda-python3.10-0.6.0_beta-cuda12-x86_64-linux.deb + ``` + +## Build the samples from source (Not required for Python samples) -6. Build the samples (whichever sample requires a build) +1. After following the [Setting up the environment](#setting-up-the-environment) section, execute the following command to compile the samples from source. This only applies to C++ samples. Python samples do not require any compilation. ```bash - ./scripts/build_samples.sh + cd /workspace/cvcuda_samples/ # Assuming this is where the samples are + ./scripts/build_samples.sh # Writes build files in /workspace/cvcuda_samples/build ``` -7. Run all the samples on by one. The `run_samples.sh` script conveniently runs all the samples in one shot. Some samples may use the TensorRT backend to run the inference and it may require a serialization step to convert a PyTorch model into a TensorRT model. This step should take some time depending on the GPUs used but usually it is only done once during the first run of the sample. The `run_samples.sh` script is supplied to serve only as a basic test case to test the samples under most frequently used command line parameters. It does not cover all the settings and command line parameters a sample may have to offer. Please explore and run the samples individually to explore all the capabilities of the samples. +## Run the samples + +1. After following the [Setting up the environment](#setting-up-the-environment) section and compiling them from source, one can run the samples manually one by one or use the `scripts/run_samples.sh` script to run all samples in one shot. Some samples uses the TensorRT back-end to run the inference and it may require a serialization step to convert a PyTorch model into a TensorRT model. This step should take some time depending on the GPU used but usually it is only done once during the first run of the sample. The `scripts/run_samples.sh` script is supplied to serve only as a basic test case to test the samples under most frequently used command line parameters. It does not cover all the settings and command line parameters a sample may have to offer. Please explore and run the samples individually to explore all the capabilities of the samples. ```bash + cd /workspace/cvcuda_samples/ # Assuming this is where the samples are and built samples are in /workspace/cvcuda_samples/build ./scripts/run_samples.sh ``` -## Performance Benchmarking +## Performance Benchmarking of the samples -See the [Performance Benchmarking](scripts/README.md) documentation. +See the [Performance Benchmarking](scripts/README.md) documentation to understand how to benchmark the samples. diff --git a/samples/classification/CMakeLists.txt b/samples/classification/CMakeLists.txt index 3b27da7b5..a74715d35 100644 --- a/samples/classification/CMakeLists.txt +++ b/samples/classification/CMakeLists.txt @@ -18,13 +18,13 @@ find_package(CUDA REQUIRED) set(CMAKE_CXX_FLAGS "-Wno-deprecated-enum-enum-conversion") # tag: Build classification sample -add_executable(nvcv_samples_classification Main.cpp) -target_link_libraries(nvcv_samples_classification nvcv_types cvcuda CUDA::cudart nvcv_samples_common) +add_executable(cvcuda_sample_classification Main.cpp) +target_link_libraries(cvcuda_sample_classification nvcv_types cvcuda CUDA::cudart cvcuda_samples_common) -target_include_directories(nvcv_samples_classification +target_include_directories(cvcuda_sample_classification PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) -install(TARGETS nvcv_samples_classification - EXPORT nvcv_samples_classification +install(TARGETS cvcuda_sample_classification + EXPORT cvcuda_sample_classification COMPONENT samples DESTINATION samples/bin) diff --git a/samples/classification/Main.cpp b/samples/classification/Main.cpp index ba9c3620f..073c1716f 100644 --- a/samples/classification/Main.cpp +++ b/samples/classification/Main.cpp @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) std::string labelPath = "./engines/imagenet-classes.txt"; uint32_t batchSize = 1; - // Parse the command line paramaters to override the default parameters + // Parse the command line parameters to override the default parameters int retval = ParseArgs(argc, argv, modelPath, imagePath, labelPath, batchSize); if (retval != 0) { diff --git a/samples/classification/python/main.py b/samples/classification/python/main.py index 5e3d53dc1..f12c95f5a 100644 --- a/samples/classification/python/main.py +++ b/samples/classification/python/main.py @@ -38,10 +38,9 @@ parse_validate_default_args, ) -from torch_utils import ImageBatchDecoderPyTorch # noqa: E402 - -from vpf_utils import ( # noqa: E402 - VideoBatchDecoderVPF, +from nvcodec_utils import ( # noqa: E402 + VideoBatchDecoder, + ImageBatchDecoder, ) from pipelines import ( # noqa: E402 @@ -92,7 +91,7 @@ def run_sample( if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images - decoder = ImageBatchDecoderPyTorch( + decoder = ImageBatchDecoder( input_path, batch_size, device_id, @@ -102,7 +101,7 @@ def run_sample( else: # Treat this as data modality of videos - decoder = VideoBatchDecoderVPF( + decoder = VideoBatchDecoder( input_path, batch_size, device_id, diff --git a/samples/common/CMakeLists.txt b/samples/common/CMakeLists.txt index 3435f6a03..a114213e8 100644 --- a/samples/common/CMakeLists.txt +++ b/samples/common/CMakeLists.txt @@ -13,15 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -project(nvcv_samples_common LANGUAGES CXX) +project(cvcuda_samples_common LANGUAGES CXX) -add_library(nvcv_samples_common SHARED +add_library(cvcuda_samples_common SHARED TRTUtils.cpp NvDecoder.cpp) -target_compile_options(nvcv_samples_common PRIVATE -Wno-deprecated-declarations -Wno-missing-declarations) -target_link_libraries(nvcv_samples_common nvcv_types cvcuda CUDA::cudart TensorRT::nvinfer CUDA::nvjpeg) +target_compile_options(cvcuda_samples_common PRIVATE -Wno-deprecated-declarations -Wno-missing-declarations) +target_link_libraries(cvcuda_samples_common nvcv_types cvcuda CUDA::cudart TensorRT::nvinfer CUDA::nvjpeg) -install(TARGETS nvcv_samples_common - EXPORT nvcv_samples_common +install(TARGETS cvcuda_samples_common + EXPORT cvcuda_samples_common COMPONENT samples DESTINATION samples/lib) diff --git a/samples/common/python/nvcodec_utils.py b/samples/common/python/nvcodec_utils.py new file mode 100644 index 000000000..2a300d385 --- /dev/null +++ b/samples/common/python/nvcodec_utils.py @@ -0,0 +1,641 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +nvcodec_utils + +This file hosts various helpers for NV codecs exist. +""" + + +import os +import sys +import av +import logging +import glob +import numpy as np +import torch +import nvcv +import cvcuda +from fractions import Fraction +import itertools +import PyNvVideoCodec as nvvc +from nvidia import nvimgcodec + +from pathlib import Path + +# Bring module folders from the samples directory into our path so that +# we can import modules from it. +samples_dir = Path(os.path.abspath(__file__)).parents[2] # samples/ +sys.path.insert(0, os.path.join(samples_dir, "")) + +from common.python.batch import Batch # noqa: E402 + +pixel_format_to_cvcuda_code = { + nvvc.Pixel_Format.YUV444: cvcuda.ColorConversion.YUV2RGB, + nvvc.Pixel_Format.NV12: cvcuda.ColorConversion.YUV2RGB_NV12, +} + + +class AppCAI: + def __init__(self, shape, stride, typestr, gpualloc): + self.__cuda_array_interface__ = { + "shape": shape, + "strides": stride, + "data": (int(gpualloc), False), + "typestr": typestr, + "version": 3, + } + + +# docs_tag: begin_videobatchdecoder_pyvideocodec +class VideoBatchDecoder: + def __init__( + self, + input_path, + batch_size, + device_id, + cuda_ctx, + cvcuda_perf, + ): + # docs_tag: begin_init_videobatchdecoder_pyvideocodec + self.logger = logging.getLogger(__name__) + self.input_path = input_path + self.batch_size = batch_size + self.device_id = device_id + self.cuda_ctx = cuda_ctx + self.cuda_stream = cvcuda.Stream().current + self.cvcuda_perf = cvcuda_perf + self.total_decoded = 0 + self.batch_idx = 0 + self.decoder = None + self.cvcuda_RGBtensor_batch = None + nvDemux = nvvc.PyNvDemuxer(self.input_path) + self.fps = nvDemux.FrameRate() + self.logger.info("Using PyNvVideoCodec decoder version: %s" % nvvc.__version__) + # docs_tag: end_init_videobatchdecoder_pyvideocodec + + # docs_tag: begin_call_videobatchdecoder_pyvideocodec + def __call__(self): + self.cvcuda_perf.push_range("decoder.pyVideoCodec") + + # docs_tag: begin_alloc_videobatchdecoder_pyvideocodec + # Check if we need to allocate the decoder for its first use. + if self.decoder is None: + self.decoder = nvVideoDecoder( + self.input_path, self.device_id, self.cuda_ctx, self.cuda_stream + ) + # docs_tag: end_alloc_videobatchdecoder_pyvideocodec + + # docs_tag: begin_decode_videobatchdecoder_pyvideocodec + # Get the NHWC YUV tensor from the decoder + cvcuda_YUVtensor = self.decoder.get_next_frames(self.batch_size) + + # Check if we are done decoding + if cvcuda_YUVtensor is None: + self.cvcuda_perf.pop_range() + return None + + # Check the code for the color conversion based in the pixel format + cvcuda_code = pixel_format_to_cvcuda_code.get(self.decoder.pixelFormat) + if cvcuda_code is None: + raise ValueError(f"Unsupported pixel format: {self.decoder.pixelFormat}") + + # Check layout to make sure it is what we expected + if cvcuda_YUVtensor.layout != "NHWC": + raise ValueError("Unexpected tensor layout, NHWC expected.") + + # this may be different than batch size since last frames may not be a multiple of batch size + actual_batch_size = cvcuda_YUVtensor.shape[0] + + # docs_tag: end_decode_videobatchdecoder_pyvideocodec + + # docs_tag: begin_convert_videobatchdecoder_pyvideocodec + # Create a CVCUDA tensor for color conversion YUV->RGB + # Allocate only for the first time or for the last batch. + if not self.cvcuda_RGBtensor_batch or actual_batch_size != self.batch_size: + self.cvcuda_RGBtensor_batch = cvcuda.Tensor( + (actual_batch_size, self.decoder.h, self.decoder.w, 3), + nvcv.Type.U8, + nvcv.TensorLayout.NHWC, + ) + + # Convert from YUV to RGB. Conversion code is based on the pixel format. + cvcuda.cvtcolor_into(self.cvcuda_RGBtensor_batch, cvcuda_YUVtensor, cvcuda_code) + + self.total_decoded += actual_batch_size + # docs_tag: end_convert_videobatchdecoder_pyvideocodec + + # docs_tag: begin_batch_videobatchdecoder_pyvideocodec + # Create a batch instance and set its properties. + batch = Batch( + batch_idx=self.batch_idx, + data=self.cvcuda_RGBtensor_batch, + fileinfo=self.input_path, + ) + self.batch_idx += 1 + + self.cvcuda_perf.pop_range() + return batch + # docs_tag: end_call_videobatchdecoder_pyvideocodec + + def start(self): + pass + + def join(self): + pass + + +# docs_tag: end_videobatchdecoder_pyvideocodec + +# docs_tag: begin_imp_nvvideodecoder +class nvVideoDecoder: + def __init__(self, enc_file, device_id, cuda_ctx, stream): + """ + Create instance of HW-accelerated video decoder. + :param enc_file: Full path to the MP4 file that needs to be decoded. + :param device_id: id of video card which will be used for decoding & processing. + :param cuda_ctx: A cuda context object. + """ + self.device_id = device_id + self.cuda_ctx = cuda_ctx + self.input_path = enc_file + self.stream = stream + # Demuxer is instantiated only to collect required information about + # certain video file properties. + self.nvDemux = nvvc.PyNvDemuxer(self.input_path) + self.nvDec = nvvc.CreateDecoder( + gpuid=0, + codec=self.nvDemux.GetNvCodecId(), + cudacontext=self.cuda_ctx.handle, + cudastream=self.stream.handle, + enableasyncallocations=False, + ) + + self.w, self.h = self.nvDemux.Width(), self.nvDemux.Height() + self.pixelFormat = self.nvDec.GetPixelFormat() + # In case sample aspect ratio isn't 1:1 we will re-scale the decoded + # frame to maintain uniform 1:1 ratio across the pipeline. + sar = 8.0 / 9.0 + self.fixed_h = self.h + self.fixed_w = int(self.w * sar) + + # frame iterator + def generate_decoded_frames(self): + for packet in self.nvDemux: + for decodedFrame in self.nvDec.Decode(packet): + nvcvTensor = nvcv.as_tensor( + nvcv.as_image(decodedFrame.nvcv_image(), nvcv.Format.U8) + ) + if nvcvTensor.layout == "NCHW": + # This will re-format the NCHW tensor to a NHWC tensor which will create + # a copy in the CUDA device decoded frame will go out of scope and the + # backing memory will be available by the decoder. + yield cvcuda.reformat(nvcvTensor, "NHWC") + else: + raise ValueError("Unexpected tensor layout, NCHW expected.") + + def get_next_frames(self, N): + decoded_frames = list(itertools.islice(self.generate_decoded_frames(), N)) + if len(decoded_frames) == 0: + return None + elif len(decoded_frames) == 1: # this case we dont need stack the tensor + return decoded_frames[0] + else: + # convert from list of tensors to a single tensor (NHWC) + tensorNHWC = cvcuda.stack(decoded_frames) + return tensorNHWC + + +# docs_tag: end_imp_nvvideodecoder + +# docs_tag: begin_init_videobatchencoder_pyvideocodec +class VideoBatchEncoder: + def __init__( + self, + output_path, + fps, + device_id, + cuda_ctx, + cvcuda_perf, + ): + self.logger = logging.getLogger(__name__) + self.output_path = output_path + self.fps = fps + self.device_id = device_id + self.cuda_ctx = cuda_ctx + self.cuda_stream = cvcuda.Stream().current + self.cvcuda_perf = cvcuda_perf + + self.encoder = None + self.cvcuda_HWCtensor_batch = None + self.cvcuda_YUVtensor_batch = None + self.input_layout = "NCHW" + self.gpu_input = True + self.output_file_name = None + + self.logger.info("Using PyNvVideoCodec encoder version: %s" % nvvc.__version__) + # docs_tag: end_init_videobatchencoder_pyvideocodec + + # docs_tag: begin_call_videobatchencoder_pyvideocodec + def __call__(self, batch): + self.cvcuda_perf.push_range("encoder.pyVideoCodec") + + # Get the name of the original video file read by the decoder. We would use + # the same filename to save the output video. + file_name = os.path.splitext(os.path.basename(batch.fileinfo))[0] + self.output_file_name = os.path.join(self.output_path, "out_%s.mp4" % file_name) + + assert isinstance(batch.data, torch.Tensor) + + # docs_tag: begin_alloc_cvcuda_videobatchencoder_pyvideocodec + # Check if we need to allocate the encoder for its first use. + if self.encoder is None: + self.encoder = nvVideoEncoder( + self.device_id, + batch.data.shape[3], + batch.data.shape[2], + self.fps, + self.output_file_name, + self.cuda_ctx, + self.cuda_stream, + "NV12", + ) + # docs_tag: end_alloc_cvcuda_videobatchencoder_pyvideocodec + + # docs_tag: begin_convert_videobatchencoder_pyvideocodec + + # Create 2 CVCUDA tensors: reformat NCHW->NHWC and color conversion RGB->YUV + current_batch_size = batch.data.shape[0] + height, width = batch.data.shape[2], batch.data.shape[3] + + # Allocate only for the first time or for the last batch. + if ( + not self.cvcuda_HWCtensor_batch + or current_batch_size != self.cvcuda_HWCtensor_batch.shape[0] + ): + self.cvcuda_HWCtensor_batch = cvcuda.Tensor( + (current_batch_size, height, width, 3), + nvcv.Type.U8, + nvcv.TensorLayout.NHWC, + ) + self.cvcuda_YUVtensor_batch = cvcuda.Tensor( + (current_batch_size, (height // 2) * 3, width, 1), + nvcv.Type.U8, + nvcv.TensorLayout.NHWC, + ) + + # Convert RGB to NV12, in batch, before sending it over to pyVideoCodec. + # Convert to CVCUDA tensor + cvcuda_tensor = cvcuda.as_tensor(batch.data, nvcv.TensorLayout.NCHW) + + # Reformat NCHW to NHWC + cvcuda.reformat_into(self.cvcuda_HWCtensor_batch, cvcuda_tensor) + + # Color convert from RGB to YUV_NV12 + cvcuda.cvtcolor_into( + self.cvcuda_YUVtensor_batch, + self.cvcuda_HWCtensor_batch, + cvcuda.ColorConversion.RGB2YUV_NV12, + ) + + # Convert back to torch tensor we are NV12 + tensor = torch.as_tensor(self.cvcuda_YUVtensor_batch.cuda(), device="cuda") + # docs_tag: end_convert_videobatchencoder_pyvideocodec + + # docs_tag: begin_encode_videobatchencoder_pyvideocodec + # Encode frames from the batch one by one using pyVideoCodec. + for img_idx in range(tensor.shape[0]): + img = tensor[img_idx] + self.encoder.encode_from_tensor(img) + + self.cvcuda_perf.pop_range() + + def start(self): + pass + + def join(self): + self.encoder.flush() + self.logger.info("Wrote: %s" % self.output_file_name) + + +# docs_tag: end_init_videobatchencoder_pyvideocodec + +# docs_tag: begin_imp_nvvideoencoder +class nvVideoEncoder: + def __init__( + self, + device_id, + width, + height, + fps, + enc_file, + cuda_ctx, + cuda_stream, + format, + ): + """ + Create instance of HW-accelerated video encoder. + :param device_id: id of video card which will be used for encoding & processing. + :param width: encoded frame width. + :param height: encoded frame height. + :param fps: The FPS at which the encoding should happen. + :param enc_file: path to encoded video file. + :param cuda_ctx: A cuda context object + :param format: The format of the encoded video file. + (e.g. "NV12", "YUV444" see NvPyVideoEncoder docs for more info) + """ + self.device_id = device_id + self.fps = round(Fraction(fps), 6) + self.enc_file = enc_file + self.cuda_ctx = cuda_ctx + self.cuda_stream = cuda_stream + + self.pts_time = 0 + self.delta_t = 1 # Increment the packets' timestamp by this much. + self.encoded_frame = np.ndarray(shape=(0), dtype=np.uint8) + self.container = av.open(enc_file, "w") + self.avstream = self.container.add_stream("h264", rate=self.fps) + + aligned_value = 0 + if width % 16 != 0: + aligned_value = 16 - (width % 16) + aligned_width = width + aligned_value + width = aligned_width + + self.avstream.width = width + self.avstream.height = height + + self.avstream.time_base = 1 / Fraction(self.fps) + self.surface = None + self.surf_plane = None + + self.tmpTensor = None + + self.nvEnc = nvvc.CreateEncoder( + self.avstream.width, + self.avstream.height, + format, + codec="h264", + preset="P4", + cudastream=cuda_stream.handle, + ) + + def width(self): + """ + Gets the actual video frame width from the encoder. + """ + return self.nvEnc.Width() + + def height(self): + """ + Gets the actual video frame height from the encoder. + """ + return self.nvEnc.Height() + + # docs_tag: begin_imp_nvvideoencoder + + def encode_from_tensor(self, tensor): + + # Create a CUDA array interface object wit 2 planes one for luma and CrCb for NV12 + objCAI = [] + # Need to compute the address of the Y plane and the interleaved chroma plane + data = ( + tensor.storage().data_ptr() + + tensor.storage_offset() * tensor.element_size() + ) + objCAI.append( + AppCAI( + (self.avstream.height, self.avstream.width, 1), + (self.avstream.width, 1, 1), + "|u1", + data, + ) + ) + chromaAlloc = int(data) + self.avstream.width * self.avstream.height + objCAI.append( + AppCAI( + (int(self.avstream.height / 2), int(self.avstream.width / 2), 2), + (self.avstream.width, 2, 1), + "|u1", + chromaAlloc, + ) + ) + # Encode the frame takes CUDA array interface object as input + self.encoded_frame = self.nvEnc.Encode(objCAI) + self.write_frame( + self.encoded_frame, + self.pts_time, + self.fps, + self.avstream, + self.container, + ) + self.pts_time += self.delta_t + + # docs_tag: end_imp_nvvideoencoder + + # docs_tag: begin_writeframe_nvvideoencoder + def write_frame(self, encoded_frame, pts_time, fps, stream, container): + encoded_bytes = bytearray(encoded_frame) + pkt = av.packet.Packet(encoded_bytes) + pkt.pts = pts_time + pkt.dts = pts_time + pkt.stream = stream + pkt.time_base = 1 / Fraction(fps) + container.mux(pkt) + + # docs_tag: end_writeframe_nvvideoencoder + + def flush(self): + encoded_bytes = self.nvEnc.EndEncode() + if encoded_bytes: + self.write_frame( + encoded_bytes, + self.pts_time, + self.fps, + self.avstream, + self.container, + ) + self.pts_time += self.delta_t + self.container.close() + + +# docs_tag: end_imp_nvvideoencoder + +# docs_tag: begin_imagebatchdecoder_nvimagecodec +class ImageBatchDecoder: + def __init__( + self, + input_path, + batch_size, + device_id, + cuda_ctx, + cvcuda_perf, + ): + + # docs_tag: begin_init_imagebatchdecoder_nvimagecodec + self.logger = logging.getLogger(__name__) + self.batch_size = batch_size + self.input_path = input_path + self.device_id = device_id + self.total_decoded = 0 + self.batch_idx = 0 + self.cuda_ctx = cuda_ctx + self.cuda_stream = cvcuda.Stream().current + self.cvcuda_perf = cvcuda_perf + self.decoder = nvimgcodec.Decoder(device_id=device_id) + + # docs_tag: begin_parse_imagebatchdecoder_nvimagecodec + if os.path.isfile(self.input_path): + if os.path.splitext(self.input_path)[1] == ".jpg": + # Read the input image file. + self.file_names = [self.input_path] * self.batch_size + # We will use the nvImageCodec based decoder on the GPU in case of images. + # This will be allocated once during the first run or whenever a batch + # size change happens. + else: + raise ValueError("Unable to read file %s as image." % self.input_path) + + elif os.path.isdir(self.input_path): + # It is a directory. Grab file names of all JPG images. + self.file_names = glob.glob(os.path.join(self.input_path, "*.jpg")) + self.logger.info("Found a total of %d JPEG images." % len(self.file_names)) + + else: + raise ValueError( + "Unknown expression given as input_path: %s." % self.input_path + ) + + # docs_tag: end_parse_imagebatchdecoder_nvimagecodec + + # docs_tag: begin_batch_imagebatchdecoder_nvimagecodec + self.file_name_batches = [ + self.file_names[i : i + self.batch_size] # noqa: E203 + for i in range(0, len(self.file_names), self.batch_size) + ] + # docs_tag: end_batch_imagebatchdecoder_nvimagecodec + + self.max_image_size = 1024 * 1024 * 3 # Maximum possible image size. + + self.logger.info( + "Using nvImageCodec decoder version: %s" % nvimgcodec.__version__ + ) + + # docs_tag: end_init_imagebatchdecoder_nvimagecodec + + def __call__(self): + if self.total_decoded == len(self.file_names): + return None + + # docs_tag: begin_call_imagebatchdecoder_nvimagecodec + self.cvcuda_perf.push_range("decoder.nvimagecodec") + + file_name_batch = self.file_name_batches[self.batch_idx] + + data_batch = [open(path, "rb").read() for path in file_name_batch] + + # docs_tag: begin_decode_imagebatchdecoder_nvimagecodec + + tensor_list = [] + image_list = self.decoder.decode(data_batch, cuda_stream=self.cuda_stream) + + # Convert the decoded images to nvcv tensors in a list. + for i in range(len(image_list)): + tensor_list.append(cvcuda.as_tensor(image_list[i], "HWC")) + + # Stack the list of tensors to a single NHWC tensor. + cvcuda_decoded_tensor = cvcuda.stack(tensor_list) + self.total_decoded += len(tensor_list) + # docs_tag: end_decode_imagebatchdecoder_nvimagecodec + + # docs_tag: begin_return_imagebatchdecoder_nvimagecodec + batch = Batch( + batch_idx=self.batch_idx, + data=cvcuda_decoded_tensor, + fileinfo=file_name_batch, + ) + self.batch_idx += 1 + + # docs_tag: end_return_imagebatchdecoder_nvimagecodec + + self.cvcuda_perf.pop_range() + # docs_tag: end_call_imagebatchdecoder_nvimagecodec + return batch + + def start(self): + pass + + def join(self): + pass + + +# docs_tag: end_imagebatchdecoder_nvimagecodec + +# docs_tag: begin_imagebatchencoder_nvimagecodec +class ImageBatchEncoder: + def __init__( + self, + output_path, + device_id, + cvcuda_perf, + ): + # docs_tag: begin_init_imagebatchencoder_nvimagecodec + self.logger = logging.getLogger(__name__) + self.encoder = nvimgcodec.Encoder(device_id=device_id) + self.input_layout = "NHWC" + self.gpu_input = True + self.output_path = output_path + self.device_id = device_id + self.cvcuda_perf = cvcuda_perf + + self.logger.info( + "Using nvImageCodec encoder version: %s" % nvimgcodec.__version__ + ) + # docs_tag: end_init_init_imagebatchencoder_nvimagecodec + + # docs_tag: begin_call_imagebatchencoder_nvimagecodec + def __call__(self, batch): + self.cvcuda_perf.push_range("encoder.nvimagecodec") + + assert isinstance(batch.data, torch.Tensor) + + image_tensors_nchw = batch.data + # Create an empty list to store filenames + filenames = [] + chwtensor_list = [] + # Iterate through each image to prepare the filenames + for img_idx in range(image_tensors_nchw.shape[0]): + img_name = os.path.splitext(os.path.basename(batch.fileinfo[img_idx]))[0] + results_path = os.path.join(self.output_path, f"out_{img_name}.jpg") + self.logger.info(f"Preparing to save the image to: {results_path}") + # Add the filename to the list + filenames.append(results_path) + # Add the image tensor CAI to a CAI list from an NCHW tensor + # (this was a stacked tensor if N images) + chwtensor_list.append(image_tensors_nchw[img_idx].cuda()) + + # Pass the image tensors and filenames to the encoder. + self.encoder.write(filenames, chwtensor_list) + self.cvcuda_perf.pop_range() + # docs_tag: end_call_imagebatchencoder_nvimagecodec + + def start(self): + pass + + def join(self): + pass + + +# docs_tag: end_imagebatchencoder_nvimagecodec diff --git a/samples/common/python/perf_utils.py b/samples/common/python/perf_utils.py index 1563afd91..7c32a2bdc 100644 --- a/samples/common/python/perf_utils.py +++ b/samples/common/python/perf_utils.py @@ -21,12 +21,22 @@ import sys import json import logging +from datetime import datetime import argparse import subprocess from collections import deque import cvcuda import torch import nvtx +import pandas + + +logger = logging.getLogger(__name__) +logging.basicConfig( + format="[%(name)s:%(lineno)d] %(asctime)s %(levelname)-6s %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) class CvCudaPerf: @@ -76,6 +86,7 @@ def __init__( self.timing_info = {} self.batch_info = {} self.inside_batch_info = [] + self.deleted_range_info = [] self.is_inside_batch = 0 self.total_batches_processed = {} # Check if the benchmark.py script was used to run this. We do so @@ -116,24 +127,37 @@ def push_range( self.stack.append((message, batch_idx)) self.stack_path = os.path.join(self.stack_path, message) - def pop_range(self, domain=None, total_items=None): + def pop_range(self, domain=None, total_items=None, delete_range=False): """ Pops a code range off of the stack for performance benchmarking. :param domain: Name of a domain under which the code range is scoped. :param total_items: The number of items processed in this range. + :param delete_range: Flag specifying whether the range should be completely deleted + instead of just popping it out. This will remove all traces of this range from + the benchmarks. Useful if the code being benchmarked fails and one wants to + remove its range in that case. """ if self.should_benchmark: # Grab the message and optional batch index from the stack. message, batch_idx = self.stack.pop() - self.timing_info[self.stack_path] = ( - 0, - 0, - ) # Placeholders for CPU and GPU times respectively. - # Actual timing information will be recorded and pulled from NSYS by a - # script like benchmark.py. - - if self.is_inside_batch > 0: + if not delete_range: + # Add only if this range was not meant for deletion. + self.timing_info[self.stack_path] = ( + 0, + 0, + ) # Placeholders for CPU and GPU times respectively. + # Actual timing information will be recorded and pulled from NSYS by a + # script like benchmark.py. + else: + # This range was meant for deletion. We did not add it to the timing_info + # but all the previously added children of this range must also be deleted. + # We will do that later in the finalize to avoid costing us time here. + # For that, we will save this stack path so that we can remove all the + # orphan nodes later. + self.deleted_range_info.append(self.stack_path) + + if self.is_inside_batch > 0 and not delete_range: self.inside_batch_info.append(self.stack_path) # Record the batch information if it was present. @@ -145,15 +169,19 @@ def pop_range(self, domain=None, total_items=None): "push a batch first by using the batch_idx in the push_range()." ) - self.batch_info[self.stack_path] = (batch_idx, total_items) - self.is_inside_batch -= 1 + self.is_inside_batch -= 1 # Decrement this by one. - if total_items > 0: - batch_level_prefix = os.path.dirname(self.stack_path) + if not delete_range: + # Add to batch info only if this range was not meant for deletion. + self.batch_info[self.stack_path] = (batch_idx, total_items) - if batch_level_prefix not in self.total_batches_processed: - self.total_batches_processed[batch_level_prefix] = 0 - self.total_batches_processed[batch_level_prefix] += 1 + # Maintain a count of the number of items processed in various batches. + if total_items > 0: + batch_level_prefix = os.path.dirname(self.stack_path) + + if batch_level_prefix not in self.total_batches_processed: + self.total_batches_processed[batch_level_prefix] = 0 + self.total_batches_processed[batch_level_prefix] += 1 # Unwind the stack to point to the previous path(i.e. directory like expression) # e.g. one level above. @@ -174,6 +202,15 @@ def finalize(self): " item(s) still not popped." % len(self.stack) ) + # Remove the keys from the timing_info which starts with any key in the + # deleted_range_info. That makes sure that we not only delete the current + # key but also all of its previous children which were added but not deleted. + timing_info_keys = list(self.timing_info.keys()) + for key_delete in self.deleted_range_info: + for k in timing_info_keys: + if k.startswith(key_delete): + self.timing_info.pop(k, None) + # Build a dictionary containing the timing information and some metadata # about this run. # The overall structure of this would be: @@ -711,3 +748,210 @@ def parse_validate_default_args(parser): raise ValueError("target_img_width must be a value >=10.") return args + + +def summarize_runs( + baseline_run_root, + baseline_run_name="baseline", + compare_run_roots=[], + compare_run_names=[], +): + """ + Summarizes one or more benchmark runs and prepares a pandas table showing the run per sample run-time + and speed-up numbers. + :param baseline_run_root: Folder containing one sub-folder per sample in which the benchmark.py + styled JSON of the baseline run is stored. + :param baseline_run_name: The display name of the column representing the first run in the table. + :param compare_run_roots: Optional. A list of folder containing one sub-folder per sample in which the + benchmark.py styled JSON of the other runs are stored. These runs are compared with the baseline run. + :param compare_run_names: A list of display names of the column representing the comparison runs + in the table. This must be of the same length as the `compare_run_json_paths`. + :returns: A pandas table with the sample's name and its run time from the baseline run. + If compare runs are given, it also returns their run times and the speed-up + compared to the baseline run. The speedup is simply the run time of the sample from the compare run + divided by its run time from the baseline run. If an sample's run time or speedup factor is not + available, it simply puts "N/A". + """ + + def _parse_json_for_time(json_data): + mean_all_batches = json_data["mean_all_batches"] + sample_name_key = list(mean_all_batches.keys())[0] + + cpu_time_minus_warmup_per_item = mean_all_batches[sample_name_key][ + "run_sample" + ]["pipeline"]["cpu_time_minus_warmup_per_item"] + + return cpu_time_minus_warmup_per_item + + baseline_perf = {} + if os.path.isdir(baseline_run_root): + for path in os.listdir(baseline_run_root): + if os.path.isdir(os.path.join(baseline_run_root, path)): + json_path = os.path.join(baseline_run_root, path, "benchmark_mean.json") + if os.path.isfile(json_path): + with open(json_path, "r") as f: + json_data = json.loads( + f.read() + ) # Storing by the name of the sample + + baseline_perf[path] = _parse_json_for_time(json_data) + else: + raise ValueError("baseline_run_root does not exist: %s" % baseline_run_root) + + if len(compare_run_roots) != len(compare_run_names): + raise ValueError( + "Length mismatch between the number of given paths for comparison and" + "their run names. %d v/s %d. Each path must have its corresponding run name." + % (len(compare_run_roots), len(compare_run_names)) + ) + + # Read all the comparison related JSON files, one by one, if any. + compare_perfs = {} + for compare_run_root, compare_run_name in zip(compare_run_roots, compare_run_names): + if os.path.isdir(compare_run_root): + compare_perfs[compare_run_name] = {} + + for path in os.listdir(compare_run_root): + if os.path.isdir(os.path.join(compare_run_root, path)): + compare_perfs[compare_run_name][path] = {} + + json_path = os.path.join( + compare_run_root, path, "benchmark_mean.json" + ) + if os.path.isfile(json_path): + with open(json_path, "r") as f: + json_data = json.loads( + f.read() + ) # Storing by the name of the sample + + compare_perfs[compare_run_name][ + path + ] = _parse_json_for_time(json_data) + else: + raise ValueError("compare_run_root does not exist: %s" % compare_run_root) + + results = [] + + for sample_name in baseline_perf: + row_dict = {} + + # Fetch the time and parameters from the JSON for baseline run. + baseline_run_time = baseline_perf[sample_name] + + row_dict["sample name"] = sample_name + row_dict["%s time (ms)" % baseline_run_name] = baseline_run_time + + if compare_perfs: + # Fetch the time from the JSON for all comparison runs. + for compare_run_name in compare_perfs: + # Check if the sample was present. + if sample_name in compare_perfs[compare_run_name]: + compare_run_time = compare_perfs[compare_run_name][sample_name] + else: + compare_run_time = None + + row_dict["%s time (ms)" % compare_run_name] = ( + compare_run_time if compare_run_time else "N/A" + ) + + if baseline_run_time and compare_run_time: + speedup = round(compare_run_time / baseline_run_time, 3) + else: + speedup = "N/A" + row_dict[ + "%s v/s %s speed-up" % (compare_run_name, baseline_run_name) + ] = speedup + + results.append(row_dict) + + df = pandas.DataFrame.from_dict(results) + + return df + + +def main(): + """ + The main function. This will run the comparison function to compare two benchmarking runs. + """ + parser = argparse.ArgumentParser("Summarize and compare benchmarking runs.") + + parser.add_argument( + "-o", + "--output-dir", + type=str, + required=True, + help="The output directory where you want to store the result summary as a CSV file.", + ) + + parser.add_argument( + "-b", + "--baseline-root", + type=str, + required=True, + help="Root folder containing one sub-folder per sample in which benchmark.py styled JSONs" + " of the baseline runs of those samples are stored.", + ) + parser.add_argument( + "-bn", + "--baseline-name", + type=str, + required=True, + help="The name of the column representing the baseline run in the output table.", + ) + parser.add_argument( + "-c", + "--compare-roots", + action="append", + required=False, + help="Optional. List of folders containing one sub-folder per sample in which benchmark.py" + " styled JSONs of the comparison runs of those samples are stored.", + ) + parser.add_argument( + "-cn", + "--compare-names", + action="append", + required=False, + help="Optional. List of names of the column representing the comparison runs in the " + "output table", + ) + + args = parser.parse_args() + + if not os.path.isdir(args.output_dir): + raise ValueError("output-dir does not exist: %s" % args.output_dir) + + if not os.path.isdir(args.baseline_root): + raise ValueError("baseline-root does not exist: %s" % args.baseline_json) + + if len(args.compare_roots) != len(args.compare_names): + raise ValueError( + "Length mismatch between the number of given paths for comparison and" + "their run names. %d v/s %d. Each path must have its corresponding run name." + % (len(args.compare_roots), len(args.compare_names)) + ) + + logger.info( + "Summarizing a total of %d runs. All times are in milliseconds" + % (len(args.compare_roots) + 1) + ) + + df = summarize_runs( + baseline_run_root=args.baseline_root, + baseline_run_name=args.baseline_name, + compare_run_roots=args.compare_roots, + compare_run_names=args.compare_names, + ) + + csv_path = os.path.join( + args.output_dir, + "summarize_runs.%s.csv" % datetime.now(), + ) + df.to_csv(csv_path) + + logger.info("Wrote comparison CSV to: %s" % csv_path) + + +if __name__ == "__main__": + # If this was called on its own, we will run the summarize_runs function to summarize and + # compare two runs. + main() diff --git a/samples/common/python/torch_utils.py b/samples/common/python/torch_utils.py deleted file mode 100644 index efc3fa801..000000000 --- a/samples/common/python/torch_utils.py +++ /dev/null @@ -1,187 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import glob -import logging -import torch -import torchnvjpeg -import torchvision.transforms.functional as F - -from pathlib import Path - -# Bring module folders from the samples directory into our path so that -# we can import modules from it. -samples_dir = Path(os.path.abspath(__file__)).parents[2] # samples/ -sys.path.insert(0, os.path.join(samples_dir, "")) - -from common.python.batch import Batch # noqa: E402 - -# docs_tag: begin_init_imagebatchdecoder_pytorch - - -class ImageBatchDecoderPyTorch: - def __init__( - self, - input_path, - batch_size, - device_id, - cuda_ctx, - cvcuda_perf, - ): - self.logger = logging.getLogger(__name__) - self.batch_size = batch_size - self.input_path = input_path - self.device_id = device_id - self.total_decoded = 0 - self.batch_idx = 0 - self.cuda_ctx = cuda_ctx - self.cvcuda_perf = cvcuda_perf - - if os.path.isfile(self.input_path): - if os.path.splitext(self.input_path)[1] == ".jpg": - # Read the input image file. - self.file_names = [self.input_path] * self.batch_size - # We will use the torchnvjpeg based decoder on the GPU in case of images. - # This will be allocated once during the first run or whenever a batch - # size change happens. - self.decoder = None - else: - raise ValueError("Unable to read file %s as image." % self.input_path) - - elif os.path.isdir(self.input_path): - # It is a directory. Grab file names of all JPG images. - self.decoder = None - self.file_names = glob.glob(os.path.join(self.input_path, "*.jpg")) - self.logger.info("Found a total of %d JPEG images." % len(self.file_names)) - - else: - raise ValueError( - "Unknown expression given as input_path: %s." % self.input_path - ) - - # docs_tag: end_parse_imagebatchdecoder_pytorch - - # docs_tag: begin_batch_imagebatchdecoder_pytorch - self.file_name_batches = [ - self.file_names[i : i + self.batch_size] # noqa: E203 - for i in range(0, len(self.file_names), self.batch_size) - ] - - self.max_image_size = 1024 * 1024 * 3 # Maximum possible image size. - - self.logger.info("Using torchnvjpeg as decoder.") - - # docs_tag: end_init_imagebatchdecoder_pytorch - - def __call__(self): - if self.total_decoded == len(self.file_names): - return None - - # docs_tag: begin_call_imagebatchdecoder_pytorch - self.cvcuda_perf.push_range("decoder.torch") - - file_name_batch = self.file_name_batches[self.batch_idx] - effective_batch_size = len(file_name_batch) - data_batch = [open(path, "rb").read() for path in file_name_batch] - - # docs_tag: end_read_imagebatchdecoder_pytorch - - # docs_tag: begin_decode_imagebatchdecoder_pytorch - if not self.decoder or effective_batch_size != self.batch_size: - decoder = torchnvjpeg.Decoder( - device_padding=0, - host_padding=0, - gpu_huffman=True, - device_id=self.device_id, - bath_size=effective_batch_size, - max_cpu_threads=8, # this is max_cpu_threads parameter. Not used internally. - max_image_size=self.max_image_size, - stream=None, - ) - - image_tensor_list = decoder.batch_decode(data_batch) - - # Convert the list of tensors to a tensor itself. - image_tensors_nhwc = torch.stack(image_tensor_list) - - self.total_decoded += len(image_tensor_list) - # docs_tag: end_decode_imagebatchdecoder_pytorch - - # docs_tag: begin_return_imagebatchdecoder_pytorch - batch = Batch( - batch_idx=self.batch_idx, data=image_tensors_nhwc, fileinfo=file_name_batch - ) - self.batch_idx += 1 - - self.cvcuda_perf.pop_range() - - return batch - # docs_tag: end_return_imagebatchdecoder_pytorch - - def start(self): - pass - - def join(self): - pass - - -# docs_tag: begin_init_imagebatchencoder_pytorch -class ImageBatchEncoderPyTorch: - def __init__( - self, - output_path, - fps, - device_id, - cuda_ctx, - cvcuda_perf, - ): - self.logger = logging.getLogger(__name__) - self._encoder = None - self.input_layout = "NCHW" - self.gpu_input = True - self.output_path = output_path - self.device_id = device_id - self.cvcuda_perf = cvcuda_perf - - self.logger.info("Using PyTorch/PIL as encoder.") - # docs_tag: end_init_imagebatchencoder_pytorch - - # docs_tag: begin_call_imagebatchencoder_pytorch - def __call__(self, batch): - self.cvcuda_perf.push_range("encoder.torch") - - image_tensors_nchw = batch.data - - # Bring the image_tensors_nchw to CPU and convert it to a PIL - # image and save those. - for img_idx in range(image_tensors_nchw.shape[0]): - img_name = os.path.splitext(os.path.basename(batch.fileinfo[img_idx]))[0] - results_path = os.path.join(self.output_path, "out_%s.jpg" % img_name) - self.logger.info("Saving the overlay result to: %s" % results_path) - overlay_cpu = image_tensors_nchw[img_idx].detach().cpu() - overlay_pil = F.to_pil_image(overlay_cpu) - overlay_pil.save(results_path) - - self.cvcuda_perf.pop_range() - - # docs_tag: end_call_imagebatchencoder_pytorch - - def start(self): - pass - - def join(self): - pass diff --git a/samples/common/python/vpf_utils.py b/samples/common/python/vpf_utils.py index da626887f..d688d7944 100644 --- a/samples/common/python/vpf_utils.py +++ b/samples/common/python/vpf_utils.py @@ -39,525 +39,6 @@ samples_dir = Path(os.path.abspath(__file__)).parents[2] # samples/ sys.path.insert(0, os.path.join(samples_dir, "")) -from common.python.batch import Batch # noqa: E402 - -# docs_tag: begin_init_videobatchdecoder_vpf - - -class VideoBatchDecoderVPF: - def __init__( - self, - input_path, - batch_size, - device_id, - cuda_ctx, - cvcuda_perf, - ): - self.logger = logging.getLogger(__name__) - self.input_path = input_path - self.batch_size = batch_size - self.device_id = device_id - self.cuda_ctx = cuda_ctx - self.cvcuda_perf = cvcuda_perf - - # Demuxer is instantiated only to collect required information about - # certain video file properties. - nvDemux = nvc.PyFFmpegDemuxer(self.input_path) - self.fps = nvDemux.Framerate() - self.total_frames = nvDemux.Numframes() - self.total_decoded = 0 - self.batch_idx = 0 - - # We use VPF to do video decoding. This instance will be allocated when the first - # batch comes in. - self.decoder = None - - # We would use VPF for video encoding/decoding, and CVCUDA to do color conversions - # to and from RGB to NV12 format. These formats are required by VPF to encode/decode - # video streams. Since CVCUDA can do these conversions much faster on a batch level - # and since VPF does not work on batches, we would perform these conversions here - # in this class using CVCUDA. We would pre-allocate the memory required by these - # conversions upon the first use or whenever the batch size changes. This would allow - # us to use the 'into' versions of CVCUDA operators without allocating/de-allocating - # memory on every batch. We need to be mindful of the following things when dealing - # with NV12 format in CVCUDA: - # NV12 is a complex format and it is not tensor friendly so libraries use a workaround - # to put the NV12 in a "matrix" form. They put the YUV from NV12 as 3/2 height - # 1 height is Y luma that is full resolution - # 1/2 height is UV chroma that is 2x2 down-scaled - # Hence you would see YUV's H dimension 1.5 times the RGB's H dimension. - self.cvcuda_RGBtensor_batch = None - - self.logger.info("Using VPF as decoder.") - # docs_tag: end_init_videobatchdecoder_vpf - - # docs_tag: begin_call_videobatchdecoder_vpf - def __call__(self): - # Check if we have reached the end of the stream. If so, simply return None. - if self.total_decoded == self.total_frames: - return None - - self.cvcuda_perf.push_range("decoder.vpf") - - # Check if we need to allocate the decoder for its first use. - if self.decoder is None: - self.decoder = nvdecoder( - self.input_path, - self.device_id, - self.cuda_ctx, - ) - - # docs_tag: end_alloc_videobatchdecoder_vpf - - # docs_tag: begin_decode_videobatchdecoder_vpf - # If we are in the last batch size, the total frames left to decode may be - # less than equal to the batch size. - if self.total_decoded + self.batch_size > self.total_frames: - actual_batch_size = self.total_frames - self.total_decoded - else: - actual_batch_size = self.batch_size - - # Decode each frame one by one and put them in a list. - frame_list = [self.decoder.decode_to_tensor() for x in range(actual_batch_size)] - - # Convert 3D list to 4D torch tensor. - image_tensor_nhwc = torch.stack(frame_list) - # docs_tag: end_decode_videobatchdecoder_vpf - - # docs_tag: begin_convert_videobatchdecoder_vpf - # Create a CVCUDA tensor for color conversion YUV->RGB - # Allocate only for the first time or for the last batch. - if not self.cvcuda_RGBtensor_batch or actual_batch_size != self.batch_size: - self.cvcuda_RGBtensor_batch = cvcuda.Tensor( - (actual_batch_size, self.decoder.h, self.decoder.w, 3), - nvcv.Type.U8, - nvcv.TensorLayout.NHWC, - ) - - # Add the batch dim at the end to make it W,H,1 from W,H - image_tensor_nhwc = torch.unsqueeze(image_tensor_nhwc, -1) - # Make it a CVCUDA Tensor, C will be 1. - cvcuda_YUVtensor = cvcuda.as_tensor(image_tensor_nhwc, nvcv.TensorLayout.NHWC) - # Convert from YUV to RGB. This will be NHWC. - cvcuda.cvtcolor_into( - self.cvcuda_RGBtensor_batch, cvcuda_YUVtensor, self.decoder.cvcuda_code - ) - self.total_decoded += len(frame_list) - # docs_tag: end_convert_videobatchdecoder_vpf - - # docs_tag: begin_batch_videobatchdecoder_vpf - # Create a batch instance and set its properties. - batch = Batch( - batch_idx=self.batch_idx, - data=self.cvcuda_RGBtensor_batch, - fileinfo=self.input_path, - ) - self.batch_idx += 1 - - self.cvcuda_perf.pop_range() - - return batch - # docs_tag: end_batch_videobatchdecoder_vpf - - def start(self): - pass - - def join(self): - pass - - -# docs_tag: begin_init_videobatchencoder_vpf -class VideoBatchEncoderVPF: - def __init__( - self, - output_path, - fps, - device_id, - cuda_ctx, - cvcuda_perf, - ): - self.logger = logging.getLogger(__name__) - self.output_path = output_path - self.fps = fps - self.device_id = device_id - self.cuda_ctx = cuda_ctx - self.cvcuda_perf = cvcuda_perf - - # We use VPF to do video encoding. This instance will be allocated when the first - # batch comes in. - self.encoder = None - - # We would use VPF for video encoding/decoding, and CVCUDA to do color conversions - # to and from RGB to NV12 format. These formats are required by VPF to encode/decode - # video streams. Since CVCUDA can do these conversions much faster on a batch level - # and since VPF does not work on batches, we would perform these conversions here - # in this class using CVCUDA. We would pre-allocate the memory required by these - # conversions upon the first use or whenever the batch size changes. This would allow - # us to use the 'into' versions of CVCUDA operators without allocating/deallocating - # memory on every batch. We need to be mindful of the following things when dealing - # with NV12 format in CVCUDA: - # NV12 is a complex format and it is not tensor friendly so libraries use a workaround - # to put the NV12 in a "matrix" form. They put the YUV from NV12 as 3/2 height - # 1 height is Y luma that is full resolution - # 1/2 height is UV chroma that is 2x2 down-scaled - # Hence you would see YUV's H dimension 1.5 times the RGB's H dimension. - self.cvcuda_HWCtensor_batch = None - self.cvcuda_YUVtensor_batch = None - self.input_layout = "NCHW" - self.gpu_input = True - self.output_file_name = None - - self.logger.info("Using VPF as encoder.") - # docs_tag: end_init_videobatchencoder_vpf - - # docs_tag: begin_call_videobatchencoder_vpf - def __call__(self, batch): - self.cvcuda_perf.push_range("encoder.vpf") - - # Get the name of the original video file read by the decoder. We would use - # the same filename to save the output video. - file_name = os.path.splitext(os.path.basename(batch.fileinfo))[0] - self.output_file_name = os.path.join(self.output_path, "out_%s.mp4" % file_name) - - # Check if we need to allocate the encoder for its first use. - if self.encoder is None: - self.encoder = nvencoder( - self.device_id, - batch.data.shape[3], - batch.data.shape[2], - self.fps, - self.output_file_name, - self.cuda_ctx, - ) - - # docs_tag: end_alloc_videobatchdecoder_vpf - - # docs_tag: begin_alloc_cvcuda_videobatchdecoder_vpf - # Create 2 CVCUDA tensors: reformat NCHW->NHWC and color conversion RGB->YUV - current_batch_size = batch.data.shape[0] - height, width = batch.data.shape[2], batch.data.shape[3] - # Allocate only for the first time or for the last batch. - if ( - not self.cvcuda_HWCtensor_batch - or current_batch_size != self.cvcuda_HWCtensor_batch.shape[0] - ): - self.cvcuda_HWCtensor_batch = cvcuda.Tensor( - (current_batch_size, height, width, 3), - nvcv.Type.U8, - nvcv.TensorLayout.NHWC, - ) - self.cvcuda_YUVtensor_batch = cvcuda.Tensor( - (current_batch_size, (height // 2) * 3, width, 1), - nvcv.Type.U8, - nvcv.TensorLayout.NHWC, - ) - # docs_tag: end_alloc_cvcuda_videobatchdecoder_vpf - - # docs_tag: begin_convert_videobatchencoder_vpf - # Convert RGB to NV12, in batch, before sending it over to VPF. - # Convert to CVCUDA tensor - cvcuda_tensor = cvcuda.as_tensor(batch.data, nvcv.TensorLayout.NCHW) - # Reformat - cvcuda.reformat_into(self.cvcuda_HWCtensor_batch, cvcuda_tensor) - # Color convert from RGB to YUV_NV12 - cvcuda.cvtcolor_into( - self.cvcuda_YUVtensor_batch, - self.cvcuda_HWCtensor_batch, - cvcuda.ColorConversion.RGB2YUV_NV12, - ) - - # Convert back to torch tensor - tensor = torch.as_tensor(self.cvcuda_YUVtensor_batch.cuda(), device="cuda") - - # docs_tag: end_convert_videobatchencoder_vpf - - # docs_tag: begin_encode_videobatchencoder_vpf - # Encode frames from the batch one by one using VPF. - for img_idx in range(tensor.shape[0]): - img = tensor[img_idx] - self.encoder.encode_from_tensor(img) - - self.cvcuda_perf.pop_range() - - # docs_tag: end_encode_videobatchencoder_vpf - - def start(self): - pass - - def join(self): - self.encoder.flush() - self.logger.info("Wrote: %s" % self.output_file_name) - - -class nvdecoder: - def __init__( - self, - enc_file, - device_id, - cuda_ctx, - ): - """ - Create instance of HW-accelerated video decoder. - :param enc_file: Full path to the MP4 file that needs to be decoded. - :param device_id: id of video card which will be used for decoding & processing. - :param cuda_ctx: A cuda context object. - """ - self.device_id = device_id - self.cuda_ctx = cuda_ctx - # Demuxer is instantiated only to collect required information about - # certain video file properties. - nvDemux = nvc.PyFFmpegDemuxer(enc_file) - self.w, self.h = nvDemux.Width(), nvDemux.Height() - self.fps = nvDemux.Framerate() - self.total_frames = nvDemux.Numframes() - - # In case sample aspect ratio isn't 1:1 we will re-scale the decoded - # frame to maintain uniform 1:1 ratio across the pipeline. - sar = 8.0 / 9.0 - self.fixed_h = self.h - self.fixed_w = int(self.w * sar) - - self.pix_fmt = nvDemux.Format() - is_yuv420 = ( - nvc.PixelFormat.YUV420 == self.pix_fmt - or nvc.PixelFormat.NV12 == self.pix_fmt - ) - is_yuv444 = nvc.PixelFormat.YUV444 == self.pix_fmt - - # Set CVCUDA color conversion code to do YUV->RGB - self.cvcuda_code = None - if is_yuv420: - self.cvcuda_code = cvcuda.ColorConversion.YUV2RGB_NV12 - elif is_yuv444: - self.cvcuda_code = cvcuda.ColorConversion.YUV2RGB - - codec = nvDemux.Codec() - is_hevc = nvc.CudaVideoCodec.HEVC == codec - - # YUV420 or YUV444 sampling formats are supported by Nvdec - self.is_hw_dec = is_yuv420 or is_yuv444 - - # But YUV444 HW decode is supported for HEVC only - if self.is_hw_dec and is_yuv444 and not is_hevc: - self.is_hw_dec = False - - if self.is_hw_dec: - # Nvdec supports NV12 (resampled YUV420) and YUV444 formats - if self.cuda_ctx: - self.nvDec = nvc.PyNvDecoder( - input=enc_file, - context=self.cuda_ctx.handle, - stream=cvcuda.Stream.current.handle, - ) - else: - self.nvDec = nvc.PyNvDecoder( - input=enc_file, - gpu_id=self.device_id, - ) - else: - raise ValueError( - "Current combination of hardware and the video file being read does not " - "hardware accelerated decoding." - ) - - # docs_tag: begin_imp_nvdecoder - def decode_hw(self, seek_ctx=None): - """ - Decode single video frame with Nvdec, convert it to planar RGB. - """ - # Decode with HW decoder - if seek_ctx is None: - dec_surface = self.nvDec.DecodeSingleSurface() - else: - dec_surface = self.nvDec.DecodeSingleSurface(seek_ctx) - if not dec_surface or dec_surface.Empty(): - raise RuntimeError("Can not decode frame.") - - return dec_surface - - def decode_to_tensor(self, *args, **kwargs): - """ - Decode single video frame, convert it to torch.cuda.FloatTensor. - Image will be planar RGB normalized to range [0.0; 1.0]. - """ - if self.is_hw_dec: - dec_surface = self.decode_hw(*args, **kwargs) - else: - raise ValueError( - "Current combination of hardware and the video file being read does not " - "hardware accelerated decoding." - ) - - if not dec_surface or dec_surface.Empty(): - raise RuntimeError("Can not decode surface.") - - surf_plane = dec_surface.PlanePtr() - - img_tensor = pnvc.makefromDevicePtrUint8( - surf_plane.GpuMem(), - surf_plane.Width(), - surf_plane.Height(), - surf_plane.Pitch(), - surf_plane.ElemSize(), - ) - if img_tensor is None: - raise RuntimeError("Can not export to tensor.") - - return img_tensor - - # docs_tag: end_imp_nvdecoder - - -class nvencoder: - def __init__( - self, - device_id, - width, - height, - fps, - enc_file, - cuda_ctx, - ): - """ - Create instance of HW-accelerated video encoder. - :param device_id: id of video card which will be used for encoding & processing. - :param width: encoded frame width. - :param height: encoded frame height. - :param fps: The FPS at which the encoding should happen. - :param enc_file: path to encoded video file. - :param cuda_ctx: A cuda context object - """ - self.device_id = device_id - self.fps = round(Fraction(fps), 6) - self.enc_file = enc_file - self.cuda_ctx = cuda_ctx - - opts = { - "preset": "P5", - "tuning_info": "high_quality", - "codec": "h264", - "fps": str(self.fps), - "s": str(width) + "x" + str(height), - "bitrate": "10M", - } - - self.nvEnc = nvc.PyNvEncoder( - opts, - self.cuda_ctx.handle, - cvcuda.Stream.current.handle, - ) - self.pts_time = 0 - self.delta_t = 1 # Increment the packets' timestamp by this much. - self.encoded_frame = np.ndarray(shape=(0), dtype=np.uint8) - self.container = av.open(enc_file, "w") - self.avstream = self.container.add_stream("h264", rate=fps) - self.avstream.width = width - self.avstream.height = height - # 1/fps would be our scale. - self.avstream.time_base = 1 / Fraction(fps) - self.surface = None - self.surf_plane = None - - def width(self): - """ - Gets the actual video frame width from the encoder. - """ - return self.nvEnc.Width() - - def height(self): - """ - Gets the actual video frame height from the encoder. - """ - return self.nvEnc.Height() - - # docs_tag: begin_imp_nvencoder - def tensor_to_surface(self, img_tensor): - """ - Converts torch float tensor into a planar RGB surface. - """ - if not self.surface: - if self.cuda_ctx: - self.surface = nvc.Surface.Make( - format=nvc.PixelFormat.NV12, - width=self.width(), - height=self.height(), - context=self.cuda_ctx.handle, - ) - else: - self.surface = nvc.Surface.Make( - format=nvc.PixelFormat.NV12, - width=self.width(), - height=self.height(), - gpu_id=self.device_id, - ) - self.surf_plane = self.surface.PlanePtr() - - pnvc.TensorToDptr( - img_tensor, - self.surf_plane.GpuMem(), - self.surf_plane.Width(), - self.surf_plane.Height(), - self.surf_plane.Pitch(), - self.surf_plane.ElemSize(), - ) - - return self.surface - - def encode_from_tensor(self, tensor): - """ - Encode single video frame from torch.cuda.FloatTensor. - Tensor must have planar RGB format and be normalized to range [0.0; 1.0]. - Shape of the tensor must be (3, height, width). - """ - assert tensor.dim() == 3 - assert tensor.device.index == self.device_id - - dst_surface = self.tensor_to_surface(tensor) - - if dst_surface.Empty(): - raise RuntimeError("Can not convert to yuv444.") - - success = self.nvEnc.EncodeSingleSurface(dst_surface, self.encoded_frame) - - if success: - self.write_frame( - self.encoded_frame, - self.pts_time, - self.fps, - self.avstream, - self.container, - ) - self.pts_time += self.delta_t - - # docs_tag: end_imp_nvencoder - - # docs_tag: begin_writeframe_nvencoder - def write_frame(self, encoded_frame, pts_time, fps, stream, container): - encoded_bytes = bytearray(encoded_frame) - pkt = av.packet.Packet(encoded_bytes) - pkt.pts = pts_time - pkt.dts = pts_time - pkt.stream = stream - pkt.time_base = 1 / Fraction(fps) - container.mux(pkt) - - def flush(self): - packets = np.ndarray(shape=(0), dtype=np.uint8) - - success = self.nvEnc.Flush(packets) - if success: - self.write_frame( - self.encoded_frame, - self.pts_time, - self.fps, - self.avstream, - self.container, - ) - self.pts_time += self.delta_t - - # docs_tag: end_writeframe_nvencoder - - """ Streaming video version of the Video Batch Decoder using VPF. """ diff --git a/samples/cropandresize/CMakeLists.txt b/samples/cropandresize/CMakeLists.txt index 938b6bb1e..3e09936f5 100644 --- a/samples/cropandresize/CMakeLists.txt +++ b/samples/cropandresize/CMakeLists.txt @@ -19,14 +19,14 @@ set(CMAKE_CXX_FLAGS "-Wno-deprecated-enum-enum-conversion") # tag: Build crop and resize sample -add_executable(nvcv_samples_cropandresize Main.cpp) -target_link_libraries(nvcv_samples_cropandresize nvcv_types cvcuda CUDA::cudart nvcv_samples_common) +add_executable(cvcuda_sample_cropandresize Main.cpp) +target_link_libraries(cvcuda_sample_cropandresize nvcv_types cvcuda CUDA::cudart cvcuda_samples_common) -target_include_directories(nvcv_samples_cropandresize +target_include_directories(cvcuda_sample_cropandresize PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) # tag: Install binaries -install(TARGETS nvcv_samples_cropandresize - EXPORT nvcv_samples_cropandresize +install(TARGETS cvcuda_sample_cropandresize + EXPORT cvcuda_sample_cropandresize COMPONENT samples DESTINATION samples/bin) diff --git a/samples/label/python/main.py b/samples/label/python/main.py index 575a2a2a4..aeff0f85a 100644 --- a/samples/label/python/main.py +++ b/samples/label/python/main.py @@ -38,7 +38,7 @@ parse_validate_default_args, ) -from torch_utils import ImageBatchDecoderPyTorch, ImageBatchEncoderPyTorch # noqa: E402 +from nvcodec_utils import ImageBatchDecoder, ImageBatchEncoder # noqa: E402 from interop_utils import to_cpu_numpy_buffer, to_cuda_buffer # noqa: E402 # docs_tag: end_python_imports @@ -59,7 +59,7 @@ def save_batch(images, label, encoder, batch): batch : Batch object to save the images Returns: - nvcv Tensor: RGB color, random for each label + n/a """ # Function to modify filenames in the batch def modify_filenames(suffix): @@ -70,13 +70,22 @@ def modify_filenames(suffix): modified_filenames.append(modified_filename) return modified_filenames - # convert to NCHW - imagesNCHW = cvcuda.reformat(images, "NCHW") + # Check if the format is what we expect + if encoder.input_layout != "NHWC": + raise ValueError( + "Expected input layout to be 'NHWC', but found '{}'".format( + encoder.input_layout + ) + ) + + # Convert to RGB if the input is grayscale the encoder expects RGB + if images.shape[3] == 1: + images = cvcuda.cvtcolor(images, cvcuda.ColorConversion.GRAY2RGB) # Modify filenames with "_labels" suffix oldFileNames = batch.fileinfo batch.fileinfo = modify_filenames(label) - batch.data = torch.as_tensor(imagesNCHW.cuda()) + batch.data = torch.as_tensor(images.cuda()) encoder(batch) batch.fileinfo = oldFileNames @@ -158,14 +167,12 @@ def run_sample( # Now define the object that will handle pre-processing if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images - decoder = ImageBatchDecoderPyTorch( + decoder = ImageBatchDecoder( input_path, batch_size, device_id, cuda_ctx, cvcuda_perf ) - encoder = ImageBatchEncoderPyTorch( + encoder = ImageBatchEncoder( output_dir, - fps=0, device_id=device_id, - cuda_ctx=cuda_ctx, cvcuda_perf=cvcuda_perf, ) else: @@ -204,7 +211,9 @@ def run_sample( # 1) CVCUDA tensor --> Nothing needs to be done. # 2) Numpy Array --> Convert to torch tensor first and then CVCUDA tensor # 3) Torch Tensor --> Convert to CVCUDA tensor - if isinstance(batch.data, torch.Tensor): + if isinstance(batch.data, cvcuda.Tensor): + cvcudaTensorNHWC = batch.data + elif isinstance(batch.data, torch.Tensor): cvcudaTensorNHWC = cvcuda.as_tensor(batch.data, "NHWC") elif isinstance(batch.data, np.ndarray): cvcudaTensorNHWC = cvcuda.as_tensor( @@ -213,11 +222,12 @@ def run_sample( ), "NHWC", ) + else: + raise ValueError("Unknown input type: %s" % type(batch.data)) # docs_tag: end_tensor_conversion # Convert to grayscale out = cvcuda.cvtcolor(cvcudaTensorNHWC, cvcuda.ColorConversion.RGB2GRAY) - save_batch(out, "grayscale", encoder, batch) # Histogram eq the image diff --git a/samples/object_detection/python/main.py b/samples/object_detection/python/main.py index 9f37d97c4..935e121a1 100644 --- a/samples/object_detection/python/main.py +++ b/samples/object_detection/python/main.py @@ -38,11 +38,11 @@ parse_validate_default_args, ) -from torch_utils import ImageBatchDecoderPyTorch, ImageBatchEncoderPyTorch # noqa: E402 - -from vpf_utils import ( # noqa: E402 - VideoBatchDecoderVPF, - VideoBatchEncoderVPF, +from nvcodec_utils import ( # noqa: E402 + VideoBatchDecoder, + VideoBatchEncoder, + ImageBatchDecoder, + ImageBatchEncoder, ) from pipelines import ( # noqa: E402 @@ -95,24 +95,22 @@ def run_sample( if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images - decoder = ImageBatchDecoderPyTorch( + decoder = ImageBatchDecoder( input_path, batch_size, device_id, cuda_ctx, cvcuda_perf ) - encoder = ImageBatchEncoderPyTorch( + encoder = ImageBatchEncoder( output_dir, - fps=0, device_id=device_id, - cuda_ctx=cuda_ctx, cvcuda_perf=cvcuda_perf, ) else: # Treat this as data modality of videos - decoder = VideoBatchDecoderVPF( + decoder = VideoBatchDecoder( input_path, batch_size, device_id, cuda_ctx, cvcuda_perf ) - encoder = VideoBatchEncoderVPF( + encoder = VideoBatchEncoder( output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf ) diff --git a/samples/scripts/README.md b/samples/scripts/README.md index c2b279b4a..3adf38ef5 100644 --- a/samples/scripts/README.md +++ b/samples/scripts/README.md @@ -1,3 +1,18 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + # Performance Benchmarking CV-CUDA samples ships with the following scripts that can help track and report the performance of the Python samples. @@ -31,7 +46,7 @@ This file holds the data structures and functions most commonly used during the With these tools, the benchmarking flow involves the following two steps: -1. Annotating the code of the sample using classes and functions from the `perf_utils.py` so that it can be profiled. +1. Annotating the code of the sample using classes and functions from the `perf_utils.py` so that it can be profiled. This is already done for you in the CV-CUDA Python samples. Here is how you can do it for any Python code: 1. Import the necessary classes and functions first ```python from perf_utils import CvCudaPerf, get_default_arg_parser, parse_validate_default_args @@ -75,7 +90,7 @@ With these tools, the benchmarking flow involves the following two steps: # Once everything is done, we must call the finalize(). cvcuda_perf.finalize() ``` -2. Use the sample with the `benchmark.py` to launch the benchmarking. `benchmark.py` can launch any script that uses `perf_utils`'s functionality and benchmark it using NSYS. It can also launch it in a multi-CPU multi-GPU fashion to compute the throughput. +2. Launch the sample with the `benchmark.py` to do the benchmarking. `benchmark.py` can launch any script that uses `perf_utils`'s functionality and benchmark it using NSYS. It can also launch it in a multi-CPU multi-GPU fashion to compute the throughput. 1. To benchmark the object detection sample, for example, we can use the following command: diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py index 3bcb62ebc..fe252d263 100644 --- a/samples/scripts/benchmark.py +++ b/samples/scripts/benchmark.py @@ -127,8 +127,8 @@ def parse_nvtx_pushpop_trace_json(json_path): thread_id = row["TID"] # Process a bit. Conversion from nano to milliseconds. - start_ms = round(start_ns / 10**6, 3) - end_ms = round(end_ns / 10**6, 3) + start_ms = round(start_ns / 10**6, 4) + end_ms = round(end_ns / 10**6, 4) parent_range_id = None if parent_range_id == "None" else parent_range_id # Save it in our dictionary at the process id and thread id level. @@ -212,11 +212,11 @@ def parse_nvtx_gpu_proj_trace_json(json_path): thread_id = row["TID"] # Process a bit. Conversion from nano to milliseconds. - cpu_start_ms = round(cpu_start_ns / 10**6, 3) - cpu_end_ms = round(cpu_end_ns / 10**6, 3) + cpu_start_ms = round(cpu_start_ns / 10**6, 4) + cpu_end_ms = round(cpu_end_ns / 10**6, 4) - gpu_start_ms = round(gpu_start_ns / 10**6, 3) - gpu_end_ms = round(gpu_end_ns / 10**6, 3) + gpu_start_ms = round(gpu_start_ns / 10**6, 4) + gpu_end_ms = round(gpu_end_ns / 10**6, 4) # Save it in our dictionary at the process id and thread id level. if process_id not in range_info: @@ -385,8 +385,8 @@ def calc_mean_ranges(all_range_info): cpu_ranges_list = mean_range_info[range_name][0] gpu_ranges_list = mean_range_info[range_name][1] - avg_cpu_time = round(sum(cpu_ranges_list) / len(cpu_ranges_list), 3) - avg_gpu_time = round(sum(gpu_ranges_list) / len(gpu_ranges_list), 3) + avg_cpu_time = round(sum(cpu_ranges_list) / len(cpu_ranges_list), 4) + avg_gpu_time = round(sum(gpu_ranges_list) / len(gpu_ranges_list), 4) mean_range_info[range_name] = (avg_cpu_time, avg_gpu_time) else: @@ -481,12 +481,12 @@ def recurse_divide_dict(input_dict, divide_by=None): for i in range(len(input_dict[key].value)): input_dict[key].value[i] /= divide_by - input_dict[key].value[i] = round(input_dict[key].value[i], 3) + input_dict[key].value[i] = round(input_dict[key].value[i], 4) else: divide_by = divide_by if divide_by else input_dict[key].len input_dict[key].value /= divide_by - input_dict[key].value = round(input_dict[key].value, 3) + input_dict[key].value = round(input_dict[key].value, 4) # Remove the MeanDictInfo object and store the value directly. input_dict[key] = input_dict[key].value @@ -500,11 +500,11 @@ def recurse_divide_dict(input_dict, divide_by=None): for i in range(len(input_dict[key].value)): input_dict[key][i] /= divide_by - input_dict[key][i] = round(input_dict[key][i], 3) + input_dict[key][i] = round(input_dict[key][i], 4) else: input_dict[key] /= divide_by - input_dict[key] = round(input_dict[key], 3) + input_dict[key] = round(input_dict[key], 4) def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): @@ -631,11 +631,11 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): # Computer per item. if batch_size > 0: current_dict[parts[-1]]["cpu_time_per_item"] = round( - current_dict[parts[-1]]["cpu_time"] / batch_size, 3 + current_dict[parts[-1]]["cpu_time"] / batch_size, 4 ) current_dict[parts[-1]]["gpu_time_per_item"] = round( - current_dict[parts[-1]]["gpu_time"] / batch_size, 3 + current_dict[parts[-1]]["gpu_time"] / batch_size, 4 ) # Maintain global counts of various batch level stats @@ -689,11 +689,11 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): if total_items[path] > 0: current_dict[parts[-1]]["cpu_time_per_item"] = round( current_dict[parts[-1]]["cpu_time"] / total_items[path], - 3, + 4, ) current_dict[parts[-1]]["gpu_time_per_item"] = round( current_dict[parts[-1]]["gpu_time"] / total_items[path], - 3, + 4, ) current_dict[parts[-1]]["total_items"] = total_items[path] @@ -709,11 +709,11 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): if total_items_above_level > 0: current_dict[parts[-1]]["cpu_time_per_item"] = round( current_dict[parts[-1]]["cpu_time"] / total_items_above_level, - 3, + 4, ) current_dict[parts[-1]]["gpu_time_per_item"] = round( current_dict[parts[-1]]["gpu_time"] / total_items_above_level, - 3, + 4, ) current_dict[parts[-1]]["total_items"] = total_items_above_level @@ -726,10 +726,10 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): batch_dict = batch_dicts[batch_level_prefix] batch_dict["cpu_time_minus_warmup"] = round( - (batch_dict["cpu_time"] - total_warmup_cpu_time[batch_level_prefix]), 3 + (batch_dict["cpu_time"] - total_warmup_cpu_time[batch_level_prefix]), 4 ) batch_dict["gpu_time_minus_warmup"] = round( - (batch_dict["gpu_time"] - total_warmup_gpu_time[batch_level_prefix]), 3 + (batch_dict["gpu_time"] - total_warmup_gpu_time[batch_level_prefix]), 4 ) batch_dict["cpu_time_minus_warmup_per_item"] = 0 @@ -739,12 +739,12 @@ def unflatten_process_benchmark_dict(benchmark_dict, warmup_batches): batch_dict["cpu_time_minus_warmup_per_item"] = round( batch_dict["cpu_time_minus_warmup"] / total_items_minus_warmup[batch_level_prefix], - 3, + 4, ) batch_dict["gpu_time_minus_warmup_per_item"] = round( batch_dict["gpu_time_minus_warmup"] / total_items_minus_warmup[batch_level_prefix], - 3, + 4, ) batch_dict["total_items_minus_warmup"] = total_items_minus_warmup[ @@ -1117,7 +1117,7 @@ def main(): proc_args, ), ) - logger.info("Launched process: %d on gpu: %d" % (process_idx, gpu_idx)) + logger.info("Launched process: %d. gpu-idx: %d" % (process_idx, gpu_idx)) results.append(result) # Close the pool and wait everything to finish. diff --git a/samples/scripts/benchmark_samples.sh b/samples/scripts/benchmark_samples.sh index 07851442a..7b97c3f78 100755 --- a/samples/scripts/benchmark_samples.sh +++ b/samples/scripts/benchmark_samples.sh @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Usage: benchmark_samples.sh +# Usage: benchmark_samples.sh # Performs benchmarking of all Python samples. # Since some samples may involve creation of a TensorRT model on the first run and since it takes @@ -23,62 +23,106 @@ # Only the results of the second run will be used. The model artifacts from the first run will # help us run the second run easily. -mkdir -p /tmp/benchmarking/classification -mkdir -p /tmp/benchmarking/segmentation -mkdir -p /tmp/benchmarking/detection + +set -e # Stops this script if any one command fails. + +if [ "$#" -lt 1 ]; then + echo "Usage: benchmark_samples.sh {USE_TENSORRT: True}" + exit 1 +fi + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +SAMPLES_ROOT="$(dirname "$SCRIPT_DIR")" # removes the scripts dir +OUTPUT_DIR="$1" +USE_TRT=${2:-True} +CLASSIFICATION_OUT_DIR="$OUTPUT_DIR/classification" +SEGMENTATION_OUT_DIR="$OUTPUT_DIR/segmentation" +DETECTION_OUT_DIR="$OUTPUT_DIR/detection" + +mkdir -p "$CLASSIFICATION_OUT_DIR" +mkdir -p "$SEGMENTATION_OUT_DIR" +mkdir -p "$DETECTION_OUT_DIR" + +echo "OUTPUT_DIR: $OUTPUT_DIR" +echo "CLASSIFICATION_OUT_DIR: $CLASSIFICATION_OUT_DIR" +echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR" +echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR" +if [ "$USE_TRT" = "True" ]; then + echo "Using TensorRT as the inference back-end in all the runs." + CLASSIFICATION_BACKEND="tensorrt" + SEGMENTATION_BACKEND="tensorrt" + DETECTION_BACKEND="tensorrt" +else + echo "Not using TensorRT as the inference back-end in all the runs." + CLASSIFICATION_BACKEND="pytorch" + SEGMENTATION_BACKEND="pytorch" + DETECTION_BACKEND="tensorflow" +fi # 1. The Classification sample # First dry run with 2 processes and 1 batch from start and end used as a warm-up batch. -python ./scripts/benchmark.py \ +echo "Running the classification sample (warm-up run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ -np 2 \ -w 1 \ - -o /tmp/benchmarking/classification \ - ./classification/python/main.py \ + -o "$CLASSIFICATION_OUT_DIR" \ + "$SAMPLES_ROOT/classification/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 + -bk $CLASSIFICATION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4" # Second run - the actual run. -python ./scripts/benchmark.py \ +echo "Running the classification sample (actual run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ -np 2 \ -w 1 \ - -o /tmp/benchmarking/classification \ - ./classification/python/main.py \ + -o "$CLASSIFICATION_OUT_DIR" \ + "$SAMPLES_ROOT/classification/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 + -bk $CLASSIFICATION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4" # 2. The Segmentation sample # First dry run with 2 processes and 1 batch from start and end used as a warm-up batch. -python ./scripts/benchmark.py \ +echo "Running the segmentation sample (warm-up run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ -np 2 \ -w 1 \ - -o /tmp/benchmarking/segmentation \ - ./segmentation/python/main.py \ + -o "$SEGMENTATION_OUT_DIR" \ + "$SAMPLES_ROOT/segmentation/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-ilimdar-avgezer-7081456.mp4 + -bk $SEGMENTATION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-ilimdar-avgezer-7081456.mp4" # Second run - the actual run. -python ./scripts/benchmark.py \ +echo "Running the segmentation sample (actual run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ -np 2 \ -w 1 \ - -o /tmp/benchmarking/segmentation \ - ./segmentation/python/main.py \ + -o "$SEGMENTATION_OUT_DIR" \ + "$SAMPLES_ROOT/segmentation/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-ilimdar-avgezer-7081456.mp4 + -bk $SEGMENTATION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-ilimdar-avgezer-7081456.mp4" # 3. The Object Detection sample # First dry run with 2 processes and 1 batch from start and end used as a warm-up batch. -python ./scripts/benchmark.py \ - -np 2 \ +echo "Running the detection sample (warm-up run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ + -np 1 \ -w 1 \ - -o /tmp/benchmarking/detection \ - ./object_detection/python/main.py \ + -o "$DETECTION_OUT_DIR" \ + "$SAMPLES_ROOT/object_detection/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 + -bk $DETECTION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4" # Second run - the actual run. -python ./scripts/benchmark.py \ - -np 2 \ +echo "Running the detection sample (actual run)..." +python3 "$SCRIPT_DIR/benchmark.py" \ + -np 1 \ -w 1 \ - -o /tmp/benchmarking/detection \ - ./object_detection/python/main.py \ + -o "$DETECTION_OUT_DIR" \ + "$SAMPLES_ROOT/object_detection/python/main.py" \ -b 4 \ - -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 + -bk $DETECTION_BACKEND \ + -i "$SAMPLES_ROOT/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4" # Done. diff --git a/samples/scripts/install_dependencies.sh b/samples/scripts/install_dependencies.sh index bb3a4f24d..cd2e6fb72 100755 --- a/samples/scripts/install_dependencies.sh +++ b/samples/scripts/install_dependencies.sh @@ -18,6 +18,33 @@ # This script installs all the dependencies required to run the CVCUDA samples. # It uses the /tmp folder to download temporary data and libraries. +# SCRIPT_DIR is the directory where this script is located. +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" + +# Check CUDA version. Begin by checking if nvcc command exists. +if command -v nvcc >/dev/null 2>&1; then + # Get CUDA version from nvcc output + CUDA_VERSION=$(nvcc --version | grep "release" | awk '{print $5}') + + # Extract major version number + CUDA_MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d. -f1) + + # Check major version to determine CUDA version + if [ "$CUDA_MAJOR_VERSION" -eq 11 ]; then + echo "CUDA 11 is installed." + elif [ "$CUDA_MAJOR_VERSION" -eq 12 ]; then + echo "CUDA 12 is installed." + else + echo "Unknown/Unsupported CUDA version." + exit 1 + fi +else + echo "CUDA is not installed." + exit 1 +fi + +set -e # Exit script if any command fails + # Install basic packages first. cd /tmp apt-get update && apt-get install -y --no-install-recommends \ @@ -41,7 +68,7 @@ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 update-alternatives --set gcc /usr/bin/gcc-11 update-alternatives --set g++ /usr/bin/g++-11 -# Install python and gtest +# Install Python and gtest apt-get update && apt-get install -y --no-install-recommends \ libgtest-dev \ libgmock-dev \ @@ -50,16 +77,7 @@ apt-get update && apt-get install -y --no-install-recommends \ mlocate && updatedb \ && rm -rf /var/lib/apt/lists/* -# Install pip and all the python packages. -pip3 install --upgrade pip -pip3 install torch==1.13.0 torchvision==0.14.0 av==10.0.0 pycuda==2022.1 nvtx==0.2.5 tensorflow==2.11.1 -cd /tmp -[ ! -d 'torchnvjpeg' ] && git clone https://github.com/itsliupeng/torchnvjpeg.git -cd torchnvjpeg && python3 setup.py bdist_wheel && cd dist && pip3 install torchnvjpeg-0.1.0-*-linux_x86_64.whl -echo "export PATH=$PATH:/opt/tensorrt/bin" >> ~/.bashrc - -# Install VPF and its dependencies. -# 1. ffmpeg and other libraries needed for VPF. +# Install ffmpeg and other libraries needed for VPF. # Note: We are not installing either libnv-encode or decode libraries here. apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ @@ -69,10 +87,11 @@ apt-get update && apt-get install -y --no-install-recommends \ libswresample-dev \ libavutil-dev\ && rm -rf /var/lib/apt/lists/* + +# Install libssl 1.1.1 cd /tmp -[ ! -d 'VideoProcessingFramework' ] && git clone https://github.com/NVIDIA/VideoProcessingFramework.git -pip3 install /tmp/VideoProcessingFramework -pip3 install /tmp/VideoProcessingFramework/src/PytorchNvCodec +wget http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.0g-2ubuntu4_amd64.deb +dpkg -i libssl1.1_1.1.0g-2ubuntu4_amd64.deb # Install tao-converter which parses the .etlt model file, and generates an optimized TensorRT engine wget 'https://api.ngc.nvidia.com/v2/resources/nvidia/tao/tao-converter/versions/v4.0.0_trt8.5.1.7_x86/files/tao-converter' --directory-prefix=/usr/local/bin @@ -91,4 +110,30 @@ apt-get update && apt-get install -y \ /tmp/nsight-systems-2023.2.1_2023.2.1.122-1_amd64.deb \ && rm -rf /var/lib/apt/lists/* +echo "export PATH=$PATH:/opt/tensorrt/bin" >> ~/.bashrc + +# Upgrade pip and install all required Python packages. +pip3 install --upgrade pip +pip3 install -r "$SCRIPT_DIR/requirements.txt" + +# Install VPF +cd /tmp +[ ! -d 'VideoProcessingFramework' ] && git clone https://github.com/NVIDIA/VideoProcessingFramework.git +# HotFix: Must change the PyTorch version used by PytorchNvCodec to match the one we are using. +# Since we are using 2.2.0 we must use that. +sed -i 's/torch/torch==2.2.0/g' /tmp/VideoProcessingFramework/src/PytorchNvCodec/pyproject.toml +sed -i 's/"torch"/"torch==2.2.0"/g' /tmp/VideoProcessingFramework/src/PytorchNvCodec/setup.py +pip3 install /tmp/VideoProcessingFramework +pip3 install /tmp/VideoProcessingFramework/src/PytorchNvCodec + +# Install NvImageCodec +pip3 install nvidia-nvimgcodec-cu${CUDA_MAJOR_VERSION} +pip3 install nvidia-pyindex +pip3 install nvidia-nvjpeg-cu${CUDA_MAJOR_VERSION} + +# Install NvPyVideoCodec +cd /tmp +wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/py_nvvideocodec/versions/0.0.9/zip -O py_nvvideocodec_0.0.9.zip +pip3 install py_nvvideocodec_0.0.9.zip + # Done diff --git a/samples/scripts/requirements.txt b/samples/scripts/requirements.txt new file mode 100644 index 000000000..f5a6af782 --- /dev/null +++ b/samples/scripts/requirements.txt @@ -0,0 +1,9 @@ +torch==2.2.0 +torchvision==0.17.0 +onnx==1.15.0 +av==11.0.0 +pycuda==2024.1 +nvtx==0.2.8 +tensorflow==2.15.0.post1 +pandas==2.0.3 +matplotlib==3.7.4 diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index 2679e5468..dea98a584 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -20,69 +20,90 @@ # NOTE: This script may take a long time to finish since some samples may need to create # TensorRT models as they run for the first time. +set -e + +export CUDA_MODULE_LOADING="LAZY" +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +SAMPLES_DIR="$(dirname "$SCRIPT_DIR")" +CLASSIFICATION_OUT_DIR=/tmp/classification +SEGMENTATION_OUT_DIR="/tmp/segmentation" +DETECTION_OUT_DIR="/tmp/object_detection" +DISTANCE_LABEL_OUT_DIR="/tmp/distance_label" + +echo "SAMPLES_DIR: $SAMPLES_DIR" +echo "CLASSIFICATION_OUT_DIR: $CLASSIFICATION_OUT_DIR" +echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR" +echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR" +echo "DISTANCE_LABEL_OUT_DIR: $DISTANCE_LABEL_OUT_DIR" + # Crop and Resize Sample # Batch size 2 -LD_LIBRARY_PATH=./lib ./bin/nvcv_samples_cropandresize -i ./assets/images/ -b 2 -export CUDA_MODULE_LOADING="LAZY" +LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_cropandresize -i $SAMPLES_DIR/assets/images/ -b 2 -# Run the classification Python sample first. This will save the necessary TensorRT model +# Run the classification Python sample. This will save the necessary TensorRT model # and labels in the output directory. The C++ sample can then use those directly. # Run the segmentation Python sample with default settings, without any command-line args. -find /tmp/ -maxdepth 1 -type f -delete -python3 ./classification/python/main.py +rm -rf "$CLASSIFICATION_OUT_DIR" +mkdir "$CLASSIFICATION_OUT_DIR" +python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_OUT_DIR" # Run it on a specific image with batch size 1 with PyTorch backend. -python3 ./classification/python/main.py -i ./assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" # # Run it on a specific image with batch size 4 with PyTorch backend. Uses Same image multiple times -python3 ./classification/python/main.py -i ./assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" # Run it on a folder worth of images with batch size 2 with PyTorch backend. -python3 ./classification/python/main.py -i ./assets/images/ -b 2 -bk pytorch +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_OUT_DIR" # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory. -mkdir /tmp/classification -python3 ./classification/python/main.py -i ./assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o /tmp/classification + +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory. -python3 ./classification/python/main.py -i ./assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o /tmp/classification +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" # Run it on a video with batch size 1 with TensorRT backend with saving the output in a specific directory. -python3 ./classification/python/main.py -i ./assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o /tmp/classification +python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR" # Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model # and the labels file generated by it. # Batch size 1 -LD_LIBRARY_PATH=./lib ./bin/nvcv_samples_classification -e /tmp/classification/model.1.224.224.trtmodel -i ./assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 1 +LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.1.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 1 # Batch size 2 -LD_LIBRARY_PATH=./lib ./bin/nvcv_samples_classification -e /tmp/classification/model.2.224.224.trtmodel -i ./assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2 +LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2 # Run the segmentation Python sample with default settings, without any command-line args. -find /tmp/ -maxdepth 1 -type f -delete -python3 ./segmentation/python/main.py +rm -rf "$SEGMENTATION_OUT_DIR" +mkdir "$SEGMENTATION_OUT_DIR" +python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_OUT_DIR" # Run the segmentation sample with default settings for PyTorch backend. -python3 ./segmentation/python/main.py -bk pytorch +python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_OUT_DIR" # Run it on a single image with high batch size for the background class writing to a specific directory with PyTorch backend -python3 ./segmentation/python/main.py -i ./assets/images/tabby_tiger_cat.jpg -o /tmp -b 5 -c __background__ -bk pytorch +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch # Run it on a folder worth of images with the default tensorrt backend -python3 ./segmentation/python/main.py -i ./assets/images/ -o /tmp -b 4 -c __background__ +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 4 -c __background__ # Run it on a folder worth of images with PyTorch -python3 ./segmentation/python/main.py -i ./assets/images/ -o /tmp -b 5 -c __background__ -bk pytorch +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch # Run on a single image with custom resized input given to the sample for the dog class -python3 ./segmentation/python/main.py -i ./assets/images/Weimaraner.jpg -o /tmp -b 1 -c dog -th 512 -tw 512 +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_OUT_DIR" -b 1 -c dog -th 512 -tw 512 # Run it on a video for class background. -python ./segmentation/python/main.py -i ./assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_OUT_DIR" # Run it on a video for class background with the PyTorch backend. -python ./segmentation/python/main.py -i ./assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch +python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_OUT_DIR" + # Run the object detection Python sample with default settings, without any command-line args. -find /tmp/ -maxdepth 1 -type f -delete -python3 ./object_detection/python/main.py +rm -rf "$DETECTION_OUT_DIR" +mkdir "$DETECTION_OUT_DIR" +python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_OUT_DIR" # Run it with batch size 1 on a single image -python3 ./object_detection/python/main.py -i ./assets/images/peoplenet.jpg -b 1 +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DETECTION_OUT_DIR" # Run it with batch size 4 on a video -python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_OUT_DIR" # Run it with batch size 2 on a folder of images -python3 ./object_detection/python/main.py -i ./assets/images/ -b 3 +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_OUT_DIR" # RUn it with the TensorFlow backend -python3 ./object_detection/python/main.py -i ./assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow +python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_OUT_DIR" + -# Run the label Python sample with default settings, without any command-line args. -find /tmp/ -maxdepth 1 -type f -delete -python3 ./label/python/main.py +# Run the distance label Python sample with default settings, without any command-line args. +rm -rf "$DISTANCE_LABEL_OUT_DIR" +mkdir "$DISTANCE_LABEL_OUT_DIR" +python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_OUT_DIR" # Run it with batch size 1 on a single image -python3 ./label/python/main.py -i ./assets/images/peoplenet.jpg -b 1 +python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_OUT_DIR" diff --git a/samples/segmentation/python/README.md b/samples/segmentation/python/README.md index d35d0dc1e..d9bd7537c 100644 --- a/samples/segmentation/python/README.md +++ b/samples/segmentation/python/README.md @@ -1,3 +1,18 @@ + +[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved." +[//]: # "SPDX-License-Identifier: Apache-2.0" +[//]: # "" +[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');" +[//]: # "you may not use this file except in compliance with the License." +[//]: # "You may obtain a copy of the License at" +[//]: # "http://www.apache.org/licenses/LICENSE-2.0" +[//]: # "" +[//]: # "Unless required by applicable law or agreed to in writing, software" +[//]: # "distributed under the License is distributed on an 'AS IS' BASIS" +[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." +[//]: # "See the License for the specific language governing permissions and" +[//]: # "limitations under the License." + # Semantic Segmentation : Locally and using Triton ## Pre-requisites @@ -66,12 +81,12 @@ Triton has different public [Docker images](https://catalog.ngc.nvidia.com/orgs/ ./samples/scripts/install_dependencies.sh pip3 install tensorrt ``` -3. Install the CV-CUDA packages. Pre-built packages `.deb`, `.tar.xz`, `.whl` are only available on Github, so need to download from there. Otherwise, please build from source. Please note that since the above container comes with Python 3.8.10, we will install nvcv-python3.8-0 package as mentioned below. If you have any other Python distributions, you would need to use the appropriate nvcv-python packages below. +3. Install the CV-CUDA packages. Pre-built packages `.deb`, `.tar.xz`, `.whl` are only available on Github, so need to download from there. Otherwise, please build from source. Please note that since the above container comes with Python 3.8.10, we will install cvcuda-python3.8-0 package as mentioned below. If you have any other Python distributions, you would need to use the appropriate cvcuda-python packages below. ```bash - wget https://github.com/CVCUDA/CV-CUDA/releases/download/v0.3.0-beta/nvcv-lib-0.3.0_beta-cuda11-x86_64-linux.deb \ - https://github.com/CVCUDA/CV-CUDA/releases/download/v0.3.0-beta/nvcv-python3.8-0.3.0_beta-cuda11-x86_64-linux.deb \ - https://github.com/CVCUDA/CV-CUDA/releases/download/v0.3.0-beta/nvcv_python-0.3.x_beta-cp38-cp38-linux_x86_64.whl \ + wget https://github.com/CVCUDA/CV-CUDA/releases/download/v0.6.0-beta/cvcuda-lib-0.6.0_beta-cuda11-x86_64-linux.deb \ + https://github.com/CVCUDA/CV-CUDA/releases/download/v0.6.0-beta/cvcuda-python3.8-0.6.0_beta-cuda11-x86_64-linux.deb \ + https://github.com/CVCUDA/CV-CUDA/releases/download/v0.6.0-beta/cvcuda_cu11-0.6.0b0-cp310-cp310-linux_x86_64.whl \ -P /tmp/cvcuda && \ apt-get install -y /tmp/cvcuda/*.deb && \ pip3 install /tmp/cvcuda/*.whl diff --git a/samples/segmentation/python/main.py b/samples/segmentation/python/main.py index 7412e444d..02c8a9820 100644 --- a/samples/segmentation/python/main.py +++ b/samples/segmentation/python/main.py @@ -22,27 +22,27 @@ import logging import cvcuda import torch -from pathlib import Path -# Bring module folders from the samples directory into our path so that +# Bring the commons folder from the samples directory into our path so that # we can import modules from it. -samples_dir = Path(os.path.abspath(__file__)).parents[2] # samples/ -sys.path.insert(0, os.path.join(samples_dir, "")) +common_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "common", + "python", +) +sys.path.insert(0, common_dir) -from common.python.perf_utils import ( # noqa: E402 +from perf_utils import ( # noqa: E402 CvCudaPerf, get_default_arg_parser, parse_validate_default_args, ) -from common.python.torch_utils import ( # noqa: E402 - ImageBatchDecoderPyTorch, - ImageBatchEncoderPyTorch, -) - -from common.python.vpf_utils import ( # noqa: E402 - VideoBatchDecoderVPF, - VideoBatchEncoderVPF, +from nvcodec_utils import ( # noqa: E402 + VideoBatchDecoder, + VideoBatchEncoder, + ImageBatchDecoder, + ImageBatchEncoder, ) from pipelines import ( # noqa: E402 @@ -95,7 +95,7 @@ def run_sample( if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images - decoder = ImageBatchDecoderPyTorch( + decoder = ImageBatchDecoder( input_path, batch_size, device_id, @@ -103,16 +103,14 @@ def run_sample( cvcuda_perf, ) - encoder = ImageBatchEncoderPyTorch( + encoder = ImageBatchEncoder( output_dir, - fps=0, device_id=device_id, - cuda_ctx=cuda_ctx, cvcuda_perf=cvcuda_perf, ) else: # Treat this as data modality of videos - decoder = VideoBatchDecoderVPF( + decoder = VideoBatchDecoder( input_path, batch_size, device_id, @@ -120,7 +118,7 @@ def run_sample( cvcuda_perf, ) - encoder = VideoBatchEncoderVPF( + encoder = VideoBatchEncoder( output_dir, decoder.fps, device_id, diff --git a/samples/segmentation/python/model_inference.py b/samples/segmentation/python/model_inference.py index 8b271bb5f..84a3ee538 100644 --- a/samples/segmentation/python/model_inference.py +++ b/samples/segmentation/python/model_inference.py @@ -23,14 +23,16 @@ from torchvision.models import segmentation as segmentation_models import tensorrt as trt -from pathlib import Path - -# Bring module folders from the samples directory into our path so that +# Bring the commons folder from the samples directory into our path so that # we can import modules from it. -samples_dir = Path(os.path.abspath(__file__)).parents[2] # samples/ -sys.path.insert(0, os.path.join(samples_dir, "")) +common_dir = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "common", + "python", +) +sys.path.insert(0, common_dir) -from common.python.trt_utils import ( # noqa: E402 +from trt_utils import ( # noqa: E402 convert_onnx_to_tensorrt, setup_tensort_bindings, ) diff --git a/samples/segmentation/python/triton_client.py b/samples/segmentation/python/triton_client.py index d902a2675..7802fec2d 100644 --- a/samples/segmentation/python/triton_client.py +++ b/samples/segmentation/python/triton_client.py @@ -41,14 +41,14 @@ parse_validate_default_args, ) -from common.python.torch_utils import ( # noqa: E402 - ImageBatchDecoderPyTorch, - ImageBatchEncoderPyTorch, +from common.python.nvcodec_utils import ( # noqa: E402 + VideoBatchDecoder, + VideoBatchEncoder, + ImageBatchDecoder, + ImageBatchEncoder, ) from common.python.vpf_utils import ( # noqa: E402 - VideoBatchDecoderVPF, - VideoBatchEncoderVPF, VideoBatchStreamingDecoderVPF, VideoBatchStreamingEncoderVPF, ) @@ -123,7 +123,7 @@ def run_sample( # docs_tag: begin_init_dataloader if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path): # Treat this as data modality of images - decoder = ImageBatchDecoderPyTorch( + decoder = ImageBatchDecoder( input_path, batch_size, device_id, @@ -131,11 +131,9 @@ def run_sample( cvcuda_perf, ) - encoder = ImageBatchEncoderPyTorch( + encoder = ImageBatchEncoder( output_dir, - fps=0, device_id=device_id, - cuda_ctx=cuda_ctx, cvcuda_perf=cvcuda_perf, ) else: @@ -163,7 +161,7 @@ def run_sample( decoder.decoder.fps, ) else: - decoder = VideoBatchDecoderVPF( + decoder = VideoBatchDecoder( input_path, batch_size, device_id, @@ -171,7 +169,7 @@ def run_sample( cvcuda_perf, ) - encoder = VideoBatchEncoderVPF( + encoder = VideoBatchEncoder( output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf ) diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt index 9da865081..4a21a4c56 100644 --- a/src/cvcuda/CMakeLists.txt +++ b/src/cvcuda/CMakeLists.txt @@ -33,6 +33,7 @@ set(CV_CUDA_OP_FILES OpRemap.cpp OpColorTwist.cpp OpCropFlipNormalizeReformat.cpp + OpHQResize.cpp OpNonMaximumSuppression.cpp OpReformat.cpp OpResize.cpp diff --git a/src/cvcuda/OpHQResize.cpp b/src/cvcuda/OpHQResize.cpp new file mode 100644 index 000000000..fd9c3ec28 --- /dev/null +++ b/src/cvcuda/OpHQResize.cpp @@ -0,0 +1,139 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cvcuda/OpHQResize.h" + +#include "priv/OpHQResize.hpp" +#include "priv/SymbolVersioning.hpp" + +#include +#include +#include +#include + +namespace priv = cvcuda::priv; + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeCreate, (NVCVOperatorHandle * handle)) +{ + return nvcv::ProtectCall( + [&] + { + if (handle == nullptr) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Pointer to NVCVOperator handle must not be NULL"); + } + + *handle = reinterpret_cast(new priv::HQResize()); + }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeTensorGetWorkspaceRequirements, + (NVCVOperatorHandle handle, int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoiF *roi, + NVCVWorkspaceRequirements *reqOut)) +{ + if (!reqOut) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + *reqOut = priv::ToDynamicRef(handle).getWorkspaceRequirements( + batchSize, inputShape, outputShape, minInterpolation, magInterpolation, antialias, roi); + }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeTensorBatchGetWorkspaceRequirements, + (NVCVOperatorHandle handle, int batchSize, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi, + NVCVWorkspaceRequirements *reqOut)) +{ + if (!reqOut) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + *reqOut = priv::ToDynamicRef(handle).getWorkspaceRequirements( + batchSize, inputShapes, outputShapes, minInterpolation, magInterpolation, antialias, roi); + }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeGetMaxWorkspaceRequirements, + (NVCVOperatorHandle handle, int maxBatchSize, const HQResizeTensorShapeI maxShape, + NVCVWorkspaceRequirements *reqOut)) +{ + if (!reqOut) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] { *reqOut = priv::ToDynamicRef(handle).getWorkspaceRequirements(maxBatchSize, maxShape); }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVTensorHandle in, + NVCVTensorHandle out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoiF *roi)) +{ + if (!ws) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + nvcv::TensorWrapHandle _in(in), _out(out); + priv::ToDynamicRef(handle)(stream, *ws, _in, _out, minInterpolation, magInterpolation, + antialias, roi); + }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeImageBatchSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVImageBatchHandle in, + NVCVImageBatchHandle out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi)) +{ + if (!ws) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + nvcv::ImageBatchVarShapeWrapHandle _in(in), _out(out); + priv::ToDynamicRef(handle)(stream, *ws, _in, _out, minInterpolation, magInterpolation, + antialias, roi); + }); +} + +CVCUDA_DEFINE_API(0, 6, NVCVStatus, cvcudaHQResizeTensorBatchSubmit, + (NVCVOperatorHandle handle, cudaStream_t stream, const NVCVWorkspace *ws, NVCVTensorBatchHandle in, + NVCVTensorBatchHandle out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi)) +{ + if (!ws) + return NVCV_ERROR_INVALID_ARGUMENT; + + return nvcv::ProtectCall( + [&] + { + nvcv::TensorBatchWrapHandle _in(in), _out(out); + priv::ToDynamicRef(handle)(stream, *ws, _in, _out, minInterpolation, magInterpolation, + antialias, roi); + }); +} diff --git a/src/cvcuda/include/cvcuda/OpErase.h b/src/cvcuda/include/cvcuda/OpErase.h index d64425d28..b3b9a3bb1 100644 --- a/src/cvcuda/include/cvcuda/OpErase.h +++ b/src/cvcuda/include/cvcuda/OpErase.h @@ -102,25 +102,25 @@ CVCUDA_PUBLIC NVCVStatus cvcudaEraseCreate(NVCVOperatorHandle *handle, int32_t m * * anchor Tensor * - * Must be 'N' (dim = 1) with N = number of eraing area. + * Must be 'N' (dim = 1) with N = number of erasing area. * Data Type must be 32bit Signed. * DataType must be TYPE_2S32. * * erasing Tensor * - * Must be 'N' (dim = 1) with N = number of eraing area. + * Must be 'N' (dim = 1) with N = number of erasing area. * Data Type must be 32bit Signed. * DataType must be TYPE_3S32. * * imgIdx Tensor * - * Must be 'N' (dim = 1) with N = number of eraing area. + * Must be 'N' (dim = 1) with N = number of erasing area. * Data Type must be 32bit Signed. * DataType must be TYPE_S32. * * values Tensor * - * Must be 'N' (dim = 1) with W = number of eraing area * 4. + * Must be 'N' (dim = 1) with W = number of erasing area * 4. * Data Type must be 32bit Float. * DataType must be TYPE_F32. * @@ -133,9 +133,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaEraseCreate(NVCVOperatorHandle *handle, int32_t m * * @param [out] out output tensor / image batch. * - * @param [in] anchor an array of size num_erasing_area that gives the x coordinate and y coordinate of the top left point in the eraseing areas. + * @param [in] anchor an array of size num_erasing_area that gives the x coordinate and y coordinate of the top left point in the erasing areas. * - * @param [in] eraisng an array of size num_erasing_area that gives the widths of the eraseing areas, the heights of the eraseing areas and + * @param [in] erasing an array of size num_erasing_area that gives the widths of the erasing areas, the heights of the erasing areas and * integers in range 0-15, each of whose bits indicates whether or not the corresponding channel need to be erased. * * @param [in] values an array of size num_erasing_area*4 that gives the filling value for each erase area. diff --git a/src/cvcuda/include/cvcuda/OpHQResize.h b/src/cvcuda/include/cvcuda/OpHQResize.h new file mode 100644 index 000000000..d6715e138 --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpHQResize.h @@ -0,0 +1,406 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpHQResize.h + * + * @brief Defines types and functions to handle the HQResize operation. + * @defgroup NVCV_C_ALGORITHM_HQ_RESIZE HQ Resize + * @{ + */ + +#ifndef CVCUDA_HQ_RESIZE_H +#define CVCUDA_HQ_RESIZE_H + +#include "Operator.h" +#include "Types.h" +#include "Workspace.h" +#include "detail/Export.h" + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define NVCV_HQ_RESIZE_MAX_RESIZED_NDIM (3) + +typedef struct +{ + int32_t extent[NVCV_HQ_RESIZE_MAX_RESIZED_NDIM]; + int32_t ndim; + int32_t numChannels; +} HQResizeTensorShapeI; + +typedef struct +{ + HQResizeTensorShapeI *shape; + int32_t size; // the number of valid elements in the `shape` array + int32_t ndim; // the number of spatial extents in each `shapes` element + int32_t numChannels; // the number of innermost channels, -1 if they differ between samples +} HQResizeTensorShapesI; + +typedef struct +{ + float lo[NVCV_HQ_RESIZE_MAX_RESIZED_NDIM]; + float hi[NVCV_HQ_RESIZE_MAX_RESIZED_NDIM]; +} HQResizeRoiF; + +typedef struct +{ + int32_t size; // the number of valid elements in the `roi` array + int32_t ndim; // the number of valid extents in each `roi` element + HQResizeRoiF *roi; +} HQResizeRoisF; + +/** Constructs an instance of the HQResize operator. + * + * @param [out] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null. + * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeCreate(NVCVOperatorHandle *handle); + +/** Calculates the workspace requirements for Tensor input/ouput. + * + * @param [in] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @param [in] batchSize The number of samples in the tensor (the size of N extent). + * + * @param [in] inputShape The HW or DHW extents of the input tensor, the number of resized extents, + * and the number of channels. + * Supported number of resizes extents are 2 and 3. + * For ndim = 2, a tensor of layout (N)HW(C) is expected to be processed, + * for ndim = 3, a tensor of layout (N)DHW(C) is expected to be processed. + * + * @param [in] outputShape The HW or DHW extents of the output tensor and the number of channels. + * The number of extents and channels must be the same as in inputShape. + * + * @param [in] minInterpolation The type of interpolation to be used when downsampling an extent + * (i.e. when output extent is smaller than the corresponding input extent). + * + * @param [in] magInterpolation The type of interpolation to be used when upsampling an extent + * (i.e. when output extent is bigger than the corresponding input extent). + * + * @param [in] antialias Whether to use antialiasing when downsampling. + * + * @param [in] roi Optional region of interest for the input, in (D)HW layout. + * + * @param [out] reqOut The pointer for workspace requirements struct that will be filled by the call. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeTensorGetWorkspaceRequirements(NVCVOperatorHandle handle, int batchSize, + const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi, + NVCVWorkspaceRequirements *reqOut); + +/** Calculates the workspace requirements for TensorBatch/ImageBatchVarShape input/ouput. + * + * @param [in] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @param [in] batchSize The number of samples in the tensor batch/image batch. + * + * @param [in] inputShapes The list of shapes (HW or DHW extents) in the input batch, + * the number of channels, and the number of extents to be resampled (2 or 3). + * The number of channels can be specified once for the whole batch or each sample + * separately. + * + * @param [in] outputShapes The list of shapes (HW or DHW extents) in the output batch, + * the number of channels, and the number of extents to be resampled (2 or 3). + * The number of channels must match the number of channels in the input. + * + * @param [in] minInterpolation The type of interpolation to be used when downsampling an extent + * (i.e. when output extent is smaller than the corresponding input extent). + * + * @param [in] magInterpolation The type of interpolation to be used when upsampling an extent + * (i.e. when output extent is bigger than the corresponding input extent). + * + * @param [in] antialias Whether to use antialiasing when downsampling. + * + * @param [in] roi Optional region of interest for the input, in (D)HW layout. The roi can be described + * as a list for each sample or contain a single element to be used for all the samples + * in the batch. + * + * @param [out] reqOut The pointer for workspace requirements struct that will be filled by the call. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeTensorBatchGetWorkspaceRequirements(NVCVOperatorHandle handle, int batchSize, + const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoisF roi, + NVCVWorkspaceRequirements *reqOut); + +/** Calculates the upper bound for workspace requirements. The workspace that meets the returned + * requirements can be used with any call to the operator as long as: the input dimensionality + * (2 or 3) matches, the number of samples does not exceed the maxBatchSize, and all the input + * and output shapes do not exceed the maxShape in any extent (including number of channels). + * + * @param [in] handle Where the image instance handle will be written to. + * + Must not be NULL. + * + * @param [in] maxBatchSize The maximal number of samples in the tensor/tensor batch/image batch. + * + * @param [in] maxShape The maximal shape of any input or output sample. The number of channels must + * be an upper bound for number of channels in any sample. + * + * @param [out] reqOut The pointer for workspace requirements struct that will be filled by the call. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null or one of the arguments is out of range. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeGetMaxWorkspaceRequirements(NVCVOperatorHandle handle, int maxBatchSize, + const HQResizeTensorShapeI maxShape, + NVCVWorkspaceRequirements *reqOut); + +/** Executes the HQResize operation on the given cuda stream. This operation does not wait for completion. + * + * Limitations: + * + * Input, Output: + * Data Layout: NVCV_TENSOR_[N][D]HW[C] + * + * Number of channels: Positive integer + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | No + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Input/Output dependency + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | No (output can be the same or float32). + * Channels | Yes + * Width | No + * Height | No + * Samples | Yes + * + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] workspace The workspace with memory for intermediate results. The requirements for a given input + * can be acquired with a call to `cvcudaHQResizeTensorGetWorkspaceRequirements` or + * `cvcudaHQResizeGetMaxWorkspaceRequirements`. + * + * @param [in] in The input tensor with (N)(D)HW(C) layout. + * + * @param [in] out The output tensor with the same layout, number of samples and channels as the in tensor. + * + * @param [in] minInterpolation The type of interpolation to be used when downsampling an extent + * (i.e. when output extent is smaller than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] magInterpolation The type of interpolation to be used when upsampling an extent + * (i.e. when output extent is bigger than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] antialias Whether to use antialiasing when downsampling. The value is ignored for + * `minInterpolation = NVCV_INTERP_NEAREST`. + * + * @param [in] roi Optional region of interest for the input, in (D)HW layout. + * If, for some axis, the low bound is bigger than the high bound, + * the image is flipped in that dimension. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + const NVCVWorkspace *workspace, NVCVTensorHandle in, NVCVTensorHandle out, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoiF *roi); + +/** Executes the HQResize operation on the given cuda stream. This operation does not wait for completion. + * + * Limitations: + * + * Input, Output: + * Data Layout: NVCV_TENSOR_HWC + * + * Number of channels: [1, 2, 3, 4] + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | No + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Input/Output dependency + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | No (output can be the same or float32). + * Channels | Yes + * Width | No + * Height | No + * Samples | Yes + * + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] workspace The workspace with memory for intermediate results. The requirements for a given input + * can be acquired with a call to `cvcudaHQResizeTensorBatchGetWorkspaceRequirements` or + * `cvcudaHQResizeGetMaxWorkspaceRequirements`. + * + * @param [in] in The ImageBatchVarShape batch of input samples. + * + * @param [in] out The ImageBatchVarShape batch of output samples. + * + * @param [in] minInterpolation The type of interpolation to be used when downsampling an extent + * (i.e. when output extent is smaller than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] magInterpolation The type of interpolation to be used when upsampling an extent + * (i.e. when output extent is bigger than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] antialias Whether to use antialiasing when downsampling. The value is ignored for + * `minInterpolation = NVCV_INTERP_NEAREST`. + * + * @param [in] roi Optional region of interest for the input, in (D)HW layout. The roi can be described + * as a list of elements for each sample or a list containing a single element to be used + * for all the samples in the batch. If, for some axis, the low bound is bigger than + * the high bound, the image is flipped in that dimension. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeImageBatchSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + const NVCVWorkspace *workspace, NVCVImageBatchHandle in, + NVCVImageBatchHandle out, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF roi); + +/** Executes the HQResize operation on the given cuda stream. This operation does not wait for completion. + * + * Limitations: + * + * Input, Output: + * Data Layout: NVCV_TENSOR_[D]HW[C] + * + * Number of channels: Positive integer + * + * Data Type | Allowed + * -------------- | ------------- + * 8bit Unsigned | Yes + * 8bit Signed | No + * 16bit Unsigned | Yes + * 16bit Signed | Yes + * 32bit Unsigned | No + * 32bit Signed | No + * 32bit Float | Yes + * 64bit Float | No + * + * Input/Output dependency + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | No (output can be the same or float32). + * Channels | Yes + * Width | No + * Height | No + * Samples | Yes + * + * + * @param [in] handle Handle to the operator. + * + Must not be NULL. + * @param [in] stream Handle to a valid CUDA stream. + * + * @param [in] workspace The workspace with memory for intermediate results. The requirements for a given input + * can be acquired with a call to `cvcudaHQResizeTensorBatchGetWorkspaceRequirements` or + * `cvcudaHQResizeGetMaxWorkspaceRequirements`. + * + * @param [in] in The TensorBatch of input samples. + * + * @param [in] out The TensorBatch batch of output samples. + * + * @param [in] minInterpolation The type of interpolation to be used when downsampling an extent + * (i.e. when output extent is smaller than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] magInterpolation The type of interpolation to be used when upsampling an extent + * (i.e. when output extent is bigger than the corresponding input extent). + * Supported interpolation formats are: `NVCV_INTERP_NEAREST`, `NVCV_INTERP_LINEAR`, + * `NVCV_INTERP_CUBIC`, `NVCV_INTERP_LANCZOS`, and `NVCV_INTERP_GAUSSIAN`. + * + * @param [in] antialias Whether to use antialiasing when downsampling. The value is ignored for + * `minInterpolation = NVCV_INTERP_NEAREST`. + * + * @param [in] roi Optional region of interest for the input, in (D)HW layout. The roi can be described + * as a list of elements for each sample or a list containing a single element to be used + * for all the samples in the batch. If, for some axis, the low bound is bigger than + * the high bound, the image is flipped in that dimension. + * + * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range. + * @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in. + * @retval #NVCV_SUCCESS Operation executed successfully. + */ +CVCUDA_PUBLIC NVCVStatus cvcudaHQResizeTensorBatchSubmit(NVCVOperatorHandle handle, cudaStream_t stream, + const NVCVWorkspace *workspace, NVCVTensorBatchHandle in, + NVCVTensorBatchHandle out, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF roi); + +#ifdef __cplusplus +} +#endif + +#endif /* CVCUDA_HQ_RESIZE_H */ diff --git a/src/cvcuda/include/cvcuda/OpHQResize.hpp b/src/cvcuda/include/cvcuda/OpHQResize.hpp new file mode 100644 index 000000000..8e929bc5f --- /dev/null +++ b/src/cvcuda/include/cvcuda/OpHQResize.hpp @@ -0,0 +1,154 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file OpHQResize.hpp + * + * @brief Defines the public C++ Class for the HQResize operation. + * @defgroup NVCV_CPP_ALGORITHM_HQ_RESIZE HQ Resize + * @{ + */ + +#ifndef CVCUDA_HQ_RESIZE_HPP +#define CVCUDA_HQ_RESIZE_HPP + +#include "IOperator.hpp" +#include "OpHQResize.h" +#include "Workspace.hpp" + +#include +#include +#include +#include +#include +#include + +namespace cvcuda { + +class HQResize final : public IOperator +{ +public: + explicit HQResize(); + + ~HQResize(); + + WorkspaceRequirements getWorkspaceRequirements(int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoiF *roi = nullptr); + + WorkspaceRequirements getWorkspaceRequirements(int batchSize, HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF roi = {}); + + WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, const HQResizeTensorShapeI maxShape); + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, const nvcv::Tensor &out, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias = false, const HQResizeRoiF *roi = nullptr); + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatch &in, const nvcv::ImageBatch &out, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias = false, const HQResizeRoisF roi = {}); + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::TensorBatch &in, const nvcv::TensorBatch &out, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias = false, const HQResizeRoisF roi = {}); + + virtual NVCVOperatorHandle handle() const noexcept override; + +private: + NVCVOperatorHandle m_handle; +}; + +inline HQResize::HQResize() +{ + nvcv::detail::CheckThrow(cvcudaHQResizeCreate(&m_handle)); + assert(m_handle); +} + +inline HQResize::~HQResize() +{ + nvcvOperatorDestroy(m_handle); +} + +inline WorkspaceRequirements HQResize::getWorkspaceRequirements(int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi) +{ + WorkspaceRequirements req{}; + nvcv::detail::CheckThrow(cvcudaHQResizeTensorGetWorkspaceRequirements( + m_handle, batchSize, inputShape, outputShape, minInterpolation, magInterpolation, antialias, roi, &req)); + return req; +} + +inline WorkspaceRequirements HQResize::getWorkspaceRequirements(int batchSize, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoisF roi) +{ + WorkspaceRequirements req{}; + nvcv::detail::CheckThrow(cvcudaHQResizeTensorBatchGetWorkspaceRequirements( + m_handle, batchSize, inputShapes, outputShapes, minInterpolation, magInterpolation, antialias, roi, &req)); + return req; +} + +inline WorkspaceRequirements HQResize::getWorkspaceRequirements(int maxBatchSize, const HQResizeTensorShapeI maxShape) +{ + WorkspaceRequirements req{}; + nvcv::detail::CheckThrow(cvcudaHQResizeGetMaxWorkspaceRequirements(m_handle, maxBatchSize, maxShape, &req)); + return req; +} + +inline void HQResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &in, + const nvcv::Tensor &out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoiF *roi) +{ + nvcv::detail::CheckThrow(cvcudaHQResizeSubmit(m_handle, stream, &ws, in.handle(), out.handle(), minInterpolation, + magInterpolation, antialias, roi)); +} + +inline void HQResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatch &in, + const nvcv::ImageBatch &out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) +{ + nvcv::detail::CheckThrow(cvcudaHQResizeImageBatchSubmit(m_handle, stream, &ws, in.handle(), out.handle(), + minInterpolation, magInterpolation, antialias, roi)); +} + +inline void HQResize::operator()(cudaStream_t stream, const Workspace &ws, const nvcv::TensorBatch &in, + const nvcv::TensorBatch &out, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) +{ + nvcv::detail::CheckThrow(cvcudaHQResizeTensorBatchSubmit(m_handle, stream, &ws, in.handle(), out.handle(), + minInterpolation, magInterpolation, antialias, roi)); +} + +inline NVCVOperatorHandle HQResize::handle() const noexcept +{ + return m_handle; +} + +} // namespace cvcuda + +#endif // CVCUDA_HQ_RESIZE_HPP diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h index ad0b40aa0..77f620a52 100644 --- a/src/cvcuda/include/cvcuda/OpLabel.h +++ b/src/cvcuda/include/cvcuda/OpLabel.h @@ -58,20 +58,20 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * inside the input tensor, yielding labels in the output tensor with same rank and shape. Labels are numbers * uniquely assigned to each connected region, for example: * - * Input 0 0 0 0 Output 0 0 0 0 - * image: 1 1 0 1 labels: 4 4 0 7 - * 0 0 0 1 0 0 0 7 - * 0 1 1 1 0 7 7 7 + * Input 0 0 0 0 Output 0 0 0 0 + * image: 1 1 0 1 labels: 4 4 0 7 + * 0 0 0 1 0 0 0 7 + * 0 1 1 1 0 7 7 7 * * In the above example, three distinct regions were identified and labeled as 0, 4 and 7. Note that the region * labeled with 0 remained with the same value as the input, and label numbers 4 and 7 were assigned in * non-consecutive ordering. Some values in the input may be ignored, i.e. not labeled, using the \ref bgLabel * tensor to define those values as background, which usually is set to the value zero. For example: * - * Input 0 0 1 0 Output 0 0 2 3 Zeros in 0 0 2 0 - * image: 0 1 0 1 labels: 0 5 6 7 bgLabel: 0 5 0 7 - * 0 0 1 1 0 0 7 7 0 0 7 7 - * 0 1 1 1 0 7 7 7 0 7 7 7 + * Input 0 0 1 0 Output 0 0 2 3 Zeros in 0 0 2 0 + * image: 0 1 0 1 labels: 0 5 6 7 bgLabel: 0 5 0 7 + * 0 0 1 1 0 0 7 7 0 0 7 7 + * 0 1 1 1 0 7 7 7 0 7 7 7 * * Limitations: * @@ -106,7 +106,6 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle); * 64bit Float | No * * Input/Output dependency - * * Property | Input == Output * -------------- | ------------- * Data Layout | Yes diff --git a/src/cvcuda/include/cvcuda/OpMorphology.h b/src/cvcuda/include/cvcuda/OpMorphology.h index 35c890020..3ab9bd85d 100644 --- a/src/cvcuda/include/cvcuda/OpMorphology.h +++ b/src/cvcuda/include/cvcuda/OpMorphology.h @@ -191,11 +191,11 @@ CVCUDA_PUBLIC NVCVStatus cvcudaMorphologySubmit(NVCVOperatorHandle handle, cudaS * * @param [in] morphType Type of operation to perform (Erode/Dilate). \ref NVCVMorphologyType. * - * @param [in] masks 1D Tensor of NVCV_DATA_TYPE_2S32 mask W/H pairs, where the 1st pair is for image 0, second for image 1, etc. + * @param [in, out] masks 1D Tensor of NVCV_DATA_TYPE_2S32 mask W/H pairs, where the 1st pair is for image 0, second for image 1, etc. * Setting values to -1,-1 will create a default 3,3 mask. * (Note after the operation the tensor values may be modified by kernel) * - * @param [in] anchors 1D Tensor of NVCV_DATA_TYPE_2S32 X/Y pairs, where the 1st pair is for image 0, second for image 1, etc + * @param [in, out] anchors 1D Tensor of NVCV_DATA_TYPE_2S32 X/Y pairs, where the 1st pair is for image 0, second for image 1, etc * Setting values to -1,-1 will anchor the kernel at the center. * (Note after the operation the tensor values may be modified by kernel) * diff --git a/src/cvcuda/include/cvcuda/OpNormalize.h b/src/cvcuda/include/cvcuda/OpNormalize.h index 2830578e2..d20eed11e 100644 --- a/src/cvcuda/include/cvcuda/OpNormalize.h +++ b/src/cvcuda/include/cvcuda/OpNormalize.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -82,7 +82,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaNormalizeCreate(NVCVOperatorHandle *handle); * Limitations: * * Input: - * Data Layout: [kNHWC, kHWC, kNCHW, KCHW] + * Data Layout: [kNHWC, kHWC] * Channels: [1, 3, 4] * * Data Type | Allowed @@ -97,7 +97,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaNormalizeCreate(NVCVOperatorHandle *handle); * 64bit Float | No * * Output: - * Data Layout: [kNHWC, kHWC, kNCHW, KCHW] + * Data Layout: [kNHWC, kHWC] * Channels: [1, 3, 4] * * Data Type | Allowed diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h index 17bb8f62e..37eb2e0cf 100644 --- a/src/cvcuda/include/cvcuda/Types.h +++ b/src/cvcuda/include/cvcuda/Types.h @@ -43,6 +43,7 @@ typedef enum NVCV_INTERP_CUBIC = 2, NVCV_INTERP_AREA = 3, NVCV_INTERP_LANCZOS = 4, + NVCV_INTERP_GAUSSIAN = 5, NVCV_INTERP_MAX = 7, NVCV_WARP_INVERSE_MAP = 16, NVCV_INTERP_HAMMING = 17, diff --git a/src/cvcuda/include/cvcuda/Workspace.hpp b/src/cvcuda/include/cvcuda/Workspace.hpp index 65a9ddfd7..e878ff00e 100644 --- a/src/cvcuda/include/cvcuda/Workspace.hpp +++ b/src/cvcuda/include/cvcuda/Workspace.hpp @@ -64,6 +64,13 @@ inline NVCVWorkspaceRequirements MaxWorkspaceReq(const WorkspaceRequirements &a, return ret; } +inline void AlignUp(WorkspaceRequirements &ws) +{ + ws.hostMem.size = nvcv::detail::AlignUp(ws.hostMem.size, ws.hostMem.alignment); + ws.pinnedMem.size = nvcv::detail::AlignUp(ws.pinnedMem.size, ws.pinnedMem.alignment); + ws.cudaMem.size = nvcv::detail::AlignUp(ws.cudaMem.size, ws.cudaMem.alignment); +} + /** A helper class that manages the lifetime of resources stored in a Workspace structure. * * This class works in a way similar to unique_ptr with a custom deleter. diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index cd3904c41..6b28a39f7 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -32,6 +32,7 @@ set(CV_CUDA_PRIV_OP_FILES OpRemap.cu OpColorTwist.cu OpCropFlipNormalizeReformat.cu + OpHQResize.cu OpNonMaximumSuppression.cu OpReformat.cpp OpResize.cpp diff --git a/src/cvcuda/priv/OpHQResize.cu b/src/cvcuda/priv/OpHQResize.cu new file mode 100644 index 000000000..e7b924d81 --- /dev/null +++ b/src/cvcuda/priv/OpHQResize.cu @@ -0,0 +1,2788 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OpHQResize.hpp" +#include "cvcuda/Workspace.hpp" + +#include "OpHQResizeBatchWrap.cuh" +#include "OpHQResizeFilter.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +namespace cuda = nvcv::cuda; +namespace filter = cvcuda::priv::hq_resize::filter; +namespace batch_wrapper = cvcuda::priv::hq_resize::batch_wrapper; + +template +using Vec = typename cuda::MakeType; + +template +using VecI = Vec; + +template +using VecF = Vec; + +namespace utils { + +template>> +inline std::enable_if_t>, int64_t> Volume(const T &v) +{ + int64_t vol = 1; + for (int i = 0; i < cuda::NumComponents; i++) + { + vol *= cuda::GetElement(v, i); + } + return vol; +} + +template>> +auto DivCeil(const T &a, const T &b) +{ + return (a + b - 1) / b; +} +} // namespace utils + +namespace resampling { + +template +struct SampleDesc +{ + static constexpr int kSpatialNDim = _kSpatialNDim; + + // input, output and the intermediate buffers + static constexpr int kNumBuffers = kSpatialNDim + 1; + + // shapes[0] - input shape, consecutive intermediate results shapes, + // shapes[kSpatialNDim] - output shape + VecI shapes[kNumBuffers]; + + // the number of channels in the sample, common for input, + // intermediate and output sample + int channels; + + // describes which axis to processes in a given resampling pass, e.g. + // if processingOrder.x = 2, then in the first pass the z axis + // will be resampled + VecI processingOrder; + + // resampling origin and scale in pass order, i.e. + // origin.x and scale.x describe origin and scale for resampling + // in the first pass + VecF origin, scale; + + // what type of filter to use (NN, Linear, Support based) + // in pass order (i.e. filterKind[0] refers to filter used in the first pass) + filter::FilterTypeKind filterKind[kSpatialNDim]; + + // filter description (support, coefficients etc.) + // in pass order (i.e. filter[0] refers to filter used in the first pass) + filter::ResamplingFilter filter[kSpatialNDim]; + + // spatial offset in the input sample based on the input ROI + // and filter support + VecI inRoiOffset; + + // describes the logical block shape, i.e. a size of a slice + // that a single gpu block will process in a given pass + VecI blockShape[kSpatialNDim]; +}; + +/** + * @brief Helper structure to indicate the static number of channels + * dynamic number of channels that may differ between samples. + */ +template +struct NumChannels +{ + constexpr int __forceinline__ __device__ operator()() const + { + return kStaticChannels; + } + + static constexpr bool kHasStaticChannels = true; + static constexpr int kStaticChannels = _kStaticChannels; +}; + +template<> +struct NumChannels<-1> +{ + int __forceinline__ __device__ operator()() const + { + return dynamicChannels; + } + + static constexpr bool kHasStaticChannels = false; + static constexpr int kStaticChannels = -1; + int dynamicChannels; +}; + +template +__forceinline__ __device__ void WithChannels(const int dynamicChannels, Cb &&cb) +{ + if constexpr (kNumStaticChannels == -1) + { + cb(NumChannels<-1>{dynamicChannels}); + } + else if constexpr (kNumStaticChannels != -1) + { + static_assert(kNumStaticChannels > 0); + cb(NumChannels{}); + } +} + +/** + * @brief Each threadblock will cover `lanes * volume(blockDim)` + * elements of the output sample. More lanes result in: + * 1. smaller grid launched (possibly reducing parallelism for small images), + * 2. better resuing of the filter's coefficients + * (they are computed once for all lanes). + * + * @return int - the number of lanes for a single threadblock + * to cover in the output image + */ +inline int GetResizeBlockLanesEnv() +{ + char *env = getenv("CVCUDA_HQ_RESIZE_BLOCK_LANES"); + if (env) + { + int lanes = atoi(env); + if (lanes < 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The CVCUDA_HQ_RESIZE_BLOCK_LANES must be a positive integer"); + } + return lanes; + } + else + { + return 8; + } +} + +inline int GetResizeBlockLanes() +{ + static int lanes = GetResizeBlockLanesEnv(); + return lanes; +} + +template +struct GridHelperDevice +{ +}; + +/** + * @brief Maps cuda blockIdx to sample and bounds of the sample region + * to be processed be the threadblock for 2D resampling + */ +template<> +struct GridHelperDevice<2> +{ + GridHelperDevice(VecI<2> numBlocks) + : m_numBlocksX{numBlocks.x} + { + } + + int __forceinline__ __device__ CurrentSample() const + { + return blockIdx.y; + } + + void __forceinline__ __device__ CurrentBlock(VecI<2> &lo, VecI<2> &hi, const VecI<2> blockShape) const + + { + VecI<2> currentBlock; + { + int block = blockIdx.x; + currentBlock.x = block % m_numBlocksX; + currentBlock.y = block / m_numBlocksX; + } + lo = blockShape * currentBlock; + hi = lo + blockShape; + } + +private: + int m_numBlocksX; +}; + +/** + * @brief Maps cuda blockIdx to sample and bounds of the sample region + * to be processed be the threadblock for 3D resampling + */ +template<> +struct GridHelperDevice<3> +{ + GridHelperDevice(VecI<3> numBlocks) + : m_numBlocksX{numBlocks.x} + , m_numBlocksY{numBlocks.y} + { + } + + int __forceinline__ __device__ CurrentSample() const + { + return blockIdx.y; + } + + void __forceinline__ __device__ CurrentBlock(VecI<3> &lo, VecI<3> &hi, const VecI<3> blockShape) const + + { + VecI<3> currentBlock; + { + int block = blockIdx.x; + currentBlock.x = block % m_numBlocksX; + block = block / m_numBlocksX; + currentBlock.y = block % m_numBlocksY; + currentBlock.z = block / m_numBlocksY; + } + lo = blockShape * currentBlock; + hi = lo + blockShape; + } + +private: + int m_numBlocksX, m_numBlocksY; +}; + +/** + * @brief Maps the logical blocks and the number of samples into cuda grid and back. + */ +template +struct GridHelper +{ + GridHelper(VecI numBlocks, int numSamples) + : m_numBlocks{numBlocks} + , m_numSamples{numSamples} + { + } + + template + std::enable_if_t GetKernelGrid() const + { + static_assert(kSpatialNDim == 2); + return dim3(m_numBlocks.x * m_numBlocks.y, m_numSamples, 1); + } + + template + std::enable_if_t GetKernelGrid() const + { + static_assert(kSpatialNDim == 3); + return dim3(m_numBlocks.x * m_numBlocks.y * m_numBlocks.z, m_numSamples, 1); + } + + GridHelperDevice GetDeviceGridHelper() + { + return {m_numBlocks}; + } + +private: + VecI m_numBlocks; + int m_numSamples; +}; + +// The namespace contains implementation of different resampling +// methods in device code. +namespace interpolate { + +template +auto __forceinline__ __device__ GetWrapPtr(const Wrap wrap, const VecI<2> yx, const Idxs... idxs) +{ + return wrap.ptr(yx.y, yx.x, idxs...); +} + +template +auto __forceinline__ __device__ GetWrapPtr(const Wrap wrap, const VecI<3> zyx, const Idxs... idxs) +{ + return wrap.ptr(zyx.z, zyx.y, zyx.x, idxs...); +} + +template +std::enable_if_t __forceinline__ __device__ + LoadPixelLdg(const Wrap wrap, const NumChannelsT numChannels, const Idxs... idxs) +{ + using T = std::remove_const_t; + using BT = cuda::BaseType; + constexpr int kStaticChannels = NumChannelsT::kStaticChannels; + static_assert(kStaticChannels == cuda::NumElements); + + constexpr bool kSupportsLdg = kStaticChannels == 2 || kStaticChannels == 4; + + if constexpr (kSupportsLdg) + { + return __ldg(GetWrapPtr(wrap, idxs...)); + } + else if constexpr (!kSupportsLdg) + { + const BT *basePtr = reinterpret_cast(GetWrapPtr(wrap, idxs...)); + T value; +#pragma unroll + for (int c = 0; c < kStaticChannels; c++) + { + cuda::GetElement(value, c) = __ldg(basePtr + c); + } + return value; + } +} + +template +std::enable_if_t __forceinline__ __device__ + LoadPixelLdg(const Wrap wrap, const NumChannelsT numChannels, const Idxs... idxs) +{ + static_assert(!cuda::IsCompound); + return __ldg(GetWrapPtr(wrap, idxs...)); +} + +namespace nn { + +template +void __forceinline__ __device__ ForAllPixels(const VecI<2> lo, const VecI<2> hi, ProcessPixel &&processPixel) +{ + for (int y = lo.y + threadIdx.y; y < hi.y; y += blockDim.y) + { + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<2>{x, y}); + } + } +} + +template +void __forceinline__ __device__ ForAllPixels(const VecI<3> lo, const VecI<3> hi, ProcessPixel &&processPixel) +{ + for (int z = lo.z + threadIdx.z; z < hi.z; z += blockDim.z) + { + for (int y = lo.y + threadIdx.y; y < hi.y; y += blockDim.y) + { + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<3>{x, y, z}); + } + } + } +} + +/** + * @brief Nearest neighbor resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param origin - source coordinates corresponding to output's (0, 0) + * @param scale - step, in source coordinates, for one pixel in output coordinates + * @param inShape - shape of the input (x, y) order + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + */ +template +void __forceinline__ __device__ Resample(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, + VecF origin, const VecF scale, + const VecI inShape, const NumChannelsT numChannels) +{ + using OutT = typename PassOutWrap::ValueType; + using InT = typename PassInWrap::ValueType; + // spatial extents and optional channels extent + constexpr int kNDim = kSpatialNDim + !NumChannelsT::kHasStaticChannels; + + static_assert(!NumChannelsT::kHasStaticChannels || NumChannelsT::kStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(PassOutWrap::kNumDimensions == kNDim); + static_assert(PassInWrap::kNumDimensions == kNDim); + + origin += 0.5f * scale; + ForAllPixels(lo, hi, + [=](const VecI outIdxs) + { + VecI inIdxs = cuda::round(outIdxs * scale + origin); + inIdxs = cuda::clamp(inIdxs, cuda::SetAll>(0), inShape - 1); + + if constexpr (NumChannelsT::kHasStaticChannels) + { + const InT in = LoadPixelLdg(inWrap, numChannels, inIdxs); + OutT &out = *GetWrapPtr(outWrap, outIdxs); + out = cuda::SaturateCast(in); + } + else if constexpr (!NumChannelsT::kHasStaticChannels) + { + for (int c = 0; c < numChannels(); c++) + { + const InT in = LoadPixelLdg(inWrap, numChannels, inIdxs, c); + OutT &out = *GetWrapPtr(outWrap, outIdxs, c); + out = cuda::SaturateCast(in); + } + } + }); +} + +} // namespace nn + +namespace linear { + +template +void __forceinline__ __device__ Linear(const PassOutWrap outWrap, const PassInWrap inWrap, + const NumChannelsT numChannels, const VecI inIdx0, + const VecI inIdx1, const float q, const VecI outIdx) +{ + using OutT = typename PassOutWrap::ValueType; + using InT = std::remove_const_t; + // spatial extents and optional channels extent + constexpr int kNDim = kSpatialNDim + !NumChannelsT::kHasStaticChannels; + + static_assert(!NumChannelsT::kHasStaticChannels || NumChannelsT::kStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(PassOutWrap::kNumDimensions == kNDim); + static_assert(PassInWrap::kNumDimensions == kNDim); + + if constexpr (NumChannelsT::kHasStaticChannels) + { + using FloatT = cuda::ConvertBaseTypeTo; + const FloatT a = cuda::StaticCast(LoadPixelLdg(inWrap, numChannels, inIdx0)); + const FloatT b = cuda::StaticCast(LoadPixelLdg(inWrap, numChannels, inIdx1)); + FloatT tmp = b - a; +#pragma unroll + for (int c = 0; c < NumChannelsT::kStaticChannels; c++) + { + cuda::GetElement(tmp, c) = fmaf(cuda::GetElement(tmp, c), q, cuda::GetElement(a, c)); + } + OutT &out = *GetWrapPtr(outWrap, outIdx); + out = cuda::SaturateCast(tmp); + } + else if constexpr (!NumChannelsT::kHasStaticChannels) + { + for (int c = 0; c < numChannels(); c++) + { + const float a = LoadPixelLdg(inWrap, numChannels, inIdx0, c); + const float b = LoadPixelLdg(inWrap, numChannels, inIdx1, c); + const float tmp = fmaf(b - a, q, a); + OutT &out = *GetWrapPtr(outWrap, outIdx, c); + out = cuda::SaturateCast(tmp); + } + } +} + +template +void __forceinline__ __device__ ForAllPixelsHorz(const VecI<2> lo, const VecI<2> hi, ProcessPixel &&processPixel) +{ + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + processPixel(VecI<2>{x, y}); + } + } +} + +template +void __forceinline__ __device__ ForAllPixelsHorz(const VecI<3> lo, const VecI<3> hi, ProcessPixel &&processPixel) +{ + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + for (int z = threadIdx.z + lo.z; z < hi.z; z += blockDim.z) + { + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + processPixel(VecI<3>{x, y, z}); + } + } + } +} + +template +void __forceinline__ __device__ ForAllPixelsVert(const VecI<2> lo, const VecI<2> hi, ProcessPixel &&processPixel) +{ + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<2>{x, y}); + } + } +} + +template +void __forceinline__ __device__ ForAllPixelsVert(const VecI<3> lo, const VecI<3> hi, ProcessPixel &&processPixel) +{ + for (int z = threadIdx.z + lo.z; z < hi.z; z += blockDim.z) + { + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<3>{x, y, z}); + } + } + } +} + +/** + * @brief Implements horizontal resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcX0 - X coordinate in the source image corresponding to output 0 + * @param scale - step, in source X, for one pixel in output X (may be negative) + * @param inShape - shape of the input (x, y[, z]) order + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + * + * The input region of interest is defined in terms of origin/scale, which are relative to + * output (0, 0). + * The lo/hi parameters are not output RoI - they merely indicate the output slice processed + * by current block. + */ +template +void __forceinline__ __device__ ResampleHorz(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, float srcX0, + const float scale, const VecI inShape, + const NumChannelsT numChannels) +{ + srcX0 += 0.5f * scale - 0.5f; + ForAllPixelsHorz(lo, hi, + [=](const VecI outIdx) + { + const float sx0f = outIdx.x * scale + srcX0; + const int sx0i = cuda::round(sx0f); + const float q = sx0f - sx0i; + const int sx0 = cuda::clamp(sx0i, 0, inShape.x - 1); + const int sx1 = cuda::clamp(sx0i + 1, 0, inShape.x - 1); + + VecI inIdx0 = outIdx; + VecI inIdx1 = outIdx; + inIdx0.x = sx0; + inIdx1.x = sx1; + + Linear(outWrap, inWrap, numChannels, inIdx0, inIdx1, q, outIdx); + }); +} + +/** + * @brief Implements vertical resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcY0 - Y coordinate in the source image corresponding to output 0 + * @param scale - step, in source Y, for one pixel in output Y (may be negative) + * @param inShape - shape of the input (x, y[, z]) order + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + */ +template +void __forceinline__ __device__ ResampleVert(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, float srcY0, + const float scale, const VecI inShape, + const NumChannelsT numChannels) +{ + srcY0 += 0.5f * scale - 0.5f; + ForAllPixelsVert(lo, hi, + [=](const VecI outIdx) + { + const float sy0f = outIdx.y * scale + srcY0; + const int sy0i = cuda::round(sy0f); + const float q = sy0f - sy0i; + const int sy0 = cuda::clamp(sy0i, 0, inShape.y - 1); + const int sy1 = cuda::clamp(sy0i + 1, 0, inShape.y - 1); + + VecI inIdx0 = outIdx; + VecI inIdx1 = outIdx; + inIdx0.y = sy0; + inIdx1.y = sy1; + + Linear(outWrap, inWrap, numChannels, inIdx0, inIdx1, q, outIdx); + }); +} + +/** + * @brief Implements depthwise resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcZ0 - Z coordinate in the source image corresponding to output's 0 + * @param scale - step, in source Z, for one pixel in output Z (may be negative) + * @param inShape - shape of the input (x, y[, z]) order + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + */ +template +void __forceinline__ __device__ ResampleDepth(const PassOutWrap outWrap, const PassInWrap inWrap, const VecI<3> lo, + const VecI<3> hi, float srcZ0, const float scale, const VecI<3> inShape, + const NumChannelsT numChannels) +{ + srcZ0 += 0.5f * scale - 0.5f; + // threadIdx.y is used to traverse Z axis + for (int z = lo.z + threadIdx.y; z < hi.z; z += blockDim.y) + { + const float sz0f = z * scale + srcZ0; + const int sz0i = cuda::round(sz0f); + const float q = sz0f - sz0i; + const int sz0 = cuda::clamp(sz0i, 0, inShape.z - 1); + const int sz1 = cuda::clamp(sz0i + 1, 0, inShape.z - 1); + + for (int y = lo.y + threadIdx.z; y < hi.y; y += blockDim.z) + { + for (int x = lo.x + threadIdx.x; x < hi.x; x += blockDim.x) + { + VecI<3> inIdx0{x, y, sz0}; + VecI<3> inIdx1{x, y, sz1}; + VecI<3> outIdx{x, y, z}; + Linear<3>(outWrap, inWrap, numChannels, inIdx0, inIdx1, q, outIdx); + } + } + } +} + +} // namespace linear + +namespace filter_support { + +constexpr int kMaxGPUFilterSupport = 8192; + +bool __forceinline__ __host__ __device__ CanComputeCoefPerThread(const int support, const int resamplingAxisBlockSize) +{ + return support * resamplingAxisBlockSize <= kMaxGPUFilterSupport; +} + +inline int RequiredSharedMemoryElements(const int support, const int resamplingAxisBlockSize) +{ + if (CanComputeCoefPerThread(support, resamplingAxisBlockSize)) + { + return support * resamplingAxisBlockSize; + } + else + { + return support; + } +} + +template +void __forceinline__ __device__ ForAllOrthogonalToHorz(const VecI<2> lo, const VecI<2> hi, ProcessPixel &&processPixel) +{ + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + processPixel(VecI<2>{0, y}); + } +} + +template +void __forceinline__ __device__ ForAllOrthogonalToHorz(const VecI<3> lo, const VecI<3> hi, ProcessPixel &&processPixel) +{ + for (int z = threadIdx.z + lo.z; z < hi.z; z += blockDim.z) + { + for (int y = threadIdx.y + lo.y; y < hi.y; y += blockDim.y) + { + processPixel(VecI<3>{0, y, z}); + } + } +} + +template +void __forceinline__ __device__ ForAllOrthogonalToVert(const VecI<2> lo, const VecI<2> hi, ProcessPixel &&processPixel) +{ + for (int x = threadIdx.x + lo.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<2>{x, 0}); + } +} + +template +void __forceinline__ __device__ ForAllOrthogonalToVert(const VecI<3> lo, const VecI<3> hi, ProcessPixel &&processPixel) +{ + for (int z = threadIdx.z + lo.z; z < hi.z; z += blockDim.z) + { + for (int x = threadIdx.x + lo.x; x < hi.x; x += blockDim.x) + { + processPixel(VecI<3>{x, 0, z}); + } + } +} + +/** + * @brief Implements horizontal resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcX0 - X coordinate in the source image corresponding to output's 0 + * @param scale - step, in source X, for one pixel in output X (may be negative) + * @param support - size of the resampling kernel, in source pixels + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + * + * The function fills the output in block-sized vertical spans. + * Block horizontal size is warp-aligned. + * Filter coefficients are pre-calculated for each vertical span to avoid + * recalculating them for each row, and stored in a shared memory block. + * + * The function follows different code paths for static and dynamic number of channels. + * For the dynamic, the innermost loop goes over filter taps, which eliminates the need + * for thread-local memory to store intermediate sums. This allows processing arbitrary + * number of channels. + * For static number of channels, the run-time parameter `channels` is ignored and + * there's also a local temporary storage for a tap sum for each channel. This is faster, + * but requires extra registers for the intermediate sums. + */ +template +void __forceinline__ __device__ ResampleHorz(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, float srcX0, + const float scale, const VecI inShape, + const filter::ResamplingFilter filter, const NumChannelsT numChannels) +{ + extern __shared__ float coeffs[]; + + using OutT = typename PassOutWrap::ValueType; + using InT = std::remove_const_t; + // spatial extents and optional channels extent + constexpr int kNDim = kSpatialNDim + !NumChannelsT::kHasStaticChannels; + + static_assert(!NumChannelsT::kHasStaticChannels || NumChannelsT::kStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(PassOutWrap::kNumDimensions == kNDim); + static_assert(PassInWrap::kNumDimensions == kNDim); + + const int support = filter.support(); + const float filterStep = filter.scale; + // If the support is small enough (for blockDim.x = 32 and kMaxGPUFilterSupport = 8192, it's 256), + // we can fit `support` x `blockDim.x` elements into shm, so that for each output_x mapped to input_x, + // we take into account the exact error that comes from rounding the input_x from float to integer. + // For larger supports, we just compute `support` elements common for all threads. + const bool hugeSupport = !CanComputeCoefPerThread(support, blockDim.x); + const int coeffBase = hugeSupport ? 0 : threadIdx.x; + const int coeffStride = hugeSupport ? 1 : blockDim.x; + + srcX0 += 0.5f * scale - 0.5f - filter.anchor; + + for (int j = lo.x; j < hi.x; j += blockDim.x) + { + const int x = j + threadIdx.x; + const float sx0f = x * scale + srcX0; + const int sx0 = hugeSupport ? cuda::round(sx0f) + : cuda::round(sx0f); + const float f = (sx0 - sx0f) * filterStep; + __syncthreads(); + if (hugeSupport) + { + for (int k = threadIdx.x + blockDim.x * threadIdx.y; k < support; k += blockDim.x * blockDim.y) + { + float flt = filter(f + k * filterStep); + coeffs[k] = flt; + } + } + else + { + for (int k = threadIdx.y; k < support; k += blockDim.y) + { + float flt = filter(f + k * filterStep); + coeffs[coeffBase + coeffStride * k] = flt; + } + } + __syncthreads(); + + if (x >= hi.x) + continue; + + float norm = 0; + for (int k = 0; k < support; k++) + { + norm += coeffs[coeffBase + coeffStride * k]; + } + norm = 1.0f / norm; + + ForAllOrthogonalToHorz( + lo, hi, + [=](VecI outIdx) + { + VecI inIdx = outIdx; + outIdx.x = x; + + if constexpr (NumChannelsT::kHasStaticChannels) + { + using FloatT = cuda::ConvertBaseTypeTo; + FloatT tmp{}; + + for (int k = 0, coeffIdx = coeffBase; k < support; k++, coeffIdx += coeffStride) + { + inIdx.x = cuda::clamp(sx0 + k, 0, inShape.x - 1); + const float flt = coeffs[coeffIdx]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx); +#pragma unroll + for (int c = 0; c < NumChannelsT::kStaticChannels; c++) + { + cuda::GetElement(tmp, c) = fmaf(cuda::GetElement(px, c), flt, cuda::GetElement(tmp, c)); + } + } + + OutT &out = *GetWrapPtr(outWrap, outIdx); + out = cuda::SaturateCast(tmp * norm); + } + else if constexpr (!NumChannelsT::kHasStaticChannels) + { + for (int c = 0; c < numChannels(); c++) + { + float tmp = 0; + + for (int k = 0, coeffIdx = coeffBase; k < support; k++, coeffIdx += coeffStride) + { + inIdx.x = cuda::clamp(sx0 + k, 0, inShape.x - 1); + const float flt = coeffs[coeffIdx]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx, c); + tmp = fmaf(px, flt, tmp); + } + + OutT &out = *GetWrapPtr(outWrap, outIdx, c); + out = cuda::SaturateCast(tmp * norm); + } + } + }); + } +} + +/** + * @brief Implements vertical resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcY0 - Y coordinate in the source image corresponding to output's 0 + * @param scale - step, in source Y, for one pixel in output Y (may be negative) + * @param support - size of the resampling kernel, in source pixels + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + * + * The function fills the output in block-sized horizontal spans. + * Filter coefficients are pre-calculated for each horizontal span to avoid + * recalculating them for each column, and stored in a shared memory block. + */ +template +void __forceinline__ __device__ ResampleVert(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, float srcY0, + const float scale, const VecI inShape, + const filter::ResamplingFilter filter, const NumChannelsT numChannels) +{ + extern __shared__ float coeffs[]; + + using OutT = typename PassOutWrap::ValueType; + using InT = std::remove_const_t; + // spatial extents and optional channels extent + constexpr int kNDim = kSpatialNDim + !NumChannelsT::kHasStaticChannels; + + static_assert(!NumChannelsT::kHasStaticChannels || NumChannelsT::kStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(PassOutWrap::kNumDimensions == kNDim); + static_assert(PassInWrap::kNumDimensions == kNDim); + + const int support = filter.support(); + const float filterStep = filter.scale; + // If the support is small enough, we can fit `blockDim.y` x `support` elements into shm, so that + // for each output_y mapped to input_y, we take into account the exact error that comes from + // rounding the input_y from float to integer. For larger supports, we just compute `support` + // elements common for all threads. + const bool hugeSupport = !CanComputeCoefPerThread(support, blockDim.y); + const int coeffBase = hugeSupport ? 0 : support * threadIdx.y; + + srcY0 += 0.5f * scale - 0.5f - filter.anchor; + + for (int i = lo.y; i < hi.y; i += blockDim.y) + { + const int y = i + threadIdx.y; + const float sy0f = y * scale + srcY0; + const int sy0 = hugeSupport ? cuda::round(sy0f) + : cuda::round(sy0f); + float f = (sy0 - sy0f) * filterStep; + __syncthreads(); + // fills `support` + if (hugeSupport) + { + for (int k = threadIdx.x + blockDim.x * threadIdx.y; k < support; k += blockDim.x * blockDim.y) + { + float flt = filter(f + k * filterStep); + coeffs[k] = flt; + } + } + else + { + for (int k = threadIdx.x; k < support; k += blockDim.x) + { + float flt = filter(f + k * filterStep); + coeffs[coeffBase + k] = flt; + } + } + __syncthreads(); + + if (y >= hi.y) + continue; + + float norm = 0; + for (int k = 0; k < support; k++) + { + norm += coeffs[coeffBase + k]; + } + norm = 1.0f / norm; + + ForAllOrthogonalToVert(lo, hi, + [=](VecI outIdx) + { + VecI inIdx = outIdx; + outIdx.y = y; + + if constexpr (NumChannelsT::kHasStaticChannels) + { + using FloatT = cuda::ConvertBaseTypeTo; + FloatT tmp{}; + + for (int k = 0; k < support; k++) + { + inIdx.y = cuda::clamp(sy0 + k, 0, inShape.y - 1); + const float flt = coeffs[coeffBase + k]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx); +#pragma unroll + for (int c = 0; c < NumChannelsT::kStaticChannels; c++) + { + cuda::GetElement(tmp, c) + = fmaf(cuda::GetElement(px, c), flt, cuda::GetElement(tmp, c)); + } + } + + OutT &out = *GetWrapPtr(outWrap, outIdx); + out = cuda::SaturateCast(tmp * norm); + } + else if constexpr (!NumChannelsT::kHasStaticChannels) + { + for (int c = 0; c < numChannels(); c++) + { + float tmp = 0; + + for (int k = 0; k < support; k++) + { + inIdx.y = cuda::clamp(sy0 + k, 0, inShape.y - 1); + const float flt = coeffs[coeffBase + k]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx, c); + tmp = fmaf(px, flt, tmp); + } + + OutT &out = *GetWrapPtr(outWrap, outIdx, c); + out = cuda::SaturateCast(tmp * norm); + } + } + }); + } +} + +/** + * @brief Implements depth resampling + * + * @param outWrap - the wrapper for accessing output data + * @param inWrap - the wrapper for accessing input data + * @param lo - inclusive lower bound output coordinates of the block processed by the threadblock + * @param hi - exclusive upper bound output coordinates of the block processed by the threadblock + * @param srcZ0 - Y coordinate in the source image corresponding to output's 0 + * @param scale - step, in source Y, for one pixel in output Y (may be negative) + * @param support - size of the resampling kernel, in source pixels + * @param numChannels - the NumChannels specialization describing the number of interleaved + * channels in the input and output sample. + * + * The function fills the output in block-sized horizontal spans. + * Filter coefficients are pre-calculated for each horizontal span to avoid + * recalculating them for each column, and stored in a shared memory block. + */ +template +void __forceinline__ __device__ ResampleDepth(const PassOutWrap outWrap, const PassInWrap inWrap, const VecI<3> lo, + const VecI<3> hi, float srcZ0, const float scale, const VecI<3> inShape, + const filter::ResamplingFilter filter, const NumChannelsT numChannels) +{ + extern __shared__ float coeffs[]; + + using OutT = typename PassOutWrap::ValueType; + using InT = std::remove_const_t; + // spatial extents and optional channels extent + constexpr int kNDim = 3 + !NumChannelsT::kHasStaticChannels; + + static_assert(!NumChannelsT::kHasStaticChannels || NumChannelsT::kStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(PassOutWrap::kNumDimensions == kNDim); + static_assert(PassInWrap::kNumDimensions == kNDim); + + const int support = filter.support(); + const float filterStep = filter.scale; + // If the support is small enough, we can fit `blockDim.y` x `support` elements into shm, + // so that for each output_z mapped to input_z, we take into account the exact error that + // comes from rounding the input_z from float to integer. For larger supports, we just + // compute `support` elements common for all threads. + const bool hugeSupport = !CanComputeCoefPerThread(support, blockDim.y); + const int coeffBase = hugeSupport ? 0 : support * threadIdx.y; + + srcZ0 += 0.5f * scale - 0.5f - filter.anchor; + + for (int i = lo.z; i < hi.z; i += blockDim.y) + { + // threadIdx.y is used to traverse Z axis + const int z = i + threadIdx.y; + const float sz0f = z * scale + srcZ0; + const int sz0 = hugeSupport ? cuda::round(sz0f) + : cuda::round(sz0f); + float f = (sz0 - sz0f) * filterStep; + __syncthreads(); + if (hugeSupport) + { + for (int k = threadIdx.x + blockDim.x * threadIdx.y; k < support; k += blockDim.x * blockDim.y) + { + float flt = filter(f + k * filterStep); + coeffs[k] = flt; + } + } + else + { + for (int k = threadIdx.x; k < support; k += blockDim.x) + { + float flt = filter(f + k * filterStep); + coeffs[coeffBase + k] = flt; + } + } + __syncthreads(); + + if (z >= hi.z) + continue; + + float norm = 0; + for (int k = 0; k < support; k++) + { + norm += coeffs[coeffBase + k]; + } + norm = 1.0f / norm; + + for (int y = threadIdx.z + lo.y; y < hi.y; y += blockDim.z) + { + for (int x = threadIdx.x + lo.x; x < hi.x; x += blockDim.x) + { + const VecI<3> outIdx{x, y, z}; + VecI<3> inIdx = outIdx; + + if constexpr (NumChannelsT::kHasStaticChannels) + { + using FloatT = cuda::ConvertBaseTypeTo; + FloatT tmp{}; + + for (int k = 0; k < support; k++) + { + inIdx.z = cuda::clamp(sz0 + k, 0, inShape.z - 1); + const float flt = coeffs[coeffBase + k]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx); +#pragma unroll + for (int c = 0; c < NumChannelsT::kStaticChannels; c++) + { + cuda::GetElement(tmp, c) = fmaf(cuda::GetElement(px, c), flt, cuda::GetElement(tmp, c)); + } + } + + OutT &out = *GetWrapPtr(outWrap, outIdx); + out = cuda::SaturateCast(tmp * norm); + } + else if constexpr (!NumChannelsT::kHasStaticChannels) + { + for (int c = 0; c < numChannels(); c++) + { + float tmp = 0; + + for (int k = 0; k < support; k++) + { + inIdx.z = cuda::clamp(sz0 + k, 0, inShape.z - 1); + const float flt = coeffs[coeffBase + k]; + const InT px = LoadPixelLdg(inWrap, numChannels, inIdx, c); + tmp = fmaf(px, flt, tmp); + } + + OutT &out = *GetWrapPtr(outWrap, outIdx, c); + out = cuda::SaturateCast(tmp * norm); + } + } + } + } + } +} +} // namespace filter_support + +template +void __forceinline__ __device__ RunNN(const PassOutWrap outWrap, const PassInWrap inWrap, const VecI lo, + const VecI hi, int axis, const VecI inShape, + const float origin, const float scale, const NumChannelsT numChannels) +{ + auto originV = cuda::SetAll>(0.f); + auto scaleV = cuda::SetAll>(1.f); + cuda::GetElement(originV, axis) = origin; + cuda::GetElement(scaleV, axis) = scale; + nn::Resample(outWrap, inWrap, lo, hi, originV, scaleV, inShape, numChannels); +} + +template +void __forceinline__ __device__ RunLinear(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, int axis, + const VecI inShape, const float origin, const float scale, + const NumChannelsT numChannels) +{ + if (axis == 0) + { + linear::ResampleHorz(outWrap, inWrap, lo, hi, origin, scale, inShape, numChannels); + } + else if (axis == 1) + { + linear::ResampleVert(outWrap, inWrap, lo, hi, origin, scale, inShape, numChannels); + } + else if (axis == 2) + { + if constexpr (kSpatialNDim == 3) + { + linear::ResampleDepth(outWrap, inWrap, lo, hi, origin, scale, inShape, numChannels); + } + } +} + +template +void __forceinline__ __device__ RunFilter(const PassOutWrap outWrap, const PassInWrap inWrap, + const VecI lo, const VecI hi, int axis, + const VecI inShape, const float origin, const float scale, + const filter::ResamplingFilter filter, const NumChannelsT numChannels) +{ + if (axis == 0) + { + filter_support::ResampleHorz(outWrap, inWrap, lo, hi, origin, scale, inShape, filter, + numChannels); + } + else if (axis == 1) + { + filter_support::ResampleVert(outWrap, inWrap, lo, hi, origin, scale, inShape, filter, + numChannels); + } + else if (axis == 2) + { + if constexpr (kSpatialNDim == 3) + { + filter_support::ResampleDepth(outWrap, inWrap, lo, hi, origin, scale, inShape, filter, numChannels); + } + } +} +} // namespace interpolate + +template +void __forceinline__ __device__ RunResamplingPass(const SampleDesc sampleDesc, const PassOutWrap outWrap, + const PassInWrap inWrap, const VecI lo, + const VecI hi, const NumChannelsT numChannels) +{ + VecI inShape = sampleDesc.shapes[kWhichPass]; + int axis = cuda::GetElement(sampleDesc.processingOrder, kWhichPass); // vec-order: 0 = X, 1 = Y, 2 = Z + const float origin = cuda::GetElement(sampleDesc.origin, kWhichPass); + const float scale = cuda::GetElement(sampleDesc.scale, kWhichPass); + + switch (sampleDesc.filterKind[kWhichPass]) + { + case filter::FilterTypeKind::Nearest: + interpolate::RunNN(outWrap, inWrap, lo, hi, axis, inShape, origin, scale, numChannels); + break; + case filter::FilterTypeKind::Linear: + interpolate::RunLinear(outWrap, inWrap, lo, hi, axis, inShape, origin, scale, numChannels); + break; + default: + interpolate::RunFilter(outWrap, inWrap, lo, hi, axis, inShape, origin, scale, + sampleDesc.filter[kWhichPass], numChannels); + break; + } +} + +// Tensor variant (unfirom batch) +template +__global__ void SeparableResamplingKernel(const SampleDesc sampleDesc, const PassOutWrap outWrap, + const PassInWrap inWrap, const GridHelperDevice gridHelper) + +{ + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + static_assert(PassInWrap::kNumDimensions == 1 + kSpatialNDim + kHasDynamicChannels); + static_assert(PassOutWrap::kNumDimensions == 1 + kSpatialNDim + kHasDynamicChannels); + // Get sample idx and the region of the output image that + // the current threadblock has to process + int sampleIdx = gridHelper.CurrentSample(); + VecI lo, hi; + gridHelper.CurrentBlock(lo, hi, sampleDesc.blockShape[kWhichPass]); + hi = cuda::min(hi, sampleDesc.shapes[kWhichPass + 1]); + + const auto outSampleView = batch_wrapper::tensor::GetSampleView(outWrap, sampleIdx); + const auto inSampleView = batch_wrapper::tensor::GetSampleView(inWrap, sampleIdx); + WithChannels( + sampleDesc.channels, [=](const NumChannels numChannels) + { RunResamplingPass(sampleDesc, outSampleView, inSampleView, lo, hi, numChannels); }); +} + +// Batch variant (ImageBatchVarShape, TensorBatch) +template +__global__ void SeparableResamplingKernel(const SampleDesc *__restrict__ samples, + const PassOutWrap outWrap, const PassInWrap inWrap, + const GridHelperDevice gridHelper) +{ + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + static_assert(PassInWrap::kNumDimensions == 1 + kSpatialNDim + kHasDynamicChannels); + static_assert(PassOutWrap::kNumDimensions == 1 + kSpatialNDim + kHasDynamicChannels); + // Get sample idx and the region of the output image that + // the current threadblock has to process + const int sampleIdx = gridHelper.CurrentSample(); + const auto sampleDesc = samples[sampleIdx]; + const VecI outShape = sampleDesc.shapes[kWhichPass + 1]; + VecI lo, hi; + gridHelper.CurrentBlock(lo, hi, sampleDesc.blockShape[kWhichPass]); + + // exit early for smaller samples + if (lo.x >= outShape.x || lo.y >= outShape.y) + { + return; + } + if constexpr (kSpatialNDim == 3) + { + if (lo.z >= outShape.z) + { + return; + } + } + hi = cuda::min(hi, outShape); + + const auto outSampleView = outWrap.GetSampleView(sampleIdx); + WithChannels( + sampleDesc.channels, + [=](const NumChannels numChannels) + { + if constexpr (kWhichPass == 0) + { + const auto inSampleView = inWrap.GetSampleView(sampleIdx, sampleDesc.inRoiOffset); + RunResamplingPass(sampleDesc, outSampleView, inSampleView, lo, hi, numChannels); + } + else if constexpr (kWhichPass != 0) + { + const auto inSampleView = inWrap.GetSampleView(sampleIdx); + RunResamplingPass(sampleDesc, outSampleView, inSampleView, lo, hi, numChannels); + } + }); +} + +} // namespace resampling + +namespace validate { +inline auto srcDst(const nvcv::Tensor &src, const nvcv::Tensor &dst) +{ + auto srcData = src.exportData(); + auto dstData = dst.exportData(); + + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input must be cuda-accessible tensor"); + } + + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output must be cuda-accessible tensor"); + } + + using maybeTensorAccess = nvcv::Optional; + std::tuple ret; + + auto &[srcAccess, dstAccess, numSamples, numChannels, srcDtype, dstDtype] = ret; + + srcDtype = srcData->dtype(); + dstDtype = dstData->dtype(); + + srcAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*srcData); + dstAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dstData); + NVCV_ASSERT(srcAccess && dstAccess); + + numSamples = srcAccess->numSamples(); + if (numSamples != dstAccess->numSamples()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of samples"); + } + + if (srcDtype.numChannels() > 1 || dstDtype.numChannels() > 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The tensor channels should be explicit part of the shape, not tensor type"); + } + + numChannels = srcAccess->numChannels(); + if (numChannels != dstAccess->numChannels()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of channels"); + } + + if (numChannels <= 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Number of channels must be positive"); + } + + auto numPlanes = srcAccess->numPlanes(); + if (numPlanes != dstAccess->numPlanes()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of planes"); + } + + if (numPlanes > 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Plannar images are not supported"); + } + + if (srcData->layout() != dstData->layout()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input and output must have the same layout"); + } + + return ret; +} + +inline void srcDst(int &numSamples, int &uniqueNumChannels, nvcv::DataType &srcDtype, nvcv::DataType &dstDtype, + const nvcv::ImageBatchVarShape &src, const nvcv::ImageBatchVarShape &dst) +{ + numSamples = src.numImages(); + if (numSamples != dst.numImages()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of samples"); + } + + const auto &srcFormat = src.uniqueFormat(); + const auto &dstFormat = dst.uniqueFormat(); + + if (!srcFormat || !dstFormat) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "All images in a batch must have the same format (including number of channels)"); + } + + auto numPlanes = srcFormat.numPlanes(); + if (numPlanes != dstFormat.numPlanes()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of planes"); + } + + if (numPlanes > 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Plannar images are not supported"); + } + + srcDtype = srcFormat.planeDataType(0); + dstDtype = dstFormat.planeDataType(0); + + uniqueNumChannels = srcFormat.numChannels(); + if (uniqueNumChannels != dstFormat.numChannels()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of channels"); + } +} + +inline void srcDst(int &numSamples, int &uniqueNumChannels, nvcv::DataType &srcDtype, nvcv::DataType &dstDtype, + const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst) +{ + numSamples = src.numTensors(); + if (numSamples != dst.numTensors()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of samples"); + } + + uniqueNumChannels = -1; + srcDtype = src.dtype(); + dstDtype = dst.dtype(); + + if (srcDtype.numChannels() > 1 || dstDtype.numChannels() > 1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The tensor channels should be explicit part of the shape, not tensor type"); + } + + if (src.layout() != dst.layout()) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output layouts"); + } + + if (src.layout() != nvcv::TENSOR_HW && src.layout() != nvcv::TENSOR_HWC && src.layout() != nvcv::TENSOR_DHW + && src.layout() != nvcv::TENSOR_DHWC) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "The tensor batch must contain [D]HW[C] samples"); + } +} + +inline void inOutNumberOfChannels(const HQResizeTensorShapeI &inShape, const HQResizeTensorShapeI &outShape) +{ + if (inShape.numChannels != outShape.numChannels) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Incompatible input/output number of channels in one of the samples"); + } + if (inShape.numChannels <= 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "The number of channels must be positive"); + } +} + +inline void sameInOutNdim(const HQResizeTensorShapeI &inShape, const HQResizeTensorShapeI &outShape) +{ + if (inShape.ndim != outShape.ndim) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Incompatible input/output number extents to resize"); + } +} + +inline void inOutShapes(int numSamples, const HQResizeTensorShapesI &inShapes, const HQResizeTensorShapesI &outShapes) +{ + if (inShapes.ndim != outShapes.ndim) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The dimensionality of input and output shapes does not match"); + } + + if (numSamples != inShapes.size || numSamples != outShapes.size) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of samples"); + } + + if (inShapes.ndim != outShapes.ndim) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of extents"); + } + + if (inShapes.numChannels != outShapes.numChannels) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of channels"); + } + + if (inShapes.numChannels < 0) + { + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + inOutNumberOfChannels(inShapes.shape[sampleIdx], outShapes.shape[sampleIdx]); + } + } + else if (inShapes.numChannels == 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "The number of channels cannot be 0"); + } +} + +inline void roiBatch(int numSamples, int ndim, const HQResizeRoisF &rois) +{ + auto numRois = rois.size; + if (numRois != 0 && numRois != 1 && numRois != numSamples) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The resize ROI list, if specified, must contain a single element to be used across all " + "samples in a batch or its length must match the batch size."); + } + if (numRois != 0) + { + if (rois.ndim != ndim) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "The number of ROI extents does not match the numebr of extents in the input"); + } + } +} +} // namespace validate + +namespace shape { + +template +struct Roi +{ + Vec Size() const + { + return hi - lo; + } + + Vec lo, hi; +}; + +inline HQResizeRoiF *SampleRoi(const HQResizeRoisF &rois, int sampleIdx) +{ + if (rois.size == 0) + { + return nullptr; + } + else if (rois.size == 1) + { + return rois.roi; + } + else + { + return rois.roi + sampleIdx; + } +} + +template +inline VecI TensorShape(const HQResizeTensorShapeI &shape) +{ + VecI shapeVec; + for (int d = 0; d < kSpatialNDim; d++) + { + cuda::GetElement(shapeVec, d) = shape.extent[kSpatialNDim - d - 1]; + } + return shapeVec; +} + +template +inline VecI SampleShape(const HQResizeTensorShapesI &shapes, int sampleIdx) +{ + return TensorShape(shapes.shape[shapes.size == 1 ? 0 : sampleIdx]); +} + +template +inline VecI TensorShape(const nvcv::Tensor &tensor) +{ + static_assert(kSpatialNDim == 2 || kSpatialNDim == 3); + const auto &shape = tensor.shape(); + const auto &layout = tensor.layout(); + char shapeArgLayout[4] = "WHD"; + VecI tensorShape; + for (int d = 0; d < kSpatialNDim; d++) + { + int axis = layout.find(shapeArgLayout[d]); + if (axis < 0) + { + throw std::runtime_error( + "The layout of an input tensor to the resize operator must contain HW extents in the layout (for " + "images) or DHW extents (for 3D resampling). Some extents are missing in the input tensor."); + } + cuda::GetElement(tensorShape, d) = shape[axis]; + } + return tensorShape; +} + +template +inline VecI SampleShape(const nvcv::ImageBatchVarShape &batch, int sampleIdx) +{ + static_assert(kSpatialNDim == 2); + VecI sampleShape; + const nvcv::Image &image = batch[sampleIdx]; + const auto &imageSize = image.size(); + sampleShape.x = imageSize.w; + sampleShape.y = imageSize.h; + return sampleShape; +} + +template +inline VecI SampleShape(const nvcv::TensorBatch &batch, int sampleIdx) +{ + return TensorShape(batch[sampleIdx]); +} + +inline int TensorNumChannels(const nvcv::Tensor &tensor) +{ + const auto &shape = tensor.shape(); + const auto &layout = tensor.layout(); + int channelAxis = layout.find('C'); + if (channelAxis < 0) + { + return 1; + } + return shape[channelAxis]; +} + +inline int SampleNumChannels(const nvcv::TensorBatch &src, const nvcv::TensorBatch &dst, int sampleIdx) +{ + const auto &srcSample = src[sampleIdx]; + const auto &dstSample = dst[sampleIdx]; + int numChannels = TensorNumChannels(srcSample); + if (numChannels != TensorNumChannels(dstSample)) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Incompatible input/output number of channels"); + } + if (numChannels <= 0) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Number of channels must be positive"); + } + return numChannels; +} +} // namespace shape + +/** + * @brief Calculates optimum processing order based on input/output sizes and filter support. + * + * The sizes of intermediate storage and time taken to compute the intermediate images + * may depend on the order - i.e. if downscaling only one axis, it's beneficial to resample that + * axis first, so that intermediate image is smaller. + */ +template +class ProcessingOrderCalculator + +{ +public: + static constexpr float size_bias = 3; + + ProcessingOrderCalculator(const VecI inSize, const VecI outSize, const VecI filterSupport) + : m_inSize(inSize) + , m_outSize(outSize) + , m_filterSupport(filterSupport) + { + } + + VecI operator()() + { + for (int i = 0; i < ndim; i++) cuda::GetElement(m_bestOrder, i) = i; + m_axisVisited = {}; + m_currSize = m_inSize; + m_minCost = 1e+30f; + Run(0); + return m_bestOrder; + } + +private: + // recursively check every possible order in DFS fashion + void Run(int pass, float totalCost = 0) + { + if (totalCost >= m_minCost) + return; // this branch of recursion will not yield a better result - abandon it + + if (pass == ndim) + { + m_minCost = totalCost; + m_bestOrder = m_currOrder; + } + else + { + for (int a = 0; a < ndim; a++) + { + if (cuda::GetElement(m_axisVisited, a)) + continue; + cuda::GetElement(m_axisVisited, a) = true; + cuda::GetElement(m_currOrder, pass) = a; + auto prevSize = cuda::GetElement(m_currSize, a); + cuda::GetElement(m_currSize, a) = cuda::GetElement(m_outSize, a); + + float passCost = PassCost(pass, a); + Run(pass + 1, totalCost + passCost); + + cuda::GetElement(m_currSize, a) = prevSize; + cuda::GetElement(m_axisVisited, a) = false; + } + } + } + + float PassCost(int pass, int axis) + { + // y-axis is likely to be the cheapest + float axisCost = axis == 0 ? 1.4f : axis > 1 ? 1.2f : 1.0f; + auto vol = utils::Volume(m_currSize); + float baseComputeCost = cuda::GetElement(m_filterSupport, axis) * vol; + return axisCost * baseComputeCost + vol * size_bias; + } + + const VecI m_inSize, m_outSize, m_filterSupport; + float m_minCost; + VecI m_currSize, m_bestOrder, m_currOrder, m_axisVisited; +}; + +template +inline void RunTypedSwitch(nvcv::DataType srcDtype, nvcv::DataType dstDtype, int numChannels, const Cb &cb) +{ + using uchar = unsigned char; + +#define NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(SRC_TYPE_NAME, DST_TYPE_NAME, SRC_VEC, DST_VEC) \ + ((srcDtype == nvcv::TYPE_##SRC_TYPE_NAME) && (dstDtype == nvcv::TYPE_##DST_TYPE_NAME)) \ + cb(SRC_VEC{}, IntermediateBaseT{}, DST_VEC{}, std::integral_constant{}) + +#define NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(NUM_STATIC_CHANNELS, SRC_TYPE_NAME, DST_TYPE_NAME, SRC_VEC, DST_VEC) \ + ((numChannels == NUM_STATIC_CHANNELS) && (srcDtype == nvcv::TYPE_##SRC_TYPE_NAME) \ + && (dstDtype == nvcv::TYPE_##DST_TYPE_NAME)) \ + cb(SRC_VEC{}, IntermediateBaseT{}, DST_VEC{}, std::integral_constant{}) + +#define NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(NUM_STATIC_CHANNELS, SRC_TYPE_NAME, DST_TYPE_NAME, SRC_VEC, DST_VEC) \ + ((numChannels == NUM_STATIC_CHANNELS) \ + && (srcDtype == nvcv::TYPE_##SRC_TYPE_NAME || srcDtype == nvcv::TYPE_##NUM_STATIC_CHANNELS##SRC_TYPE_NAME) \ + && (dstDtype == nvcv::TYPE_##DST_TYPE_NAME || dstDtype == nvcv::TYPE_##NUM_STATIC_CHANNELS##DST_TYPE_NAME)) \ + cb(SRC_VEC##NUM_STATIC_CHANNELS{}, Vec{}, \ + DST_VEC##NUM_STATIC_CHANNELS{}, std::integral_constant{}) + + // clang-format off + if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, U8, U8, uchar, uchar); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, U8, U8, uchar, uchar); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, U8, U8, uchar, uchar); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, U8, U8, uchar, uchar); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(U8, U8, uchar, uchar); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, U8, F32, uchar, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, U8, F32, uchar, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, U8, F32, uchar, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, U8, F32, uchar, float); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(U8, F32, uchar, float); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, S16, S16, short, short); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, S16, S16, short, short); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, S16, S16, short, short); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, S16, S16, short, short); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(S16, S16, short, short); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, S16, F32, short, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, S16, F32, short, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, S16, F32, short, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, S16, F32, short, float); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(S16, F32, short, float); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, U16, U16, ushort, ushort); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, U16, U16, ushort, ushort); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, U16, U16, ushort, ushort); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, U16, U16, ushort, ushort); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(U16, U16, ushort, ushort); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, U16, F32, ushort, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, U16, F32, ushort, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, U16, F32, ushort, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, U16, F32, ushort, float); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(U16, F32, ushort, float); + + else if NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE(1, F32, F32, float, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(2, F32, F32, float, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(3, F32, F32, float, float); + else if NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE(4, F32, F32, float, float); + else if NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE(F32, F32, float, float); + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Unsupported input/output types. The resize operator supports the " + "following types: uint8, int16, uint16, and float32. " + "The output type must be same as the input type or float."); + } +// clang-format on +#undef NVCV_RUN_DYNAMIC_CHANNELS_HQ_RESIZE +#undef NVCV_RUN_SINGLE_CHANNEL_HQ_RESIZE +#undef NVCV_RUN_MULTI_STATIC_CHANNEL_HQ_RESIZE +} + +template +class HQResizeRun +{ +public: + static_assert(_kSpatialNDim == 2 || _kSpatialNDim == 3, + "Currently, the resampling operator supports only 2 or 3 spatial dimensions"); + + HQResizeRun(const filter::ResamplingFiltersFactory &filtersFactory) + : m_filtersFactory{filtersFactory} + { + } + + using SampleDescT = resampling::SampleDesc<_kSpatialNDim>; + static_assert(std::is_trivially_copyable_v); + using DynamicBatchWrapMeta = batch_wrapper::dynamic::DynamicBatchWrapMeta; + + static constexpr VecI<3> kBlockDim = {32, 8, 1}; + static constexpr int kSpatialNDim = _kSpatialNDim; + // the number of buffers for intermediate results + static constexpr int kNumTmpBuffers = kSpatialNDim - 1; + // use alignment suitable for maximal supported number of static channels + static constexpr int kIntermediateAlignment = alignof(Vec); + + // Computes workspace requierements for calling the operator with tensor (uniform batch) input/output + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int numSamples, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + const bool antialias, const HQResizeRoiF *roi) const + { + validate::inOutNumberOfChannels(inputShape, outputShape); + validate::sameInOutNdim(inputShape, outputShape); + + SampleDescT sampleDesc; + VecI srcShape = shape::TensorShape(inputShape); + VecI dstShape = shape::TensorShape(outputShape); + int numChannels = inputShape.numChannels; + auto [minFilter, magFilter] = filter::GetFilterModes(minInterpolation, magInterpolation, antialias); + SetupSampleDesc(sampleDesc, srcShape, dstShape, numChannels, roi, minFilter, magFilter); + + cvcuda::WorkspaceEstimator est; + for (int t = 0; t < kNumTmpBuffers; t++) + { + // the vectorized alignment may or may not be needed, depending on the number of channels + est.addCuda(GetPassOutputVolume(sampleDesc, t) * numSamples, kIntermediateAlignment); + } + + cvcuda::WorkspaceRequirements req{}; + req.hostMem = est.hostMem.req; + req.pinnedMem = est.pinnedMem.req; + req.cudaMem = est.cudaMem.req; + + // The allocator requries the total size of the allocation to be aligned + cvcuda::AlignUp(req); + return req; + } + + // Computes workspace requirements for calling the operator with TensorBatch/ImageBatchVarShape input/output + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int numSamples, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + const bool antialias, const HQResizeRoisF rois) const + { + validate::roiBatch(numSamples, kSpatialNDim, rois); + validate::inOutShapes(numSamples, inputShapes, outputShapes); + auto [minFilter, magFilter] = filter::GetFilterModes(minInterpolation, magInterpolation, antialias); + + size_t intermediateSizes[kNumTmpBuffers]{}; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + const VecI srcShape = shape::SampleShape(inputShapes, sampleIdx); + const VecI dstShape = shape::SampleShape(outputShapes, sampleIdx); + const HQResizeRoiF *sampleRoi = shape::SampleRoi(rois, sampleIdx); + int numChannels + = inputShapes.numChannels < 0 ? inputShapes.shape[sampleIdx].numChannels : inputShapes.numChannels; + + SampleDescT sampleDesc; + SetupSampleDesc(sampleDesc, srcShape, dstShape, numChannels, sampleRoi, minFilter, magFilter); + for (int t = 0; t < kNumTmpBuffers; t++) + { + intermediateSizes[t] += GetPassOutputVolume(sampleDesc, t); + } + } + + cvcuda::WorkspaceEstimator est; + est.addPinned(numSamples); + est.addCuda(numSamples); + + // reserve space for pointers and strides for intermediate wrappers + for (int t = 0; t < kNumTmpBuffers; t++) + { + batch_wrapper::dynamic::AddDynamicBatchWrapMeta(est, numSamples); + } + for (int t = 0; t < kNumTmpBuffers; t++) + { + // the vectorized alignment may or may not be needed, depending on the number of channels + est.addCuda(intermediateSizes[t], kIntermediateAlignment); + } + + cvcuda::WorkspaceRequirements req{}; + req.hostMem = est.hostMem.req; + req.pinnedMem = est.pinnedMem.req; + req.cudaMem = est.cudaMem.req; + // The allocator requries the total size of the allocation to be aligned + cvcuda::AlignUp(req); + + return req; + } + + // Computes upper bound for workspace requirements, i.e. the workspace that meets the computed requirements + // can be passed to the call with any type of input/output as long as there are no more than maxBatchSize + // samples that do not exceed the maxShape (in the input nor in the output). + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int maxNumSamples, const HQResizeTensorShapeI maxShape) const + { + validate::inOutNumberOfChannels(maxShape, maxShape); + + cvcuda::WorkspaceEstimator est; + est.addPinned(maxNumSamples); + est.addCuda(maxNumSamples); + + // reserve space for pointers and strides for intermediate wrappers + for (int t = 0; t < kNumTmpBuffers; t++) + { + batch_wrapper::dynamic::AddDynamicBatchWrapMeta(est, maxNumSamples); + } + VecI shape = shape::TensorShape(maxShape); + for (int t = 0; t < kNumTmpBuffers; t++) + { + size_t numElements = utils::Volume(shape) * maxNumSamples * maxShape.numChannels; + est.addCuda(numElements, kIntermediateAlignment); + } + + cvcuda::WorkspaceRequirements req{}; + req.hostMem = est.hostMem.req; + req.pinnedMem = est.pinnedMem.req; + req.cudaMem = est.cudaMem.req; + // The allocator requries the total size of the allocation to be aligned + cvcuda::AlignUp(req); + + return req; + } + + void operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + const bool antialias, const HQResizeRoiF *roi) const + { + auto tensorAccess = validate::srcDst(src, dst); + auto &[srcAccess, dstAccess, numSamples, numChannels, srcDtype, dstDtype] = tensorAccess; + + SampleDescT sampleDesc; + VecI srcShape = shape::TensorShape(src); + VecI dstShape = shape::TensorShape(dst); + const auto [minFilter, magFilter] = filter::GetFilterModes(minInterpolation, magInterpolation, antialias); + SetupSampleDesc(sampleDesc, srcShape, dstShape, numChannels, roi, minFilter, magFilter); + + cvcuda::WorkspaceAllocator allocator(ws); + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaStreamWaitEvent(stream, ws.cudaMem.ready)); + } + IntermediateBaseT *intermediate[kNumTmpBuffers]; + // Get intermediate buffers + for (int t = 0; t < kNumTmpBuffers; t++) + { + intermediate[t] = allocator.getCuda(GetPassOutputVolume(sampleDesc, t) * numSamples, + kIntermediateAlignment); + } + + RunTypedSwitch( + srcDtype, dstDtype, numChannels, + [&](auto dummySrcVal, auto intermediateVal, auto dummyDstVal, auto numChannelsVal) + { + using InT = decltype(dummySrcVal); + using IntermediateT = decltype(intermediateVal); + using OutT = decltype(dummyDstVal); + constexpr int numStaticChannels = decltype(numChannelsVal)::value; + static_assert(numStaticChannels == -1 || numStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + + auto &[srcAccess, dstAccess, numSamples, numChannels, srcDtype, dstDtype] = tensorAccess; + RunPasses(sampleDesc, *dstAccess, *srcAccess, intermediate, + numSamples, ws, stream); + }); + } + + template + void operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const BatchContainer &src, + const BatchContainer &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, const bool antialias, const HQResizeRoisF rois) const + { + int numSamples; + int uniqueNumChannels; // numChannels for ImageBatchVarShape, -1 for TensorBatch + nvcv::DataType srcDtype, dstDtype; + validate::srcDst(numSamples, uniqueNumChannels, srcDtype, dstDtype, src, dst); + validate::roiBatch(numSamples, kSpatialNDim, rois); + + const auto [minFilter, magFilter] = filter::GetFilterModes(minInterpolation, magInterpolation, antialias); + cvcuda::WorkspaceAllocator allocator(ws); + if (ws.pinnedMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventSynchronize(ws.pinnedMem.ready)); + } + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaStreamWaitEvent(stream, ws.cudaMem.ready)); + } + SampleDescT *sampleDescsCpu = allocator.getPinned(numSamples); + SampleDescT *sampleDescsGpu = allocator.getCuda(numSamples); + size_t intermediateSizes[kNumTmpBuffers]{}; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + const VecI srcShape = shape::SampleShape(src, sampleIdx); + const VecI dstShape = shape::SampleShape(dst, sampleIdx); + const HQResizeRoiF *sampleRoi = shape::SampleRoi(rois, sampleIdx); + int numChannels; + if constexpr (std::is_same_v) + { + numChannels = uniqueNumChannels; + } + else if constexpr (!std::is_same_v) + { + static_assert(std::is_same_v); + numChannels = shape::SampleNumChannels(src, dst, sampleIdx); + } + SampleDescT &sampleDesc = sampleDescsCpu[sampleIdx]; + SetupSampleDesc(sampleDesc, srcShape, dstShape, numChannels, sampleRoi, minFilter, magFilter); + for (int t = 0; t < kNumTmpBuffers; t++) + { + intermediateSizes[t] += GetPassOutputVolume(sampleDesc, t); + } + } + NVCV_CHECK_THROW(cudaMemcpyAsync(sampleDescsGpu, sampleDescsCpu, numSamples * sizeof(SampleDescT), + cudaMemcpyHostToDevice, stream)); + + // allocate space for pointers and strides for intermediate wrappers + DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers]; + IntermediateBaseT *intermediate[kNumTmpBuffers]; + for (int t = 0; t < kNumTmpBuffers; t++) + { + intermediateMeta[t] = batch_wrapper::dynamic::AllocateDynamicBatchWrapMeta(allocator, numSamples); + } + // allocate space for intermediate data + for (int t = 0; t < kNumTmpBuffers; t++) + { + intermediate[t] = allocator.getCuda(intermediateSizes[t], kIntermediateAlignment); + } + + RunTyped(sampleDescsCpu, sampleDescsGpu, src, dst, intermediate, intermediateMeta, numSamples, srcDtype, + dstDtype, uniqueNumChannels, ws, stream); + } + +private: + void RunTyped(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, + const nvcv::ImageBatchVarShape &src, const nvcv::ImageBatchVarShape &dst, + IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, + const nvcv::DataType srcDtype, const nvcv::DataType dstDtype, int uniqueNumChannels, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + static_assert(kSpatialNDim == 2, "ImageBatchVarShape does not support 3D spatial resampling"); + + auto srcData = src.exportData(stream); + auto dstData = dst.exportData(stream); + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must be cuda-accessible, varshape pitch-linear image batch"); + } + + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be cuda-accessible, varshape pitch-linear image batch"); + } + + RunTypedSwitch( + srcDtype, dstDtype, uniqueNumChannels, + [&](auto dummySrcVal, auto intermediateVal, auto dummyDstVal, auto numChannelsVal) + { + using InT = decltype(dummySrcVal); + using IntermediateT = decltype(intermediateVal); + using OutT = decltype(dummyDstVal); + constexpr int numStaticChannels = decltype(numChannelsVal)::value; + if constexpr (numStaticChannels == -1) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Unsupported number of channels for ImageBatchVarShape input."); + } + else if constexpr (numStaticChannels != -1) + { + static_assert(numStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + RunPasses(sampleDescsCpu, sampleDescsGpu, *dstData, + *srcData, intermediate, intermediateMeta, + numSamples, ws, stream); + } + }); + } + + void RunTyped(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, + const nvcv::DataType srcDtype, const nvcv::DataType dstDtype, int uniqueNumChannels, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + // Other cointainer allow exporting data with const qualifiers + const auto srcData + = const_cast(src).exportData(stream).cast(); + const auto dstData + = const_cast(dst).exportData(stream).cast(); + + if (!srcData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Input must be cuda-accessible, varshape pitch-linear image batch"); + } + + if (!dstData) + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Output must be cuda-accessible, varshape pitch-linear image batch"); + } + + uniqueNumChannels = -1; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + if (sampleIdx == 0) + { + uniqueNumChannels = sampleDescsCpu[sampleIdx].channels; + } + else if (uniqueNumChannels != sampleDescsCpu[sampleIdx].channels) + { + uniqueNumChannels = -1; + break; + } + } + + RunTypedSwitch( + srcDtype, dstDtype, uniqueNumChannels, + [&](auto dummySrcVal, auto intermediateVal, auto dummyDstVal, auto numChannelsVal) + { + using InT = decltype(dummySrcVal); + using IntermediateT = decltype(intermediateVal); + using OutT = decltype(dummyDstVal); + constexpr int numStaticChannels = decltype(numChannelsVal)::value; + static_assert(numStaticChannels == -1 || numStaticChannels == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + static_assert(cuda::NumElements == cuda::NumElements); + + RunPasses(sampleDescsCpu, sampleDescsGpu, *dstData, + *srcData, intermediate, intermediateMeta, + numSamples, ws, stream); + }); + } + + template + std::enable_if_t RunPasses(const SampleDescT &sampleDesc, + const nvcv::TensorDataAccessStridedImagePlanar &dstAccess, + const nvcv::TensorDataAccessStridedImagePlanar &srcAccess, + IntermediateBaseT *intermediate[kNumTmpBuffers], int numSamples, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + static_assert(kSpatialNDim == 2); + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + // sample extent, spatial extents, optional dynamic channel extent + constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; + using OutWrap = cuda::TensorNDWrap; + using InWrap = cuda::TensorNDWrap; + using InterWrap = cuda::TensorNDWrap; + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + const OutWrap outWrap = batch_wrapper::tensor::WrapTensor(dstAccess); + const InWrap inWrap = batch_wrapper::tensor::WrapTensor( + srcAccess, sampleDesc.inRoiOffset); + const InterWrap interWrap = batch_wrapper::tensor::CreateDenseWrap( + intermediate[0], sampleDesc.channels, sampleDesc.shapes[1]); + RunPass(sampleDesc, interWrap, inWrap, numSamples, stream); + RunPass(sampleDesc, outWrap, interWrap, numSamples, stream); + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.cudaMem.ready, stream)); + } + } + + template + std::enable_if_t RunPasses(const SampleDescT &sampleDesc, + const nvcv::TensorDataAccessStridedImagePlanar &dstAccess, + const nvcv::TensorDataAccessStridedImagePlanar &srcAccess, + IntermediateBaseT *intermediate[kNumTmpBuffers], int numSamples, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + static_assert(kSpatialNDim == 3); + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + // sample extent, spatial extents, optional dynamic channel extent + constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; + using OutWrap = cuda::TensorNDWrap; + using InWrap = cuda::TensorNDWrap; + using InterWrap = cuda::TensorNDWrap; + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + const OutWrap outWrap = batch_wrapper::tensor::WrapTensor(dstAccess); + const InWrap inWrap = batch_wrapper::tensor::WrapTensor( + srcAccess, sampleDesc.inRoiOffset); + const InterWrap interWrap0 = batch_wrapper::tensor::CreateDenseWrap( + intermediate[0], sampleDesc.channels, sampleDesc.shapes[1]); + const InterWrap interWrap1 = batch_wrapper::tensor::CreateDenseWrap( + intermediate[1], sampleDesc.channels, sampleDesc.shapes[2]); + RunPass(sampleDesc, interWrap0, inWrap, numSamples, stream); + RunPass(sampleDesc, interWrap1, interWrap0, numSamples, stream); + RunPass(sampleDesc, outWrap, interWrap1, numSamples, stream); + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.cudaMem.ready, stream)); + } + } + + template + std::enable_if_t RunPasses(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, + const BatchDataStridedCuda &dstData, const BatchDataStridedCuda &srcData, + IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + static_assert(kSpatialNDim == 2); + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + // sample extent, spatial extents, optional dynamic channel extent + constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; + using BatchWrapOutT + = std::conditional_t, + batch_wrapper::ImageBatchVarShapeWrapAdapter, + batch_wrapper::TensorBatchWrapAdapter>; + using BatchWrapInT + = std::conditional_t, + batch_wrapper::ImageBatchVarShapeWrapAdapter, + batch_wrapper::TensorBatchWrapAdapter>; + using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + const BatchWrapOutT outWrap(dstData); + const BatchWrapInT inWrap(srcData); + const DynamicBatchWrap intermediateWrap + = batch_wrapper::dynamic::CreateDynamicBatchWrap( + 0, intermediate[0], intermediateMeta[0], sampleDescsCpu, numSamples, stream); + if (ws.pinnedMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.pinnedMem.ready, stream)); + } + RunPass(sampleDescsCpu, sampleDescsGpu, intermediateWrap, inWrap, numSamples, stream); + RunPass(sampleDescsCpu, sampleDescsGpu, outWrap, intermediateWrap, numSamples, stream); + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.cudaMem.ready, stream)); + } + } + + template + std::enable_if_t RunPasses(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, + const nvcv::TensorBatchDataStridedCuda &dstData, + const nvcv::TensorBatchDataStridedCuda &srcData, + IntermediateBaseT *intermediate[kNumTmpBuffers], + const DynamicBatchWrapMeta intermediateMeta[kNumTmpBuffers], int numSamples, + const cvcuda::Workspace &ws, cudaStream_t stream) const + { + static_assert(kSpatialNDim == 3); + constexpr bool kHasDynamicChannels = kNumStaticChannels == -1; + // sample extent, spatial extents, optional dynamic channel extent + constexpr int kWrapNDim = 1 + kSpatialNDim + kHasDynamicChannels; + using TensorBatchWrapOutT = batch_wrapper::TensorBatchWrapAdapter; + using TensorBatchWrapInT = batch_wrapper::TensorBatchWrapAdapter; + using DynamicBatchWrap = batch_wrapper::dynamic::DynamicBatchWrap; + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + const TensorBatchWrapOutT outWrap(dstData); + const TensorBatchWrapInT inWrap(srcData); + const DynamicBatchWrap intermediateWrap0 + = batch_wrapper::dynamic::CreateDynamicBatchWrap( + 0, intermediate[0], intermediateMeta[0], sampleDescsCpu, numSamples, stream); + const DynamicBatchWrap intermediateWrap1 + = batch_wrapper::dynamic::CreateDynamicBatchWrap( + 1, intermediate[1], intermediateMeta[1], sampleDescsCpu, numSamples, stream); + if (ws.pinnedMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.pinnedMem.ready, stream)); + } + RunPass(sampleDescsCpu, sampleDescsGpu, intermediateWrap0, inWrap, numSamples, stream); + RunPass(sampleDescsCpu, sampleDescsGpu, intermediateWrap1, intermediateWrap0, numSamples, + stream); + RunPass(sampleDescsCpu, sampleDescsGpu, outWrap, intermediateWrap1, numSamples, stream); + if (ws.cudaMem.ready != nullptr) + { + NVCV_CHECK_THROW(cudaEventRecord(ws.cudaMem.ready, stream)); + } + } + + template + void RunPass(const SampleDescT &sampleDesc, const PassOutWrap &outWrap, const PassInWrap &inWrap, int numSamples, + cudaStream_t stream) const + { + using GridHelperT = resampling::GridHelper; + + VecI numBlocks; + { + VecI outputShape = sampleDesc.shapes[kWhichPass + 1]; + VecI blockShape = sampleDesc.blockShape[kWhichPass]; + numBlocks = utils::DivCeil(outputShape, blockShape); + if (utils::Volume(numBlocks) == 0) + { + return; + } + } + + GridHelperT gridHelper{numBlocks, numSamples}; + dim3 block(kBlockDim.x, kBlockDim.y, kBlockDim.z); + dim3 grid = gridHelper.GetKernelGrid(); + const auto devGridHelper = gridHelper.GetDeviceGridHelper(); + + int sharedMemSize = RequiredSharedMemorySize(sampleDesc, kWhichPass); + resampling::SeparableResamplingKernel + <<>>(sampleDesc, outWrap, inWrap, devGridHelper); + NVCV_CHECK_THROW(cudaGetLastError()); + } + + template + void RunPass(const SampleDescT *sampleDescsCpu, const SampleDescT *sampleDescsGpu, const PassOutWrap &outWrap, + const PassInWrap &inWrap, int numSamples, cudaStream_t stream) const + { + using GridHelperT = resampling::GridHelper; + + int maxSharedMemSize = 0; + VecI maxNumBlocks{}; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + const SampleDescT &sampleDesc = sampleDescsCpu[sampleIdx]; + int sharedMemSize = RequiredSharedMemorySize(sampleDesc, kWhichPass); + maxSharedMemSize = std::max(maxSharedMemSize, sharedMemSize); + + VecI outputShape = sampleDesc.shapes[kWhichPass + 1]; + VecI blockShape = sampleDesc.blockShape[kWhichPass]; + VecI numBlocks = utils::DivCeil(outputShape, blockShape); + maxNumBlocks = cuda::max(maxNumBlocks, numBlocks); + } + + if (utils::Volume(maxNumBlocks) == 0) + { + return; + } + + GridHelperT gridHelper{maxNumBlocks, numSamples}; + dim3 block(kBlockDim.x, kBlockDim.y, kBlockDim.z); + dim3 grid = gridHelper.GetKernelGrid(); + const auto devGridHelper = gridHelper.GetDeviceGridHelper(); + + resampling::SeparableResamplingKernel + <<>>(sampleDescsGpu, outWrap, inWrap, devGridHelper); + NVCV_CHECK_THROW(cudaGetLastError()); + } + + int RequiredSharedMemorySize(const SampleDescT &sampleDesc, int whichPass) const + { + using resampling::interpolate::filter_support::RequiredSharedMemoryElements; + if (sampleDesc.filterKind[whichPass] != filter::FilterTypeKind::ShmFilter) + { + return 0; + } + int support = sampleDesc.filter[whichPass].support(); + int axis = cuda::GetElement(sampleDesc.processingOrder, whichPass); + // for depth resampling y is used as well + int resamplingAxisBlockSize = axis == 0 ? kBlockDim.x : kBlockDim.y; + return sizeof(IntermediateBaseT) * RequiredSharedMemoryElements(support, resamplingAxisBlockSize); + } + + void SetupSampleDesc(SampleDescT &sampleDesc, const VecI &srcShape, + const VecI &dstShape, int numChannels, const HQResizeRoiF *roi, + const filter::FilterMode &minFilter, const filter::FilterMode &magFilter) const + { + SetupSampleDescFilterShapeScale(sampleDesc, srcShape, dstShape, numChannels, minFilter, magFilter, roi); + SetupBlockLayout(sampleDesc); + } + + void SetupSampleDescFilterShapeScale(SampleDescT &sampleDesc, const VecI &inShape, + const VecI &outShape, int numChannels, + const filter::FilterMode &minFilter, const filter::FilterMode &magFilter, + const HQResizeRoiF *roi) const + { + // get user provided roi + const shape::Roi parsedRoi = ParseROI(roi, inShape); + // setup filter based on user provided filter types and the input/output size + filter::FilterTypeKind filterKinds[kSpatialNDim]; + filter::ResamplingFilter filters[kSpatialNDim]; + SetupFilters(filterKinds, filters, parsedRoi.Size(), outShape, minFilter, magFilter); + // get the ROI that is normalized (so that roiLo <= roiHi), adjusted for filter's "halo", + // and clampped to input shape + const shape::Roi adjustedRoi = AdjustRoiForFilter(parsedRoi, inShape, filters); + VecI adjustedRoiSize = adjustedRoi.Size(); + // the processing order is permutation that maps pass number to axis resampled during given pass + sampleDesc.processingOrder = SetupProcessingOrder(adjustedRoiSize, outShape, filters); + // now, use filters, roi and processingOrder to populate sample descriptor + sampleDesc.channels = numChannels; + sampleDesc.shapes[0] = inShape; + // set output shapes, scaling, roi, and relevant filters for each pass + // according to the best processingOrder of axes + { + VecI intermediateShape = adjustedRoiSize; + for (int pass = 0; pass < kSpatialNDim; pass++) + { + const int axis = cuda::GetElement(sampleDesc.processingOrder, pass); + const int axisOutShape = cuda::GetElement(outShape, axis); + const float roiStart = cuda::GetElement(parsedRoi.lo, axis); + const float roiEnd = cuda::GetElement(parsedRoi.hi, axis); + + cuda::GetElement(intermediateShape, axis) = axisOutShape; + sampleDesc.filterKind[pass] = filterKinds[axis]; + sampleDesc.filter[pass] = filters[axis]; + sampleDesc.shapes[pass + 1] = intermediateShape; + + cuda::GetElement(sampleDesc.origin, pass) = roiStart; + cuda::GetElement(sampleDesc.scale, pass) = (roiEnd - roiStart) / axisOutShape; + + // "Clamp" the axes processed in later passes to the input ROI + if (pass == 0) + { + // the first processed axis roi is handled simply with the `origin` + cuda::GetElement(sampleDesc.inRoiOffset, axis) = 0; + } + else + { + // for the axes not resampled in the first pass, we can just use offset when accesing data + // (adjustedRoi.lo) and pretend the input shape is the adjustedRoi.Size() + cuda::GetElement(sampleDesc.shapes[0], axis) = cuda::GetElement(adjustedRoiSize, axis); + cuda::GetElement(sampleDesc.inRoiOffset, axis) = cuda::GetElement(adjustedRoi.lo, axis); + cuda::GetElement(sampleDesc.origin, pass) + -= cuda::GetElement(adjustedRoi.lo, axis); // parsedRoi.lo - adjustedRoi.lo + } + } + } + } + + /** + * @brief If user specified the roi, it's returned with reversed dims oreder ((d)hw -> wh(d)), + * otherwise the input shape is used to create whole-plane roi. + * Note, that in the first case, some lo and hi may be flipped (i.e. lo[d] > hi[d]). + */ + shape::Roi ParseROI(const HQResizeRoiF *roi, VecI inShape) const + { + shape::Roi retRoi; + for (int dim = 0; dim < kSpatialNDim; dim++) + { + int axis = kSpatialNDim - 1 - dim; + auto axisSize = cuda::GetElement(inShape, axis); + float roiStart, roiEnd; + if (roi != nullptr) + { + roiStart = roi->lo[dim]; + roiEnd = roi->hi[dim]; + } + else + { + roiStart = 0; + roiEnd = axisSize; + } + cuda::GetElement(retRoi.lo, axis) = roiStart; + cuda::GetElement(retRoi.hi, axis) = roiEnd; + } + return retRoi; + } + + void SetupFilters(filter::FilterTypeKind filterKind[kSpatialNDim], filter::ResamplingFilter filters[kSpatialNDim], + VecF roiShape, const VecI &outShape, + const filter::FilterMode &minFilter, const filter::FilterMode &magFilter) const + { + using resampling::interpolate::filter_support::kMaxGPUFilterSupport; + static_assert(kSpatialNDim == 2 || kSpatialNDim == 3, + "Currently, the resampling operator supports only 2 or 3 spatial dimensions"); + + for (int axis = 0; axis < kSpatialNDim; axis++) + { + float inSize = std::abs(cuda::GetElement(roiShape, axis)); + float outSize = cuda::GetElement(outShape, axis); + const auto filterMode = outSize < inSize ? minFilter : magFilter; + filterKind[axis] = filter::GetFilterTypeKind(filterMode.filterType); + auto &filter = filters[axis]; + filter = filter::GetResamplingFilter(m_filtersFactory, filterMode, inSize, outSize); + + // for very small outputs, the required support may be too big for avialable shm + if (filter.support() > kMaxGPUFilterSupport) + { + filter.rescale(kMaxGPUFilterSupport); + } + } + } + + /** + * @brief Computes normalized ROI (i.e. so that roiLo <= roiHow), which is adjusted for filter's halo, + * converted to int and clamped to the input shape + */ + shape::Roi AdjustRoiForFilter(const shape::Roi &roi, + const VecI &inShape, + const filter::ResamplingFilter filters[kSpatialNDim]) const + { + shape::Roi ajustedRoi; + for (int axis = 0; axis < kSpatialNDim; axis++) + { + const float &axisLo = cuda::GetElement(roi.lo, axis); + const float &axisHi = cuda::GetElement(roi.hi, axis); + const auto &filter = filters[axis]; + int support = filter.numCoeffs ? filter.support() : 1; + float adjustedAxisLo, adjustedAxisHi; + if (axisLo <= axisHi) + { + adjustedAxisLo = axisLo - filter.anchor; + adjustedAxisHi = axisHi - filter.anchor + support; + } + else + { // flipped + adjustedAxisLo = axisHi - filter.anchor; + adjustedAxisHi = axisLo - filter.anchor + support; + } + const int axisSize = cuda::GetElement(inShape, axis); + cuda::GetElement(ajustedRoi.lo, axis) + = std::max(0, std::min(axisSize, std::floor(adjustedAxisLo))); + cuda::GetElement(ajustedRoi.hi, axis) + = std::max(0, std::min(axisSize, std::ceil(adjustedAxisHi))); + } + return ajustedRoi; + } + + VecI SetupProcessingOrder(const VecI &inRoiSize, const VecI &outSize, + const filter::ResamplingFilter filters[kSpatialNDim]) const + { + VecI filterSupport; + for (int i = 0; i < kSpatialNDim; i++) + { + int support = filters[i].support(); + // NN filter has support -1, so we need the max() below + cuda::GetElement(filterSupport, i) = std::max(1, support); + } + + return ProcessingOrderCalculator(inRoiSize, outSize, filterSupport)(); + } + + int64_t GetPassOutputVolume(SampleDescT sampleDesc, int pass) const + { + return utils::Volume(sampleDesc.shapes[pass + 1]) * sampleDesc.channels; + } + + /** + * @brief Calculates block layout for a 2D sample + * + */ + template + std::enable_if_t SetupBlockLayout(SampleDescT &sampleDesc) const + { + static_assert(kSpatialNDim == 2); + int lanes = resampling::GetResizeBlockLanes(); + for (int pass = 0; pass < kSpatialNDim; pass++) + { + int resamplingAxis = cuda::GetElement(sampleDesc.processingOrder, pass); + // The threadblock is (kBlockDim.x, kBlockDim.y) for all passes. + // In horizontal pass (resamplingAxis == 0), a single block will + // process output slice of (kBlockDim.x, lanes * kBlockDim.y). + // In vertical pass (resamplingAxis == 1), each block will handle + // output slice of (kBlockDim.x * lanes, kBlockDim.y). + VecI<2> blockShape{kBlockDim.x, kBlockDim.y}; + cuda::GetElement(blockShape, 1 - resamplingAxis) *= lanes; + auto outputShape = sampleDesc.shapes[pass + 1]; + sampleDesc.blockShape[pass] = cuda::clamp(blockShape, VecI<2>{1, 1}, outputShape); + } + } + + /** + * @brief Calculates block layout for a 3D sample + */ + template + std::enable_if_t SetupBlockLayout(SampleDescT &sampleDesc) const + { + static_assert(kSpatialNDim == 3); + int lanes = resampling::GetResizeBlockLanes(); + for (int pass = 0; pass < kSpatialNDim; pass++) + { + auto outputShape = sampleDesc.shapes[pass + 1]; + int resamplingAxis = cuda::GetElement(sampleDesc.processingOrder, pass); + if (resamplingAxis < 2) + { + VecI<3> blockShape{kBlockDim.x, kBlockDim.y, kBlockDim.z * lanes}; + sampleDesc.blockShape[pass] = cuda::clamp(blockShape, VecI<3>{1, 1, 1}, outputShape); + } + else + { + assert(resamplingAxis == 2); + VecI<3> blockShape{kBlockDim.x, kBlockDim.z * lanes, kBlockDim.y}; + sampleDesc.blockShape[pass] = cuda::clamp(blockShape, VecI<3>{1, 1, 1}, outputShape); + } + } + } + + const filter::ResamplingFiltersFactory &m_filtersFactory; +}; +} // namespace + +namespace cvcuda::priv { +namespace hq_resize { + +// Implements the IHQResizeImpl interface and keeps the filters fatory with initilized +// supports. The actual implementation is in a stateless HQResizeRun that is parametrized +// with the number of resampled dimensions. +class HQResizeImpl final : public IHQResizeImpl +{ +public: + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int numSamples, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoiF *roi) const override + { + if (inputShape.ndim == 2) + { + HQResizeRun<2> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(numSamples, inputShape, outputShape, minInterpolation, + magInterpolation, antialias, roi); + } + else if (inputShape.ndim == 3) + { + HQResizeRun<3> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(numSamples, inputShape, outputShape, minInterpolation, + magInterpolation, antialias, roi); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Only 2D or 3D resize is supported. Got unexpected number of extents to resize."); + } + } + + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int numSamples, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF rois) const override + { + if (inputShapes.ndim == 2) + { + HQResizeRun<2> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(numSamples, inputShapes, outputShapes, minInterpolation, + magInterpolation, antialias, rois); + } + else if (inputShapes.ndim == 3) + { + HQResizeRun<3> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(numSamples, inputShapes, outputShapes, minInterpolation, + magInterpolation, antialias, rois); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Only 2D or 3D resize is supported. Got unexpected number of extents to resize."); + } + } + + cvcuda::WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, + const HQResizeTensorShapeI maxShape) const override + { + if (maxShape.ndim == 2) + { + HQResizeRun<2> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(maxBatchSize, maxShape); + } + else if (maxShape.ndim == 3) + { + HQResizeRun<3> resize(m_filtersFactory); + return resize.getWorkspaceRequirements(maxBatchSize, maxShape); + } + else + { + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, + "Only 2D or 3D resize is supported. Got unexpected number of extents to resize."); + } + } + + void operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi) override + { + if (src.layout().find('D') < 0) + { + HQResizeRun<2> resize(m_filtersFactory); + resize(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, roi); + } + else + { + HQResizeRun<3> resize(m_filtersFactory); + resize(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, roi); + } + } + + void operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::ImageBatchVarShape &src, + const nvcv::ImageBatchVarShape &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF rois) override + { + HQResizeRun<2> resize(m_filtersFactory); + resize(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, rois); + } + + void operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF rois) override + { + if (src.layout().find('D') < 0) + { + HQResizeRun<2> resize(m_filtersFactory); + resize(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, rois); + } + else + { + HQResizeRun<3> resize(m_filtersFactory); + resize(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, rois); + } + } + +private: + filter::ResamplingFiltersFactory m_filtersFactory; +}; + +} // namespace hq_resize + +// Constructor ----------------------------------------------------------------- + +HQResize::HQResize() + +{ + m_impl = std::make_unique(); +} + +// Operator -------------------------------------------------------------------- + +// Workspace esitmation for Tensor input +cvcuda::WorkspaceRequirements HQResize::getWorkspaceRequirements(int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi) const +{ + return m_impl->getWorkspaceRequirements(batchSize, inputShape, outputShape, minInterpolation, magInterpolation, + antialias, roi); +} + +// Workspace esitmation for ImageBatch and TensorBatch input +cvcuda::WorkspaceRequirements HQResize::getWorkspaceRequirements(int batchSize, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoisF rois) const +{ + return m_impl->getWorkspaceRequirements(batchSize, inputShapes, outputShapes, minInterpolation, magInterpolation, + antialias, rois); +} + +cvcuda::WorkspaceRequirements HQResize::getWorkspaceRequirements(int maxBatchSize, + const HQResizeTensorShapeI maxShape) const +{ + return m_impl->getWorkspaceRequirements(maxBatchSize, maxShape); +} + +// Tensor variant +void HQResize::operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::Tensor &src, + const nvcv::Tensor &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoiF *roi) const +{ + assert(m_impl); + m_impl->operator()(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, roi); +} + +// ImageBatchVarShape variant +void HQResize::operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::ImageBatchVarShape &src, + const nvcv::ImageBatchVarShape &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF rois) const +{ + assert(m_impl); + m_impl->operator()(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, rois); +} + +// TensorBatch variant +void HQResize::operator()(cudaStream_t stream, const cvcuda::Workspace &ws, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF rois) const +{ + assert(m_impl); + m_impl->operator()(stream, ws, src, dst, minInterpolation, magInterpolation, antialias, rois); +} + +} // namespace cvcuda::priv diff --git a/src/cvcuda/priv/OpHQResize.hpp b/src/cvcuda/priv/OpHQResize.hpp new file mode 100644 index 000000000..85a89ee2c --- /dev/null +++ b/src/cvcuda/priv/OpHQResize.hpp @@ -0,0 +1,115 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file HQResize.hpp + * + * @brief Defines the private C++ Class for the HQResize operation. + */ + +#ifndef CVCUDA_PRIV_HQ_RESIZE_HPP +#define CVCUDA_PRIV_HQ_RESIZE_HPP +#include "IOperator.hpp" +#include "cvcuda/Workspace.hpp" + +#include +#include +#include +#include + +#include + +namespace cvcuda::priv { + +namespace hq_resize { + +class IHQResizeImpl +{ +public: + virtual WorkspaceRequirements getWorkspaceRequirements(int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoiF *roi) const = 0; + + virtual WorkspaceRequirements getWorkspaceRequirements(int batchSize, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF rois) const = 0; + + virtual WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, + const HQResizeTensorShapeI maxShape) const = 0; + + virtual void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi) + = 0; + + virtual void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &src, + const nvcv::ImageBatchVarShape &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) + = 0; + + virtual void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) + = 0; + + virtual ~IHQResizeImpl() = default; +}; + +} // namespace hq_resize + +class HQResize final : public IOperator +{ +public: + explicit HQResize(); + + WorkspaceRequirements getWorkspaceRequirements(int batchSize, const HQResizeTensorShapeI inputShape, + const HQResizeTensorShapeI outputShape, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoiF *roi) const; + + WorkspaceRequirements getWorkspaceRequirements(int batchSize, const HQResizeTensorShapesI inputShapes, + const HQResizeTensorShapesI outputShapes, + const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, + const HQResizeRoisF rois) const; + + WorkspaceRequirements getWorkspaceRequirements(int maxBatchSize, const HQResizeTensorShapeI maxShape) const; + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::Tensor &src, const nvcv::Tensor &dst, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, + bool antialias, const HQResizeRoiF *roi) const; + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::ImageBatchVarShape &src, + const nvcv::ImageBatchVarShape &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) const; + + void operator()(cudaStream_t stream, const Workspace &ws, const nvcv::TensorBatch &src, + const nvcv::TensorBatch &dst, const NVCVInterpolationType minInterpolation, + const NVCVInterpolationType magInterpolation, bool antialias, const HQResizeRoisF roi) const; + +private: + std::unique_ptr m_impl; +}; + +} // namespace cvcuda::priv + +#endif // CVCUDA_PRIV_HQ_RESIZE_HPP diff --git a/src/cvcuda/priv/OpHQResizeBatchWrap.cuh b/src/cvcuda/priv/OpHQResizeBatchWrap.cuh new file mode 100644 index 000000000..8f7b69411 --- /dev/null +++ b/src/cvcuda/priv/OpHQResizeBatchWrap.cuh @@ -0,0 +1,408 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef CVCUDA_PRIV_HQ_RESIZE_BATCH_WRAP_CUH +#define CVCUDA_PRIV_HQ_RESIZE_BATCH_WRAP_CUH + +#include "cvcuda/Workspace.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// This file contains three kind of helpers +// 1. Helpers to wrap contigious batch with uniform sample stride into TensorWrap +// 2. DynamicBatchWrap class & helpers to wrap dynamically created batch (of intermediate samples) +// with rigged sample stride +// 3. ImageBatchVarShapeWrap/TensorBatchWrap adapters that handle ROI on top of the usual wrappers + +namespace cvcuda::priv::hq_resize::batch_wrapper { + +namespace cuda = nvcv::cuda; + +template +using Vec = typename cuda::MakeType; + +template +using VecI = Vec; + +template +auto ComputeDenseStrides(ExtentParams... extentParams) +{ + constexpr int N = sizeof...(extentParams); + static_assert(N >= 1); + static_assert(std::conjunction_v...>); + std::array extents = {extentParams...}; + std::array strides; + strides[0] = extents[0] * sizeof(T); + for (int d = 1; d < N; d++) + { + strides[d] = strides[d - 1] * extents[d]; + } + return strides; +} + +template +auto ComputeDenseStrides(VecI<2> shape, Channels... channels) +{ + static_assert(sizeof...(channels) <= 1); + return ComputeDenseStrides(channels..., shape.x, shape.y); +} + +template +auto ComputeDenseStrides(VecI<3> shape, Channels... channels) +{ + static_assert(sizeof...(channels) <= 1); + return ComputeDenseStrides(channels..., shape.x, shape.y, shape.z); +} + +namespace tensor { +template +auto CreateDenseWrap(cuda::BaseType *base, const std::array strides) +{ + constexpr int N = kNStrides + 1; + for (auto stride : strides) + { + NVCV_ASSERT(stride <= cuda::TypeTraits::max); + } + static_assert(2 <= N && N <= 5); + if constexpr (N == 5) + { + return cuda::TensorNDWrap(base, static_cast(strides[3]), static_cast(strides[2]), + static_cast(strides[1]), static_cast(strides[0])); + } + else if constexpr (N == 4) + { + return cuda::TensorNDWrap(base, static_cast(strides[2]), static_cast(strides[1]), + static_cast(strides[0])); + } + else if constexpr (N == 3) + { + return cuda::TensorNDWrap(base, static_cast(strides[1]), static_cast(strides[0])); + } + else if constexpr (N == 2) + { + return cuda::TensorNDWrap(base, static_cast(strides[0])); + } +} + +template +auto CreateDenseWrap(cuda::BaseType *base, int numChannels, ShapeT shape) +{ + static constexpr int kNStrides = cuda::NumElements + kHasDynamicChannels; + if constexpr (kHasDynamicChannels) + { + auto strides = ComputeDenseStrides(shape, numChannels); + return CreateDenseWrap(base, strides); + } + else if constexpr (!kHasDynamicChannels) + { + auto strides = ComputeDenseStrides(shape); + return CreateDenseWrap(base, strides); + } +} + +template +std::enable_if_t> WrapTensor( + const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const ptrdiff_t roiOffset = 0) +{ + NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); + + if constexpr (kHasDynamicChannels) + { + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.rowStride()), static_cast(tensorAccess.colStride())); + } + else + { + return cuda::TensorNDWrap(tensorAccess.sampleData(0) + roiOffset, + static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.rowStride())); + } +} + +template +std::enable_if_t> WrapTensor( + const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const ptrdiff_t roiOffset = 0) +{ + NVCV_ASSERT(tensorAccess.sampleStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.depthStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.rowStride() <= cuda::TypeTraits::max); + NVCV_ASSERT(tensorAccess.colStride() <= cuda::TypeTraits::max); + + if constexpr (kHasDynamicChannels) + { + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride()), + static_cast(tensorAccess.colStride())); + } + else + { + return cuda::TensorNDWrap( + tensorAccess.sampleData(0) + roiOffset, static_cast(tensorAccess.sampleStride()), + static_cast(tensorAccess.depthStride()), static_cast(tensorAccess.rowStride())); + } +} + +template +std::enable_if_t> WrapTensor( + const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const VecI<2> &roiOffset) +{ + ptrdiff_t offset = tensorAccess.rowStride() * roiOffset.y + tensorAccess.colStride() * roiOffset.x; + return WrapTensor(tensorAccess, offset); +} + +template +std::enable_if_t> WrapTensor( + const nvcv::TensorDataAccessStridedImagePlanar &tensorAccess, const VecI<3> &roiOffset) +{ + ptrdiff_t offset = tensorAccess.depthStride() * roiOffset.z + tensorAccess.rowStride() * roiOffset.y + + tensorAccess.colStride() * roiOffset.x; + return WrapTensor(tensorAccess, offset); +} + +template +auto __device__ GetSampleView(const TensorWrap &batchTensorWrap, const int sampleIdx) +{ + using T = typename TensorWrap::ValueType; + static constexpr int kNumDimensions = TensorWrap::kNumDimensions; + static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim + static constexpr int kVariableStrides = kNumSampleDim - 1; // the innermost stride is static - sizeof type + using TensorWrapT = cuda::TensorNDWrap; + static_assert(kVariableStrides == TensorWrapT::kVariableStrides); + static_assert(kVariableStrides + 1 == TensorWrap::kVariableStrides); + static_assert(1 <= kVariableStrides && kVariableStrides <= 3); + auto *basePtr = batchTensorWrap.ptr(sampleIdx); + const int *strides = batchTensorWrap.strides(); + if constexpr (kVariableStrides == 1) + { + return TensorWrapT{basePtr, strides[1]}; + } + else if constexpr (kVariableStrides == 2) + { + return TensorWrapT{basePtr, strides[1], strides[2]}; + } + else if constexpr (kVariableStrides == 3) + { + return TensorWrapT{basePtr, strides[1], strides[2], strides[3]}; + } +} + +} // namespace tensor + +namespace dynamic { +struct TensorAccessDesc +{ + static constexpr int kMaxNStrides = 3; + + unsigned char *basePtr; + int strides[kMaxNStrides]; +}; + +template +void SetupTensorAccessStrides(TensorAccessDesc &tensorAccessDesc, const std::array strides) +{ + // we ignore the last stride (sample stride), it's not needed for a single sample + // as the samples are not assumed to be uniform + static constexpr int kNSampleStrides = kNStrides - 1; + static_assert(kNSampleStrides <= TensorAccessDesc::kMaxNStrides); + for (int d = 0; d < kNSampleStrides; d++) + { + NVCV_ASSERT(strides[d] <= cuda::TypeTraits::max); + tensorAccessDesc.strides[kNSampleStrides - 1 - d] = strides[d]; + } +} + +/** + * @brief Wrapper for batch of dynamically created samples + * (here, batch of intermediate samples between resampling passes) + */ +template +struct DynamicBatchWrap +{ + using ValueType = T; + static constexpr int kNumDimensions = N; + static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim + static constexpr int kVariableStrides = kNumSampleDim - 1; // the innermost stride is static - sizeof type + using TensorWrapT = cuda::TensorNDWrap; + static_assert(kVariableStrides == TensorWrapT::kVariableStrides); + static_assert(kVariableStrides >= 1 && kVariableStrides <= TensorAccessDesc::kMaxNStrides); + + DynamicBatchWrap(TensorAccessDesc *samples) + : m_samples{samples} + { + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx) const + { + static_assert(1 <= kVariableStrides && kVariableStrides <= 3); + + auto sample = m_samples[sampleIdx]; + const unsigned char *basePtr = sample.basePtr; + + if constexpr (kVariableStrides == 1) + { + return TensorWrapT{basePtr, sample.strides[0]}; + } + else if constexpr (kVariableStrides == 2) + { + return TensorWrapT{basePtr, sample.strides[0], sample.strides[1]}; + } + else if constexpr (kVariableStrides == 3) + { + return TensorWrapT{basePtr, sample.strides[0], sample.strides[1], sample.strides[2]}; + } + } + +private: + TensorAccessDesc *m_samples; +}; + +struct DynamicBatchWrapMeta +{ + TensorAccessDesc *cpu; + TensorAccessDesc *gpu; +}; + +inline void AddDynamicBatchWrapMeta(WorkspaceEstimator &est, int numSamples) +{ + est.addPinned(numSamples); + est.addCuda(numSamples); +} + +inline DynamicBatchWrapMeta AllocateDynamicBatchWrapMeta(WorkspaceAllocator &allocator, int numSamples) +{ + DynamicBatchWrapMeta meta; + meta.cpu = allocator.getPinned(numSamples); + meta.gpu = allocator.getCuda(numSamples); + return meta; +} + +template +DynamicBatchWrap CreateDynamicBatchWrap(int pass, cuda::BaseType *intermediate, + const DynamicBatchWrapMeta tensorBatchMeta, + const SampleDescT *sampleDescsCpu, int numSamples, cudaStream_t stream) +{ + static constexpr int kSpatialNDim = SampleDescT::kSpatialNDim; + static_assert(N == 1 + kSpatialNDim + kHasDynamicChannels); + + ptrdiff_t sampleOffset = 0; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + const SampleDescT &sampleDesc = sampleDescsCpu[sampleIdx]; + VecI outputShape = sampleDesc.shapes[pass + 1]; + TensorAccessDesc &tensorAccess = tensorBatchMeta.cpu[sampleIdx]; + tensorAccess.basePtr = reinterpret_cast(intermediate) + sampleOffset; + if constexpr (kHasDynamicChannels) + { + constexpr int kNStrides = kSpatialNDim + 1; + auto strides = ComputeDenseStrides(outputShape, sampleDesc.channels); + SetupTensorAccessStrides(tensorAccess, strides); + sampleOffset += strides[kNStrides - 1]; + } + else if constexpr (!kHasDynamicChannels) + { + constexpr int kNStrides = kSpatialNDim; + auto strides = ComputeDenseStrides(outputShape); + SetupTensorAccessStrides(tensorAccess, strides); + sampleOffset += strides[kNStrides - 1]; + } + } + NVCV_CHECK_THROW(cudaMemcpyAsync(tensorBatchMeta.gpu, tensorBatchMeta.cpu, numSamples * sizeof(TensorAccessDesc), + cudaMemcpyHostToDevice, stream)); + + return {tensorBatchMeta.gpu}; +} +} // namespace dynamic + +template +struct ImageBatchVarShapeWrapAdapter +{ + using ValueType = T; + static constexpr int kNumDimensions = 3; // NHW + static constexpr int kNumSampleDim = 2; // HW + static constexpr int kVariableStrides = 1; // the innermost stride is static - sizeof type + using TensorWrapT = cuda::TensorNDWrap; + static_assert(kVariableStrides == TensorWrapT::kVariableStrides); + + ImageBatchVarShapeWrapAdapter(const nvcv::ImageBatchVarShapeDataStridedCuda &batchData) + : m_batch{cuda::ImageBatchVarShapeWrap{batchData}} + { + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx, const VecI<2> roi) const + { + return TensorWrapT{m_batch.ptr(sampleIdx, 0, roi.y, roi.x), m_batch.rowStride(sampleIdx)}; + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx) const + { + return TensorWrapT{m_batch.ptr(sampleIdx, 0, 0, 0), m_batch.rowStride(sampleIdx)}; + } + +private: + cuda::ImageBatchVarShapeWrap m_batch; +}; + +template +struct TensorBatchWrapAdapter +{ + using ValueType = T; + static constexpr int kNumDimensions = N; + static constexpr int kNumSampleDim = kNumDimensions - 1; // not including sample (N) dim + static constexpr int kVariableStrides = kNumSampleDim - 1; + using TensorWrapT = cuda::TensorNDWrap; + using TensorBatchWrapT = cuda::TensorBatchNDWrap; + static_assert(kVariableStrides == TensorWrapT::kVariableStrides); + static_assert(kVariableStrides == TensorBatchWrapT::kVariableStrides); + + TensorBatchWrapAdapter(const nvcv::TensorBatchDataStridedCuda &batchData) + : m_batch{TensorBatchWrapT{batchData}} + { + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx, const VecI<2> roi) const + { + return TensorWrapT{m_batch.ptr(sampleIdx, roi.y, roi.x), m_batch.strides(sampleIdx)}; + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx, const VecI<3> roi) const + { + return TensorWrapT{m_batch.ptr(sampleIdx, roi.z, roi.y, roi.x), m_batch.strides(sampleIdx)}; + } + + inline __device__ TensorWrapT GetSampleView(const int sampleIdx) const + { + return m_batch.tensor(sampleIdx); + } + +private: + TensorBatchWrapT m_batch; +}; +} // namespace cvcuda::priv::hq_resize::batch_wrapper +#endif // CVCUDA_PRIV_HQ_RESIZE_BATCH_WRAP_CUH diff --git a/src/cvcuda/priv/OpHQResizeFilter.cuh b/src/cvcuda/priv/OpHQResizeFilter.cuh new file mode 100644 index 000000000..e32f5d270 --- /dev/null +++ b/src/cvcuda/priv/OpHQResizeFilter.cuh @@ -0,0 +1,402 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef CVCUDA_PRIV_HQ_RESIZE_FILTER_CUH +#define CVCUDA_PRIV_HQ_RESIZE_FILTER_CUH + +#include +#include // for NVCVInterpolationType, etc. +#include +#include +#include +#include + +#include +#include +#include +#include + +/* This file implements ResamplingFiltersFactory. + The class precomputes the coefficients for the supported filter kinds once + and stores them in the memory of current device. + Then, the facotory can be used to create ResamplingFilters + that interpolates the coefficients for a given support size.*/ +namespace cvcuda::priv::hq_resize::filter { + +/** + * @brief Internally supported filters. + * + * Triangual is short for Linear + antialias (that requiers precomupting an explicit filter's support) + */ +enum class FilterType : uint8_t +{ + Nearest, + Linear, + Triangular, + Gaussian, + Cubic, + Lanczos3, +}; + +/** + * @brief Internally supported kinds of filters - all the FilterTypes that + * require coefficients kept in shared memory are mapped to the same kind: + * `ShmFilter`. + * + */ +enum class FilterTypeKind : uint8_t +{ + Nearest, + Linear, + ShmFilter, +}; + +inline FilterTypeKind GetFilterTypeKind(FilterType filterType) +{ + FilterTypeKind filterKind; + switch (filterType) + { + case FilterType::Nearest: + filterKind = FilterTypeKind::Nearest; + break; + case FilterType::Linear: + filterKind = FilterTypeKind::Linear; + break; + default: + filterKind = FilterTypeKind::ShmFilter; + break; + } + return filterKind; +} + +struct FilterMode +{ + FilterType filterType; + bool antialias; +}; + +inline FilterMode GetFilterMode(NVCVInterpolationType interpolation, bool antialias) + +{ + FilterType filterType; + switch (interpolation) + { + case NVCV_INTERP_NEAREST: + filterType = FilterType::Nearest; + break; + case NVCV_INTERP_LINEAR: + filterType = FilterType::Linear; + break; + case NVCV_INTERP_CUBIC: + filterType = FilterType::Cubic; + break; + case NVCV_INTERP_LANCZOS: + filterType = FilterType::Lanczos3; + break; + case NVCV_INTERP_GAUSSIAN: + filterType = FilterType::Gaussian; + break; + default: + throw nvcv::Exception(nvcv::Status::ERROR_NOT_IMPLEMENTED, + "The resize operator does not support the selected interpolation method"); + } + if (antialias && filterType == FilterType::Linear) + { + filterType = FilterType::Triangular; + } + return {filterType, antialias}; +} + +inline std::tuple GetFilterModes(NVCVInterpolationType minInterpolation, + NVCVInterpolationType magInterpolation, bool antialias) +{ + std::tuple modes; + auto &[minFilter, magFilter] = modes; + minFilter = GetFilterMode(minInterpolation, antialias); + magFilter = GetFilterMode(magInterpolation, false); + return modes; +} + +struct ResamplingFilter +{ + float *coeffs; + int numCoeffs; + float anchor; // support / 2 + float scale; // (numCoeffs - 1) / support + + void rescale(float support) + { + float old_scale = scale; + scale = (numCoeffs - 1) / support; + anchor = anchor * old_scale / scale; + } + + __host__ __device__ int support() const + { + return ceilf((numCoeffs - 1) / scale); + } + + __device__ float operator()(float x) const + { + if (!(x > -1)) // negative and NaN arguments + return 0; + if (x >= numCoeffs) + return 0; + int x0 = floorf(x); + int x1 = x0 + 1; + float d = x - x0; + float f0 = x0 < 0 ? 0.0f : __ldg(coeffs + x0); + float f1 = x1 >= numCoeffs ? 0.0f : __ldg(coeffs + x1); + return f0 + d * (f1 - f0); + } +}; + +static_assert(std::is_pod_v); + +inline float LanczosWindow(float x, float a) +{ + if (fabsf(x) >= a) + return 0.0f; + return nvcv::util::sinc(x) * nvcv::util::sinc(x / a); +} + +inline float CubicWindow(float x) +{ + x = fabsf(x); + if (x >= 2) + return 0; + + float x2 = x * x; + float x3 = x2 * x; + if (x > 1) + return -0.5f * x3 + 2.5f * x2 - 4.0f * x + 2.0f; + else + return 1.5f * x3 - 2.5f * x2 + 1.0f; +} + +template +inline void InitFilter(ResamplingFilter &filter, Function F) +{ + for (int i = 0; i < filter.numCoeffs; i++) filter.coeffs[i] = F(i); +} + +inline void InitTriangularFilter(ResamplingFilter filter) +{ + filter.coeffs[0] = 0; + filter.coeffs[1] = 1; + filter.coeffs[2] = 0; +} + +inline void InitGaussianFilter(ResamplingFilter filter) +{ + InitFilter(filter, + [&](int i) + { + float x = 4 * (i - (filter.numCoeffs - 1) * 0.5f) / (filter.numCoeffs - 1); + return expf(-x * x); + }); +} + +inline void InitLanczosFilter(ResamplingFilter filter, float a) +{ + InitFilter(filter, + [&](int i) + { + float x = 2 * a * (i - (filter.numCoeffs - 1) * 0.5f) / (filter.numCoeffs - 1); + return LanczosWindow(x, a); + }); + filter.rescale(6); // rescaling to the minimal allowed support +} + +inline void InitCubicFilter(ResamplingFilter filter) +{ + InitFilter(filter, + [&](int i) + { + float x = 4 * (i - (filter.numCoeffs - 1) * 0.5f) / (filter.numCoeffs - 1); + return CubicWindow(x); + }); + filter.rescale(4); // rescaling to the minimal allowed support +} + +class ResamplingFiltersFactory +{ +public: + enum FilterIdx + { + Idx_Triangular = 0, + Idx_Gaussian, + Idx_Lanczos3, + Idx_Cubic, + kNumFilters + }; + + static constexpr int kLanczosResolution = 32; + static constexpr int kLanczosA = 3; + + static constexpr int kTriangularSize = 3; + static constexpr int kGaussianSize = 65; + static constexpr int kCubicSize = 129; + static constexpr int kLanczosSize = (2 * kLanczosA * kLanczosResolution + 1); + + static constexpr int kTotalSize = kTriangularSize + kGaussianSize + kCubicSize + kLanczosSize; + + ResamplingFiltersFactory() + : m_deviceId{[]() + { + int deviceId; + NVCV_CHECK_THROW(cudaGetDevice(&deviceId)); + return deviceId; + }()} + + { + // Pinned memory is needed for proper synchronization of the synchronous copy + std::unique_ptr> filterDataPinned; + { + float *ptr = nullptr; + NVCV_CHECK_THROW(cudaMallocHost(&ptr, kTotalSize * sizeof(float))); + filterDataPinned = {ptr, [](void *ptr) + { + NVCV_CHECK_THROW(cudaFreeHost(ptr)); + }}; + } + { + float *ptr = nullptr; + NVCV_CHECK_THROW(cudaMalloc(&ptr, kTotalSize * sizeof(float))); + m_filterDataGpu = {ptr, [](void *ptr) + { + NVCV_CHECK_THROW(cudaFree(ptr)); + }}; + } + auto addFilter = [&](FilterIdx filterIdx, int size) + { + float *base = filterIdx == 0 ? filterDataPinned.get() + : m_filters[filterIdx - 1].coeffs + m_filters[filterIdx - 1].numCoeffs; + m_filters[filterIdx] = {base, size, 1, (size - 1) * 0.5f}; + }; + addFilter(Idx_Triangular, kTriangularSize); + InitTriangularFilter(m_filters[Idx_Triangular]); + addFilter(Idx_Gaussian, kGaussianSize); + InitGaussianFilter(m_filters[Idx_Gaussian]); + addFilter(Idx_Lanczos3, kLanczosSize); + InitLanczosFilter(m_filters[Idx_Lanczos3], kLanczosA); + addFilter(Idx_Cubic, kCubicSize); + InitCubicFilter(m_filters[Idx_Cubic]); + + // According to cuda-driver-api: For transfers from pinned host memory to device memory, + // the cudaMemcpy is synchronous with respect to the host. + NVCV_CHECK_THROW(cudaMemcpy(m_filterDataGpu.get(), filterDataPinned.get(), kTotalSize * sizeof(float), + cudaMemcpyHostToDevice)); + // Set the pointers to the corresponding offsets in m_filterDataGpu + ptrdiff_t diff = m_filterDataGpu.get() - filterDataPinned.get(); + for (auto &f : m_filters) + { + f.coeffs += diff; + } + } + + ResamplingFilter CreateCubic(float radius = 2.0f) const noexcept + { + validateDeviceId(); + auto flt = m_filters[Idx_Cubic]; + flt.rescale(2.0f * std::max(2.0f, radius)); + return flt; + } + + ResamplingFilter CreateGaussian(float sigma) const noexcept + { + validateDeviceId(); + auto flt = m_filters[Idx_Gaussian]; + flt.rescale(std::max(1.0f, static_cast(4 * M_SQRT2) * sigma)); + return flt; + } + + ResamplingFilter CreateLanczos3(float radius = 3.0f) const noexcept + { + validateDeviceId(); + auto flt = m_filters[Idx_Lanczos3]; + flt.rescale(2.0f * std::max(3.0f, radius)); + return flt; + } + + ResamplingFilter CreateTriangular(float radius) const noexcept + { + validateDeviceId(); + auto flt = m_filters[Idx_Triangular]; + flt.rescale(std::max(1.0f, 2 * radius)); + return flt; + } + +private: + void validateDeviceId() const + { + int deviceId; + NVCV_CHECK_THROW(cudaGetDevice(&deviceId)); + if (deviceId != m_deviceId) + { + throw nvcv::Exception(nvcv::Status::ERROR_DEVICE, + "The HQ resize operator was initialized and called with different current device."); + } + } + + int m_deviceId; + std::unique_ptr> m_filterDataGpu; + ResamplingFilter m_filters[kNumFilters]; +}; + +inline ResamplingFilter GetResamplingFilter(const ResamplingFiltersFactory &filtersFactory, + const FilterMode &filterMode, const float inSize, const float outSize) +{ + bool antialias = filterMode.antialias && (outSize < inSize); + switch (filterMode.filterType) + { + case FilterType::Linear: + { + return filtersFactory.CreateTriangular(1); + } + break; + case FilterType::Triangular: + { + const float radius = antialias ? inSize / outSize : 1; + return filtersFactory.CreateTriangular(radius); + } + break; + case FilterType::Gaussian: + { + const float radius = antialias ? inSize / outSize : 1; + return filtersFactory.CreateGaussian(radius * 0.5f / M_SQRT2); + } + break; + case FilterType::Cubic: + { + const float radius = antialias ? (2 * inSize / outSize) : 2; + return filtersFactory.CreateCubic(radius); + } + break; + case FilterType::Lanczos3: + { + const float radius = antialias ? (3 * inSize / outSize) : 3; + return filtersFactory.CreateLanczos3(radius); + } + default: // Nearest neighbour + { + return {nullptr, 0, 0, 1}; + } + } +} + +} // namespace cvcuda::priv::hq_resize::filter +#endif // CVCUDA_PRIV_HQ_RESIZE_FILTER_CUH diff --git a/src/cvcuda/priv/legacy/channel_reorder_var_shape.cu b/src/cvcuda/priv/legacy/channel_reorder_var_shape.cu index 93688bd23..af21bd1a9 100644 --- a/src/cvcuda/priv/legacy/channel_reorder_var_shape.cu +++ b/src/cvcuda/priv/legacy/channel_reorder_var_shape.cu @@ -92,6 +92,12 @@ ErrorCode ChannelReorderVarShape::infer(const ImageBatchVarShapeDataStridedCuda return ErrorCode::INVALID_DATA_SHAPE; } + if (inData.numImages() == 0) + { + // nothing to do, move above the calling of GetLegacyDataType to avoid error: "All planes must have the same data type" + return ErrorCode::SUCCESS; + } + DataType data_type; int channels; { @@ -113,6 +119,12 @@ ErrorCode ChannelReorderVarShape::infer(const ImageBatchVarShapeDataStridedCuda return ErrorCode::INVALID_DATA_SHAPE; } + if (helpers::GetLegacyDataType(orderData.dtype()) != kCV_32S) + { + LOG_ERROR("Invalid Order tensor DataType " << helpers::GetLegacyDataType(orderData.dtype())); + return ErrorCode::INVALID_DATA_SHAPE; + } + if (orderData.layout()[0] != nvcv::LABEL_BATCH) { LOG_ERROR("Label of the first dimension of order tensor must be " << nvcv::LABEL_BATCH); @@ -144,14 +156,14 @@ ErrorCode ChannelReorderVarShape::infer(const ImageBatchVarShapeDataStridedCuda if (outFmt.numPlanes() != 1) { - LOG_ERROR("Format of input image #" << i << " must have only 1 plane"); + LOG_ERROR("Format of output image #" << i << " must have only 1 plane"); return ErrorCode::INVALID_DATA_FORMAT; } // Legacy code has this check, let's stick to it. if (inFmt.numChannels() != channels) { - LOG_ERROR("Invalid input"); + LOG_ERROR("Input channel " << inFmt.numChannels() << " differs from " << channels); return ErrorCode::INVALID_DATA_SHAPE; } @@ -185,12 +197,6 @@ ErrorCode ChannelReorderVarShape::infer(const ImageBatchVarShapeDataStridedCuda } } - if (inData.numImages() == 0) - { - // nothing to do - return ErrorCode::SUCCESS; - } - typedef void (*func_t)(const ImageBatchVarShapeDataStridedCuda &inData, const ImageBatchVarShapeDataStridedCuda &outData, const TensorDataStridedCuda &orderData, int numChannels, cudaStream_t stream); diff --git a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu index 2dc01bbd2..0d469ca71 100644 --- a/src/cvcuda/priv/legacy/cvt_color_var_shape.cu +++ b/src/cvcuda/priv/legacy/cvt_color_var_shape.cu @@ -746,8 +746,9 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != sch) { @@ -755,6 +756,12 @@ inline ErrorCode BGR_to_RGB(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -837,8 +844,9 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != 1) { @@ -846,6 +854,12 @@ inline ErrorCode GRAY_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -929,8 +943,9 @@ inline ErrorCode BGR_to_GRAY(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != sch) { @@ -938,6 +953,12 @@ inline ErrorCode BGR_to_GRAY(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -1004,8 +1025,9 @@ inline ErrorCode BGR_to_YUV(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != 3) { @@ -1013,6 +1035,12 @@ inline ErrorCode BGR_to_YUV(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -1079,8 +1107,9 @@ inline ErrorCode YUV_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != 3) { @@ -1088,6 +1117,12 @@ inline ErrorCode YUV_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -1155,8 +1190,9 @@ inline ErrorCode BGR_to_HSV(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != 3) { @@ -1164,6 +1200,12 @@ inline ErrorCode BGR_to_HSV(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); @@ -1223,8 +1265,9 @@ inline ErrorCode HSV_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_FORMAT; } - int channels = inData.uniqueFormat().numChannels(); - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + int channels = inData.uniqueFormat().numChannels(); + DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (channels != 3) { @@ -1232,6 +1275,12 @@ inline ErrorCode HSV_to_BGR(const ImageBatchVarShapeDataStridedCuda &inData, return ErrorCode::INVALID_DATA_SHAPE; } + if (data_type != out_data_type) + { + LOG_ERROR("Unsupported input/output DataType " << data_type << "/" << out_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + if (!outData.uniqueFormat()) { LOG_ERROR("Images in the output batch must all have the same format"); diff --git a/src/cvcuda/priv/legacy/gaussian_noise.cu b/src/cvcuda/priv/legacy/gaussian_noise.cu index e9da8c573..77d09fef1 100644 --- a/src/cvcuda/priv/legacy/gaussian_noise.cu +++ b/src/cvcuda/priv/legacy/gaussian_noise.cu @@ -199,7 +199,7 @@ GaussianNoise::GaussianNoise(DataShape max_input_shape, DataShape max_output_sha if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Parameter error!"); } cudaError_t err = cudaMalloc((void **)&m_states, sizeof(curandState) * BLOCK * maxBatchSize); if (err != cudaSuccess) @@ -253,7 +253,7 @@ ErrorCode GaussianNoise::infer(const TensorDataStridedCuda &inData, const Tensor DataType out_data_type = GetLegacyDataType(outData.dtype()); if (in_data_type != out_data_type) { - LOG_ERROR("Invalid DataType " << out_data_type); + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } diff --git a/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu b/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu index 70b4f8ab2..c97515cfa 100644 --- a/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu +++ b/src/cvcuda/priv/legacy/gaussian_noise_var_shape.cu @@ -258,7 +258,7 @@ ErrorCode GaussianNoiseVarShape::infer(const ImageBatchVarShapeDataStridedCuda & DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (in_data_type != out_data_type) { - LOG_ERROR("Invalid DataType " << out_data_type); + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } diff --git a/src/cvcuda/priv/legacy/inpaint.cu b/src/cvcuda/priv/legacy/inpaint.cu index 077fc6643..909c2c987 100644 --- a/src/cvcuda/priv/legacy/inpaint.cu +++ b/src/cvcuda/priv/legacy/inpaint.cu @@ -634,7 +634,7 @@ ErrorCode Inpaint::infer(const TensorDataStridedCuda &inData, const TensorDataSt if (in_data_type != out_data_type) { - LOG_ERROR("Invalid DataType " << out_data_type); + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } diff --git a/src/cvcuda/priv/legacy/median_blur_var_shape.cu b/src/cvcuda/priv/legacy/median_blur_var_shape.cu index 0c65efbab..3abf39adb 100644 --- a/src/cvcuda/priv/legacy/median_blur_var_shape.cu +++ b/src/cvcuda/priv/legacy/median_blur_var_shape.cu @@ -91,7 +91,7 @@ __device__ T fetch(T *shared, const cuda::ImageBatchVarShapeWrapNHWC src, int */ template __global__ void median(const cuda::ImageBatchVarShapeWrapNHWC src, cuda::ImageBatchVarShapeWrapNHWC dst, - const cuda::Tensor2DWrap ksize) + cuda::Tensor1DWrap ksize) { #define fetch_(gx, gy, block_size) \ fetch(tails, src, batchIdx, h, w, channel, blockX, blockY, (gx), (gy), (block_size)) @@ -105,8 +105,10 @@ __global__ void median(const cuda::ImageBatchVarShapeWrapNHWC src, cuda::Imag int channel = blockIdx.z % dst.numChannels(); int batchIdx = blockIdx.z / dst.numChannels(); int h = src.height(batchIdx), w = src.width(batchIdx); - int kWidth = *ksize.ptr(batchIdx, 0); //kWidths[batchIdx]; - int kHeight = *ksize.ptr(batchIdx, 1); //kHeights[batchIdx]; + + int2 kernelSize = ksize[batchIdx]; + int kWidth = kernelSize.x; + int kHeight = kernelSize.y; __shared__ T tails[GENERAL_KERNEL_BLOCK * GENERAL_KERNEL_BLOCK]; if (x < w && y < h) @@ -277,7 +279,7 @@ __inline__ __device__ T placePivot(T *arr, int length) template __global__ void medianForSmallKernel(const cuda::ImageBatchVarShapeWrapNHWC src, - cuda::ImageBatchVarShapeWrapNHWC dst, const cuda::Tensor2DWrap ksize) + cuda::ImageBatchVarShapeWrapNHWC dst, cuda::Tensor1DWrap ksize) { int tx = threadIdx.x, ty = threadIdx.y; int blockX = blockIdx.x * blockDim.x; @@ -287,8 +289,10 @@ __global__ void medianForSmallKernel(const cuda::ImageBatchVarShapeWrapNHWC s int channel = blockIdx.z % dst.numChannels(); int batchIdx = blockIdx.z / dst.numChannels(); int h = src.height(batchIdx), w = src.width(batchIdx); - int kWidth = *ksize.ptr(batchIdx, 0); //kWidths[batchIdx]; - int kHeight = *ksize.ptr(batchIdx, 1); //kHeights[batchIdx]; + + int2 kernelSize = ksize[batchIdx]; + int kWidth = kernelSize.x; + int kHeight = kernelSize.y; __shared__ T tails[SMALL_KERNEL_BLOCK * SMALL_KERNEL_BLOCK]; if (x < w && y < h) @@ -350,8 +354,6 @@ void median(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap cuda::ImageBatchVarShapeWrapNHWC src(in, channels); cuda::ImageBatchVarShapeWrapNHWC dst(out, channels); - cuda::Tensor2DWrap ksizePtr(ksize); - #ifdef CUDA_DEBUG_LOG checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGetLastError()); diff --git a/src/cvcuda/priv/legacy/min_area_rect.cu b/src/cvcuda/priv/legacy/min_area_rect.cu index 384c011cb..cdf463249 100644 --- a/src/cvcuda/priv/legacy/min_area_rect.cu +++ b/src/cvcuda/priv/legacy/min_area_rect.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +/* Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES * SPDX-License-Identifier: Apache-2.0 @@ -42,7 +42,6 @@ void calculateRotateCoefCUDA(cuda::Tensor2DWrap rotateCoefBuf, const int template __global__ void resetRotatedPointsBuf(cuda::Tensor3DWrap rotatedPointsTensor, const int numOfDegrees) { - // int pointIdx = blockIdx.x * blockDim.x + threadIdx.x; int contourIdx = blockIdx.x; int angleIdx = threadIdx.x; if (angleIdx < numOfDegrees) @@ -124,56 +123,83 @@ template __global__ void findMinAreaAndAngle(TensorWrapper rotatedPointsTensor, cuda::Tensor2DWrap outMinAreaRectBox, const int numOfDegrees) { + // Determine the angle index from the thread's X-dimension index. int angleIdx = threadIdx.x; + + // If the angle index exceeds the number of degrees, exit the thread to avoid out-of-bounds access. if (angleIdx > numOfDegrees) { return; } + // Determine the rectangle index from the block's X-dimension index. int rectIdx = blockIdx.x; extern __shared__ int areaAngleBuf_sm[]; - areaAngleBuf_sm[2 * angleIdx] = *rotatedPointsTensor.ptr(rectIdx, angleIdx, 4); - areaAngleBuf_sm[2 * angleIdx + 1] = *rotatedPointsTensor.ptr(rectIdx, angleIdx, 5); + // Load area and angle data from the input tensor into shared memory for efficient access. + // rotatedPointsTensor is a 3D tensor with dimensions (rectIdx (N), angleIdx (0-90), 6). + areaAngleBuf_sm[2 * angleIdx] = *rotatedPointsTensor.ptr(rectIdx, angleIdx, 4); + areaAngleBuf_sm[(2 * angleIdx) + 1] = *rotatedPointsTensor.ptr(rectIdx, angleIdx, 5); + + // Synchronize threads within a block to ensure shared memory is fully populated. __syncthreads(); + // Iterate over strides, halving the stride each time, for a parallel reduction algorithm. + // Each thread in the block will compare the area of the rectangle at its current angleInxed (threadIdx.x) for (int stride = numOfDegrees / 2; stride > 0; stride >>= 1) { + // Only process elements within the current stride length. if (angleIdx < stride) { + // Pointers to the current and next elements in the shared memory for comparison. int *curAreaIdx = &areaAngleBuf_sm[2 * angleIdx]; int *nextAreaIdx = &areaAngleBuf_sm[2 * (angleIdx + stride)]; int *curAngleIdx = &areaAngleBuf_sm[2 * angleIdx + 1]; int *nextAngleIdx = &areaAngleBuf_sm[2 * (angleIdx + stride) + 1]; + + // Compare and store the minimum area and corresponding angle. if (*curAreaIdx > *nextAreaIdx) { *curAreaIdx = *nextAreaIdx; *curAngleIdx = *nextAngleIdx; } } + + // Synchronize threads within a block after each iteration. __syncthreads(); + // Handle the case when stride is odd, ensuring the first element is the minimum. if (stride % 2 == 1 && areaAngleBuf_sm[0] > areaAngleBuf_sm[2 * (stride - 1)]) { areaAngleBuf_sm[0] = areaAngleBuf_sm[2 * (stride - 1)]; areaAngleBuf_sm[1] = areaAngleBuf_sm[2 * (stride - 1) + 1]; } + + // Synchronize threads within a block after handling the odd stride case. __syncthreads(); } + + // Handle the case for odd number of degrees. if (numOfDegrees % 2 == 1 && areaAngleBuf_sm[0] > areaAngleBuf_sm[2 * (numOfDegrees - 1)]) { areaAngleBuf_sm[0] = areaAngleBuf_sm[2 * (numOfDegrees - 1)]; areaAngleBuf_sm[1] = areaAngleBuf_sm[2 * (numOfDegrees - 1) + 1]; } + + // The following calculations are performed only by the first thread in each block. if (threadIdx.x == 0) { - int minRotateAngle = areaAngleBuf_sm[1]; - float cos_coeff = cos(-minRotateAngle * PI / 180); - float sin_coeff = sin(-minRotateAngle * PI / 180); - float xmin = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 0); - float ymin = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 1); - float xmax = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 2); - float ymax = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 3); - + // Retrieve the minimum rotation angle from shared memory. + int minRotateAngle = areaAngleBuf_sm[1]; + + // Extract the coordinates of the rectangle corners for the minimum rotation angle. + float cos_coeff = cos(-minRotateAngle * PI / 180); + float sin_coeff = sin(-minRotateAngle * PI / 180); + float xmin = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 0); + float ymin = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 1); + float xmax = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 2); + float ymax = *rotatedPointsTensor.ptr(rectIdx, areaAngleBuf_sm[1], 3); + + // Calculate cosine and sine coefficients for the rotation. float tl_x = (xmin * cos_coeff) - (ymin * sin_coeff); float tl_y = (xmin * sin_coeff) + (ymin * cos_coeff); float br_x = (xmax * cos_coeff) - (ymax * sin_coeff); @@ -183,6 +209,7 @@ __global__ void findMinAreaAndAngle(TensorWrapper rotatedPointsTensor, cuda::Ten float bl_x = (xmin * cos_coeff) - (ymax * sin_coeff); float bl_y = (xmin * sin_coeff) + (ymax * cos_coeff); + // Store the transformed coordinates back into the output tensor. *outMinAreaRectBox.ptr(rectIdx, 0) = bl_x; *outMinAreaRectBox.ptr(rectIdx, 1) = bl_y; *outMinAreaRectBox.ptr(rectIdx, 2) = tl_x; @@ -205,7 +232,7 @@ void minAreaRect(const TensorDataStridedCuda &inData, void *rotatedPointsDev, cuda::Tensor3DWrap inContourPointsData(inData); int kernelPitch2 = static_cast(_MIN_AREA_EACH_ANGLE_STRID * sizeof(int)); - int kernelPitch1 = _MAX_ROTATE_DEGREES * kernelPitch2; + int kernelPitch1 = (_MAX_ROTATE_DEGREES + 1) * kernelPitch2; cuda::Tensor3DWrap rotatedPointsTensor(rotatedPointsDev, kernelPitch1, kernelPitch2); cuda::Tensor2DWrap outMinAreaRectData(outData); cuda::Tensor2DWrap pointsInContourData(numPointsInContour); @@ -217,13 +244,15 @@ void minAreaRect(const TensorDataStridedCuda &inData, void *rotatedPointsDev, dim3 block2(256); dim3 grid2(divUp(maxNumPointsInContour, block2.x), contourBatch, _MAX_ROTATE_DEGREES); - size_t smem_size = 2 * _MAX_ROTATE_DEGREES * sizeof(float); + // Shared mem should be ((2 * (_MAX_ROTATE_DEGREES + 1))* sizeof(int) since there are 2 entries per angle and its inclusive (0-90) + size_t smem_size = (2 * (_MAX_ROTATE_DEGREES + 1)) * sizeof(int); calculateRotateArea<<>>(inContourPointsData, rotatedPointsTensor, rotateCoeffsData, pointsInContourData); checkKernelErrors(); cudaStreamSynchronize(stream); dim3 grid3(contourBatch); + findMinAreaAndAngle<<>>(rotatedPointsTensor, outMinAreaRectData, _MAX_ROTATE_DEGREES); checkKernelErrors(); @@ -232,9 +261,10 @@ void minAreaRect(const TensorDataStridedCuda &inData, void *rotatedPointsDev, MinAreaRect::MinAreaRect(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum) : mMaxContourNum(maxContourNum) { + // This needs to be _MAX_ROTATE_DEGREES + 1 since we look at 0-90 degrees inclusive. + int rotatedPtsAllocSize = ((maxContourNum * (_MAX_ROTATE_DEGREES + 1) * _MIN_AREA_EACH_ANGLE_STRID) * sizeof(int)); NVCV_CHECK_THROW(cudaMalloc(&mRotateCoeffsBufDev, _MAX_ROTATE_DEGREES * 2 * sizeof(float))); - NVCV_CHECK_THROW( - cudaMalloc(&mRotatedPointsDev, maxContourNum * _MAX_ROTATE_DEGREES * _MIN_AREA_EACH_ANGLE_STRID * sizeof(int))); + NVCV_CHECK_THROW(cudaMalloc(&mRotatedPointsDev, rotatedPtsAllocSize)); } MinAreaRect::~MinAreaRect() @@ -245,7 +275,7 @@ MinAreaRect::~MinAreaRect() size_t MinAreaRect::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum) { - return maxContourNum * _MAX_ROTATE_DEGREES * _MIN_AREA_EACH_ANGLE_STRID * sizeof(int); + return maxContourNum * (_MAX_ROTATE_DEGREES + 1) * _MIN_AREA_EACH_ANGLE_STRID * sizeof(int); } ErrorCode MinAreaRect::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, diff --git a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu index e95cc20ec..7539e30f5 100644 --- a/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/pillow_resize_var_shape.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +/* Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES * SPDX-License-Identifier: Apache-2.0 @@ -536,7 +536,7 @@ WorkspaceRequirements PillowResizeVarShape::getWorkspaceRequirements(DataShape m WorkspaceRequirements req{}; - int max_support = 1; //3 + int max_support = 3; // Needed for various filtes Cubic needs 2 and Lanczos needs 3. Just use worst case. size_t size = std::ceil( max_output_shape.H * (((1.0 * max_input_shape.H / max_output_shape.H + 1) * max_support * 2 + 1) * sizeof(work_type) diff --git a/src/cvcuda/priv/legacy/random_resized_crop.cu b/src/cvcuda/priv/legacy/random_resized_crop.cu index bd038f48a..7f0959624 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop.cu @@ -375,7 +375,7 @@ RandomResizedCrop::RandomResizedCrop(DataShape max_input_shape, DataShape max_ou if (min_scale_ > max_scale_ || min_ratio_ > max_ratio_) { LOG_ERROR("Invalid Parameter: scale and ratio should be of kind (min, max)"); - throw std::runtime_error("Memory allocation error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Parameter error!"); } if (maxBatchSize > 0) { @@ -488,8 +488,9 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te auto inAccess = TensorDataAccessStridedImagePlanar::Create(inData); NVCV_ASSERT(inAccess); - DataType data_type = helpers::GetLegacyDataType(inData.dtype()); - DataShape input_shape = helpers::GetLegacyDataShape(inAccess->infoShape()); + DataType in_data_type = helpers::GetLegacyDataType(inData.dtype()); + DataType out_data_type = helpers::GetLegacyDataType(outData.dtype()); + DataShape input_shape = helpers::GetLegacyDataShape(inAccess->infoShape()); int channels = input_shape.C; @@ -499,9 +500,15 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te return ErrorCode::INVALID_DATA_SHAPE; } - if (!(data_type == kCV_8U || data_type == kCV_16U || data_type == kCV_16S || data_type == kCV_32F)) + if (!(in_data_type == kCV_8U || in_data_type == kCV_16U || in_data_type == kCV_16S || in_data_type == kCV_32F)) { - LOG_ERROR("Invalid DataType " << data_type); + LOG_ERROR("Invalid DataType " << in_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + + if (in_data_type != out_data_type) + { + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } @@ -559,7 +566,7 @@ ErrorCode RandomResizedCrop::infer(const TensorDataStridedCuda &inData, const Te { resize, 0 /*resize*/, resize, resize} }; - const func_t func = funcs[data_type][channels - 1]; + const func_t func = funcs[in_data_type][channels - 1]; func(inData, outData, interpolation, stream, tops_gpu, lefts_gpu, scale_x_gpu, scale_y_gpu); return SUCCESS; } diff --git a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu index 26453a0fa..279b2c875 100644 --- a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu +++ b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu @@ -397,7 +397,7 @@ ErrorCode RandomResizedCropVarShape::infer(const ImageBatchVarShape &in, const I if (m_maxBatchSize <= 0 || inData.numImages() > m_maxBatchSize) { - LOG_ERROR("Invalid maximum batch size"); + LOG_ERROR("Invalid maximum batch size" << m_maxBatchSize); return ErrorCode::INVALID_PARAMETER; } @@ -432,11 +432,18 @@ ErrorCode RandomResizedCropVarShape::infer(const ImageBatchVarShape &in, const I return ErrorCode::INVALID_DATA_SHAPE; } - DataType data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType in_data_type = helpers::GetLegacyDataType(inData.uniqueFormat()); + DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); - if (!(data_type == kCV_8U || data_type == kCV_16U || data_type == kCV_16S || data_type == kCV_32F)) + if (!(in_data_type == kCV_8U || in_data_type == kCV_16U || in_data_type == kCV_16S || in_data_type == kCV_32F)) { - LOG_ERROR("Invalid DataType " << data_type); + LOG_ERROR("Invalid DataType " << in_data_type); + return ErrorCode::INVALID_DATA_TYPE; + } + + if (in_data_type != out_data_type) + { + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } @@ -491,7 +498,7 @@ ErrorCode RandomResizedCropVarShape::infer(const ImageBatchVarShape &in, const I { resize, 0 /*resize*/, resize, resize} }; - const func_t func = funcs[data_type][channels - 1]; + const func_t func = funcs[in_data_type][channels - 1]; func(inData, outData, interpolation, stream, scale_y_gpu, scale_x_gpu, tops_gpu, lefts_gpu); return SUCCESS; } diff --git a/src/cvcuda/priv/legacy/threshold.cu b/src/cvcuda/priv/legacy/threshold.cu index 8c5b620df..fd6c85791 100644 --- a/src/cvcuda/priv/legacy/threshold.cu +++ b/src/cvcuda/priv/legacy/threshold.cu @@ -793,7 +793,7 @@ Threshold::Threshold(DataShape max_input_shape, DataShape max_output_shape, uint if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Parameter error!"); } m_automatic_thresh = (m_type & ~NVCV_THRESH_MASK); if (m_automatic_thresh != 0) @@ -833,7 +833,7 @@ ErrorCode Threshold::infer(const TensorDataStridedCuda &inData, const TensorData DataType out_data_type = GetLegacyDataType(outData.dtype()); if (in_data_type != out_data_type) { - LOG_ERROR("Invalid Data Type " << out_data_type); + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } diff --git a/src/cvcuda/priv/legacy/threshold_var_shape.cu b/src/cvcuda/priv/legacy/threshold_var_shape.cu index dbbebb767..deaf70db3 100644 --- a/src/cvcuda/priv/legacy/threshold_var_shape.cu +++ b/src/cvcuda/priv/legacy/threshold_var_shape.cu @@ -1003,7 +1003,7 @@ ThresholdVarShape::ThresholdVarShape(DataShape max_input_shape, DataShape max_ou if (maxBatchSize < 0) { LOG_ERROR("Invalid num of max batch size " << maxBatchSize); - throw std::runtime_error("Parameter error!"); + throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Parameter error!"); } m_automatic_thresh = (m_type & ~NVCV_THRESH_MASK); if (m_automatic_thresh != 0) @@ -1044,7 +1044,7 @@ ErrorCode ThresholdVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inDa DataType out_data_type = helpers::GetLegacyDataType(outData.uniqueFormat()); if (in_data_type != out_data_type) { - LOG_ERROR("Invalid Data Type " << out_data_type); + LOG_ERROR("DataType of input and output must be equal, but got " << in_data_type << " and " << out_data_type); return ErrorCode::INVALID_DATA_TYPE; } diff --git a/src/nvcv_types/Tensor.cpp b/src/nvcv_types/Tensor.cpp index 89b352398..146b4d3cd 100644 --- a/src/nvcv_types/Tensor.cpp +++ b/src/nvcv_types/Tensor.cpp @@ -49,6 +49,16 @@ NVCV_DEFINE_API(0, 2, NVCVStatus, nvcvTensorCalcRequirementsForImages, throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "Pointer to output requirements must not be NULL"); } + if (batch < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "numImages must >= 0"); + } + + if (width < 0 || height < 0) + { + throw priv::Exception(NVCV_ERROR_INVALID_ARGUMENT, "width and height must >= 0"); + } + priv::ImageFormat fmt{format}; *reqs = priv::Tensor::CalcRequirements(batch, {width, height}, fmt, baseAlign, rowAlign); diff --git a/src/nvcv_types/include/nvcv/alloc/Requirements.h b/src/nvcv_types/include/nvcv/alloc/Requirements.h index ec8fde3ef..b1a7fbf07 100644 --- a/src/nvcv_types/include/nvcv/alloc/Requirements.h +++ b/src/nvcv_types/include/nvcv/alloc/Requirements.h @@ -64,7 +64,7 @@ typedef struct NVCVRequirementsRec * @param [out] req Requirements to be initialized to zero * + Must not be NULL * - * @retval NVCV_STATUS_INVALID_ARGUMENTS Some parameter is outside its valid range. + * @retval NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. * @retval NVCV_SUCCESS Operation completed successfully. */ NVCV_PUBLIC NVCVStatus nvcvRequirementsInit(NVCVRequirements *req); @@ -82,7 +82,7 @@ NVCV_PUBLIC NVCVStatus nvcvRequirementsInit(NVCVRequirements *req); * @param [in] req Requirements to be added. * + Must not be NULL * - * @retval NVCV_STATUS_INVALID_ARGUMENTS Some parameter is outside its valid range. + * @retval NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. * @retval NVCV_SUCCESS Operation completed successfully. */ NVCV_PUBLIC NVCVStatus nvcvRequirementsAdd(NVCVRequirements *reqSum, const NVCVRequirements *req); @@ -95,7 +95,7 @@ NVCV_PUBLIC NVCVStatus nvcvRequirementsAdd(NVCVRequirements *reqSum, const NVCVR * @param [out] size_t Calculated size in bytes. * + Must not be NULL * - * @retval NVCV_STATUS_INVALID_ARGUMENTS Some parameter is outside its valid range. + * @retval NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. * @retval NVCV_SUCCESS Operation completed successfully. */ NVCV_PUBLIC NVCVStatus nvcvMemRequirementsCalcTotalSizeBytes(const NVCVMemRequirements *memReq, int64_t *sizeBytes); @@ -115,7 +115,7 @@ NVCV_PUBLIC NVCVStatus nvcvMemRequirementsCalcTotalSizeBytes(const NVCVMemRequir * * @param [in] bufAlignment Alignment of the memory buffer, in bytes. * - * @retval NVCV_STATUS_INVALID_ARGUMENTS Some parameter is outside its valid range. + * @retval NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside its valid range. * @retval NVCV_SUCCESS Operation completed successfully. */ NVCV_PUBLIC NVCVStatus nvcvMemRequirementsAddBuffer(NVCVMemRequirements *memReq, int64_t bufSize, int64_t bufAlignment); diff --git a/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp new file mode 100644 index 000000000..ebc838658 --- /dev/null +++ b/src/nvcv_types/include/nvcv/cuda/TensorBatchWrap.hpp @@ -0,0 +1,386 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file TensorBatchWrap.hpp + * + * @brief Defines a wrapper of a tensor batch. + */ + +#ifndef NVCV_CUDA_TENSOR_BATCH_WRAP_HPP +#define NVCV_CUDA_TENSOR_BATCH_WRAP_HPP + +#include "TypeTraits.hpp" // for HasTypeTraits, etc +#include "nvcv/TensorBatchData.hpp" +#include "nvcv/cuda/TensorWrap.hpp" + +#include + +namespace nvcv::cuda { + +/** + * @defgroup NVCV_CPP_CUDATOOLS_TENSORBATCHWRAP TensorBatchWrap classes + * @{ + */ + +/** + * TensorBatchWrap class is a non-owning wrap of a batch of N-D tensors used for easy access of its elements in CUDA device. + * + * TensorBatchWrap is a wrapper of a batch of multi-dimensional tensors that can have one or more of its N dimension strides, or + * pitches, defined either at compile-time or at run-time. Each pitch in \p Strides represents the offset in bytes + * as a compile-time template parameter that will be applied from the first (slowest changing) dimension to the + * last (fastest changing) dimension of the tensor, in that order. Each dimension with run-time pitch is specified + * as -1 in the \p Strides template parameter. + * + * Template arguments: + * - T type of the values inside the tensors + * - Strides sequence of compile- or run-time pitches (-1 indicates run-time) + * - Y compile-time pitches + * - X run-time pitches + * - N dimensions, where N = X + Y + * + * For example, in the code below a wrap is defined for a batch of HWC 3D tensors where each row in H + * has a run-time row pitch (second -1), a pixel in W has a compile-time constant pitch as + * the size of the pixel type and a channel in C has also a compile-time constant pitch as + * the size of the channel type. + * + * @code + * using DataType = ...; + * using ChannelType = BaseType; + * using TensorBatchWrap = TensorBatchWrap; + * TensorBatch tensorBatch = ...; + * TensorBatchWrap tensorBatchWrap(tensorBatch.data()); + * // Elements may be accessed via operator[] using an int4 argument. They can also be accessed via pointer using + * // the ptr method with up to 4 integer arguments or by accessing each TensorWrap separately with tensor(...) method. + * @endcode + * + * @sa NVCV_CPP_CUDATOOLS_TENSORBATCHWRAPS + * + * @tparam T Type (it can be const) of each element inside the tensor wrapper. + * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. + */ +template +class TensorBatchWrap; + +template +class TensorBatchWrap +{ + static_assert(HasTypeTraits, "TensorBatchWrap can only be used if T has type traits"); + +public: + // The type provided as template parameter is the value type, i.e. the type of each element inside this wrapper. + using ValueType = const T; + + static constexpr int kNumDimensions = sizeof...(Strides); + static constexpr int kVariableStrides = ((Strides == -1) + ...); + static constexpr int kConstantStrides = kNumDimensions - kVariableStrides; + + TensorBatchWrap() = default; + + /** + * Constructs a constant TensorBatchWrap by wrapping a \p data argument. + * + * @param[in] data Tensor batch data to wrap. + */ + __host__ TensorBatchWrap(const TensorBatchDataStridedCuda &data) + : TensorBatchWrap(data.cdata()) + { + } + + /** + * Constructs a constant TensorBatchWrap by wrapping a \p data argument. + * + * @param[in] data Tensor batch data to wrap. + */ + __host__ __device__ TensorBatchWrap(const NVCVTensorBatchData &data) + : m_numTensors(data.numTensors) + , m_tensors(data.buffer.strided.tensors) + { + } + + /** + * Get a read-only proxy (as pointer) of the given tensor at the given coordinates. + * + * @param[in] t Tensor index in the list. + * @param[in] c Coordinates in the given tensor; + * + * @return The const pointer to the beginning of the given coordinates. + */ + template + inline const __host__ __device__ T *ptr(int t, Coords... c) const + { + return doGetPtr(t, c...); + } + + /** + * Subscript operator for read-and-write access. + * + * @param[in] t Tensor index in the list. + * @param[in] c (N+1)-D coordinates tensor index and coords (from last to first dimension) to be accessed. + * E.g. for a 2-dimensional tensors, the coordinates would be: {tensor_id, column, row} + * + * @return Accessed reference. + */ + template>>> + inline const __host__ __device__ T &operator[](DimType c) const + { + static_assert(NumElements == kNumDimensions + 1, + "Coordinates in the subscript operator must be (N+1)-dimensional, " + "where N is a dimensionality of a single tensor in the batch."); + if constexpr (NumElements == 1) + { + return *doGetPtr(c.x); + } + if constexpr (NumElements == 2) + { + return *doGetPtr(c.x, c.y); + } + else if constexpr (NumElements == 3) + { + return *doGetPtr(c.x, c.z, c.y); + } + else if constexpr (NumElements == 4) + { + return *doGetPtr(c.x, c.w, c.z, c.y); + } + } + + /** + * @brief Constructs a read-only wrapper for the tensor on index \p t + * The list of static strides can be provided as a template parameter. + * It should be a list of N outer strides (from inner to outer). + * + * @tparam Strides static strides + * @param t index of the tensor + */ + inline const __host__ __device__ auto tensor(int t) const + { + return TensorWrap(doGetPtr(t), strides(t)); + } + + /** + * @brief Returns a number of tensors in the batch. + */ + inline __host__ __device__ int32_t numTensors() const + { + return m_numTensors; + } + + /** + * @brief Returns a pointer to shape buffer of the tensor at index \p t + * + * @param t tensor index + */ + inline const __host__ __device__ int64_t *shape(int t) const + { + assert(t >= 0 && t < m_numTensors); + return m_tensors[t].shape; + } + + /** + * @brief Returns a pointer to a stride buffer of the tensor at index \p t + * + * @param t tensor index + */ + inline const __host__ __device__ int64_t *strides(int t) const + { + assert(t >= 0 && t < m_numTensors); + return m_tensors[t].stride; + } + +protected: + template + inline __host__ __device__ T *doGetPtr(int t, Args... c) const + { + static_assert(std::conjunction_v...>); + static_assert(sizeof...(Args) <= kNumDimensions); + + constexpr int kArgSize = sizeof...(Args); + constexpr int kVarSize = kArgSize < kVariableStrides ? kArgSize : kVariableStrides; + constexpr int kDimSize = kArgSize < kNumDimensions ? kArgSize : kNumDimensions; + constexpr int kStride[] = {std::forward(Strides)...}; + + // Computing offset first potentially postpones or avoids 64-bit math during addressing + int offset = 0; + if constexpr (kArgSize > 0) + { + int coords[] = {std::forward(c)...}; + const int64_t *strides = m_tensors[t].stride; + +#pragma unroll + for (int i = 0; i < kVarSize; ++i) + { + offset += coords[i] * strides[i]; + } +#pragma unroll + for (int i = kVariableStrides; i < kDimSize; ++i) + { + offset += coords[i] * kStride[i]; + } + } + + NVCVByte *dataPtr = m_tensors[t].data; + return reinterpret_cast(dataPtr + offset); + } + + int32_t m_numTensors; + NVCVTensorBatchElementStridedRec *m_tensors; +}; + +/** + * TensorBatch wrapper class specialized for non-constant value type. + * + * @tparam T Type (non-const) of each element inside the tensor batch wrapper. + * @tparam Strides Each compile-time (use -1 for run-time) pitch in bytes from first to last dimension. + */ +template +class TensorBatchWrap : public TensorBatchWrap +{ + using Base = TensorBatchWrap; + +public: + using ValueType = T; + using Base::doGetPtr; + using Base::kNumDimensions; + using Base::m_tensors; + using Base::strides; + + /** + * Constructs a TensorBatchWrap by wrapping a \p data argument. + * + * @param[in] data Tensor batch data to wrap. + */ + __host__ TensorBatchWrap(const TensorBatchDataStridedCuda &data) + : Base(data) + { + } + + /** + * Constructs a TensorBatchWrap by wrapping a \p data argument. + * + * @param[in] data Tensor batch data to wrap. + */ + __host__ __device__ TensorBatchWrap(NVCVTensorBatchData &data) + : Base(data) + { + } + + /** + * Get a read-and-write proxy (as pointer) of the given tensor at the given coordinates. + * + * @param[in] t Tensor index in the list. + * @param[in] c Coordinates in the given tensor; + * + * @return The const pointer to the beginning of the given coordinates. + */ + template + inline __host__ __device__ T *ptr(int t, Coords... c) const + { + return doGetPtr(t, c...); + } + + /** + * @brief Constructs a read-and-write wrapper for the tensor on index \p t + * The list of static strides can be provided as a template parameter. + * It should be a list of N outer strides (from inner to outer). + * + * @tparam Strides static strides + * @param t index of the tensor + */ + inline __host__ __device__ auto tensor(int t) const + { + return TensorWrap(doGetPtr(t), strides(t)); + } + + /** + * Subscript operator for read-and-write access. + * + * @param[in] t Tensor index in the list. + * @param[in] c (N+1)-D coordinates - tensor index and coords (from inner to outer) to be accessed. + * E.g. for a 2-dimensional tensors, the coordinates would be: {tensor_id, column, row} + * + * @return Accessed reference. + */ + template>>> + inline __host__ __device__ T &operator[](DimType c) const + { + static_assert(NumElements == kNumDimensions + 1, + "Coordinates in the subscript operator must be (N+1)-dimensional, " + "where N is a dimensionality of a single tensor in the batch."); + if constexpr (NumElements == 1) + { + return *doGetPtr(c.x); + } + if constexpr (NumElements == 2) + { + return *doGetPtr(c.x, c.y); + } + else if constexpr (NumElements == 3) + { + return *doGetPtr(c.x, c.z, c.y); + } + else if constexpr (NumElements == 4) + { + return *doGetPtr(c.x, c.w, c.z, c.y); + } + } +}; + +/**@}*/ + +/** + * Specializes \ref TensorBatchWrap template classes to different dimensions. + * + * The specializations have the last dimension as the only compile-time dimension as size of T. All other + * dimensions have run-time pitch and must be provided. + * + * Template arguments: + * - T data type of each element in \ref TensorBatchWrap + * + * @sa NVCV_CPP_CUDATOOLS_TENSORBATCHWRAP + * + * @defgroup NVCV_CPP_CUDATOOLS_TENSORBATCHWRAPS TensorBatchWrap shortcuts + * @{ + */ + +template +using TensorBatch1DWrap = TensorBatchWrap; + +template +using TensorBatch2DWrap = TensorBatchWrap; + +template +using TensorBatch3DWrap = TensorBatchWrap; + +template +using TensorBatch4DWrap = TensorBatchWrap; + +template +using TensorBatch5DWrap = TensorBatchWrap; + +template +using TensorBatchNDWrap = std::conditional_t< + N == 1, TensorBatch1DWrap, + std::conditional_t, + std::conditional_t, + std::conditional_t, + std::conditional_t, void>>>>>; +/**@}*/ + +} // namespace nvcv::cuda + +#endif // NVCV_CUDA_TENSOR_BATCH_WRAP_HPP diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp index 487b4f82c..1cc7143be 100644 --- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp +++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp @@ -110,6 +110,23 @@ class TensorWrap static_assert(sizeof...(Args) == kVariableStrides); } + /** + * Constructs a constant TensorWrap by wrapping a const \p data pointer argument + * and copying the dyncamic strides from a given buffer. + * + * @param[in] data Pointer to the data that will be wrapped. + * @param[in] strides Pointer to stride data + */ + template + explicit __host__ __device__ TensorWrap(const DataType *data, StrideType *strides) + : m_data(reinterpret_cast(data)) + { + for (int i = 0; i < kVariableStrides; ++i) + { + m_strides[i] = strides[i]; + } + } + /** * Constructs a constant TensorWrap by wrapping an \p image argument. * @@ -278,6 +295,19 @@ class TensorWrap : public TensorWrap { } + /** + * Constructs a TensorWrap by wrapping a const \p data pointer argument + * and copying the dyncamic strides from a given buffer. + * + * @param[in] data Pointer to the data that will be wrapped. + * @param[in] strides Pointer to stride data + */ + template + explicit __host__ __device__ TensorWrap(DataType *data, StrideType *strides) + : Base(data, strides) + { + } + /** * Constructs a TensorWrap by wrapping an \p image argument. * @@ -385,11 +415,16 @@ using Tensor3DWrap = TensorWrap; template using Tensor4DWrap = TensorWrap; +template +using Tensor5DWrap = TensorWrap; + template using TensorNDWrap = std::conditional_t< N == 1, Tensor1DWrap, std::conditional_t, - std::conditional_t, std::conditional_t, void>>>>; + std::conditional_t, + std::conditional_t, + std::conditional_t, void>>>>>; /**@}*/ diff --git a/src/nvcv_types/priv/IAllocator.cpp b/src/nvcv_types/priv/IAllocator.cpp index 6807fe36a..dbd7f03e4 100644 --- a/src/nvcv_types/priv/IAllocator.cpp +++ b/src/nvcv_types/priv/IAllocator.cpp @@ -73,7 +73,7 @@ void *IAllocator::allocHostPinnedMem(int64_t size, int32_t align) if (util::RoundUp(size, align) != size) { throw Exception(NVCV_ERROR_INVALID_ARGUMENT, - "Host memory allocator size must be an integral multiple of alignment %d, not %ld", align, + "Pinned memory allocator size must be an integral multiple of alignment %d, not %ld", align, size); } @@ -101,7 +101,7 @@ void *IAllocator::allocCudaMem(int64_t size, int32_t align) if (util::RoundUp(size, align) != size) { throw Exception(NVCV_ERROR_INVALID_ARGUMENT, - "Host memory allocator size must be an integral multiple of alignment %d, not %ld", align, + "Device memory allocator size must be an integral multiple of alignment %d, not %ld", align, size); } diff --git a/src/util/Compat.cpp b/src/util/Compat.cpp index 753a64799..77f8f046e 100644 --- a/src/util/Compat.cpp +++ b/src/util/Compat.cpp @@ -108,7 +108,7 @@ static std::vector *g_ListMainThread = nullptr; int my_thread_atexit_impl(void (*func)(void *), void *arg, void *d) { - std::vector *list; + std::vector *list = nullptr; if (IsMainThread()) { diff --git a/src/util/Math.hpp b/src/util/Math.hpp index 5e20f1b40..c558a8ee3 100644 --- a/src/util/Math.hpp +++ b/src/util/Math.hpp @@ -22,6 +22,7 @@ #include "Metaprogramming.hpp" #include +#include #include namespace nvcv::util { @@ -123,6 +124,18 @@ NVCV_CUDA_HOST_DEVICE constexpr auto DivUpPowerOfTwo(T num, TypeIdentity den) return (num >> ILog2(den)) + !!(num & (den - 1)); } +/// @brief Calculates normalized sinc i.e. `sin(pi * x) / (pi * x)` +template>> +NVCV_CUDA_HOST_DEVICE NVCV_FORCE_INLINE T sinc(T x) +{ + static_assert(sizeof(T) >= sizeof(float)); // not analyzed for smaller floats, eps may require adjustment + constexpr T eps = sizeof(T) <= sizeof(float) ? 1e-5 : 1e-8; + x *= static_cast(M_PI); + if (std::abs(x) < eps) + return static_cast(1.0) - x * x * (static_cast(1.0) / 6); // remove singularity by using Taylor expansion + return std::sin(x) / x; +} + } // namespace nvcv::util #endif // NVCV_UTIL_MATH_HPP diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 88ceacc58..2fb4c84d7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -16,7 +16,14 @@ project(cvcuda_tests) set(CMAKE_FOLDER tests) -# Tests require C++20 +# Tests require C++20: +# The goal for the infrastructure written for tests is to make it easy to +# - add test cases for new parameter ranges, +# - make the tested parameter set visually match what's defined in the reference doc +# - ... +# so that we can quickly check if everything we claim is being tested. +# In order to achieve this, we created "tests/common/ValueList.hpp" that implements a domain-specific embedded language making it easier to define the above. +# To make usage easier, we had to use C++20 language features. set(CMAKE_CXX_STANDARD 20) enable_testing() diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in index eb648d827..f1242e371 100755 --- a/tests/cvcuda/python/cvcuda_test_python.in +++ b/tests/cvcuda/python/cvcuda_test_python.in @@ -16,25 +16,30 @@ # limitations under the License. tests_dir="@PYTHON_TEST_DIR@" -python_versions="@PYTHON_TEST_VERSIONS@" +python_versions_tentative="@PYTHON_TEST_VERSIONS@" + +python_versions="" # Verify if correct package dependencies are installed -------- pip_depends="pytest torch" -declare -a install_commands - -for ver in $python_versions; do +# Collect all python versions that are indeed installed and have proper dependencies installed +# Two behaviors: +# - default: skip Python versions that are not installed or don't have pytest and torch installed +# - if NVCV_FORCE_PYTHON is set: exit with error +for ver in $python_versions_tentative; do if ! python$ver -c "import pytest, torch" > /dev/null 2>&1; then - install_commands+=("sudo python$ver -m pip install $pip_depends") + echo "WARNING: Python version $ver not installed or missing proper dependencies" + echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends" + if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then + exit 1 #hard exit + fi + else + echo "Found Python version $ver installed with proper dependencies, adding to tests" + python_versions+="$ver " fi done -if [[ "${install_commands[*]}" ]]; then - echo "Please run the following commands before running $(basename $0): " - ( IFS=$'\n'; echo -e "${install_commands[*]}" ) - exit 1 -fi - # Run tests -------- tmpdir=$(mktemp -d) diff --git a/tests/cvcuda/python/test_ophqresize.py b/tests/cvcuda/python/test_ophqresize.py new file mode 100644 index 000000000..3a571aebd --- /dev/null +++ b/tests/cvcuda/python/test_ophqresize.py @@ -0,0 +1,306 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv +import cvcuda +import pytest as t +import cvcuda_util as util +import numpy as np + +RNG = np.random.default_rng(12345) + + +def get_shape(in_shape, layout, out_size): + assert len(out_size) in (2, 3) + out_size_layout = "HW" if len(out_size) == 2 else "DHW" + assert len(out_size) == len(out_size_layout) + out_size_map = dict(zip(out_size_layout, out_size)) + assert len(in_shape) == len(layout) + return tuple( + out_size_map.get(name, extent) for name, extent in zip(layout, in_shape) + ) + + +@t.mark.parametrize( + "src_args, dst_args, interpolation_args, roi", + [ + ( + ((2, 244, 244, 3), nvcv.Type.U8, "NHWC"), + ((122, 122), nvcv.Type.U8), + (cvcuda.Interp.NEAREST, cvcuda.Interp.NEAREST, False), + None, + ), + ( + ((2, 244, 244, 3), nvcv.Type.U8, "NHWC"), + ((122, 244), nvcv.Type.U8), + (cvcuda.Interp.LINEAR, cvcuda.Interp.LINEAR, False), + None, + ), + ( + ((1, 244, 244, 2), nvcv.Type.U8, "NHWC"), + ((122, 122), nvcv.Type.F32), + (cvcuda.Interp.LINEAR, cvcuda.Interp.CUBIC, True), + (50, 10, 230, 220), + ), + ( + ((3, 101, 244, 301, 3), nvcv.Type.U16, "NDHWC"), + ((122, 54, 101), nvcv.Type.U16), + (cvcuda.Interp.GAUSSIAN, cvcuda.Interp.CUBIC, True), + None, + ), + ( + ((54, 54, 54, 4), nvcv.Type.U8, "DHWC"), + ((100, 100, 100), nvcv.Type.U8), + (cvcuda.Interp.LANCZOS, cvcuda.Interp.LINEAR, True), + (54, 0, 0, 0, 54, 54), + ), + ( + ((101, 102, 103), nvcv.Type.U8, "DHW"), + ((41, 45, 49), nvcv.Type.F32), + (cvcuda.Interp.NEAREST, cvcuda.Interp.LINEAR, False), + None, + ), + ( + ((101, 102, 103), nvcv.Type.U8, "DHW"), + ((101, 45, 49), nvcv.Type.F32), + (cvcuda.Interp.NEAREST, cvcuda.Interp.LINEAR, False), + None, + ), + ], +) +def test_op_hq_resize_api(src_args, dst_args, interpolation_args, roi): + stream = cvcuda.Stream() + src_shape, src_type, layout = src_args + assert len(layout) == len(src_shape) + dst_size, dst_type = dst_args + min_interpolation, mag_interpolation, antialias = interpolation_args + out_shape = get_shape(src_shape, layout, dst_size) + + t_src = util.create_tensor(*src_args) + + if src_type != dst_type: + t_dst = util.create_tensor(out_shape, dst_type, layout) + t_tmp = cvcuda.hq_resize_into( + t_dst, + t_src, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=roi, + ) + assert t_tmp is t_dst + else: + t_dst = cvcuda.hq_resize( + t_src, + dst_size, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=roi, + ) + assert t_dst.layout == t_src.layout + assert t_dst.dtype == dst_type + assert t_dst.shape == out_shape + + +@t.mark.parametrize( + "num_samples, src_args, dst_type, interpolation_args, roi", + [ + ( + 1, + ((512, 1024, 3), np.uint8, "HWC"), + np.uint8, + (cvcuda.Interp.LINEAR, cvcuda.Interp.LINEAR, True), + None, + ), + ( + 5, + ((122, 244, 4), np.float32, "HWC"), + np.float32, + (cvcuda.Interp.CUBIC, cvcuda.Interp.CUBIC, False), + [(100, 200, 10, 10)], + ), + ( + 3, + ((244, 122), np.uint8, "HW"), + np.float32, + (cvcuda.Interp.NEAREST, cvcuda.Interp.NEAREST, False), + [(200, 100, 10, 10)], + ), + ], +) +def test_op_hq_resize_var_shape_api( + num_samples, src_args, dst_type, interpolation_args, roi +): + stream = cvcuda.Stream() + + src_shape, src_type, layout = src_args + assert len(layout) == len(src_shape) + min_interpolation, mag_interpolation, antialias = interpolation_args + + b_src = nvcv.ImageBatchVarShape(num_samples) + out_sizes = [] + for _ in range(num_samples): + sample_size = tuple( + RNG.integers(1, extent + 1) + for name, extent in zip(layout, src_shape) + if name in "HW" + ) + sample_shape = get_shape(src_shape, layout, sample_size) + h_data = util.generate_data(sample_shape, src_type, rng=RNG) + image = util.to_nvcv_image(h_data) + b_src.pushback(image) + out_sizes.append( + tuple(RNG.integers(1, 2 * extent + 1) for extent in sample_size) + ) + + if src_type != dst_type: + b_dst = nvcv.ImageBatchVarShape(num_samples) + assert len(out_sizes) == num_samples + for out_size in out_sizes: + out_shape = get_shape(src_shape, layout, out_size) + h_data = util.generate_data(out_shape, dst_type, rng=RNG) + image = util.to_nvcv_image(h_data) + b_dst.pushback(image) + + b_tmp = cvcuda.hq_resize_into( + b_dst, + b_src, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=roi, + ) + assert b_tmp is b_dst + else: + b_dst = cvcuda.hq_resize( + b_src, + out_sizes, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=roi, + ) + + assert len(b_dst) == len(b_src) + assert b_dst.capacity == b_src.capacity + assert b_dst.uniqueformat == b_src.uniqueformat + assert b_dst.maxsize == tuple( + max(extent) for extent in reversed(list(zip(*out_sizes))) + ) + + +@t.mark.parametrize( + "num_samples, src_args, dst_type, interpolation_args, use_roi", + [ + ( + 7, + ((244, 244, 3), nvcv.Type.U8, "HWC"), + nvcv.Type.U8, + (cvcuda.Interp.NEAREST, cvcuda.Interp.NEAREST, False), + False, + ), + ( + 5, + ((244, 244), nvcv.Type.U8, "HW"), + nvcv.Type.F32, + (cvcuda.Interp.LINEAR, cvcuda.Interp.CUBIC, True), + True, + ), + ( + 3, + ((101, 244, 301, 3), nvcv.Type.U16, "DHWC"), + nvcv.Type.U16, + (cvcuda.Interp.GAUSSIAN, cvcuda.Interp.CUBIC, True), + True, + ), + ( + 1, + ((101, 102, 103), nvcv.Type.U8, "DHW"), + nvcv.Type.F32, + (cvcuda.Interp.NEAREST, cvcuda.Interp.LINEAR, False), + False, + ), + ], +) +def test_op_hq_resize_tensor_batch_api( + num_samples, src_args, dst_type, interpolation_args, use_roi +): + stream = cvcuda.Stream() + + src_shape, src_type, layout = src_args + assert len(layout) == len(src_shape) + min_interpolation, mag_interpolation, antialias = interpolation_args + + b_src = nvcv.TensorBatch(num_samples) + out_sizes = [] + rois = [] + for _ in range(num_samples): + sample_size = tuple( + RNG.integers(1, extent + 1) + for name, extent in zip(layout, src_shape) + if name in "DHW" + ) + sample_shape = get_shape(src_shape, layout, sample_size) + t_src = util.create_tensor(sample_shape, src_type, layout) + b_src.pushback(t_src) + out_sizes.append( + tuple(RNG.integers(1, 2 * extent + 1) for extent in sample_size) + ) + if use_roi: + roi = tuple( + RNG.integers(1, extent + 1) for _ in range(2) for extent in sample_size + ) + rois.append(roi) + + if src_type != dst_type: + b_dst = nvcv.TensorBatch(num_samples) + assert len(out_sizes) == num_samples + for out_size in out_sizes: + out_shape = get_shape(src_shape, layout, out_size) + t_dst = util.create_tensor(out_shape, dst_type, layout) + b_dst.pushback(t_dst) + + b_tmp = cvcuda.hq_resize_into( + b_dst, + b_src, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=None if not use_roi else rois, + ) + assert b_dst is b_tmp + else: + b_dst = cvcuda.hq_resize( + b_src, + out_sizes, + min_interpolation=min_interpolation, + mag_interpolation=mag_interpolation, + antialias=antialias, + stream=stream, + roi=None if not use_roi else rois, + ) + assert len(b_dst) == len(b_src) + assert b_dst.capacity == b_src.capacity + assert b_dst.layout == b_src.layout + assert b_dst.ndim == b_src.ndim + assert b_dst.dtype == dst_type + for i in range(num_samples): + assert b_dst[i].shape == get_shape(src_shape, layout, out_sizes[i]) diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index a9e1a6483..a14060961 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -88,6 +88,7 @@ add_executable(cvcuda_test_system GaussianNoiseUtils.cu TestOpInpaint.cpp TestOpFindHomography.cpp + TestOpHQResize.cpp ) target_link_libraries(cvcuda_test_system diff --git a/tests/cvcuda/system/ResizeUtils.cpp b/tests/cvcuda/system/ResizeUtils.cpp index 9dea3ae34..401ed59f8 100644 --- a/tests/cvcuda/system/ResizeUtils.cpp +++ b/tests/cvcuda/system/ResizeUtils.cpp @@ -29,7 +29,8 @@ namespace nvcv::test { void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation) + int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, + bool isVarshape) { if (interpolation == NVCV_INTERP_NEAREST || interpolation == NVCV_INTERP_LINEAR || interpolation == NVCV_INTERP_CUBIC) @@ -85,65 +86,107 @@ void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, } else { - double invscale - = 1.f / (std::min(jScale, srcSize.w - fsx1) * std::min(iScale, srcSize.h - fsy1)); - - for (int dy = sy1; dy < sy2; ++dy) + if (!isVarshape || (iScale >= 1.0f && jScale >= 1.0f)) { - for (int dx = sx1; dx < sx2; ++dx) - if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w) - out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale; + double invscale + = 1.f / (std::min(jScale, srcSize.w - fsx1) * std::min(iScale, srcSize.h - fsy1)); - if (sx1 > fsx1) - if (dy >= 0 && dy < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) + for (int dy = sy1; dy < sy2; ++dy) + { + for (int dx = sx1; dx < sx2; ++dx) + if (dy >= 0 && dy < srcSize.h && dx >= 0 && dx < srcSize.w) + out = out + srcPtr[dy * srcRowStride + dx * elementsPerPixel + k] * invscale; + + if (sx1 > fsx1) + if (dy >= 0 && dy < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) + out = out + + srcPtr[dy * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + * ((sx1 - fsx1) * invscale); + + if (sx2 < fsx2) + if (dy >= 0 && dy < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) + out = out + + srcPtr[dy * srcRowStride + sx2 * elementsPerPixel + k] + * ((fsx2 - sx2) * invscale); + } + + if (sy1 > fsy1) + for (int dx = sx1; dx < sx2; ++dx) + if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && dx >= 0 && dx < srcSize.w) + out = out + + srcPtr[(sy1 - 1) * srcRowStride + dx * elementsPerPixel + k] + * ((sy1 - fsy1) * invscale); + + if (sy2 < fsy2) + for (int dx = sx1; dx < sx2; ++dx) + if (sy2 >= 0 && sy2 < srcSize.h && dx >= 0 && dx < srcSize.w) + out = out + + srcPtr[sy2 * srcRowStride + dx * elementsPerPixel + k] + * ((fsy2 - sy2) * invscale); + + if ((sy1 > fsy1) && (sx1 > fsx1)) + if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) out = out - + srcPtr[dy * srcRowStride + (sx1 - 1) * elementsPerPixel + k] - * ((sx1 - fsx1) * invscale); + + srcPtr[(sy1 - 1) * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + * ((sy1 - fsy1) * (sx1 - fsx1) * invscale); - if (sx2 < fsx2) - if (dy >= 0 && dy < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) + if ((sy1 > fsy1) && (sx2 < fsx2)) + if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) out = out - + srcPtr[dy * srcRowStride + sx2 * elementsPerPixel + k] - * ((fsx2 - sx2) * invscale); - } + + srcPtr[(sy1 - 1) * srcRowStride + sx2 * elementsPerPixel + k] + * ((sy1 - fsy1) * (fsx2 - sx2) * invscale); - if (sy1 > fsy1) - for (int dx = sx1; dx < sx2; ++dx) - if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && dx >= 0 && dx < srcSize.w) + if ((sy2 < fsy2) && (sx2 < fsx2)) + if (sy2 >= 0 && sy2 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) out = out - + srcPtr[(sy1 - 1) * srcRowStride + dx * elementsPerPixel + k] - * ((sy1 - fsy1) * invscale); + + srcPtr[sy2 * srcRowStride + sx2 * elementsPerPixel + k] + * ((fsy2 - sy2) * (fsx2 - sx2) * invscale); - if (sy2 < fsy2) - for (int dx = sx1; dx < sx2; ++dx) - if (sy2 >= 0 && sy2 < srcSize.h && dx >= 0 && dx < srcSize.w) + if ((sy2 < fsy2) && (sx1 > fsx1)) + if (sy2 >= 0 && sy2 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) out = out - + srcPtr[sy2 * srcRowStride + dx * elementsPerPixel + k] - * ((fsy2 - sy2) * invscale); - - if ((sy1 > fsy1) && (sx1 > fsx1)) - if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) - out = out - + srcPtr[(sy1 - 1) * srcRowStride + (sx1 - 1) * elementsPerPixel + k] - * ((sy1 - fsy1) * (sx1 - fsx1) * invscale); - - if ((sy1 > fsy1) && (sx2 < fsx2)) - if (sy1 - 1 >= 0 && sy1 - 1 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) - out = out - + srcPtr[(sy1 - 1) * srcRowStride + sx2 * elementsPerPixel + k] - * ((sy1 - fsy1) * (fsx2 - sx2) * invscale); - - if ((sy2 < fsy2) && (sx2 < fsx2)) - if (sy2 >= 0 && sy2 < srcSize.h && sx2 >= 0 && sx2 < srcSize.w) - out = out - + srcPtr[sy2 * srcRowStride + sx2 * elementsPerPixel + k] - * ((fsy2 - sy2) * (fsx2 - sx2) * invscale); - - if ((sy2 < fsy2) && (sx1 > fsx1)) - if (sy2 >= 0 && sy2 < srcSize.h && sx1 - 1 >= 0 && sx1 - 1 < srcSize.w) - out = out - + srcPtr[sy2 * srcRowStride + (sx1 - 1) * elementsPerPixel + k] - * ((fsy2 - sy2) * (sx1 - fsx1) * invscale); + + srcPtr[sy2 * srcRowStride + (sx1 - 1) * elementsPerPixel + k] + * ((fsy2 - sy2) * (sx1 - fsx1) * invscale); + } + else // zoom in for varshape + { + double iScale_inv = 1.0 / iScale; + double jScale_inv = 1.0 / jScale; + + sy1 = cuda::round(fsy1); + sx1 = cuda::round(fsx1); + float fy = (float)(float(di + 1) - float(sy1 + 1) * iScale_inv); + fy = fy <= 0 ? 0.f : fy - cuda::round(fy); + + float cbufy[2]; + cbufy[0] = 1.f - fy; + cbufy[1] = fy; + + float fx = (float)(float(dj + 1) - float(sx1 + 1) * jScale_inv); + fx = fx <= 0 ? 0.f : fx - cuda::round(fx); + + if (sx1 < 0) + { + fx = 0, sx1 = 0; + } + if (sx1 >= srcSize.w - 1) + { + fx = 0, sx1 = srcSize.w - 2; + } + if (sy1 >= srcSize.h - 1) + { + sy1 = srcSize.h - 2; + } + + float cbufx[2]; + cbufx[0] = 1.f - fx; + cbufx[1] = fx; + out = srcPtr[sy1 * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[0] + + srcPtr[(sy1 + 1) * srcRowStride + sx1 * elementsPerPixel + k] * cbufx[0] * cbufy[1] + + srcPtr[sy1 * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1] * cbufy[0] + + srcPtr[(sy1 + 1) * srcRowStride + (sx1 + 1) * elementsPerPixel + k] * cbufx[1] + * cbufy[1]; + } } out = std::rint(std::abs(out)); diff --git a/tests/cvcuda/system/ResizeUtils.hpp b/tests/cvcuda/system/ResizeUtils.hpp index 3296f6283..d8c27f7ca 100644 --- a/tests/cvcuda/system/ResizeUtils.hpp +++ b/tests/cvcuda/system/ResizeUtils.hpp @@ -30,7 +30,8 @@ namespace nvcv::test { // support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC/NVCV_INTERP_AREA void Resize(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation); + int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, NVCVInterpolationType interpolation, + bool isVarshape); // only support NVCV_INTERP_NEAREST/NVCV_INTERP_LINEAR/NVCV_INTERP_CUBIC void ResizedCrop(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, diff --git a/tests/cvcuda/system/TestOpChannelReorder.cpp b/tests/cvcuda/system/TestOpChannelReorder.cpp index e2f2eb158..011491bb4 100644 --- a/tests/cvcuda/system/TestOpChannelReorder.cpp +++ b/tests/cvcuda/system/TestOpChannelReorder.cpp @@ -27,7 +27,43 @@ namespace test = nvcv::test; -TEST(TestOpChannelReorder, smoke_test_works) +class TestOpChannelReorder : public ::testing::Test +{ +protected: + TestOpChannelReorder() {} + + ~TestOpChannelReorder() {} + + void SetUp() override + { + // clang-format off + inOrders = nvcv::Tensor( + { + {1, 4}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + } + + void pushDefaultImages() + { + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBA8 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBA8 + }); + } + + nvcv::ImageBatchVarShape in{nvcv::ImageBatchVarShape(2)}, out{nvcv::ImageBatchVarShape(2)}; + nvcv::Tensor inOrders; + cvcuda::ChannelReorder chReorder; +}; + +TEST_F(TestOpChannelReorder, smoke_test_works) { // Let's set up input and output images nvcv::Image inImages[2] = { @@ -40,7 +76,8 @@ TEST(TestOpChannelReorder, smoke_test_works) nvcv::Image{nvcv::Size2D{4, 2}, nvcv::FMT_RGBA8} }; - nvcv::ImageBatchVarShape in(2), out(2); + in = nvcv::ImageBatchVarShape(2); + out = nvcv::ImageBatchVarShape(2); // Create the input and output varshapes in.pushBack(inImages[0]); @@ -66,7 +103,7 @@ TEST(TestOpChannelReorder, smoke_test_works) // Populate the order tensor // clang-format off - nvcv::Tensor inOrders( + inOrders = nvcv::Tensor( { {2, 4}, "NC" @@ -89,8 +126,6 @@ TEST(TestOpChannelReorder, smoke_test_works) cudaStream_t stream; ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - cvcuda::ChannelReorder chReorder; - chReorder(stream, in, out, inOrders); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); @@ -110,3 +145,222 @@ TEST(TestOpChannelReorder, smoke_test_works) EXPECT_EQ(make_uchar4(4, 1, 2, 0), outImageValues[0]); EXPECT_EQ(make_uchar4(28, 10, 3, 0), outImageValues[1]); } + +TEST_F(TestOpChannelReorder, create_with_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaChannelReorderCreate(nullptr)); +} + +TEST_F(TestOpChannelReorder, infer_different_samples) +{ + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBA8 + }); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_void_samples) +{ + EXPECT_EQ(NVCV_SUCCESS, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_input_dataType) +{ + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBAf16 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBA8 + }); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_output_dataType) +{ + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBA8 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_RGBAf16 + }); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_rank) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {1, 4, 4}, + "NHW" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_dataType) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {1, 4}, + "NC" + }, + nvcv::TYPE_F32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_first_label) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {4, 1}, + "CN" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_num_samples) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {2, 4}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_num_channels) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {1, 5}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_order_small_num_channels) +{ + pushDefaultImages(); + + // clang-format off + inOrders= nvcv::Tensor( + { + {1, 3}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_input_planar) +{ + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRA8p + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRA8 + }); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_output_planar) +{ + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRA8 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRA8p + }); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_input_different_channels) +{ + pushDefaultImages(); + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGR8 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGR8 + }); + + // clang-format off + inOrders= nvcv::Tensor( + { + {2, 4}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} + +TEST_F(TestOpChannelReorder, infer_invalid_input_different_format) +{ + pushDefaultImages(); + in.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRAf32 + }); + out.pushBack(nvcv::Image{ + nvcv::Size2D{4, 2}, + nvcv::FMT_BGRAf32 + }); + + // clang-format off + inOrders= nvcv::Tensor( + { + {2, 4}, + "NC" + }, + nvcv::TYPE_S32); + // clang-format on + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcv::ProtectCall([&] { chReorder(NULL, in, out, inOrders); })); +} diff --git a/tests/cvcuda/system/TestOpCvtColor.cpp b/tests/cvcuda/system/TestOpCvtColor.cpp index dd9f92cae..bb6bca005 100644 --- a/tests/cvcuda/system/TestOpCvtColor.cpp +++ b/tests/cvcuda/system/TestOpCvtColor.cpp @@ -32,16 +32,72 @@ namespace test = nvcv::test; namespace cuda = nvcv::cuda; -#define VEC_EXPECT_NEAR(vec1, vec2, delta) \ - ASSERT_EQ(vec1.size(), vec2.size()); \ - for (std::size_t idx = 0; idx < vec1.size(); ++idx) \ - { \ - EXPECT_NEAR(vec1[idx], vec2[idx], delta) << "At index " << idx; \ +#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype) \ + ASSERT_EQ(vec1.size(), vec2.size()); \ + for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx) \ + { \ + EXPECT_NEAR(reinterpret_cast(vec1.data())[idx], reinterpret_cast(vec2.data())[idx], delta) \ + << "At index " << idx; \ } +template +void myGenerate(T *src, std::size_t size, std::default_random_engine &randEng) +{ + std::uniform_int_distribution rand(0u, 255u); + for (std::size_t idx = 0; idx < size; ++idx) + { + src[idx] = rand(randEng); + } +} + +template<> +void myGenerate(float *src, std::size_t size, std::default_random_engine &randEng) +{ + std::uniform_real_distribution rand(0.f, 1.f); + for (std::size_t idx = 0; idx < size; ++idx) + { + src[idx] = rand(randEng); + } +} + +#define NVCV_IMAGE_FORMAT_RGBS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZ1, ASSOCIATED, X8_Y8_Z8) +#define NVCV_IMAGE_FORMAT_BGRS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYX1, ASSOCIATED, X8_Y8_Z8) +#define NVCV_IMAGE_FORMAT_RGBAS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZW, ASSOCIATED, X8_Y8_Z8_W8) +#define NVCV_IMAGE_FORMAT_BGRAS8 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYXW, ASSOCIATED, X8_Y8_Z8_W8) + #define NVCV_IMAGE_FORMAT_Y16 NVCV_DETAIL_MAKE_YCbCr_FMT1(BT601, NONE, PL, UNSIGNED, X000, ASSOCIATED, X16) #define NVCV_IMAGE_FORMAT_BGR16 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, UNSIGNED, ZYX1, ASSOCIATED, X16_Y16_Z16) #define NVCV_IMAGE_FORMAT_RGB16 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, UNSIGNED, XYZ1, ASSOCIATED, X16_Y16_Z16) +#define NVCV_IMAGE_FORMAT_BGRA16 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, UNSIGNED, ZYXW, ASSOCIATED, X16_Y16_Z16_W16) +#define NVCV_IMAGE_FORMAT_RGBA16 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, UNSIGNED, XYZW, ASSOCIATED, X16_Y16_Z16_W16) +#define NVCV_IMAGE_FORMAT_YUV16 NVCV_DETAIL_MAKE_YCbCr_FMT1(BT601, NONE, PL, UNSIGNED, XYZ1, ASSOCIATED, X16_Y16_Z16) + +#define NVCV_IMAGE_FORMAT_YS16 NVCV_DETAIL_MAKE_YCbCr_FMT1(BT601, NONE, PL, SIGNED, X000, ASSOCIATED, X16) +#define NVCV_IMAGE_FORMAT_BGRS16 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYX1, ASSOCIATED, X16_Y16_Z16) +#define NVCV_IMAGE_FORMAT_RGBS16 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZ1, ASSOCIATED, X16_Y16_Z16) +#define NVCV_IMAGE_FORMAT_BGRAS16 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYXW, ASSOCIATED, X16_Y16_Z16_W16) +#define NVCV_IMAGE_FORMAT_RGBAS16 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZW, ASSOCIATED, X16_Y16_Z16_W16) + +#define NVCV_IMAGE_FORMAT_BGRS32 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYX1, ASSOCIATED, X32_Y32_Z32) +#define NVCV_IMAGE_FORMAT_RGBS32 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZ1, ASSOCIATED, X32_Y32_Z32) +#define NVCV_IMAGE_FORMAT_BGRAS32 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, ZYXW, ASSOCIATED, X32_Y32_Z32_W32) +#define NVCV_IMAGE_FORMAT_RGBAS32 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, SIGNED, XYZW, ASSOCIATED, X32_Y32_Z32_W32) +#define NVCV_IMAGE_FORMAT_YUVf32 NVCV_DETAIL_MAKE_YCbCr_FMT1(BT601, NONE, PL, FLOAT, XYZ1, ASSOCIATED, X32_Y32_Z32) +#define NVCV_IMAGE_FORMAT_Yf32 NVCV_DETAIL_MAKE_YCbCr_FMT1(BT601, NONE, PL, FLOAT, X000, ASSOCIATED, X32) +#define NVCV_IMAGE_FORMAT_HSVf32 NVCV_DETAIL_MAKE_COLOR_FMT1(HSV, UNDEFINED, PL, FLOAT, XYZ0, ASSOCIATED, X32_Y32_Z32) + +#define NVCV_IMAGE_FORMAT_BGRf64 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, FLOAT, ZYX1, ASSOCIATED, X64_Y64_Z64) +#define NVCV_IMAGE_FORMAT_RGBf64 NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, FLOAT, XYZ1, ASSOCIATED, X64_Y64_Z64) +#define NVCV_IMAGE_FORMAT_BGRAf64 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, FLOAT, ZYXW, ASSOCIATED, X64_Y64_Z64_W64) +#define NVCV_IMAGE_FORMAT_RGBAf64 \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, FLOAT, XYZW, ASSOCIATED, X64_Y64_Z64_W64) // clang-format off @@ -54,10 +110,42 @@ test::ValueList()}; nvcv::ImageFormat dstFormat{GetParamValue<4>()}; + NVCVDataType nvcvDataType; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType)); + NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; NVCVColorConversionCode dst2srcCode{GetParamValue<6>()}; @@ -138,12 +261,20 @@ TEST_P(OpCvtColor, correct_output) long srcBufSize = srcSampleStride * srcAccess->numSamples(); - std::vector srcVec(srcBufSize); - - std::default_random_engine randEng(0); - std::uniform_int_distribution rand(0u, 255u); - - std::generate(srcVec.begin(), srcVec.end(), [&]() { return rand(randEng); }); + std::vector srcVec(srcBufSize); + std::default_random_engine randEng(0); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + myGenerate(reinterpret_cast(srcVec.data()), srcVec.size() / sizeof(float), randEng); + break; + default: + myGenerate(reinterpret_cast(srcVec.data()), srcVec.size(), randEng); + break; + } // copy random input to device ASSERT_EQ(cudaSuccess, cudaMemcpy(srcData->basePtr(), srcVec.data(), srcBufSize, cudaMemcpyHostToDevice)); @@ -166,7 +297,18 @@ TEST_P(OpCvtColor, correct_output) // copy output back to host ASSERT_EQ(cudaSuccess, cudaMemcpy(testVec.data(), srcData->basePtr(), srcBufSize, cudaMemcpyDeviceToHost)); - VEC_EXPECT_NEAR(testVec, srcVec, maxDiff); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, float); + break; + default: + VEC_EXPECT_NEAR(testVec, srcVec, maxDiff, uint8_t); + break; + } } TEST_P(OpCvtColor, varshape_correct_output) @@ -181,6 +323,9 @@ TEST_P(OpCvtColor, varshape_correct_output) nvcv::ImageFormat srcFormat{GetParamValue<3>()}; nvcv::ImageFormat dstFormat{GetParamValue<4>()}; + NVCVDataType nvcvDataType; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(srcFormat, 0, &nvcvDataType)); + NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; NVCVColorConversionCode dst2srcCode{GetParamValue<6>()}; @@ -206,7 +351,18 @@ TEST_P(OpCvtColor, varshape_correct_output) std::uniform_int_distribution udist(0, 255); srcVec[i].resize(imgSrc[i].size().h * srcRowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size() / sizeof(float), rng); + break; + default: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size(), rng); + break; + } auto imgData = imgSrc[i].exportData(); ASSERT_NE(imgData, nvcv::NullOpt); @@ -257,8 +413,116 @@ TEST_P(OpCvtColor, varshape_correct_output) imgData->plane(0).rowStride, srcVecRowStride[i], imgSrc[i].size().h, cudaMemcpyDeviceToHost)); - VEC_EXPECT_NEAR(testVec, srcVec[i], maxDiff); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + VEC_EXPECT_NEAR(testVec, srcVec[i], maxDiff, float); + break; + default: + VEC_EXPECT_NEAR(testVec, srcVec[i], maxDiff, uint8_t); + break; + } } } +TEST(OpCvtColor_negative, create_with_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaCvtColorCreate(nullptr)); +} + +// clang-format off + +NVCV_TEST_SUITE_P(OpCvtColor_negative, +test::ValueList +{ + // W, H, N, inputFormat, outputFormat, in2outCode + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2BGRA}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRAf32, NVCV_COLOR_BGR2BGRA}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2BGRA}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_GRAY2BGR}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_GRAY2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_Y8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_GRAY2BGR}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_BGR2GRAY}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2GRAY}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV,}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_YUV8, NVCV_COLOR_BGR2YUV}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2YUV}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_YUV2BGR,}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_YUV2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_YUV8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_YUV2BGR}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRf32, NVCV_IMAGE_FORMAT_HSV8, NVCV_COLOR_BGR2HSV}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGR8, NVCV_IMAGE_FORMAT_BGRA8, NVCV_COLOR_BGR2HSV}, // invalid output channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_BGRA8, NVCV_IMAGE_FORMAT_BGR8, NVCV_COLOR_HSV2BGR}, // invalid input channel + { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_BGRf32, NVCV_COLOR_HSV2BGR}, // mismatch data type + { 8, 8, 3, NVCV_IMAGE_FORMAT_HSV8, NVCV_IMAGE_FORMAT_Y8, NVCV_COLOR_HSV2BGR}, // invalid output channel +}); + +// clang-format on + +TEST_P(OpCvtColor_negative, invalid_input) +{ + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int batches = GetParamValue<2>(); + + nvcv::ImageFormat srcFormat{GetParamValue<3>()}; + nvcv::ImageFormat dstFormat{GetParamValue<4>()}; + + NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; + + nvcv::Tensor srcTensor = nvcv::util::CreateTensor(batches, width, height, srcFormat); + nvcv::Tensor dstTensor = nvcv::util::CreateTensor(batches, width, height, dstFormat); + + // run operator + cvcuda::CvtColor cvtColorOp; + EXPECT_ANY_THROW(cvtColorOp(nullptr, srcTensor, dstTensor, src2dstCode)); +} + +TEST_P(OpCvtColor_negative, varshape_invalid_input) +{ + int width = GetParamValue<0>(); + int height = GetParamValue<1>(); + int batches = GetParamValue<2>(); + + nvcv::ImageFormat srcFormat{GetParamValue<3>()}; + nvcv::ImageFormat dstFormat{GetParamValue<4>()}; + + NVCVColorConversionCode src2dstCode{GetParamValue<5>()}; + + // Create input varshape + std::default_random_engine rng; + std::uniform_int_distribution udistWidth(width * 0.8, width * 1.1); + std::uniform_int_distribution udistHeight(height * 0.8, height * 1.1); + + std::vector imgSrc; + + for (int i = 0; i < batches; ++i) + { + imgSrc.emplace_back(nvcv::Size2D{udistWidth(rng), udistHeight(rng)}, srcFormat); + } + + nvcv::ImageBatchVarShape batchSrc(batches); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + + // Create output varshape + std::vector imgDst; + + for (int i = 0; i < batches; ++i) + { + imgDst.emplace_back(imgSrc[i].size(), dstFormat); + } + + nvcv::ImageBatchVarShape batchDst(batches); + batchDst.pushBack(imgDst.begin(), imgDst.end()); + + // run operator + cvcuda::CvtColor cvtColorOp; + EXPECT_ANY_THROW(cvtColorOp(nullptr, batchSrc, batchDst, src2dstCode)); +} + #undef VEC_EXPECT_NEAR diff --git a/tests/cvcuda/system/TestOpGammaContrast.cpp b/tests/cvcuda/system/TestOpGammaContrast.cpp index e51f20772..155291a54 100644 --- a/tests/cvcuda/system/TestOpGammaContrast.cpp +++ b/tests/cvcuda/system/TestOpGammaContrast.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -31,8 +32,6 @@ namespace test = nvcv::test; namespace cuda = nvcv::cuda; -// clang-format off - #define DBG_GAMMA_CONTRAST 0 static void printVec(std::vector &vec, int height, int rowPitch, int bytesPerPixel, std::string name) @@ -55,15 +54,29 @@ static void printVec(std::vector &vec, int height, int rowPitch, int by #endif } -static void GammaContrastVarShapeCpuOp(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, const std::vector &hSrc, - int srcRowStride, nvcv::Size2D srcSize, nvcv::ImageFormat fmt, const std::vector gamma, const int imageIndex, bool perChannel) +#define VEC_EXPECT_NEAR(vec1, vec2, delta, dtype) \ + ASSERT_EQ(vec1.size(), vec2.size()); \ + for (std::size_t idx = 0; idx < vec1.size() / sizeof(dtype); ++idx) \ + { \ + EXPECT_NEAR(reinterpret_cast(vec1.data())[idx], reinterpret_cast(vec2.data())[idx], delta) \ + << "At index " << idx; \ + } + +namespace { + +// uint8 cpu op +template +void GammaContrastVarShapeCpuOp(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, + const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, + nvcv::ImageFormat fmt, const std::vector gamma, const int imageIndex, + bool perChannel) { assert(fmt.numPlanes() == 1); int elementsPerPixel = fmt.numChannels(); - uint8_t *dstPtr = hDst.data(); - const uint8_t *srcPtr = hSrc.data(); + T *dstPtr = hDst.data(); + const T *srcPtr = hSrc.data(); for (int dst_y = 0; dst_y < dstSize.h; dst_y++) { @@ -71,16 +84,71 @@ static void GammaContrastVarShapeCpuOp(std::vector &hDst, int dstRowStr { for (int k = 0; k < elementsPerPixel; k++) { - int index = dst_y * dstRowStride + dst_x * elementsPerPixel + k; + int index = dst_y * dstRowStride + dst_x * elementsPerPixel + k; float gamma_tmp = perChannel ? gamma[imageIndex * elementsPerPixel + k] : gamma[imageIndex]; - float tmp = (srcPtr[index] + 0.0f) / 255.0f; - uint8_t out = std::rint(pow(tmp, gamma_tmp) * 255.0f); - dstPtr[index] = out; + float tmp = (srcPtr[index] + 0.0f) / 255.0f; + T out = std::rint(pow(tmp, gamma_tmp) * 255.0f); + dstPtr[index] = out; } } } } +// float cpu op +template<> +void GammaContrastVarShapeCpuOp(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, + const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, + nvcv::ImageFormat fmt, const std::vector gamma, const int imageIndex, + bool perChannel) +{ + assert(fmt.numPlanes() == 1); + + int elementsPerPixel = fmt.numChannels(); + + for (int dst_y = 0; dst_y < dstSize.h; dst_y++) + { + for (int dst_x = 0; dst_x < dstSize.w; dst_x++) + { + for (int k = 0; k < elementsPerPixel; k++) + { + int index = dst_y * dstRowStride + dst_x * elementsPerPixel + k; + float gamma_tmp = perChannel ? gamma[imageIndex * elementsPerPixel + k] : gamma[imageIndex]; + float out = nvcv::cuda::clamp(nvcv::cuda::pow(hSrc[index], gamma_tmp), 0.f, 1.f); + hDst[index] = out; + } + } + } +} + +void GammaContrastVarShapeCpuOpWrapper(std::vector &hDst, int dstRowStride, nvcv::Size2D dstSize, + const std::vector &hSrc, int srcRowStride, nvcv::Size2D srcSize, + nvcv::ImageFormat fmt, const std::vector gamma, const int imageIndex, + bool perChannel, NVCVDataType nvcvDataType) +{ + if (nvcvDataType == NVCV_DATA_TYPE_F32 || nvcvDataType == NVCV_DATA_TYPE_2F32 || nvcvDataType == NVCV_DATA_TYPE_3F32 + || nvcvDataType == NVCV_DATA_TYPE_4F32) + { + std::vector src_tmp(hSrc.size() / sizeof(float)); + std::vector dst_tmp(hDst.size() / sizeof(float)); + size_t copySize = hSrc.size(); + memcpy(static_cast(src_tmp.data()), const_cast(static_cast(hSrc.data())), + copySize); + memcpy(static_cast(dst_tmp.data()), static_cast(hDst.data()), copySize); + GammaContrastVarShapeCpuOp(dst_tmp, dstRowStride / sizeof(float), dstSize, src_tmp, + srcRowStride / sizeof(float), srcSize, fmt, gamma, imageIndex, perChannel); + memcpy(static_cast(hDst.data()), static_cast(dst_tmp.data()), copySize); + } + else + { + GammaContrastVarShapeCpuOp(hDst, dstRowStride, dstSize, hSrc, srcRowStride, srcSize, fmt, gamma, imageIndex, + perChannel); + } +} + +} // namespace + +// clang-format off + NVCV_TEST_SUITE_P(OpGammaContrast, test::ValueList { // width, height, batches, format, Gamma, per channel @@ -97,6 +165,20 @@ NVCV_TEST_SUITE_P(OpGammaContrast, test::ValueList()}; - float gamma = GetParamValue<4>(); + NVCVDataType nvcvDataType; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(format, 0, &nvcvDataType)); + float gamma = GetParamValue<4>(); + bool isFloatTest = false; bool perChannel = GetParamValue<5>(); @@ -135,9 +220,25 @@ TEST_P(OpGammaContrast, varshape_correct_output) srcVecRowStride[i] = srcRowStride; std::uniform_int_distribution udist(0, 255); + std::uniform_real_distribution udistf(0.f, 1.f); srcVec[i].resize(imgSrc[i].size().h * srcRowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F32: + case NVCV_DATA_TYPE_2F32: + case NVCV_DATA_TYPE_3F32: + case NVCV_DATA_TYPE_4F32: + isFloatTest = true; + for (size_t idx = 0; idx < (srcVec[i].size() / sizeof(float)); ++idx) + { + reinterpret_cast(srcVec[i].data())[idx] = udistf(rng); + } + break; + default: + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return udist(rng); }); + break; + } auto imgData = imgSrc[i].exportData(); ASSERT_NE(imgData, nvcv::NullOpt); @@ -228,13 +329,27 @@ TEST_P(OpGammaContrast, varshape_correct_output) std::generate(goldVec.begin(), goldVec.end(), [&]() { return 0; }); // Generate gold result - GammaContrastVarShapeCpuOp(goldVec, dstRowStride, {dstWidth, dstHeight}, srcVec[i], srcRowStride, - {srcWidth, srcHeight}, format, gammaVec, i, perChannel); + GammaContrastVarShapeCpuOpWrapper(goldVec, dstRowStride, {dstWidth, dstHeight}, srcVec[i], srcRowStride, + {srcWidth, srcHeight}, format, gammaVec, i, perChannel, nvcvDataType); printVec(goldVec, srcHeight, dstRowStride, format.numChannels(), "golden output"); printVec(testVec, srcHeight, dstRowStride, format.numChannels(), "operator output"); - EXPECT_EQ(testVec, goldVec); + if (!isFloatTest) + { + EXPECT_EQ(testVec, goldVec); + } + else + { + VEC_EXPECT_NEAR(testVec, goldVec, 1E-6F, float); + } } } + +TEST(OpGammaContrast_negative, create_with_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaGammaContrastCreate(nullptr, 4, 4)); +} + +#undef VEC_EXPECT_NEAR diff --git a/tests/cvcuda/system/TestOpGaussianNoise.cpp b/tests/cvcuda/system/TestOpGaussianNoise.cpp index 6495061b4..a8fe5419b 100644 --- a/tests/cvcuda/system/TestOpGaussianNoise.cpp +++ b/tests/cvcuda/system/TestOpGaussianNoise.cpp @@ -19,18 +19,22 @@ #include "GaussianNoiseUtils.cuh" +#include #include #include #include #include #include #include +#include +#include #include #include #include #include +namespace { inline uint8_t cast(float value) { int v = (int)(value + (value >= 0 ? 0.5 : -0.5)); @@ -39,7 +43,7 @@ inline uint8_t cast(float value) //test for RGB8 template -static void GaussianNoise(std::vector &src, std::vector &dst, float mu, float sigma, int batch, bool per_channel) +void GaussianNoise(std::vector &src, std::vector &dst, float mu, float sigma, int batch, bool per_channel) { int mem_size = src.size(); if (!per_channel) @@ -69,6 +73,39 @@ static void GaussianNoise(std::vector &src, std::vector &dst, float mu, fl free(rand_h); } +// test for float +template<> +void GaussianNoise(std::vector &src, std::vector &dst, float mu, float sigma, int batch, bool per_channel) +{ + int mem_size = src.size(); + if (!per_channel) + mem_size /= 3; + float *rand_h = (float *)malloc(sizeof(float) * mem_size); + get_random(rand_h, per_channel, batch, mem_size); + + int img_size = src.size() / 3; + for (int i = 0; i < img_size; i++) + { + if (per_channel) + { + for (int ch = 0; ch < 3; ch++) + { + float delta = mu + rand_h[i * 3 + ch] * sigma; + dst[i * 3 + ch] = nvcv::cuda::clamp(nvcv::cuda::StaticCast(src[i * 3 + ch] + delta), 0.f, 1.f); + } + } + else + { + float delta = mu + rand_h[i] * sigma; + dst[i * 3] = nvcv::cuda::clamp(nvcv::cuda::StaticCast(src[i * 3] + delta), 0.f, 1.f); + dst[i * 3 + 1] = nvcv::cuda::clamp(nvcv::cuda::StaticCast(src[i * 3 + 1] + delta), 0.f, 1.f); + dst[i * 3 + 2] = nvcv::cuda::clamp(nvcv::cuda::StaticCast(src[i * 3 + 2] + delta), 0.f, 1.f); + } + } + free(rand_h); +} +} // namespace + // clang-format off NVCV_TEST_SUITE_P(OpGaussianNoise, nvcv::test::ValueList { @@ -81,22 +118,15 @@ NVCV_TEST_SUITE_P(OpGaussianNoise, nvcv::test::ValueList +static void tensor_correct_output_test(int batch, int height, int width, float mu, float sigma, bool per_channel) { cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int batch = GetParamValue<0>(); - int height = GetParamValue<1>(); - int width = GetParamValue<2>(); - float mu = GetParamValue<3>(); - float sigma = GetParamValue<4>(); - bool per_channel = GetParamValue<5>(); - - nvcv::ImageFormat fmt = nvcv::FMT_RGB8; - using datatype = uint8_t; - nvcv::Tensor imgIn = nvcv::util::CreateTensor(batch, width, height, fmt); - nvcv::Tensor imgOut = nvcv::util::CreateTensor(batch, width, height, fmt); + nvcv::ImageFormat fmt = std::is_same::value ? nvcv::FMT_RGB8 : nvcv::FMT_RGBf32; + nvcv::Tensor imgIn = nvcv::util::CreateTensor(batch, width, height, fmt); + nvcv::Tensor imgOut = nvcv::util::CreateTensor(batch, width, height, fmt); auto inData = imgIn.exportData(); ASSERT_NE(nullptr, inData); @@ -142,15 +172,24 @@ TEST_P(OpGaussianNoise, tensor_correct_output) cudaMemcpyHostToDevice, stream)); //Generate input - std::vector> srcVec(batch); - std::default_random_engine randEng; - int rowStride = width * fmt.planePixelStrideBytes(0); + std::vector> srcVec(batch); + std::default_random_engine randEng; + int rowStride = width * fmt.planePixelStrideBytes(0); for (int i = 0; i < batch; i++) { - std::uniform_int_distribution rand(0, 255); - srcVec[i].resize(height * rowStride / sizeof(datatype)); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + if constexpr (std::is_same::value) + { + std::uniform_int_distribution rand(0, 255); + srcVec[i].resize(height * rowStride / sizeof(datatype)); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + } + else + { + std::uniform_real_distribution rand(0.f, 1.f); + srcVec[i].resize(height * rowStride / sizeof(datatype)); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + } ASSERT_EQ(cudaSuccess, cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec[i].data(), rowStride, rowStride, height, cudaMemcpyHostToDevice)); } @@ -180,20 +219,35 @@ TEST_P(OpGaussianNoise, tensor_correct_output) EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } -TEST_P(OpGaussianNoise, varshape_correct_shape) +TEST_P(OpGaussianNoise, tensor_correct_output) { - cudaStream_t stream; - EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + float mu = GetParamValue<3>(); + float sigma = GetParamValue<4>(); + bool per_channel = GetParamValue<5>(); + tensor_correct_output_test(batch, height, width, mu, sigma, per_channel); +} +TEST_P(OpGaussianNoise, tensor_correct_output_float) +{ int batch = GetParamValue<0>(); int height = GetParamValue<1>(); int width = GetParamValue<2>(); float mu = GetParamValue<3>(); float sigma = GetParamValue<4>(); bool per_channel = GetParamValue<5>(); + tensor_correct_output_test(batch, height, width, mu, sigma, per_channel); +} - nvcv::ImageFormat fmt = nvcv::FMT_RGB8; - using datatype = uint8_t; +template +static void varshape_correct_output_test(int batch, int height, int width, float mu, float sigma, bool per_channel) +{ + cudaStream_t stream; + EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + nvcv::ImageFormat fmt = std::is_same::value ? nvcv::FMT_RGB8 : nvcv::FMT_RGBf32; // Create input and output std::default_random_engine randEng; @@ -247,10 +301,18 @@ TEST_P(OpGaussianNoise, varshape_correct_shape) int srcRowStride = srcWidth * fmt.planePixelStrideBytes(0); - std::uniform_int_distribution rand(0, 255); - - srcVec[i].resize(srcHeight * srcRowStride / sizeof(datatype)); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + if constexpr (std::is_same::value) + { + std::uniform_int_distribution rand(0, 255); + srcVec[i].resize(srcHeight * srcRowStride / sizeof(datatype)); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + } + else + { + std::uniform_real_distribution rand(0.f, 1.f); + srcVec[i].resize(srcHeight * srcRowStride / sizeof(datatype)); + std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + } // Copy input data to the GPU ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->plane(0).basePtr, srcData->plane(0).rowStride, srcVec[i].data(), @@ -292,3 +354,135 @@ TEST_P(OpGaussianNoise, varshape_correct_shape) EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } + +TEST_P(OpGaussianNoise, varshape_correct_shape) +{ + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + float mu = GetParamValue<3>(); + float sigma = GetParamValue<4>(); + bool per_channel = GetParamValue<5>(); + + varshape_correct_output_test(batch, height, width, mu, sigma, per_channel); +} + +TEST_P(OpGaussianNoise, varshape_correct_shape_float) +{ + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + float mu = GetParamValue<3>(); + float sigma = GetParamValue<4>(); + bool per_channel = GetParamValue<5>(); + + varshape_correct_output_test(batch, height, width, mu, sigma, per_channel); +} + +TEST(OpGaussianNoise_negative, create_with_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaGaussianNoiseCreate(nullptr, 10)); +} + +TEST(OpGaussianNoise_negative, create_with_negative_batch) +{ + NVCVOperatorHandle opHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaGaussianNoiseCreate(&opHandle, -1)); +} + +TEST(OpGaussianNoise_negative, invalid_mu_sigma_layout) +{ + nvcv::Tensor imgIn( + { + {24, 24, 2}, + "HWC" + }, + nvcv::TYPE_U8); + nvcv::Tensor imgOut( + { + {24, 24, 2}, + "HWC" + }, + nvcv::TYPE_U8); + + //parameters + nvcv::Tensor muval({{2}, "N"}, nvcv::TYPE_F32); + nvcv::Tensor sigmaval({{2}, "N"}, nvcv::TYPE_F32); + + // invalid mu parameters + nvcv::Tensor invalidMuval( + { + {2, 2, 2}, + "HWC" + }, + nvcv::TYPE_F32); + nvcv::Tensor invalidSigmaval( + { + {2, 2, 2}, + "HWC" + }, + nvcv::TYPE_F32); + + // Call operator + int maxBatch = 4; + unsigned long long seed = 12345; + cvcuda::GaussianNoise GaussianNoiseOp(maxBatch); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcv::ProtectCall([&] { GaussianNoiseOp(NULL, imgIn, imgOut, invalidMuval, sigmaval, false, seed); })); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcv::ProtectCall([&] { GaussianNoiseOp(NULL, imgIn, imgOut, muval, invalidSigmaval, false, seed); })); +} + +// clang-format off +NVCV_TEST_SUITE_P(OpGaussianNoise_negative, nvcv::test::ValueList +{ + // in_layout, in_data_type, out_layout, out_data_type, mu_layout, mu_data_type, sigma_layout, sigma_data_type, expected_return_status + { "CHW", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "CHW", nvcv::TYPE_U8, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_F64, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_F64, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U32, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U32, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U16, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_F64, "N", nvcv::TYPE_F32, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, "N", nvcv::TYPE_F32, "N", nvcv::TYPE_F64, NVCV_ERROR_INVALID_ARGUMENT}, +}); + +// clang-format on + +TEST_P(OpGaussianNoise_negative, infer_negative_parameter) +{ + std::string in_layout = GetParamValue<0>(); + nvcv::DataType in_data_type = GetParamValue<1>(); + std::string out_layout = GetParamValue<2>(); + nvcv::DataType out_data_type = GetParamValue<3>(); + std::string mu_layout = GetParamValue<4>(); + nvcv::DataType mu_data_type = GetParamValue<5>(); + std::string sigma_layout = GetParamValue<6>(); + nvcv::DataType sigma_data_type = GetParamValue<7>(); + NVCVStatus expected_return_status = GetParamValue<8>(); + + nvcv::Tensor imgIn( + { + {24, 24, 2}, + in_layout.c_str() + }, + in_data_type); + nvcv::Tensor imgOut( + { + {24, 24, 2}, + out_layout.c_str() + }, + out_data_type); + + //parameters + nvcv::Tensor muval({{2}, mu_layout.c_str()}, mu_data_type); + nvcv::Tensor sigmaval({{2}, sigma_layout.c_str()}, sigma_data_type); + + // Call operator + int maxBatch = 4; + unsigned long long seed = 12345; + cvcuda::GaussianNoise GaussianNoiseOp(maxBatch); + EXPECT_EQ(expected_return_status, + nvcv::ProtectCall([&] { GaussianNoiseOp(NULL, imgIn, imgOut, muval, sigmaval, false, seed); })); +} diff --git a/tests/cvcuda/system/TestOpHQResize.cpp b/tests/cvcuda/system/TestOpHQResize.cpp new file mode 100644 index 000000000..f9ce474af --- /dev/null +++ b/tests/cvcuda/system/TestOpHQResize.cpp @@ -0,0 +1,1320 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cuda = nvcv::cuda; +namespace test = nvcv::test; +namespace ttype = nvcv::test::type; +using uchar = unsigned char; + +template +using uniform_distribution + = std::conditional_t, std::uniform_int_distribution, std::uniform_real_distribution>; + +namespace baseline { + +template +void ForAll(int2 shape, Cb &&cb) +{ + for (int y = 0; y < shape.y; y++) + { + for (int x = 0; x < shape.x; x++) + { + cb(int2{x, y}); + } + } +} + +template +void ForAll(int3 shape, Cb &&cb) +{ + for (int z = 0; z < shape.z; z++) + for (int y = 0; y < shape.y; y++) + { + for (int x = 0; x < shape.x; x++) + { + cb(int3{x, y, z}); + } + } +} + +template +struct CpuSample +{ + static_assert(!cuda::IsCompound); + using ShapeT = cuda::MakeType; // WH or WHD + using StridesT = cuda::MakeType; // WHN or WHDN + + CpuSample(int64_t size, StridesT strides, int numSamples, ShapeT shape, int numChannels) + : m_data(size) + , m_strides{strides} + , m_numSamples{numSamples} + , m_shape{shape} + , m_numChannels{numChannels} + { + } + + BT &get(int sampleIdx, const ShapeT idx, int channel) + { + return *(reinterpret_cast(m_data.data() + offset(sampleIdx, idx)) + channel); + } + + uint8_t *data() + { + return m_data.data(); + } + + StridesT strides() + { + return m_strides; + } + + ShapeT shape() + { + return m_shape; + } + + int numSamples() + { + return m_numSamples; + } + + int numChannels() + { + return m_numChannels; + } + +private: + int offset(int sampleIdx, int2 idx) + { + return sampleIdx * m_strides.z + idx.y * m_strides.y + idx.x * m_strides.x; + } + + int offset(int sampleIdx, int3 idx) + { + return sampleIdx * m_strides.w + idx.z * m_strides.z + idx.y * m_strides.y + idx.x * m_strides.x; + } + + std::vector m_data; + StridesT m_strides; + int m_numSamples; + ShapeT m_shape; + int m_numChannels; +}; + +inline CpuSample GetIntermediate(int numSamples, int2 shape, int numChannels) +{ + int64_t size = sizeof(float) * numSamples * shape.y * shape.x * numChannels; + cuda::MakeType strides; + strides.x = sizeof(float) * numChannels; + strides.y = strides.x * shape.x; + strides.z = strides.y * shape.y; + return {size, strides, numSamples, shape, numChannels}; +} + +inline CpuSample GetIntermediate(int numSamples, int3 shape, int numChannels) +{ + int64_t size = sizeof(float) * numSamples * shape.z * shape.y * shape.x * numChannels; + cuda::MakeType strides; + strides.x = sizeof(float) * numChannels; + strides.y = strides.x * shape.x; + strides.z = strides.y * shape.y; + strides.w = strides.z * shape.z; + return {size, strides, numSamples, shape, numChannels}; +} + +struct FilterTriangular +{ + int size() const + { + return 3; + } + + float operator[](int k) const + { + return k == 1 ? 1 : 0; + } +}; + +struct FilterCubic +{ + int size() const + { + return 129; + } + + float operator[](int k) const + { + float x = 4 * (k - (size() - 1) * 0.5f) / (size() - 1); + x = fabsf(x); + if (x >= 2) + return 0; + + float x2 = x * x; + float x3 = x2 * x; + if (x > 1) + return -0.5f * x3 + 2.5f * x2 - 4.0f * x + 2.0f; + else + return 1.5f * x3 - 2.5f * x2 + 1.0f; + } +}; + +struct FilterGaussian +{ + int size() const + { + return 65; + } + + float operator[](int k) const + { + float x = 4 * (k - (size() - 1) * 0.5f) / (size() - 1); + return expf(-x * x); + } +}; + +struct FilterLanczos +{ + static constexpr int kLanczosA = 3; + static constexpr int kLanczosResolution = 32; + + int size() const + { + return (2 * kLanczosA * kLanczosResolution + 1); + } + + float operator[](int k) const + { + float x = 2 * kLanczosA * (k - (size() - 1) * 0.5f) / (size() - 1); + if (fabsf(x) >= kLanczosA) + return 0.0f; + return nvcv::util::sinc(x) * nvcv::util::sinc(x / kLanczosA); + } +}; + +template +struct Filter +{ + Filter(float support) + : m_filter{} + , m_support{support} + { + } + + float support() const + { + return std::ceil(m_support); + } + + float scale() const + { + return (m_filter.size() - 1) / m_support; + } + + float anchor() const + { + return m_support / 2; + } + + float operator()(float x) const + { + if (!(x > -1)) + return 0; + if (x >= m_filter.size()) + return 0; + int x0 = std::floor(x); + int x1 = x0 + 1; + float d = x - x0; + float f0 = x0 < 0 ? 0.0f : m_filter[x0]; + float f1 = x1 >= m_filter.size() ? 0.0f : m_filter[x1]; + return f0 + d * (f1 - f0); + } + +private: + FilterType m_filter; + float m_support; +}; + +template +void RunNN(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu) +{ + const int numSamples = inTensorCpu.numSamples(); + const int numChannels = inTensorCpu.numChannels(); + const auto inShape = inTensorCpu.shape(); + const auto outShape = outTensorCpu.shape(); + const int inSize = cuda::GetElement(inShape, axis); + const int outSize = cuda::GetElement(outShape, axis); + const float axisScale = static_cast(inSize) / outSize; + const float axisOrigin = 0.5f * axisScale; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + ForAll(outShape, + [&](const cuda::MakeType outIdx) + { + auto inIdx = outIdx; + int inAxis = std::floor(cuda::GetElement(outIdx, axis) * axisScale + axisOrigin); + inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); + cuda::GetElement(inIdx, axis) = inAxis; + for (int c = 0; c < numChannels; c++) + { + outTensorCpu.get(sampleIdx, outIdx, c) + = cuda::SaturateCast(inTensorCpu.get(sampleIdx, inIdx, c)); + } + }); + } +} + +template +void RunLinear(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu) +{ + const int numSamples = inTensorCpu.numSamples(); + const int numChannels = inTensorCpu.numChannels(); + const auto inShape = inTensorCpu.shape(); + const auto outShape = outTensorCpu.shape(); + const int inSize = cuda::GetElement(inShape, axis); + const int outSize = cuda::GetElement(outShape, axis); + const float axisScale = static_cast(inSize) / outSize; + const float axisOrigin = 0.5f * axisScale - 0.5f; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + ForAll(outShape, + [&](const cuda::MakeType outIdx) + { + const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; + int inAxis0 = std::floor(inAxis0f); + int inAxis1 = inAxis0 + 1; + const float q = inAxis0f - inAxis0; + inAxis0 = inAxis0 < 0 ? 0 : (inAxis0 > inSize - 1 ? inSize - 1 : inAxis0); + inAxis1 = inAxis1 < 0 ? 0 : (inAxis1 > inSize - 1 ? inSize - 1 : inAxis1); + auto inIdx0 = outIdx; + auto inIdx1 = outIdx; + cuda::GetElement(inIdx0, axis) = inAxis0; + cuda::GetElement(inIdx1, axis) = inAxis1; + for (int c = 0; c < numChannels; c++) + { + const float a = inTensorCpu.get(sampleIdx, inIdx0, c); + const float b = inTensorCpu.get(sampleIdx, inIdx1, c); + const float tmp = b - a; + outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(std::fmaf(tmp, q, a)); + } + }); + } +} + +template +void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, + const FilterT &filter) +{ + const int numSamples = inTensorCpu.numSamples(); + const int numChannels = inTensorCpu.numChannels(); + const auto inShape = inTensorCpu.shape(); + const auto outShape = outTensorCpu.shape(); + const int inSize = cuda::GetElement(inShape, axis); + const int outSize = cuda::GetElement(outShape, axis); + const int filterSupport = filter.support(); + const float filterStep = filter.scale(); + const float axisScale = static_cast(inSize) / outSize; + const float axisOrigin = 0.5f * axisScale - 0.5f - filter.anchor(); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + ForAll(outShape, + [&](const cuda::MakeType outIdx) + { + const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; + int inAxis0 = std::ceil(inAxis0f); + const float fStart = (inAxis0 - inAxis0f) * filterStep; + for (int c = 0; c < numChannels; c++) + { + float tmp = 0; + float norm = 0; + for (int k = 0; k < filterSupport; k++) + { + int inAxis = inAxis0 + k; + inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); + auto inIdx = outIdx; + cuda::GetElement(inIdx, axis) = inAxis; + const InBT inVal = inTensorCpu.get(sampleIdx, inIdx, c); + float coeff = filter(fStart + k * filterStep); + tmp = std::fmaf(inVal, coeff, tmp); + norm += coeff; + } + outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(tmp / norm); + } + }); + } +} + +template +void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, + const NVCVInterpolationType interpolation, bool antialias) +{ + const auto inShape = inTensorCpu.shape(); + const auto outShape = outTensorCpu.shape(); + const float inSize = cuda::GetElement(inShape, axis); + const float outSize = cuda::GetElement(outShape, axis); + switch (interpolation) + { + case NVCV_INTERP_LINEAR: + { + float radius = antialias ? inSize / outSize : 1; + float support = std::max(1.0f, 2 * radius); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + } + break; + case NVCV_INTERP_CUBIC: + { + float radius = antialias ? (2 * inSize / outSize) : 2; + float support = std::max(4.0f, 2 * radius); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + } + break; + case NVCV_INTERP_GAUSSIAN: + { + float radius = antialias ? inSize / outSize : 1; + float support = std::max(1.0f, 2 * radius); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + } + break; + case NVCV_INTERP_LANCZOS: + { + float radius = antialias ? (3 * inSize / outSize) : 3; + float support = std::max(6.0f, 2 * radius); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + } + break; + default: + FAIL() << "Unsupported filter"; + } +} + +template +void RunPass(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) + +{ + const auto inShape = inTensorCpu.shape(); + const auto outShape = outTensorCpu.shape(); + const int inSize = cuda::GetElement(inShape, axis); + const int outSize = cuda::GetElement(outShape, axis); + const bool isScalingDown = outSize < inSize; + antialias &= isScalingDown; + const auto interpolation = isScalingDown ? minInterpolation : magInterpolation; + switch (interpolation) + { + case NVCV_INTERP_NEAREST: + RunNN(axis, outTensorCpu, inTensorCpu); + break; + case NVCV_INTERP_LINEAR: + { + if (antialias) + { + RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias); + } + else + { + RunLinear(axis, outTensorCpu, inTensorCpu); + } + } + break; + default: + RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias); + break; + } +} + +template +void Resize(CpuSample &refTensorCpu, CpuSample &inTensorCpu, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) +{ + int numSamples = inTensorCpu.numSamples(); + int numChannels = inTensorCpu.numChannels(); + const int2 inShape = inTensorCpu.shape(); + const int2 outShape = refTensorCpu.shape(); + const int2 interShape = {outShape.x, inShape.y}; + auto intermediateTensor = GetIntermediate(numSamples, interShape, numChannels); + RunPass(0, intermediateTensor, inTensorCpu, minInterpolation, magInterpolation, antialias); + RunPass(1, refTensorCpu, intermediateTensor, minInterpolation, magInterpolation, antialias); +} + +template +void Resize(CpuSample &refTensorCpu, CpuSample &inTensorCpu, + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) +{ + int numSamples = inTensorCpu.numSamples(); + int numChannels = inTensorCpu.numChannels(); + const int3 inShape = inTensorCpu.shape(); + const int3 outShape = refTensorCpu.shape(); + const int3 interShape0 = {outShape.x, inShape.y, inShape.z}; + const int3 interShape1 = {outShape.x, outShape.y, inShape.z}; + auto intermediateTensor0 = GetIntermediate(numSamples, interShape0, numChannels); + RunPass(0, intermediateTensor0, inTensorCpu, minInterpolation, magInterpolation, antialias); + auto intermediateTensor1 = GetIntermediate(numSamples, interShape1, numChannels); + RunPass(1, intermediateTensor1, intermediateTensor0, minInterpolation, magInterpolation, antialias); + RunPass(2, refTensorCpu, intermediateTensor1, minInterpolation, magInterpolation, antialias); +} + +template +void Compare(CpuSample &tensor, CpuSample &refTensor, bool antialias) +{ + int numSamples = tensor.numSamples(); + int numChannels = tensor.numChannels(); + const auto shape = tensor.shape(); + ASSERT_EQ(numSamples, refTensor.numSamples()); + ASSERT_EQ(numChannels, refTensor.numChannels()); + ASSERT_EQ(shape, refTensor.shape()); + double err = 0; + int64_t vol = 0; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + ForAll(shape, + [&](const cuda::MakeType idx) + { + for (int c = 0; c < numChannels; c++) + { + const BT val = tensor.get(sampleIdx, idx, c); + const BT refVal = refTensor.get(sampleIdx, idx, c); + err += abs(val - refVal); + vol += 1; + + if (std::is_integral_v) // uchar -> uchar, short -> short, ushort -> ushort + { + ASSERT_NEAR(val, refVal, (std::is_same_v ? 1 : 10)); // uchar : short, ushort + } + else // output type is float + { + if (!std::is_integral_v) // float -> float + { + ASSERT_NEAR(val, refVal, 1e-4); + } + else // [uchar, short, ushort] -> float + { + ASSERT_NEAR(val, refVal, (std::is_same_v ? 0.1 : 6)); + } + } + } + }); + } + double mean_err = err / vol; + ASSERT_LE(mean_err, antialias ? 0.1 : 0.4); +} +} // namespace baseline + +inline void GetMaxShape(HQResizeTensorShapeI &ret, const HQResizeTensorShapeI &other) +{ + ASSERT_EQ(ret.ndim, other.ndim); + ret.numChannels = std::max(ret.numChannels, other.numChannels); + for (int d = 0; d < ret.ndim; d++) + { + ret.extent[d] = std::max(ret.extent[d], other.extent[d]); + } +} + +inline void GetMaxShape(HQResizeTensorShapeI &ret, const HQResizeTensorShapeI *shapes, int numSamples) +{ + if (numSamples > 0) + { + ret = shapes[0]; + for (int i = 1; i < numSamples; i++) + { + GetMaxShape(ret, shapes[i]); + } + } +} + +template +struct TypeAsFormatImpl +{ +}; + +template<> +struct TypeAsFormatImpl +{ + static constexpr NVCVDataType value = NVCV_DATA_TYPE_U8; +}; + +template<> +struct TypeAsFormatImpl +{ + static constexpr NVCVDataType value = NVCV_DATA_TYPE_S16; +}; + +template<> +struct TypeAsFormatImpl +{ + static constexpr NVCVDataType value = NVCV_DATA_TYPE_U16; +}; + +template<> +struct TypeAsFormatImpl +{ + static constexpr NVCVDataType value = NVCV_DATA_TYPE_F32; +}; + +template +nvcv::DataType TypeAsFormat() +{ + return nvcv::DataType{TypeAsFormatImpl::value}; +} + +template +nvcv::Tensor CreateTensorHelper(nvcv::DataType dtype, const char *layoutStr, int numSamples, Extents... extents) +{ + nvcv::TensorLayout layout{layoutStr}; + if (numSamples == 1) + { + nvcv::TensorShape shape{{extents...}, layout.last(sizeof...(extents))}; + return nvcv::Tensor{shape, dtype}; + } + else + { + nvcv::TensorShape shape{ + {numSamples, extents...}, + layout + }; + return nvcv::Tensor{shape, dtype}; + } +} + +#define NVCV_SHAPE2D(h, w) (int2{w, h}) +#define NVCV_TEST_ROW(NumSamples, InShape, OutShape, NumChannels, InT, OutT, Interpolation) \ + ttype::Types, ttype::Value, ttype::Value, ttype::Value, \ + InT, OutT, ttype::Value> + +NVCV_TYPED_TEST_SUITE( + OpHQResizeTensor2D, + // [uchar, ushort, short, float] x [same, float] x [1, 2, 3, 4, more channels] + // the input and output shapes: [x, y] -> [scale_down, scale_up] + // interpolation methods: [nn, linear, gaussian, cubic, lanczos] + ttype::Types< + NVCV_TEST_ROW(1, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 1, uchar, uchar, NVCV_INTERP_NEAREST), + NVCV_TEST_ROW(2, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 1, uchar, float, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 2, uchar, uchar, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 2, uchar, float, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 3, uchar, uchar, NVCV_INTERP_LANCZOS), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 3, uchar, float, NVCV_INTERP_LANCZOS), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 4, uchar, uchar, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 4, uchar, float, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 5, uchar, uchar, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 8, uchar, float, NVCV_INTERP_LINEAR), + + NVCV_TEST_ROW(1, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 1, ushort, ushort, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(2, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 1, short, float, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 2, short, short, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 2, ushort, float, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 3, ushort, ushort, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 3, short, float, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 4, ushort, ushort, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 7, ushort, float, NVCV_INTERP_NEAREST), + + NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 1, float, float, NVCV_INTERP_NEAREST), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(1024, 101), NVCV_SHAPE2D(105, 512), 2, float, float, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(31, 244), NVCV_SHAPE2D(311, 122), 3, float, float, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(4, NVCV_SHAPE2D(41, 41), NVCV_SHAPE2D(244, 244), 4, float, float, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(3, NVCV_SHAPE2D(769, 211), NVCV_SHAPE2D(40, 40), 7, float, float, NVCV_INTERP_LANCZOS)>); + +template +void TestTensor(bool antialias) +{ + const int numSamples = ttype::GetValue; + const int2 inShape = ttype::GetValue; + const int2 outShape = ttype::GetValue; + const int numChannels = ttype::GetValue; + using InBT = ttype::GetType; + using OutBT = ttype::GetType; + const nvcv::DataType inDtype = TypeAsFormat(); + const nvcv::DataType outDtype = TypeAsFormat(); + const NVCVInterpolationType interpolation = ttype::GetValue; + + nvcv::Tensor inTensor = CreateTensorHelper(inDtype, "NHWC", numSamples, inShape.y, inShape.x, numChannels); + nvcv::Tensor outTensor = CreateTensorHelper(outDtype, "NHWC", numSamples, outShape.y, outShape.x, numChannels); + + auto inData = inTensor.exportData(); + auto outData = outTensor.exportData(); + ASSERT_TRUE(inData && outData); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(inAccess && outAccess); + long3 inStrides{inAccess->colStride(), inAccess->rowStride(), + inAccess->sampleStride() == 0 ? inAccess->rowStride() * inShape.y : inAccess->sampleStride()}; + long3 outStrides{outAccess->colStride(), outAccess->rowStride(), + outAccess->sampleStride() == 0 ? outAccess->rowStride() * outShape.y : outAccess->sampleStride()}; + + ASSERT_EQ(inAccess->numSamples(), numSamples); + ASSERT_EQ(inAccess->numChannels(), numChannels); + ASSERT_EQ(outAccess->numChannels(), numChannels); + ASSERT_EQ(outAccess->numSamples(), numSamples); + + baseline::CpuSample inTensorCpu(inStrides.z * numSamples, inStrides, numSamples, inShape, numChannels); + baseline::CpuSample outTensorCpu(outStrides.z * numSamples, outStrides, numSamples, outShape, + numChannels); + baseline::CpuSample refTensorCpu(outStrides.z * numSamples, outStrides, numSamples, outShape, + numChannels); + + uniform_distribution rand(InBT{0}, std::is_integral_v ? cuda::TypeTraits::max : InBT{1}); + std::mt19937_64 rng(12345); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + for (int y = 0; y < inShape.y; y++) + { + for (int x = 0; x < inShape.x; x++) + { + for (int c = 0; c < numChannels; c++) + { + inTensorCpu.get(sampleIdx, int2{x, y}, c) = rand(rng); + } + } + } + } + + cvcuda::HQResize op; + cudaStream_t stream; + cvcuda::UniqueWorkspace ws; + { + HQResizeTensorShapeI inShapeDesc{ + {inShape.y, inShape.x}, + 2, + numChannels + }; + HQResizeTensorShapeI outShapeDesc{ + {outShape.y, outShape.x}, + 2, + numChannels + }; + ASSERT_NO_THROW(ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements( + numSamples, inShapeDesc, outShapeDesc, interpolation, interpolation, antialias))); + } + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.z * numSamples, + cudaMemcpyHostToDevice, stream)); + ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.z * numSamples, + cudaMemcpyDeviceToHost, stream)); + baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias); + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + baseline::Compare(outTensorCpu, refTensorCpu, antialias); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +TYPED_TEST(OpHQResizeTensor2D, correct_output_no_antialias) +{ + TestTensor(false); +} + +TYPED_TEST(OpHQResizeTensor2D, correct_output_with_antialias) +{ + TestTensor(true); +} + +#define NVCV_SHAPE3D(d, h, w) (int3{w, h, d}) +NVCV_TYPED_TEST_SUITE( + OpHQResizeTensor3D, + ttype::Types< + NVCV_TEST_ROW(1, NVCV_SHAPE3D(244, 244, 244), NVCV_SHAPE3D(40, 40, 40), 1, uchar, uchar, NVCV_INTERP_NEAREST), + NVCV_TEST_ROW(2, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(244, 244, 244), 2, uchar, float, NVCV_INTERP_GAUSSIAN), + NVCV_TEST_ROW(3, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(50, 100, 100), 3, ushort, ushort, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(4, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 50, 100), 4, ushort, float, NVCV_INTERP_LINEAR), + NVCV_TEST_ROW(3, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 100, 50), 3, float, float, NVCV_INTERP_CUBIC), + NVCV_TEST_ROW(4, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(100, 40, 40), 5, uchar, float, NVCV_INTERP_LANCZOS), + NVCV_TEST_ROW(7, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC)>); + +TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) +{ + const int numSamples = ttype::GetValue; + const int3 inShape = ttype::GetValue; + const int3 outShape = ttype::GetValue; + const int numChannels = ttype::GetValue; + using InBT = ttype::GetType; + using OutBT = ttype::GetType; + const nvcv::DataType inDtype = TypeAsFormat(); + const nvcv::DataType outDtype = TypeAsFormat(); + const NVCVInterpolationType interpolation = ttype::GetValue; + constexpr bool antialias = true; + + nvcv::Tensor inTensor + = CreateTensorHelper(inDtype, "NDHWC", numSamples, inShape.z, inShape.y, inShape.x, numChannels); + nvcv::Tensor outTensor + = CreateTensorHelper(outDtype, "NDHWC", numSamples, outShape.z, outShape.y, outShape.x, numChannels); + + auto inData = inTensor.exportData(); + auto outData = outTensor.exportData(); + ASSERT_TRUE(inData && outData); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(inAccess && outAccess); + long4 inStrides{inAccess->colStride(), inAccess->rowStride(), inAccess->depthStride(), + inAccess->sampleStride() == 0 ? inAccess->depthStride() * inShape.z : inAccess->sampleStride()}; + long4 outStrides{ + outAccess->colStride(), outAccess->rowStride(), outAccess->depthStride(), + outAccess->sampleStride() == 0 ? outAccess->depthStride() * outShape.z : outAccess->sampleStride()}; + + ASSERT_EQ(inAccess->numSamples(), numSamples); + ASSERT_EQ(inAccess->numChannels(), numChannels); + ASSERT_EQ(outAccess->numChannels(), numChannels); + ASSERT_EQ(outAccess->numSamples(), numSamples); + + baseline::CpuSample inTensorCpu(inStrides.w * numSamples, inStrides, numSamples, inShape, numChannels); + baseline::CpuSample outTensorCpu(outStrides.w * numSamples, outStrides, numSamples, outShape, + numChannels); + baseline::CpuSample refTensorCpu(outStrides.w * numSamples, outStrides, numSamples, outShape, + numChannels); + + uniform_distribution rand(InBT{0}, std::is_integral_v ? cuda::TypeTraits::max : InBT{1}); + std::mt19937_64 rng(12345); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + for (int z = 0; z < inShape.z; z++) + { + for (int y = 0; y < inShape.y; y++) + { + for (int x = 0; x < inShape.x; x++) + { + for (int c = 0; c < numChannels; c++) + { + inTensorCpu.get(sampleIdx, int3{x, y, z}, c) = rand(rng); + } + } + } + } + } + + cvcuda::HQResize op; + cudaStream_t stream; + cvcuda::UniqueWorkspace ws; + { + HQResizeTensorShapeI inShapeDesc{ + {inShape.z, inShape.y, inShape.x}, + 3, + numChannels + }; + HQResizeTensorShapeI outShapeDesc{ + {outShape.z, outShape.y, outShape.x}, + 3, + numChannels + }; + ASSERT_NO_THROW(ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements( + numSamples, inShapeDesc, outShapeDesc, interpolation, interpolation, antialias))); + } + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.w * numSamples, + cudaMemcpyHostToDevice, stream)); + ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.w * numSamples, + cudaMemcpyDeviceToHost, stream)); + baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias); + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + baseline::Compare(outTensorCpu, refTensorCpu, antialias); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +#define NVCV_TEST_ROW_TB(NumChannels, InT, OutT, Antialias, MinInterpolation, MagInterpolation) \ + ttype::Types, InT, OutT, ttype::Value, ttype::Value, \ + ttype::Value> + +NVCV_TYPED_TEST_SUITE(OpHQResizeBatch, + ttype::Types); + +TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) +{ + const int numChannels = ttype::GetValue; + using InBT = ttype::GetType; + using OutBT = ttype::GetType; + const nvcv::DataType inDtype = TypeAsFormat(); + const nvcv::DataType outDtype = TypeAsFormat(); + const bool antialias = ttype::GetValue; + const NVCVInterpolationType minInterpolation = ttype::GetValue; + const NVCVInterpolationType magInterpolation = ttype::GetValue; + + constexpr int numSamples = 5; + const int varChannels[numSamples] = {4, 1, 7, 3, 5}; + + std::vector inShapes = { + {{728, 1024}, 2, numChannels > 0 ? numChannels : varChannels[0]}, + { {512, 512}, 2, numChannels > 0 ? numChannels : varChannels[1]}, + { {128, 256}, 2, numChannels > 0 ? numChannels : varChannels[2]}, + { {256, 128}, 2, numChannels > 0 ? numChannels : varChannels[3]}, + { {40, 40}, 2, numChannels > 0 ? numChannels : varChannels[4]} + }; + std::vector outShapes = { + {{245, 245}, 2, inShapes[0].numChannels}, + { {250, 51}, 2, inShapes[1].numChannels}, + {{243, 128}, 2, inShapes[2].numChannels}, + {{128, 256}, 2, inShapes[3].numChannels}, + {{512, 512}, 2, inShapes[4].numChannels} + }; + + ASSERT_EQ(numSamples, inShapes.size()); + ASSERT_EQ(numSamples, outShapes.size()); + + nvcv::TensorBatch inTensors(numSamples); + nvcv::TensorBatch outTensors(numSamples); + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + if (numChannels == 1) + { + inTensors.pushBack( + CreateTensorHelper(inDtype, "HW", 1, inShapes[sampleIdx].extent[0], inShapes[sampleIdx].extent[1])); + outTensors.pushBack( + CreateTensorHelper(outDtype, "HW", 1, outShapes[sampleIdx].extent[0], outShapes[sampleIdx].extent[1])); + } + else + { + inTensors.pushBack(CreateTensorHelper(inDtype, "HWC", 1, inShapes[sampleIdx].extent[0], + inShapes[sampleIdx].extent[1], inShapes[sampleIdx].numChannels)); + outTensors.pushBack(CreateTensorHelper(outDtype, "HWC", 1, outShapes[sampleIdx].extent[0], + outShapes[sampleIdx].extent[1], outShapes[sampleIdx].numChannels)); + } + } + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + uniform_distribution rand(InBT{0}, std::is_integral_v ? cuda::TypeTraits::max : InBT{1}); + std::mt19937_64 rng(12345); + + std::vector> inBatchCpu; + std::vector> outBatchCpu; + std::vector> refBatchCpu; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + auto inData = inTensors[sampleIdx].exportData(); + auto outData = outTensors[sampleIdx].exportData(); + ASSERT_TRUE(inData && outData); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(inAccess && outAccess); + + long3 inStrides{inAccess->colStride(), inAccess->rowStride(), + inAccess->sampleStride() == 0 ? inAccess->rowStride() * inShapes[sampleIdx].extent[0] + : inAccess->sampleStride()}; + long3 outStrides{outAccess->colStride(), outAccess->rowStride(), + outAccess->sampleStride() == 0 ? outAccess->rowStride() * outShapes[sampleIdx].extent[0] + : outAccess->sampleStride()}; + + ASSERT_EQ(inAccess->numSamples(), 1); + ASSERT_EQ(outAccess->numSamples(), 1); + ASSERT_EQ(inAccess->numChannels(), inShapes[sampleIdx].numChannels); + ASSERT_EQ(outAccess->numChannels(), outShapes[sampleIdx].numChannels); + + int2 inShape{inShapes[sampleIdx].extent[1], inShapes[sampleIdx].extent[0]}; + int2 outShape{outShapes[sampleIdx].extent[1], outShapes[sampleIdx].extent[0]}; + inBatchCpu.push_back(baseline::CpuSample{inStrides.z, inStrides, 1, inShape, inAccess->numChannels()}); + outBatchCpu.push_back( + baseline::CpuSample{outStrides.z, outStrides, 1, outShape, outAccess->numChannels()}); + refBatchCpu.push_back( + baseline::CpuSample{outStrides.z, outStrides, 1, outShape, outAccess->numChannels()}); + + auto &inTensorCpu = inBatchCpu[sampleIdx]; + for (int y = 0; y < inShape.y; y++) + { + for (int x = 0; x < inShape.x; x++) + { + for (int c = 0; c < inShapes[sampleIdx].numChannels; c++) + { + inTensorCpu.get(0, int2{x, y}, c) = rand(rng); + } + } + } + ASSERT_EQ(cudaSuccess, + cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.z, cudaMemcpyHostToDevice, stream)); + } + + cvcuda::HQResize op; + cvcuda::UniqueWorkspace ws; + + { + HQResizeTensorShapesI inShapeDesc{inShapes.data(), numSamples, 2, numChannels}; + HQResizeTensorShapesI outShapeDesc{outShapes.data(), numSamples, 2, numChannels}; + ASSERT_NO_THROW(ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements( + numSamples, inShapeDesc, outShapeDesc, minInterpolation, magInterpolation, antialias))); + } + ASSERT_NO_THROW(op(stream, ws.get(), inTensors, outTensors, minInterpolation, magInterpolation, antialias)); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + auto outData = outTensors[sampleIdx].exportData(); + ASSERT_TRUE(outData); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outBatchCpu[sampleIdx].data(), outData->basePtr(), + outBatchCpu[sampleIdx].strides().z, cudaMemcpyDeviceToHost, stream)); + } + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias); + baseline::Compare(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias); + } + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +TYPED_TEST(OpHQResizeBatch, tensor_batch_3d_correct_output) +{ + const int numChannels = ttype::GetValue; + using InBT = ttype::GetType; + using OutBT = ttype::GetType; + const nvcv::DataType inDtype = TypeAsFormat(); + const nvcv::DataType outDtype = TypeAsFormat(); + const bool antialias = ttype::GetValue; + const NVCVInterpolationType minInterpolation = ttype::GetValue; + const NVCVInterpolationType magInterpolation = ttype::GetValue; + + constexpr int numSamples = 5; + const int varChannels[numSamples] = {6, 2, 3, 4, 1}; + + std::vector inShapes = { + {{128, 128, 128}, 3, numChannels > 0 ? numChannels : varChannels[0]}, + { {512, 40, 40}, 3, numChannels > 0 ? numChannels : varChannels[1]}, + { {40, 512, 40}, 3, numChannels > 0 ? numChannels : varChannels[2]}, + { {40, 40, 512}, 3, numChannels > 0 ? numChannels : varChannels[3]}, + { {40, 40, 40}, 3, numChannels > 0 ? numChannels : varChannels[4]} + }; + std::vector outShapes = { + { {45, 64, 50}, 3, inShapes[0].numChannels}, + { {40, 40, 40}, 3, inShapes[1].numChannels}, + { {40, 40, 40}, 3, inShapes[2].numChannels}, + { {40, 40, 40}, 3, inShapes[3].numChannels}, + {{128, 128, 128}, 3, inShapes[4].numChannels} + }; + + ASSERT_EQ(numSamples, inShapes.size()); + ASSERT_EQ(numSamples, outShapes.size()); + + nvcv::TensorBatch inTensors(numSamples); + nvcv::TensorBatch outTensors(numSamples); + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + if (numChannels == 1) + { + inTensors.pushBack(CreateTensorHelper(inDtype, "DHW", 1, inShapes[sampleIdx].extent[0], + inShapes[sampleIdx].extent[1], inShapes[sampleIdx].extent[2])); + outTensors.pushBack(CreateTensorHelper(outDtype, "DHW", 1, outShapes[sampleIdx].extent[0], + outShapes[sampleIdx].extent[1], outShapes[sampleIdx].extent[2])); + } + else + { + inTensors.pushBack(CreateTensorHelper(inDtype, "DHWC", 1, inShapes[sampleIdx].extent[0], + inShapes[sampleIdx].extent[1], inShapes[sampleIdx].extent[2], + inShapes[sampleIdx].numChannels)); + outTensors.pushBack(CreateTensorHelper(outDtype, "DHWC", 1, outShapes[sampleIdx].extent[0], + outShapes[sampleIdx].extent[1], outShapes[sampleIdx].extent[2], + outShapes[sampleIdx].numChannels)); + } + } + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + uniform_distribution rand(InBT{0}, std::is_integral_v ? cuda::TypeTraits::max : InBT{1}); + std::mt19937_64 rng(12345); + + std::vector> inBatchCpu; + std::vector> outBatchCpu; + std::vector> refBatchCpu; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + auto inData = inTensors[sampleIdx].exportData(); + auto outData = outTensors[sampleIdx].exportData(); + ASSERT_TRUE(inData && outData); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData); + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*outData); + ASSERT_TRUE(inAccess && outAccess); + + long4 inStrides{inAccess->colStride(), inAccess->rowStride(), inAccess->depthStride(), + inAccess->sampleStride() == 0 ? inAccess->depthStride() * inShapes[sampleIdx].extent[0] + : inAccess->sampleStride()}; + long4 outStrides{outAccess->colStride(), outAccess->rowStride(), outAccess->depthStride(), + outAccess->sampleStride() == 0 ? outAccess->depthStride() * outShapes[sampleIdx].extent[0] + : outAccess->sampleStride()}; + + ASSERT_EQ(inAccess->numSamples(), 1); + ASSERT_EQ(outAccess->numSamples(), 1); + ASSERT_EQ(inAccess->numChannels(), inShapes[sampleIdx].numChannels); + ASSERT_EQ(outAccess->numChannels(), outShapes[sampleIdx].numChannels); + + int3 inShape{inShapes[sampleIdx].extent[2], inShapes[sampleIdx].extent[1], inShapes[sampleIdx].extent[0]}; + int3 outShape{outShapes[sampleIdx].extent[2], outShapes[sampleIdx].extent[1], outShapes[sampleIdx].extent[0]}; + inBatchCpu.push_back(baseline::CpuSample{inStrides.w, inStrides, 1, inShape, inAccess->numChannels()}); + outBatchCpu.push_back( + baseline::CpuSample{outStrides.w, outStrides, 1, outShape, outAccess->numChannels()}); + refBatchCpu.push_back( + baseline::CpuSample{outStrides.w, outStrides, 1, outShape, outAccess->numChannels()}); + + auto &inTensorCpu = inBatchCpu[sampleIdx]; + for (int z = 0; z < inShape.z; z++) + { + for (int y = 0; y < inShape.y; y++) + { + for (int x = 0; x < inShape.x; x++) + { + for (int c = 0; c < inShapes[sampleIdx].numChannels; c++) + { + inTensorCpu.get(0, int3{x, y, z}, c) = rand(rng); + } + } + } + } + ASSERT_EQ(cudaSuccess, + cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.w, cudaMemcpyHostToDevice, stream)); + } + + cvcuda::HQResize op; + cvcuda::UniqueWorkspace ws; + + { + HQResizeTensorShapesI inShapeDesc{inShapes.data(), numSamples, 3, numChannels}; + HQResizeTensorShapesI outShapeDesc{outShapes.data(), numSamples, 3, numChannels}; + ASSERT_NO_THROW(ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements( + numSamples, inShapeDesc, outShapeDesc, minInterpolation, magInterpolation, antialias))); + } + ASSERT_NO_THROW(op(stream, ws.get(), inTensors, outTensors, minInterpolation, magInterpolation, antialias)); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + auto outData = outTensors[sampleIdx].exportData(); + ASSERT_TRUE(outData); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outBatchCpu[sampleIdx].data(), outData->basePtr(), + outBatchCpu[sampleIdx].strides().w, cudaMemcpyDeviceToHost, stream)); + } + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias); + baseline::Compare(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias); + } + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +#define NVCV_IMAGE_FORMAT_RGB16U \ + NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, PL, UNSIGNED, XYZ1, ASSOCIATED, X16_Y16_Z16) +#define NVCV_TEST_ROW_IB(NumChannels, InT, InFormat, OutT, OutFormat, Antialias, MinInterpolation, MagInterpolation) \ + ttype::Types, InT, ttype::Value, OutT, ttype::Value, \ + ttype::Value, ttype::Value, ttype::Value> + +NVCV_TYPED_TEST_SUITE( + OpHQResizeImageBatch, + ttype::Types); + +template +void TestImageBatch(int numSamples, std::vector &inShapes, + std::vector &outShapes, cvcuda::UniqueWorkspace &ws, + bool allocateWorkspace = true) +{ + const int numChannels = ttype::GetValue; + using InT = ttype::GetType; + using InBT = cuda::BaseType; + using OutT = ttype::GetType; + using OutBT = cuda::BaseType; + const nvcv::ImageFormat inImgFormat{ttype::GetValue}; + const nvcv::ImageFormat outImgFormat{ttype::GetValue}; + const bool antialias = ttype::GetValue; + const NVCVInterpolationType minInterpolation = ttype::GetValue; + const NVCVInterpolationType magInterpolation = ttype::GetValue; + + ASSERT_GE(numChannels, 1); + ASSERT_LE(numChannels, 4); + ASSERT_EQ(sizeof(InT), inImgFormat.planePixelStrideBytes(0)); + ASSERT_EQ(sizeof(OutT), outImgFormat.planePixelStrideBytes(0)); + + ASSERT_EQ(numSamples, inShapes.size()); + ASSERT_EQ(numSamples, outShapes.size()); + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + uniform_distribution rand(InBT{0}, std::is_integral_v ? cuda::TypeTraits::max : InBT{1}); + std::mt19937_64 rng(12345); + + std::vector imgSrc; + std::vector imgDst; + std::vector> inBatchCpu; + std::vector> outBatchCpu; + std::vector> refBatchCpu; + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + nvcv::Size2D inImgShape{inShapes[sampleIdx].extent[1], inShapes[sampleIdx].extent[0]}; + imgSrc.emplace_back(inImgShape, inImgFormat); + nvcv::Size2D outImgShape{outShapes[sampleIdx].extent[1], outShapes[sampleIdx].extent[0]}; + imgDst.emplace_back(outImgShape, outImgFormat); + + auto inData = imgSrc[sampleIdx].exportData(); + auto outData = imgDst[sampleIdx].exportData(); + ASSERT_TRUE(inData && outData); + + long3 inStrides{sizeof(InT), inData->plane(0).rowStride, inData->plane(0).rowStride * inData->plane(0).height}; + long3 outStrides{sizeof(OutT), outData->plane(0).rowStride, + outData->plane(0).rowStride * outData->plane(0).height}; + + inBatchCpu.push_back(baseline::CpuSample{ + inStrides.z, inStrides, 1, int2{inImgShape.w, inImgShape.h}, + numChannels + }); + outBatchCpu.push_back(baseline::CpuSample{ + outStrides.z, outStrides, 1, int2{outImgShape.w, outImgShape.h}, + numChannels + }); + refBatchCpu.push_back(baseline::CpuSample{ + outStrides.z, outStrides, 1, int2{outImgShape.w, outImgShape.h}, + numChannels + }); + + auto &inTensorCpu = inBatchCpu[sampleIdx]; + for (int y = 0; y < inImgShape.h; y++) + { + for (int x = 0; x < inImgShape.w; x++) + { + for (int c = 0; c < numChannels; c++) + { + inTensorCpu.get(0, int2{x, y}, c) = rand(rng); + } + } + } + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(inData->plane(0).basePtr, inTensorCpu.data(), inStrides.z, + cudaMemcpyHostToDevice, stream)); + } + + nvcv::ImageBatchVarShape batchSrc(numSamples); + nvcv::ImageBatchVarShape batchDst(numSamples); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + batchDst.pushBack(imgDst.begin(), imgDst.end()); + + cvcuda::HQResize op; + if (allocateWorkspace) + { + HQResizeTensorShapesI inShapeDesc{inShapes.data(), numSamples, 2, numChannels}; + HQResizeTensorShapesI outShapeDesc{outShapes.data(), numSamples, 2, numChannels}; + ASSERT_NO_THROW(ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements( + numSamples, inShapeDesc, outShapeDesc, minInterpolation, magInterpolation, antialias))); + } + ASSERT_NO_THROW(op(stream, ws.get(), batchSrc, batchDst, minInterpolation, magInterpolation, antialias)); + + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + const auto outData = imgDst[sampleIdx].exportData(); + ASSERT_TRUE(outData); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outBatchCpu[sampleIdx].data(), outData->plane(0).basePtr, + outBatchCpu[sampleIdx].strides().z, cudaMemcpyDeviceToHost, stream)); + } + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) + { + SCOPED_TRACE(sampleIdx); + baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias); + baseline::Compare(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias); + } + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +TYPED_TEST(OpHQResizeImageBatch, varbatch_2d_correct_output) +{ + const int numSamples = 4; + std::vector inShapes = {{{256, 128}}, {{40, 40}}, {{728, 1024}}, {{128, 256}}}; + std::vector outShapes = {{{128, 256}}, {{512, 512}}, {{245, 245}}, {{243, 128}}}; + cvcuda::UniqueWorkspace ws; + TestImageBatch(numSamples, inShapes, outShapes, ws); +} + +TEST(OpHQResizeImageBatch, test_multi_run_single_workspace) +{ + using FirstRun = typename NVCV_TEST_ROW_IB(1, uchar, NVCV_IMAGE_FORMAT_U8, uchar, NVCV_IMAGE_FORMAT_U8, false, + NVCV_INTERP_LINEAR, NVCV_INTERP_CUBIC); + using SecondRun = typename NVCV_TEST_ROW_IB(3, uchar3, NVCV_IMAGE_FORMAT_RGB8, float3, NVCV_IMAGE_FORMAT_RGBf32, + true, NVCV_INTERP_LANCZOS, NVCV_INTERP_LINEAR); + + const int numSamples0 = 1; + std::vector inShapes0 = { + {{128, 128}, 2, 1} + }; + std::vector outShapes0 = { + {{40, 50}, 2, 1} + }; + + const int numSamples1 = 3; + std::vector inShapes1 = { + { {50, 40}, 2, 3}, + { {64, 64}, 2, 3}, + {{128, 128}, 2, 3} + }; + std::vector outShapes1 = { + {{128, 128}, 2, 3}, + {{128, 128}, 2, 3}, + {{128, 128}, 2, 3} + }; + + HQResizeTensorShapeI maxShape; + GetMaxShape(maxShape, inShapes0.data(), numSamples0); + GetMaxShape(maxShape, outShapes0.data(), numSamples0); + GetMaxShape(maxShape, inShapes1.data(), numSamples1); + GetMaxShape(maxShape, outShapes1.data(), numSamples1); + + cvcuda::HQResize op; + cvcuda::UniqueWorkspace ws; + ASSERT_NO_THROW( + ws = cvcuda::AllocateWorkspace(op.getWorkspaceRequirements(std::max(numSamples0, numSamples1), maxShape))); + TestImageBatch(numSamples0, inShapes0, outShapes0, ws, false); + TestImageBatch(numSamples1, inShapes1, outShapes1, ws, false); +} diff --git a/tests/cvcuda/system/TestOpMedianBlur.cpp b/tests/cvcuda/system/TestOpMedianBlur.cpp index cae0b279e..aaaa331db 100644 --- a/tests/cvcuda/system/TestOpMedianBlur.cpp +++ b/tests/cvcuda/system/TestOpMedianBlur.cpp @@ -307,7 +307,7 @@ TEST_P(OpMedianBlur, varshape_correct_output) const int bytesPerPixel = 3; // Create tensor to store kernel size - nvcv::Tensor ksizeTensor(nvcv::TensorShape({numberOfImages, 2}, nvcv::TENSOR_NW), nvcv::TYPE_S32); + nvcv::Tensor ksizeTensor(nvcv::TensorShape({numberOfImages}, "N"), nvcv::TYPE_2S32); auto ksizeTensorData = ksizeTensor.exportData(); ASSERT_NE(nullptr, ksizeTensorData); diff --git a/tests/cvcuda/system/TestOpRandomResizedCrop.cpp b/tests/cvcuda/system/TestOpRandomResizedCrop.cpp index a287c1fcf..e73223a15 100644 --- a/tests/cvcuda/system/TestOpRandomResizedCrop.cpp +++ b/tests/cvcuda/system/TestOpRandomResizedCrop.cpp @@ -352,3 +352,71 @@ TEST_P(OpRandomResizedCrop, varshape_correct_output) EXPECT_THAT(mae, t::Each(t::Le(maeThreshold))); } } + +TEST(OpRandomResizedCrop_negative, createWithNullHandle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaRandomResizedCropCreate(nullptr, 0.2, 1.0, 0.8, 1.3, 2, 0)); +} + +TEST(OpRandomResizedCrop_negative, createWithInvalidScale) +{ + NVCVOperatorHandle opHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaRandomResizedCropCreate(&opHandle, 1.0, 0.2, 0.8, 1.3, 2, 0)); +} + +TEST(OpRandomResizedCrop_negative, createWithInvalidRatio) +{ + NVCVOperatorHandle opHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaRandomResizedCropCreate(&opHandle, 0.2, 1.0, 1.3, 0.8, 2, 0)); +} + +// clang-format off +NVCV_TEST_SUITE_P(OpRandomResizedCrop_negative, nvcv::test::ValueList +{ + // in_layout, in_data_type, out_layout, out_data_type, interpolation, expected_return_status + { "CHW", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "CHW", nvcv::TYPE_U8, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_F64, "HWC", nvcv::TYPE_U8, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_F64, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U32, "HWC", nvcv::TYPE_U8, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U32, NVCV_INTERP_NEAREST, NVCV_ERROR_INVALID_ARGUMENT}, + { "HWC", nvcv::TYPE_U8, "HWC", nvcv::TYPE_U8, NVCV_INTERP_AREA, NVCV_ERROR_INVALID_ARGUMENT}, +}); + +// clang-format on + +TEST_P(OpRandomResizedCrop_negative, infer_negative_parameter) +{ + std::string in_layout = GetParamValue<0>(); + nvcv::DataType in_data_type = GetParamValue<1>(); + std::string out_layout = GetParamValue<2>(); + nvcv::DataType out_data_type = GetParamValue<3>(); + NVCVInterpolationType interpolation = GetParamValue<4>(); + NVCVStatus expected_return_status = GetParamValue<5>(); + + double minScale = 0.08; + double maxScale = 1.0; + double minRatio = 3.0 / 4; + double maxRatio = 4.0 / 3; + + nvcv::Tensor imgSrc( + { + {24, 24, 2}, + in_layout.c_str() + }, + in_data_type); + nvcv::Tensor imgDst( + { + {24, 24, 2}, + out_layout.c_str() + }, + out_data_type); + + // Create and Call operator + int numberOfImages = 4; + uint32_t seed = 1; + + cvcuda::RandomResizedCrop randomResizedCropOp(minScale, maxScale, minRatio, maxRatio, numberOfImages, seed); + EXPECT_EQ(expected_return_status, + nvcv::ProtectCall([&] { randomResizedCropOp(NULL, imgSrc, imgDst, interpolation); })); +} diff --git a/tests/cvcuda/system/TestOpResize.cpp b/tests/cvcuda/system/TestOpResize.cpp index 3cde6b857..26140de44 100644 --- a/tests/cvcuda/system/TestOpResize.cpp +++ b/tests/cvcuda/system/TestOpResize.cpp @@ -56,8 +56,9 @@ NVCV_TEST_SUITE_P(OpResize, test::ValueList rndDstHeight(dstHeightBase * 0.8, dstHeightBase * 1.1); std::vector imgSrc, imgDst; - for (int i = 0; i < numberOfImages; ++i) + // The size of the first image is fixed: to cover area fast code path + imgSrc.emplace_back(nvcv::Size2D{srcWidthBase, srcHeightBase}, fmt); + imgDst.emplace_back(nvcv::Size2D{dstHeightBase, dstHeightBase}, fmt); + for (int i = 0; i < numberOfImages - 1; ++i) { imgSrc.emplace_back(nvcv::Size2D{rndSrcWidth(randEng), rndSrcHeight(randEng)}, fmt); imgDst.emplace_back(nvcv::Size2D{rndDstWidth(randEng), rndDstHeight(randEng)}, fmt); @@ -249,7 +253,7 @@ TEST_P(OpResize, varshape_correct_output) // Generate gold result test::Resize(goldVec, dstRowStride, {dstWidth, dstHeight}, srcVec[i], srcVecRowStride[i], {srcWidth, srcHeight}, - fmt, interpolation); + fmt, interpolation, true); // maximum absolute error std::vector mae(testVec.size()); diff --git a/tests/cvcuda/system/TestOpThreshold.cpp b/tests/cvcuda/system/TestOpThreshold.cpp index 8a0671d85..a495173c6 100644 --- a/tests/cvcuda/system/TestOpThreshold.cpp +++ b/tests/cvcuda/system/TestOpThreshold.cpp @@ -154,9 +154,10 @@ static double getThreshVal_Triangle(std::vector &src) return thresh; } +namespace { //test for uint8 template -static void Threshold(std::vector &src, std::vector &dst, double thresh, double maxval, uint32_t type) +void Threshold(std::vector &src, std::vector &dst, double thresh, double maxval, uint32_t type) { int automatic_thresh = (type & ~NVCV_THRESH_MASK); type &= NVCV_THRESH_MASK; @@ -214,17 +215,105 @@ static void Threshold(std::vector &src, std::vector &dst, double thresh, d } } +// test for double +template<> +void Threshold(std::vector &src, std::vector &dst, double thresh, double maxval, uint32_t type) +{ + int automatic_thresh = (type & ~NVCV_THRESH_MASK); + type &= NVCV_THRESH_MASK; + + if (automatic_thresh == (NVCV_THRESH_OTSU | NVCV_THRESH_TRIANGLE) || automatic_thresh == NVCV_THRESH_OTSU + || automatic_thresh == NVCV_THRESH_TRIANGLE) + return; + dst.assign(src.begin(), src.end()); + + int size = src.size(); + switch (type) + { + case NVCV_THRESH_BINARY: + for (int i = 0; i < size; i++) dst[i] = src[i] > thresh ? maxval : 0; + break; + case NVCV_THRESH_BINARY_INV: + for (int i = 0; i < size; i++) dst[i] = src[i] <= thresh ? maxval : 0; + break; + case NVCV_THRESH_TRUNC: + for (int i = 0; i < size; i++) dst[i] = std::min(static_cast(src[i]), thresh); + break; + case NVCV_THRESH_TOZERO: + for (int i = 0; i < size; i++) dst[i] = src[i] > thresh ? src[i] : 0; + break; + case NVCV_THRESH_TOZERO_INV: + for (int i = 0; i < size; i++) dst[i] = src[i] <= thresh ? src[i] : 0; + break; + } +} + +void ThresholdWrapper(std::vector &src, std::vector &dst, double thresh, double maxval, uint32_t type, + NVCVDataType nvcvDataType) +{ + if (nvcvDataType == NVCV_DATA_TYPE_F64) + { + std::vector src_tmp(src.size() / sizeof(double)); + std::vector dst_tmp(dst.size() / sizeof(double)); + size_t copySize = src.size(); + memcpy(static_cast(src_tmp.data()), static_cast(src.data()), copySize); + memcpy(static_cast(dst_tmp.data()), static_cast(dst.data()), copySize); + Threshold(src_tmp, dst_tmp, thresh, maxval, type); + memcpy(static_cast(dst.data()), static_cast(dst_tmp.data()), copySize); + } + else + { + Threshold(src, dst, thresh, maxval, type); + } +} + +template +void myGenerate(T *src, std::size_t size, std::default_random_engine &randEng) +{ + std::uniform_int_distribution rand(0u, 255u); + for (std::size_t idx = 0; idx < size; ++idx) + { + src[idx] = rand(randEng); + } +} + +template<> +void myGenerate(double *src, std::size_t size, std::default_random_engine &randEng) +{ + std::uniform_real_distribution rand(0., 1.); + for (std::size_t idx = 0; idx < size; ++idx) + { + src[idx] = rand(randEng); + } +} +} // namespace + // clang-format off -NVCV_TEST_SUITE_P(OpThreshold, nvcv::test::ValueList +NVCV_TEST_SUITE_P(OpThreshold, nvcv::test::ValueList { - //batch, height, width, type, thresh, maxval - { 1, 480, 360, NVCV_THRESH_BINARY, 100, 255}, - { 5, 100, 100, NVCV_THRESH_BINARY_INV, 100, 255}, - { 4, 100, 101, NVCV_THRESH_TRUNC, 100, 255}, - { 3, 360, 480, NVCV_THRESH_TOZERO, 100, 255}, - { 2, 100, 101, NVCV_THRESH_TOZERO_INV, 100, 255}, - { 1, 800, 600, NVCV_THRESH_OTSU|NVCV_THRESH_BINARY, 100, 255}, - { 3, 600, 1000, NVCV_THRESH_TRIANGLE|NVCV_THRESH_BINARY_INV, 100, 255}, + //batch, height, width, type, thresh, maxval, format, + { 1, 480, 360, NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8}, + { 1, 480, 360, NVCV_THRESH_BINARY, -1, 255, nvcv::FMT_U8}, + { 1, 480, 360, NVCV_THRESH_BINARY, 256, 255, nvcv::FMT_U8}, + { 1, 480, 360, NVCV_THRESH_BINARY, 0.5, 255, nvcv::FMT_F64}, + { 5, 100, 100, NVCV_THRESH_BINARY_INV, 100, 255, nvcv::FMT_U8}, + { 5, 100, 100, NVCV_THRESH_BINARY_INV, -1, 255, nvcv::FMT_U8}, + { 5, 100, 100, NVCV_THRESH_BINARY_INV, 256, 255, nvcv::FMT_U8}, + { 5, 100, 100, NVCV_THRESH_BINARY_INV, 0.5, 255, nvcv::FMT_F64}, + { 4, 100, 101, NVCV_THRESH_TRUNC, 100, 255, nvcv::FMT_U8}, + { 4, 100, 101, NVCV_THRESH_TRUNC, -1, 255, nvcv::FMT_U8}, + { 4, 100, 101, NVCV_THRESH_TRUNC, 256, 255, nvcv::FMT_U8}, + { 4, 100, 101, NVCV_THRESH_TRUNC, 0.5, 255, nvcv::FMT_F64}, + { 3, 360, 480, NVCV_THRESH_TOZERO, 100, 255, nvcv::FMT_U8}, + { 3, 360, 480, NVCV_THRESH_TOZERO, -1, 255, nvcv::FMT_U8}, + { 3, 360, 480, NVCV_THRESH_TOZERO, 256, 255, nvcv::FMT_U8}, + { 3, 360, 480, NVCV_THRESH_TOZERO, 0.5, 255, nvcv::FMT_F64}, + { 2, 100, 101, NVCV_THRESH_TOZERO_INV, 100, 255, nvcv::FMT_U8}, + { 2, 100, 101, NVCV_THRESH_TOZERO_INV, -1, 255, nvcv::FMT_U8}, + { 2, 100, 101, NVCV_THRESH_TOZERO_INV, 256, 255, nvcv::FMT_U8}, + { 2, 100, 101, NVCV_THRESH_TOZERO_INV, 0.5, 255, nvcv::FMT_F64}, + { 1, 800, 600, NVCV_THRESH_OTSU|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8}, + { 3, 600, 1000, NVCV_THRESH_TRIANGLE|NVCV_THRESH_BINARY_INV, 100, 255, nvcv::FMT_U8}, }); // clang-format on @@ -234,16 +323,19 @@ TEST_P(OpThreshold, tensor_correct_output) cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int batch = GetParamValue<0>(); - int height = GetParamValue<1>(); - int width = GetParamValue<2>(); - uint32_t type = GetParamValue<3>(); - double thresh = GetParamValue<4>(); - double maxval = GetParamValue<5>(); + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + uint32_t type = GetParamValue<3>(); + double thresh = GetParamValue<4>(); + double maxval = GetParamValue<5>(); + nvcv::ImageFormat fmt = GetParamValue<6>(); - nvcv::ImageFormat fmt = nvcv::FMT_U8; - nvcv::Tensor imgIn = nvcv::util::CreateTensor(batch, width, height, fmt); - nvcv::Tensor imgOut = nvcv::util::CreateTensor(batch, width, height, fmt); + NVCVDataType nvcvDataType; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(fmt, 0, &nvcvDataType)); + + nvcv::Tensor imgIn = nvcv::util::CreateTensor(batch, width, height, fmt); + nvcv::Tensor imgOut = nvcv::util::CreateTensor(batch, width, height, fmt); auto inData = imgIn.exportData(); ASSERT_NE(nullptr, inData); @@ -295,9 +387,16 @@ TEST_P(OpThreshold, tensor_correct_output) for (int i = 0; i < batch; i++) { - std::uniform_int_distribution rand(0, 255); srcVec[i].resize(height * rowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F64: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size() / sizeof(double), randEng); + break; + default: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size(), randEng); + break; + } ASSERT_EQ(cudaSuccess, cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec[i].data(), rowStride, rowStride, height, cudaMemcpyHostToDevice)); } @@ -319,7 +418,7 @@ TEST_P(OpThreshold, tensor_correct_output) rowStride, height, cudaMemcpyDeviceToHost)); std::vector goldVec(height * rowStride); - Threshold(srcVec[i], goldVec, thresh, maxval, type); + ThresholdWrapper(srcVec[i], goldVec, thresh, maxval, type, nvcvDataType); EXPECT_EQ(goldVec, testVec); } @@ -331,14 +430,17 @@ TEST_P(OpThreshold, varshape_correct_shape) cudaStream_t stream; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); - int batch = GetParamValue<0>(); - int height = GetParamValue<1>(); - int width = GetParamValue<2>(); - uint32_t type = GetParamValue<3>(); - double thresh = GetParamValue<4>(); - double maxval = GetParamValue<5>(); + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + uint32_t type = GetParamValue<3>(); + double thresh = GetParamValue<4>(); + double maxval = GetParamValue<5>(); + nvcv::ImageFormat fmt = GetParamValue<6>(); + + NVCVDataType nvcvDataType; + ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetPlaneDataType(fmt, 0, &nvcvDataType)); - nvcv::ImageFormat fmt = nvcv::FMT_U8; // Create input and output std::default_random_engine randEng; std::uniform_int_distribution rndWidth(width * 0.8, width * 1.1); @@ -391,10 +493,16 @@ TEST_P(OpThreshold, varshape_correct_shape) int srcRowStride = srcWidth * fmt.planePixelStrideBytes(0); - std::uniform_int_distribution rand(0, 255); - srcVec[i].resize(srcHeight * srcRowStride); - std::generate(srcVec[i].begin(), srcVec[i].end(), [&]() { return rand(randEng); }); + switch (nvcvDataType) + { + case NVCV_DATA_TYPE_F64: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size() / sizeof(double), randEng); + break; + default: + myGenerate(reinterpret_cast(srcVec[i].data()), srcVec[i].size(), randEng); + break; + } // Copy input data to the GPU ASSERT_EQ(cudaSuccess, cudaMemcpy2D(srcData->plane(0).basePtr, srcData->plane(0).rowStride, srcVec[i].data(), @@ -429,9 +537,148 @@ TEST_P(OpThreshold, varshape_correct_shape) dstHeight, cudaMemcpyDeviceToHost)); std::vector goldVec(dstHeight * dstRowStride); - Threshold(srcVec[i], goldVec, thresh, maxval, type); + ThresholdWrapper(srcVec[i], goldVec, thresh, maxval, type, nvcvDataType); EXPECT_EQ(goldVec, testVec); } EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } + +// clang-format off +NVCV_TEST_SUITE_P(OpThreshold_Negative, nvcv::test::ValueList +{ + //batch, height, width, type, thresh, maxval, inFormat, outFormat, threshDataType, maxvalType + { 1, 224, 224, NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_F16, nvcv::FMT_F16, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8, nvcv::FMT_U16, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8, nvcv::FMT_U8, nvcv::TYPE_F32, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8, nvcv::FMT_U8, nvcv::TYPE_F64, nvcv::TYPE_F32}, + { 1, 224, 224, NVCV_THRESH_TRIANGLE|NVCV_THRESH_OTSU|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8, nvcv::FMT_U8, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_TRUNC|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U8, nvcv::FMT_U8, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_OTSU|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U16, nvcv::FMT_U16, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_OTSU|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_RGB8, nvcv::FMT_RGB8, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_TRIANGLE|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_U16, nvcv::FMT_U16, nvcv::TYPE_F64, nvcv::TYPE_F64}, + { 1, 224, 224, NVCV_THRESH_TRIANGLE|NVCV_THRESH_BINARY, 100, 255, nvcv::FMT_RGB8, nvcv::FMT_RGB8, nvcv::TYPE_F64, nvcv::TYPE_F64}, +}); + +// clang-format on + +TEST(OpThreshold_Negative, create_with_null_handle) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaThresholdCreate(nullptr, NVCV_THRESH_BINARY, 5)); +} + +TEST(OpThreshold_Negative, create_with_negative_maxBatchSize) +{ + NVCVOperatorHandle thresholdHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, cvcudaThresholdCreate(&thresholdHandle, NVCV_THRESH_BINARY, -1)); +} + +TEST_P(OpThreshold_Negative, invalid_inputs) +{ + cudaStream_t stream; + EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + uint32_t type = GetParamValue<3>(); + double thresh = GetParamValue<4>(); + double maxval = GetParamValue<5>(); + nvcv::ImageFormat inFormat = GetParamValue<6>(); + nvcv::ImageFormat outFormat = GetParamValue<7>(); + nvcv::DataType threshDataType = GetParamValue<8>(); + nvcv::DataType maxvalDataType = GetParamValue<9>(); + + nvcv::Tensor imgIn = nvcv::util::CreateTensor(batch, width, height, inFormat); + nvcv::Tensor imgOut = nvcv::util::CreateTensor(batch, width, height, outFormat); + + //parameters + nvcv::Tensor threshval({{batch}, "N"}, threshDataType); + nvcv::Tensor maxvalval({{batch}, "N"}, maxvalDataType); + + auto threshData = threshval.exportData(); + auto maxvalData = maxvalval.exportData(); + + ASSERT_NE(nullptr, threshData); + ASSERT_NE(nullptr, maxvalData); + + std::vector threshVec(batch, thresh); + std::vector maxvalVec(batch, maxval); + + // Copy vectors to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(threshData->basePtr(), threshVec.data(), threshVec.size() * sizeof(double), + cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(maxvalData->basePtr(), maxvalVec.data(), maxvalVec.size() * sizeof(double), + cudaMemcpyHostToDevice, stream)); + + // Call operator + int maxBatch = 5; + cvcuda::Threshold thresholdOp(type, maxBatch); + EXPECT_ANY_THROW(thresholdOp(stream, imgIn, imgOut, threshval, maxvalval)); + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +TEST_P(OpThreshold_Negative, varshape_invalid_inputs) +{ + cudaStream_t stream; + EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + int batch = GetParamValue<0>(); + int height = GetParamValue<1>(); + int width = GetParamValue<2>(); + uint32_t type = GetParamValue<3>(); + double thresh = GetParamValue<4>(); + double maxval = GetParamValue<5>(); + nvcv::ImageFormat inFormat = GetParamValue<6>(); + nvcv::ImageFormat outFormat = GetParamValue<7>(); + nvcv::DataType threshDataType = GetParamValue<8>(); + nvcv::DataType maxvalDataType = GetParamValue<9>(); + + // Create input and output + std::default_random_engine randEng; + std::uniform_int_distribution rndWidth(width * 0.8, width * 1.1); + std::uniform_int_distribution rndHeight(height * 0.8, height * 1.1); + + std::vector imgSrc, imgDst; + for (int i = 0; i < batch; ++i) + { + int rw = rndWidth(randEng); + int rh = rndHeight(randEng); + imgSrc.emplace_back(nvcv::Size2D{rw, rh}, inFormat); + imgDst.emplace_back(nvcv::Size2D{rw, rh}, outFormat); + } + + nvcv::ImageBatchVarShape batchSrc(batch); + batchSrc.pushBack(imgSrc.begin(), imgSrc.end()); + + nvcv::ImageBatchVarShape batchDst(batch); + batchDst.pushBack(imgDst.begin(), imgDst.end()); + + //parameters + nvcv::Tensor threshval({{batch}, "N"}, threshDataType); + nvcv::Tensor maxvalval({{batch}, "N"}, maxvalDataType); + + auto threshData = threshval.exportData(); + auto maxvalData = maxvalval.exportData(); + + ASSERT_NE(nullptr, threshData); + ASSERT_NE(nullptr, maxvalData); + + std::vector threshVec(batch, thresh); + std::vector maxvalVec(batch, maxval); + + // Copy vectors to the GPU + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(threshData->basePtr(), threshVec.data(), threshVec.size() * sizeof(double), + cudaMemcpyHostToDevice, stream)); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(maxvalData->basePtr(), maxvalVec.data(), maxvalVec.size() * sizeof(double), + cudaMemcpyHostToDevice, stream)); + // Call operator + int maxBatch = 5; + cvcuda::Threshold thresholdOp(type, maxBatch); + EXPECT_ANY_THROW(thresholdOp(stream, batchSrc, batchDst, threshval, maxvalval)); + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} diff --git a/tests/nvcv_types/cudatools_system/CMakeLists.txt b/tests/nvcv_types/cudatools_system/CMakeLists.txt index fd97891b6..3779482ec 100644 --- a/tests/nvcv_types/cudatools_system/CMakeLists.txt +++ b/tests/nvcv_types/cudatools_system/CMakeLists.txt @@ -45,6 +45,8 @@ add_executable(nvcv_test_cudatools_system TestTypeTraits.cpp TestMetaprogramming.cpp TestArrayWrap.cpp + TestTensorBatchWrap.cpp + DeviceTensorBatchWrap.cu ) target_link_libraries(nvcv_test_cudatools_system diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu new file mode 100644 index 000000000..fa1a3b1f1 --- /dev/null +++ b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu @@ -0,0 +1,152 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DeviceTensorBatchWrap.hpp" + +#include // for EXPECT_EQ, etc. +#include // for operator == to allow EXPECT_EQ +#include // for StaticCast, etc. +#include + +namespace cuda = nvcv::cuda; + +template +__device__ T *tensor_ptr(TensorWrap &tensor, int *coords, Coords... vcoords) +{ + if constexpr (sizeof...(Coords) == NDIM) + { + return tensor.ptr(vcoords...); + } + else + { + return tensor_ptr(tensor, coords, vcoords..., coords[sizeof...(Coords)]); + } +} + +template +__device__ void SetThroughTensor::Set(TensorBatchWrapT wrap, int sample, int *coords, T value) +{ + auto tensor = wrap.tensor(sample); + *tensor_ptr(tensor, coords) = value; +} + +template +__device__ void SetThroughSubscript::Set(TensorBatchWrapT wrap, int sample, int *coords, T value) +{ + constexpr int NDIM = TensorBatchWrapT::kNumDimensions; + if constexpr (NDIM == 1) + { + wrap[int2{sample, coords[0]}] = value; + } + else if constexpr (NDIM == 2) + { + wrap[int3{sample, coords[1], coords[0]}] = value; + } + else if constexpr (NDIM == 3) + { + wrap[int4{sample, coords[2], coords[1], coords[0]}] = value; + } +} + +template +__device__ void SetThroughPtrHelper(TensorBatchWrapT wrap, int sample, int *coords, T value, VCoords... vcoords) +{ + constexpr int NDIM = TensorBatchWrapT::kNumDimensions; + if constexpr (sizeof...(VCoords) == 0) + { + SetThroughPtrHelper(wrap, sample, coords, value, sample, coords[0]); + } + else if constexpr (sizeof...(VCoords) < NDIM + 1) + { + SetThroughPtrHelper(wrap, sample, coords, value, vcoords..., coords[sizeof...(VCoords) - 1]); + } + else + { + *wrap.ptr(vcoords...) = value; + } +} + +template +__device__ void SetThroughPtr::Set(TensorBatchWrapT wrap, int sample, int *coords, T value) +{ + SetThroughPtrHelper(wrap, sample, coords, value); +} + +template +__global__ void SetReferenceKernel(TensorBatchWrapT wrap) +{ + int sample = blockIdx.x; + const int64_t *shape = wrap.shape(sample); + int id = threadIdx.x; + int64_t tensorVol = 1; + const int ndim = TensorBatchWrapT::kNumDimensions; + for (int d = 0; d < ndim; ++d) + { + tensorVol *= shape[d]; + } + for (int index = id; index < tensorVol; index += blockDim.x) + { + int coords[ndim]; + int tmp_i = index; + for (int d = ndim - 1; d >= 0; --d) + { + coords[d] = tmp_i % shape[d]; + tmp_i /= shape[d]; + } + SetValue::Set(wrap, sample, coords, cuda::SetAll(index % 255)); + } +} + +template +void SetReference(TensorBatchWrapT wrap, cudaStream_t stream) +{ + int blocks = wrap.numTensors(); + SetReferenceKernel<<>>(wrap); +} + +#define SetReferenceSpec(SET_VALUE, TENSOR_BATCH_TYPE) \ + template __device__ void SET_VALUE::Set( \ + TENSOR_BATCH_TYPE, int, int *, TENSOR_BATCH_TYPE::ValueType); \ + template void SetReference, TENSOR_BATCH_TYPE>( \ + TENSOR_BATCH_TYPE, cudaStream_t) + +#define TB_PARAMS1 uchar1, -1, 32 * sizeof(uchar1), sizeof(uchar1) +SetReferenceSpec(SetThroughTensor, cuda::TensorBatchWrap); + +#define TB_PARAMS2 double4, 8 * sizeof(double4), sizeof(double4) +SetReferenceSpec(SetThroughTensor, cuda::TensorBatchWrap); + +#define TB_PARAMS3 float3, -1, -1, 8 * sizeof(float3), sizeof(float3) +SetReferenceSpec(SetThroughTensor, cuda::TensorBatchWrap); + +#define TB_PARAMS4 uchar2, sizeof(uchar2) +SetReferenceSpec(SetThroughSubscript, cuda::TensorBatchWrap); + +#define TB_PARAMS5 int3, -1, 16 * sizeof(int3), sizeof(int3) +SetReferenceSpec(SetThroughSubscript, cuda::TensorBatchWrap); + +#define TB_PARAMS6 ushort4, -1, sizeof(ushort4) +SetReferenceSpec(SetThroughSubscript, cuda::TensorBatchWrap); + +#define TB_PARAMS7 uchar4, -1, -1, 32 * sizeof(uchar4), sizeof(uchar4) +SetReferenceSpec(SetThroughPtr, cuda::TensorBatchWrap); + +#define TB_PARAMS8 float1, -1, -1, -1, 8 * sizeof(float1), sizeof(float1) +SetReferenceSpec(SetThroughPtr, cuda::TensorBatchWrap); + +#define TB_PARAMS9 float4, sizeof(float4) +SetReferenceSpec(SetThroughPtr, cuda::TensorBatchWrap); diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.hpp b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.hpp new file mode 100644 index 000000000..2f98460b3 --- /dev/null +++ b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.hpp @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cuda = nvcv::cuda; + +template +void SetReference(TensorBatchWrapT wrap, cudaStream_t stream); + +template +struct SetThroughTensor +{ + static __device__ void Set(TensorBatchWrapT wrap, int sample, int *coords, T value); +}; + +template +struct SetThroughSubscript +{ + static __device__ void Set(TensorBatchWrapT wrap, int sample, int *coords, T value); +}; + +template +struct SetThroughPtr +{ + static __device__ void Set(TensorBatchWrapT wrap, int sample, int *coords, T value); +}; diff --git a/tests/nvcv_types/cudatools_system/TestTensorBatchWrap.cpp b/tests/nvcv_types/cudatools_system/TestTensorBatchWrap.cpp new file mode 100644 index 000000000..50d6fbf64 --- /dev/null +++ b/tests/nvcv_types/cudatools_system/TestTensorBatchWrap.cpp @@ -0,0 +1,175 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DeviceTensorBatchWrap.hpp" + +#include // for NVCV_INSTANTIATE_TEST_SUITE_P, etc. +#include // for NVCV_MIXTYPED_TEST_SUITE_P, etc. +#include // for StringLiteral +#include // for Image, etc. +#include // for TensorBatch +#include // for TensorDataAccessStridedImagePlanar, etc. +#include // for operator == to allow EXPECT_EQ +#include + +#include +#include +#include + +namespace t = ::testing; +namespace test = nvcv::test; +namespace cuda = nvcv::cuda; +namespace ttype = nvcv::test::type; + +static constexpr int kMaxDim = 50; + +template +nvcv::Tensor GetRandomTensor(R &rg, nvcv::DataType dtype, cudaStream_t stream) +{ + std::uniform_int_distribution shape_dist(kMaxDim / 2, kMaxDim); + nvcv::TensorShape::ShapeType shapeData(NDIM); + for (auto &d : shapeData) + { + d = shape_dist(rg); + } + if (INNER_DIM != -1) + { + shapeData[NDIM - 1] = INNER_DIM; + } + auto t = nvcv::Tensor(nvcv::TensorShape(shapeData, ""), dtype); + return t; +} + +template +void VerifyTensorHelper(NVCVByte *data, const int64_t *shape, const int64_t *stride, int64_t startIndex = 0) +{ + if constexpr (N == NDIM) + { + auto gold = cuda::SetAll(startIndex % 255); + auto value = *reinterpret_cast(data); + ASSERT_EQ(value, gold); + } + else + { + int64_t indexStride = 1; + for (int i = 1; i + N < NDIM; ++i) + { + indexStride *= shape[i]; + } + for (int i = 0; i < shape[0]; ++i) + { + VerifyTensorHelper(data, shape + 1, stride + 1, startIndex + i * indexStride); + data += stride[0]; + } + } +} + +template +void VerifyTensor(const nvcv::Tensor &tensor, cudaStream_t stream) +{ + auto data = tensor.exportData().cdata(); + auto bufferSize = data.shape[0] * data.buffer.strided.strides[0]; + std::vector hostBuffer(bufferSize); + ASSERT_EQ( + cudaMemcpyAsync(hostBuffer.data(), data.buffer.strided.basePtr, bufferSize, cudaMemcpyDeviceToHost, stream), + cudaSuccess); + ASSERT_EQ(cudaStreamSynchronize(stream), cudaSuccess); + VerifyTensorHelper(hostBuffer.data(), &data.shape[0], &data.buffer.strided.strides[0]); +} + +template +struct TensorBatchWrapHelper +{ + using type = TensorBatchWrapHelper::type; +}; + +template +struct TensorBatchWrapHelper +{ + using type + = TensorBatchWrapHelper::type; +}; + +template +struct TensorBatchWrapHelper +{ + using type = cuda::TensorBatchWrap; +}; + +template +struct TensorBatchWrapHelper +{ + using type = cuda::TensorBatchWrap; +}; + +template +using TensorBatchWrapHelperT = typename TensorBatchWrapHelper::type; + +#define NVCV_TEST_ROW(NUM_TENSORS, DTYPE, TYPE, NDIM, INNER_DIM, SET_VALUE_METHOD) \ + ttype::Types, ttype::Value, TYPE, ttype::Value, ttype::Value, \ + SET_VALUE_METHOD, TYPE>> + +NVCV_TYPED_TEST_SUITE(TensorBatchWrapTensorTest, + ttype::Types); + +#undef NVCV_TEST_ROW + +TYPED_TEST(TensorBatchWrapTensorTest, correct_content) +{ + int numTensors = ttype::GetValue; + nvcv::DataType dtype{ttype::GetValue}; + using T = ttype::GetType; + constexpr int NDIM = ttype::GetValue; + constexpr int INNER_DIM = ttype::GetValue; + using SET_METHOD = ttype::GetType; + using TensorBatchWrapT = TensorBatchWrapHelperT; + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + nvcv::TensorBatch tensorBatch(numTensors); + std::vector tensors; + + std::mt19937 rg{231}; + for (int i = 0; i < tensorBatch.capacity(); ++i) + { + auto t = GetRandomTensor(rg, dtype, stream); + ASSERT_EQ(t.rank(), NDIM); + tensors.push_back(t); + } + + tensorBatch.pushBack(tensors.begin(), tensors.end()); + + auto tensorBatchData = tensorBatch.exportData(stream).cast(); + ASSERT_TRUE(tensorBatchData.hasValue()); + + auto wrap = TensorBatchWrapT(*tensorBatchData); + SetReference(wrap, stream); + ASSERT_EQ(cudaStreamSynchronize(stream), cudaSuccess); + + for (auto &tensor : tensors) + { + VerifyTensor(tensor, stream); + } +} diff --git a/tests/nvcv_types/python/nvcv_test_types_python.in b/tests/nvcv_types/python/nvcv_test_types_python.in index 2abb0fc1c..ee9d70f8c 100755 --- a/tests/nvcv_types/python/nvcv_test_types_python.in +++ b/tests/nvcv_types/python/nvcv_test_types_python.in @@ -15,25 +15,32 @@ # See the License for the specific language governing permissions and # limitations under the License. + tests_dir="@PYTHON_TEST_DIR@" -python_versions="@PYTHON_TEST_VERSIONS@" +python_versions_tentative="@PYTHON_TEST_VERSIONS@" + +python_versions="" # Verify if correct package dependencies are installed -------- pip_depends="pytest torch" -declare -a install_commands - -for ver in $python_versions; do +# Collect all python versions that are indeed installed and have proper dependencies installed +# Two behaviors: +# - default: skip Python versions that are not installed or don't have pytest and torch installed +# - if NVCV_FORCE_PYTHON is set: exit with error +for ver in $python_versions_tentative; do if ! python$ver -c "import pytest, torch" > /dev/null 2>&1; then - install_commands+=("sudo python$ver -m pip install $pip_depends") + echo "WARNING: Python version $ver not installed or missing proper dependencies" + echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends" + if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then + exit 1 #hard exit + fi + else + echo "Found Python version $ver installed with proper dependencies, adding to tests" + python_versions+="$ver " fi done -if [[ "${install_commands[*]}" ]]; then - echo "Please run the following commands before running $(basename $0): " - ( IFS=$'\n'; echo -e "${install_commands[*]}" ) - exit 1 -fi # Run tests -------- diff --git a/tests/nvcv_types/system/TestAllocatorC.cpp b/tests/nvcv_types/system/TestAllocatorC.cpp index 6b47c25eb..8d7fe0bbf 100644 --- a/tests/nvcv_types/system/TestAllocatorC.cpp +++ b/tests/nvcv_types/system/TestAllocatorC.cpp @@ -82,6 +82,10 @@ TEST(AllocatorTest, CreateAndUseCustom) ASSERT_EQ(nvcvAllocatorConstructCustom(allocators, 2, &halloc), NVCV_SUCCESS); ASSERT_NE(halloc, nullptr); + int refCount = 0; + EXPECT_EQ(nvcvAllocatorRefCount(halloc, &refCount), NVCV_SUCCESS); + EXPECT_EQ(refCount, 1); + for (int i = 0; i < 2; i++) { NVCVResourceAllocator alloc = {}; @@ -216,3 +220,183 @@ TEST(Allocator, smoke_test_custom_functors) myalloc1.cudaMem().free((void *)1, 7); EXPECT_EQ(1, devCounter); } + +TEST(AllocatorTest, smoke_user_pointer) +{ + NVCVResourceAllocator allocators[1] = {}; + + int ctx0 = 100; + + allocators[0].resType = NVCV_RESOURCE_MEM_HOST; + allocators[0].ctx = &ctx0; + allocators[0].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + *(int *)ctx += 1; + return memalign(align, size); + }; + allocators[0].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + *(int *)ctx += 10; + free(ptr); + }; + allocators[0].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) + { + EXPECT_EQ(ctx, alloc->ctx); + int *ctx_int = static_cast(ctx); + *ctx_int = 0xDEAD; + }; + + NVCVAllocatorHandle halloc = nullptr; + ASSERT_EQ(nvcvAllocatorConstructCustom(allocators, 1, &halloc), NVCV_SUCCESS); + ASSERT_NE(halloc, nullptr); + + void *userPtr; + ASSERT_EQ(nvcvAllocatorGetUserPointer(halloc, &userPtr), NVCV_SUCCESS); + EXPECT_EQ(nullptr, userPtr); + + ASSERT_EQ(nvcvAllocatorSetUserPointer(halloc, (void *)0x123), NVCV_SUCCESS); + ASSERT_EQ(nvcvAllocatorGetUserPointer(halloc, &userPtr), NVCV_SUCCESS); + EXPECT_EQ((void *)0x123, userPtr); + + ASSERT_EQ(nvcvAllocatorSetUserPointer(halloc, nullptr), NVCV_SUCCESS); + ASSERT_EQ(nvcvAllocatorGetUserPointer(halloc, &userPtr), NVCV_SUCCESS); + EXPECT_EQ(nullptr, userPtr); + + int newRef = 1; + EXPECT_EQ(nvcvAllocatorDecRef(halloc, &newRef), NVCV_SUCCESS); + EXPECT_EQ(newRef, 0); +} + +TEST(AllocatorTest, invalid_arguments_api_calls) +{ + NVCVResourceAllocator allocators[2] = {}; + + allocators[0].resType = NVCV_RESOURCE_MEM_HOST; + allocators[0].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + return memalign(align, size); + }; + allocators[0].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + free(ptr); + }; + allocators[0].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + allocators[1].resType = NVCV_RESOURCE_MEM_CUDA; + allocators[1].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + void *mem; + EXPECT_EQ(cudaMalloc(&mem, size), cudaSuccess); + return mem; + }; + allocators[1].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + EXPECT_EQ(cudaFree(ptr), cudaSuccess); + }; + allocators[1].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + NVCVAllocatorHandle halloc = nullptr; + // 1. Pointer to output handle must not be NULL + EXPECT_EQ(nvcvAllocatorConstructCustom(allocators, 2, nullptr), NVCV_ERROR_INVALID_ARGUMENT); + ASSERT_EQ(nvcvAllocatorConstructCustom(allocators, 2, &halloc), NVCV_SUCCESS); + ASSERT_NE(halloc, nullptr); + + // 2. Pointer to output user pointer cannot be NULL + EXPECT_EQ(nvcvAllocatorGetUserPointer(halloc, nullptr), NVCV_ERROR_INVALID_ARGUMENT); + + // 3. Pointer to output buffer must not be NULL + EXPECT_EQ(nvcvAllocatorAllocHostMemory(halloc, nullptr, (1 << 10), 256), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocHostPinnedMemory(halloc, nullptr, (1 << 10), 256), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocCudaMemory(halloc, nullptr, (1 << 10), 256), NVCV_ERROR_INVALID_ARGUMENT); + + // 4. allocHostMem + void *p0 = nullptr; + EXPECT_EQ(nvcvAllocatorAllocHostMemory(halloc, &p0, -1, 256), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocHostMemory(halloc, &p0, (1 << 10), 3), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocHostMemory(halloc, &p0, 128, 256), NVCV_ERROR_INVALID_ARGUMENT); + + // 5. allocHostPinnedMem + EXPECT_EQ(nvcvAllocatorAllocHostPinnedMemory(halloc, &p0, -1, 256), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocHostPinnedMemory(halloc, &p0, (1 << 10), 3), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocHostPinnedMemory(halloc, &p0, 128, 256), NVCV_ERROR_INVALID_ARGUMENT); + + // 6. allocHostPinnedMem + EXPECT_EQ(nvcvAllocatorAllocCudaMemory(halloc, &p0, -1, 256), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocCudaMemory(halloc, &p0, (1 << 10), 3), NVCV_ERROR_INVALID_ARGUMENT); + EXPECT_EQ(nvcvAllocatorAllocCudaMemory(halloc, &p0, 128, 256), NVCV_ERROR_INVALID_ARGUMENT); + + int newRef = 1; + EXPECT_EQ(nvcvAllocatorDecRef(halloc, &newRef), NVCV_SUCCESS); + EXPECT_EQ(newRef, 0); +} + +TEST(AllocatorTest, customAllocator_constructor_negative) +{ + NVCVResourceAllocator invalidFnAllocAllocator[1] = {}; + NVCVResourceAllocator invalidFnFreeAllocator[1] = {}; + NVCVResourceAllocator duplicatedResourceTypeAllocator[2] = {}; + + // 1. allocation function must not be NULL + invalidFnAllocAllocator[0].resType = NVCV_RESOURCE_MEM_HOST; + invalidFnAllocAllocator[0].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + free(ptr); + }; + invalidFnAllocAllocator[0].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + NVCVAllocatorHandle halloc = nullptr; + + EXPECT_EQ(nvcvAllocatorConstructCustom(invalidFnAllocAllocator, 1, &halloc), NVCV_ERROR_INVALID_ARGUMENT); + + // 2. deallocation function must not be NULL + invalidFnFreeAllocator[0].resType = NVCV_RESOURCE_MEM_CUDA; + invalidFnFreeAllocator[0].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + void *mem; + EXPECT_EQ(cudaMalloc(&mem, size), cudaSuccess); + return mem; + }; + invalidFnFreeAllocator[0].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + EXPECT_EQ(nvcvAllocatorConstructCustom(invalidFnFreeAllocator, 1, &halloc), NVCV_ERROR_INVALID_ARGUMENT); + + // 3. duplicated resource type + duplicatedResourceTypeAllocator[0].resType = NVCV_RESOURCE_MEM_HOST; + duplicatedResourceTypeAllocator[0].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + return memalign(align, size); + }; + duplicatedResourceTypeAllocator[0].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + free(ptr); + }; + duplicatedResourceTypeAllocator[0].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + duplicatedResourceTypeAllocator[1].resType = NVCV_RESOURCE_MEM_HOST; + duplicatedResourceTypeAllocator[1].res.mem.fnAlloc = [](void *ctx, int64_t size, int32_t align) + { + return memalign(align, size); + }; + duplicatedResourceTypeAllocator[1].res.mem.fnFree = [](void *ctx, void *ptr, int64_t size, int32_t align) + { + free(ptr); + }; + duplicatedResourceTypeAllocator[1].cleanup = [](void *ctx, NVCVResourceAllocator *alloc) { + }; + + EXPECT_EQ(nvcvAllocatorConstructCustom(duplicatedResourceTypeAllocator, 2, &halloc), NVCV_ERROR_INVALID_ARGUMENT); +} + +TEST(AllocatorTest, get_name) +{ + EXPECT_STREQ("NVCV_RESOURCE_MEM_CUDA", nvcvResourceTypeGetName(NVCV_RESOURCE_MEM_CUDA)); + EXPECT_STREQ("NVCV_RESOURCE_MEM_HOST", nvcvResourceTypeGetName(NVCV_RESOURCE_MEM_HOST)); + EXPECT_STREQ("NVCV_RESOURCE_MEM_HOST_PINNED", nvcvResourceTypeGetName(NVCV_RESOURCE_MEM_HOST_PINNED)); + EXPECT_STREQ("Unexpected error retrieving NVCVResourceType string representation", + nvcvResourceTypeGetName(static_cast(255))); +} diff --git a/tests/nvcv_types/system/TestArray.cpp b/tests/nvcv_types/system/TestArray.cpp index d0f9f5297..cbc67ab5d 100644 --- a/tests/nvcv_types/system/TestArray.cpp +++ b/tests/nvcv_types/system/TestArray.cpp @@ -208,3 +208,268 @@ TEST(ArrayTests, smoke_create_allocator) EXPECT_EQ(32, data->stride()); } + +TEST(ArrayTests, invalid_outputs_calcReq) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, nullptr)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvArrayCalcRequirementsWithTarget(16, NVCV_DATA_TYPE_U8, 0, NVCV_RESOURCE_MEM_HOST, nullptr)); +} + +TEST(ArrayTests, invalid_alignment_calcReq_with_target) +{ + NVCVArrayRequirements req; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvArrayCalcRequirementsWithTarget(16, NVCV_DATA_TYPE_U8, 7, NVCV_RESOURCE_MEM_HOST, &req)); +} + +TEST(ArrayTests, invalid_input_construct) +{ + NVCVArrayRequirements req; + NVCVArrayHandle arrayHandle; + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayConstruct(nullptr, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayConstruct(&req, nullptr, nullptr)); +} + +TEST(ArrayTests, valid_construct) +{ + NVCVArrayRequirements req; + NVCVArrayHandle arrayHandle; + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, valid_construct_with_target) +{ + NVCVArrayRequirements req; + NVCVArrayHandle arrayHandle; + EXPECT_EQ(NVCV_SUCCESS, + nvcvArrayCalcRequirementsWithTarget(16, NVCV_DATA_TYPE_U8, 0, NVCV_RESOURCE_MEM_HOST_PINNED, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstructWithTarget(&req, nullptr, NVCV_RESOURCE_MEM_HOST_PINNED, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, mismatch_construct_with_target) +{ + NVCVArrayRequirements req; + NVCVArrayHandle arrayHandle; + int64_t capacity = -1, length = -1; + NVCVResourceType target; + NVCVDataType dType; + EXPECT_EQ(NVCV_SUCCESS, + nvcvArrayCalcRequirementsWithTarget(16, NVCV_DATA_TYPE_U8, 0, NVCV_RESOURCE_MEM_CUDA, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstructWithTarget(&req, nullptr, NVCV_RESOURCE_MEM_HOST, &arrayHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetCapacity(arrayHandle, &capacity)); + EXPECT_EQ(16, capacity); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetTarget(arrayHandle, &target)); + EXPECT_EQ(NVCV_RESOURCE_MEM_HOST, target); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetLength(arrayHandle, &length)); + EXPECT_EQ(0, length); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetDataType(arrayHandle, &dType)); + EXPECT_EQ(NVCV_DATA_TYPE_U8, dType); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_req_construct_with_target) +{ + NVCVArrayHandle arrayHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvArrayConstructWithTarget(nullptr, nullptr, NVCV_RESOURCE_MEM_HOST, &arrayHandle)); +} + +TEST(ArrayTests, invalid_handle_construct_with_target) +{ + NVCVArrayRequirements req; + EXPECT_EQ(NVCV_SUCCESS, + nvcvArrayCalcRequirementsWithTarget(16, NVCV_DATA_TYPE_U8, 0, NVCV_RESOURCE_MEM_HOST, &req)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvArrayConstructWithTarget(&req, nullptr, NVCV_RESOURCE_MEM_HOST, nullptr)); +} + +TEST(ArrayTests, invalid_data_wrap_data_construct) +{ + NVCVArrayHandle arrayHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayWrapDataConstruct(nullptr, nullptr, nullptr, &arrayHandle)); +} + +TEST(ArrayTests, invalid_handle_wrap_data_construct) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + NVCVArrayData arrayData; + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData)); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +void arrayDataCleanUpFunc(void *ctx, const NVCVArrayData *data); + +void arrayDataCleanUpFunc(void *ctx, const NVCVArrayData *data) {} + +TEST(ArrayTests, valid_handle_wrap_data_construct) +{ + NVCVArrayHandle arrayHandle, arrayHandle2; + NVCVArrayRequirements req; + NVCVArrayData arrayData; + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, &arrayDataCleanUpFunc, nullptr, &arrayHandle2)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle2, nullptr)); +} + +TEST(ArrayTests, null_basePtr_wrap_data_construct) +{ + NVCVArrayHandle arrayHandle2; + NVCVArrayData arrayData; + arrayData.buffer.strided.basePtr = nullptr; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayHandle2)); +} + +TEST(ArrayTests, valid_array_inc_ref) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + int refCount = -1; + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayRefCount(arrayHandle, &refCount)); + EXPECT_EQ(refCount, 1); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayIncRef(arrayHandle, &refCount)); + EXPECT_EQ(refCount, 2); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, &refCount)); + EXPECT_EQ(refCount, 1); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, smoke_user_pointer) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + void *userPtr; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetUserPointer(arrayHandle, &userPtr)); + EXPECT_EQ(nullptr, userPtr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArraySetUserPointer(arrayHandle, reinterpret_cast(0x123ULL))); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetUserPointer(arrayHandle, &userPtr)); + EXPECT_EQ(reinterpret_cast(0x123ULL), userPtr); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArraySetUserPointer(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetUserPointer(arrayHandle, &userPtr)); + EXPECT_EQ(nullptr, userPtr); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_user_pointer) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetUserPointer(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_data_type) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetDataType(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, valid_get_allocator) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + NVCVAllocatorHandle alloc; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetAllocator(arrayHandle, &alloc)); + EXPECT_EQ(alloc, nullptr); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_allocator) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetAllocator(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_export_data) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayExportData(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_length) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetLength(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_capacity) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetCapacity(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} + +TEST(ArrayTests, invalid_out_get_target) +{ + NVCVArrayHandle arrayHandle; + NVCVArrayRequirements req; + + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetTarget(arrayHandle, nullptr)); + EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr)); +} diff --git a/tests/nvcv_types/system/TestConfig.cpp b/tests/nvcv_types/system/TestConfig.cpp index 03b112aa3..ef028c5ca 100644 --- a/tests/nvcv_types/system/TestConfig.cpp +++ b/tests/nvcv_types/system/TestConfig.cpp @@ -18,6 +18,7 @@ #include "Definitions.hpp" #include +#include #include #include #include @@ -46,6 +47,10 @@ T CreateObj() { return nvcv::Tensor(nvcv::TensorShape({32, 12, 4}, nvcv::TENSOR_NONE), nvcv::TYPE_U8); } + else if constexpr (std::is_same_v) + { + return nvcv::Array(1, nvcv::TYPE_U8, 0, NVCV_RESOURCE_MEM_HOST); + } else { static_assert(sizeof(T) != 0 && "Invalid core object type"); @@ -71,13 +76,17 @@ void SetMaxCount(int32_t maxCount) { nvcv::cfg::SetMaxTensorCount(maxCount); } + else if constexpr (std::is_same_v) + { + nvcv::cfg::SetMaxArrayCount(maxCount); + } else { static_assert(sizeof(T) != 0 && "Invalid core object type"); } } -using AllCoreTypes = ttest::Types; +using AllCoreTypes = ttest::Types; template class ConfigTests : public ::testing::Test diff --git a/tests/nvcv_types/system/TestImageFormat.cpp b/tests/nvcv_types/system/TestImageFormat.cpp index 5cd8da66f..72dab22da 100644 --- a/tests/nvcv_types/system/TestImageFormat.cpp +++ b/tests/nvcv_types/system/TestImageFormat.cpp @@ -426,6 +426,12 @@ TEST(ImageFormatTests, set_extra_channel_info_image_format_none) ASSERT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageFormatSetExtraChannelInfo(&fmt, &exChannelInfo)); } +TEST(ImageFormatTests, set_extra_channel_info_null_input_ptr) +{ + NVCVExtraChannelInfo exChannelInfo = {2, 8, NVCV_DATA_KIND_UNSIGNED, NVCV_EXTRA_CHANNEL_POS3D}; + ASSERT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageFormatSetExtraChannelInfo(nullptr, &exChannelInfo)); +} + TEST(ImageFormatTests, set_extra_channel_info_max_min_bounds) { NVCVExtraChannelInfo exChannelInfo = {8, 8, NVCV_DATA_KIND_UNSIGNED, NVCV_EXTRA_CHANNEL_POS3D}; @@ -579,6 +585,8 @@ TEST(ImageFormatTests, check_alpha_type) fmt = NVCV_IMAGE_FORMAT_RGBA8; ASSERT_EQ(NVCV_SUCCESS, nvcvImageFormatGetAlphaType(fmt, &alphaType)); EXPECT_EQ(NVCV_ALPHA_ASSOCIATED, alphaType); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageFormatGetAlphaType(fmt, nullptr)); } TEST_P(ImageFormatTests, check_extra_channel_info) diff --git a/tests/nvcv_types/system/TestTensor.cpp b/tests/nvcv_types/system/TestTensor.cpp index 368e47692..2f45bcccb 100644 --- a/tests/nvcv_types/system/TestTensor.cpp +++ b/tests/nvcv_types/system/TestTensor.cpp @@ -458,3 +458,203 @@ TEST_P(TensorWrapParamTests, smoke_create) } } } + +class TensorTests_Negative : public ::testing::Test +{ +public: + TensorTests_Negative() {} + + ~TensorTests_Negative() {} + + void SetUp() override + { + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorConstruct(&reqs, nullptr, &handle)); + } + + void TearDown() override + { + nvcv::Tensor tensor(std::move(handle)); + } + + NVCVTensorHandle handle; + NVCVTensorRequirements reqs; +}; + +TEST_F(TensorTests_Negative, invalid_parameter_TensorCalcRequirementsForImages) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirementsForImages(-1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, + 0, &reqs)); // invalid numImages + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirementsForImages(5, -1, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); // invalid width + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirementsForImages(5, 224, -1, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); // invalid height + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_NONE, 0, 0, &reqs)); // invalid format + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 3, + 0, &reqs)); // invalid baseAddrAlignment + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, + 3, &reqs)); // invalid rowAddrAlignment + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, nullptr)); // null reqs + EXPECT_EQ(NVCV_ERROR_NOT_IMPLEMENTED, + nvcvTensorCalcRequirementsForImages( + 5, 224, 224, NVCV_DETAIL_MAKE_COLOR_FMT1(RGB, UNDEFINED, BL, UNSIGNED, XYZW, ASSOCIATED, X8_Y8_Z8_W8), + 0, 0, &reqs)); // BL layout + EXPECT_EQ(NVCV_ERROR_NOT_IMPLEMENTED, + nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_UYVY, 0, 0, + &reqs)); // Not implemented subsampled planes (422) + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirementsForImages(5, 224, 224, NVCV_IMAGE_FORMAT_NV24, 0, + 0, &reqs)); // semi-planar image format + EXPECT_EQ( + NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirementsForImages( + 5, 224, 224, NVCV_DETAIL_MAKE_COLOR_FMT4(RGB, UNDEFINED, PL, UNSIGNED, XYZW, ASSOCIATED, X8, X8, X8, X32), + 0, 0, &reqs)); // planes of image format don't have the same packing +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorCalcRequirements) +{ + int64_t valid_wh[] = {224, 224}; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirements(2, valid_wh, NVCV_DATA_TYPE_NONE, NVCV_TENSOR_LAYOUT_MAKE("HW"), 0, 0, + &reqs)); // invalid dtype + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirements(3, valid_wh, NVCV_DATA_TYPE_U8, NVCV_TENSOR_LAYOUT_MAKE("HW"), 0, 0, + &reqs)); // mismatch rank + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirements(-1, valid_wh, NVCV_DATA_TYPE_U8, NVCV_TENSOR_LAYOUT_MAKE("HW"), 0, 0, + &reqs)); // invalid rank + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirements(2, valid_wh, NVCV_DATA_TYPE_U8, NVCV_TENSOR_NONE, + 3, 0, &reqs)); // invalid baseAddrAlignment + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorCalcRequirements(2, valid_wh, NVCV_DATA_TYPE_U8, NVCV_TENSOR_NONE, + 0, 3, &reqs)); // invalid rowAddrAlignment + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorCalcRequirements(2, valid_wh, NVCV_DATA_TYPE_U8, NVCV_TENSOR_NONE, 0, 0, nullptr)); // null reqs +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorConstruct) +{ + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs)); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorConstruct(nullptr, nullptr, &handle)); // null reqs + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorConstruct(&reqs, nullptr, nullptr)); // null handle +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorWrapDataConstruct) +{ + NVCVTensorData tensorData; + NVCVTensorBufferStrided &tensorStrided = tensorData.buffer.strided; + tensorData.bufferType = NVCV_TENSOR_BUFFER_STRIDED_CUDA; + tensorData.layout = NVCV_TENSOR_NHWC; + tensorData.rank = 4; + tensorData.shape[0] = 1; + tensorData.shape[1] = 224; + tensorData.shape[2] = 224; + tensorData.shape[3] = 3; + tensorData.dtype = NVCV_DATA_TYPE_F32; + tensorStrided.strides[3] = nvcv::FMT_RGBf32.planePixelStrideBytes(0) / nvcv::FMT_RGBf32.numChannels(); + tensorStrided.strides[2] = nvcv::FMT_RGBf32.planePixelStrideBytes(0); + tensorStrided.strides[1] = 224 * nvcv::FMT_RGBf32.planePixelStrideBytes(0); + tensorStrided.strides[0] = tensorStrided.strides[1] * tensorData.shape[1]; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorWrapDataConstruct(nullptr, nullptr, nullptr, &handle)); // null tensorData + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorWrapDataConstruct(&tensorData, nullptr, nullptr, nullptr)); // null handle +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorGetLayout) +{ + NVCVTensorLayout layout; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetLayout(nullptr, &layout)); // null handle + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetLayout(handle, nullptr)); // null layout +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorExportData) +{ + NVCVTensorData data; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorExportData(nullptr, &data)); // null handle + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorExportData(handle, nullptr)); // null data +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorGetShape) +{ + int32_t rank = NVCV_TENSOR_MAX_RANK; + int64_t shape[NVCV_TENSOR_MAX_RANK] = {0}; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetShape(nullptr, &rank, shape)); // null handle + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetShape(handle, nullptr, shape)); // null rank +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorGetUserPointer) +{ + void *userPtr; + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetUserPointer(nullptr, &userPtr)); // null handle + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetUserPointer(handle, nullptr)); // null rank +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorReshape) +{ + int64_t new_shape[] = {1, 224, 112, 2}; + NVCVTensorHandle outHandle; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorReshape(nullptr, 4, new_shape, NVCV_TENSOR_NHWC, &outHandle)); // null handle + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorReshape(handle, 0, new_shape, NVCV_TENSOR_NHWC, &outHandle)); // invalid rank + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorReshape(handle, NVCV_TENSOR_MAX_RANK + 1, new_shape, + NVCV_TENSOR_NHWC, &outHandle)); // invalid rank 2 + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorReshape(handle, 4, new_shape, NVCV_TENSOR_HW, &outHandle)); // mismatch layout + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorReshape(handle, 4, new_shape, NVCV_TENSOR_NHWC, nullptr)); // null out handle +} + +TEST_F(TensorTests_Negative, invalid_parameter_TensorShapePermute) +{ + NVCVTensorLayout srcLayout = NVCV_TENSOR_NHWC; + std::vector srcShape{16, 61, 23, 3}; + NVCVTensorLayout dstLayout = NVCV_TENSOR_NCHW; + std::vector outShape(srcShape.size()); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorShapePermute(srcLayout, nullptr, dstLayout, outShape.data())); // null srcShape + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorShapePermute(srcLayout, srcShape.data(), dstLayout, nullptr)); // null outShape +} + +class TensorPermuteTests + : public t::TestWithParam< + std::tuple, test::Param<"srcShape", std::vector>, + test::Param<"dstLayout", NVCVTensorLayout>, test::Param<"goldShape", std::vector>>> +{ +}; + +// clang-format off +NVCV_INSTANTIATE_TEST_SUITE_P(_, TensorPermuteTests, + test::ValueList, NVCVTensorLayout, std::vector> + { + {NVCV_TENSOR_NHWC, {16, 61, 23, 3}, NVCV_TENSOR_NCHW, {16, 3, 61, 23}}, + {NVCV_TENSOR_CHW, {3, 61, 23}, NVCV_TENSOR_HWC, {61, 23, 3}}, + {NVCV_TENSOR_CFDHW, {3, 2, 6, 61, 23}, NVCV_TENSOR_FDHWC, {2, 6, 61, 23, 3}}, + {NVCV_TENSOR_CHW, {3, 61, 23}, NVCV_TENSOR_HW, {61, 23}}, + {NVCV_TENSOR_HWC, {61, 23, 3}, NVCV_TENSOR_HW, {61, 23}} + } +); + +// clang-format on + +TEST_P(TensorPermuteTests, smoke) +{ + NVCVTensorLayout srcLayout = std::get<0>(GetParam()); + std::vector srcShape = std::get<1>(GetParam()); + NVCVTensorLayout dstLayout = std::get<2>(GetParam()); + const std::vector goldShape = std::get<3>(GetParam()); + + std::vector outShape(goldShape.size()); + ASSERT_EQ(NVCV_SUCCESS, nvcvTensorShapePermute(srcLayout, srcShape.data(), dstLayout, outShape.data())); + EXPECT_EQ(outShape, goldShape); +} diff --git a/tests/nvcv_types/system/TestTensorLayout.cpp b/tests/nvcv_types/system/TestTensorLayout.cpp index 4d55597b1..d0d349fda 100644 --- a/tests/nvcv_types/system/TestTensorLayout.cpp +++ b/tests/nvcv_types/system/TestTensorLayout.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -419,3 +419,49 @@ TEST_P(TensorLayoutOStreamExecTests, works) EXPECT_STREQ(gold, ss.str().c_str()); } + +TEST(TensorLayoutTests_Negative, invalid_parameter_TensorLayoutMake) +{ + NVCVTensorLayout outLayout; + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorLayoutMake("HW", nullptr)); + + std::string exceededDescr(NVCV_TENSOR_MAX_RANK + 1, 'Z'); + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorLayoutMake(exceededDescr.c_str(), &outLayout)); // exceed range +} + +TEST(TensorLayoutTests_Negative, invalid_parameter_TensorLayoutMakeRange) +{ + NVCVTensorLayout outLayout; + + std::string validDescr(NVCV_TENSOR_MAX_RANK - 1, 'Z'); + std::string exceededDescr(NVCV_TENSOR_MAX_RANK + 1, 'Z'); + + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeRange(exceededDescr.c_str(), exceededDescr.c_str() + validDescr.size(), + nullptr)); // null output + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeRange(nullptr, exceededDescr.c_str() + validDescr.size(), &outLayout)); // null begin + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeRange(exceededDescr.c_str(), nullptr, &outLayout)); // null end + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeRange(exceededDescr.c_str(), exceededDescr.c_str() + exceededDescr.size(), + &outLayout)); // exceed range +} + +TEST(TensorLayoutTests_Negative, invalid_parameter_TensorLayoutMakeFirst) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeFirst(NVCV_TENSOR_LAYOUT_MAKE("ABCD"), 2, nullptr)); // null output +} + +TEST(TensorLayoutTests_Negative, invalid_parameter_TensorLayoutMakeLast) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeLast(NVCV_TENSOR_LAYOUT_MAKE("ABCD"), 2, nullptr)); // null output +} + +TEST(TensorLayoutTests_Negative, invalid_parameter_TensorLayoutMakeSubRange) +{ + EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, + nvcvTensorLayoutMakeSubRange(NVCV_TENSOR_LAYOUT_MAKE("ABCD"), 0, 2, nullptr)); // null output +} diff --git a/tests/nvcv_types/unit/TestMath.cpp b/tests/nvcv_types/unit/TestMath.cpp index 67bc79e48..c2bf64b62 100644 --- a/tests/nvcv_types/unit/TestMath.cpp +++ b/tests/nvcv_types/unit/TestMath.cpp @@ -290,3 +290,27 @@ TEST_P(MathDivUpPowerOfTwoTests, works) EXPECT_EQ(gold, util::DivUpPowerOfTwo(num, den)); } + +class MathSincTests : public t::TestWithParam, test::Param<"gold", float>>> +{ +}; + +// clang-format off +NVCV_INSTANTIATE_TEST_SUITE_P(_, MathSincTests, + test::ValueList + { + {0.f, 1.f}, + {1.f, 0.f}, + {0.5f, 2.f / static_cast(M_PI)}, + {-0.5f, 2.f / static_cast(M_PI)}, + }); + +// clang-format on + +TEST_P(MathSincTests, works) +{ + const float value = std::get<0>(GetParam()); + const float gold = std::get<1>(GetParam()); + + EXPECT_NEAR(gold, util::sinc(value), 1e-7f); +} diff --git a/tools/mkop/PythonWrap.cpp b/tools/mkop/PythonWrap.cpp index c2a667053..25c67a77d 100644 --- a/tools/mkop/PythonWrap.cpp +++ b/tools/mkop/PythonWrap.cpp @@ -36,9 +36,11 @@ Tensor __OPNAME__Into(Tensor &output, Tensor &input, std::optional pstre auto op = CreateOperator(); ResourceGuard guard(*pstream); - guard.add(LockMode::LOCK_READ, {input}); - guard.add(LockMode::LOCK_WRITE, {output}); - guard.add(LockMode::LOCK_NONE, {*op}); + guard.add(LockMode::LOCK_MODE_READ, {input}); + guard.add(LockMode::LOCK_MODE_WRITE, {output}); + // TODO if op kernel allocates resources that are accessed by the device change to READWRITE + // is set to none it is possible for the operator to be destroyed before the kernel is executed. + guard.add(LockMode::LOCK_MODE_NONE, {*op}); op->submit(pstream->cudaHandle(), input, output);