diff --git a/CMakeLists.txt b/CMakeLists.txt index 94260676e..13378bf1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,13 +17,13 @@ cmake_minimum_required(VERSION 3.20.1) # We need to check this variable before starting a CUDA project - otherwise it will appear # as set, with the default value pointing to the oldest supported architecture (52 as of CUDA 11.8) -if(DEFINED CMAKE_CUDA_ARCHITECTURES) +if(CMAKE_CUDA_ARCHITECTURES) set(USE_CMAKE_CUDA_ARCHITECTURES TRUE) endif() project(cvcuda LANGUAGES C CXX - VERSION 0.10.0 + VERSION 0.11.0 DESCRIPTION "CUDA-accelerated Computer Vision algorithms" ) @@ -49,6 +49,7 @@ endif() option(BUILD_TESTS "Enable testsuite" OFF) option(BUILD_PYTHON "Build python bindings" OFF) option(BUILD_BENCH "Build benchmark" OFF) +option(BUILD_DOCS "Build documentation" OFF) option(ENABLE_SANITIZER "Enabled sanitized build" OFF) # Configure build tree ====================== diff --git a/README.md b/README.md index 3e232d8bf..f71bf43f8 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0) -![Version](https://img.shields.io/badge/Version-v0.10.1--beta-blue) +![Version](https://img.shields.io/badge/Version-v0.11.0--beta-blue) ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray) @@ -60,10 +60,8 @@ To get a local copy up and running follow these steps. - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12. - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version. - Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS=""`. -- Python bindings installed via Debian packages and Python tests fail with Numpy 2.0. We recommend using an older version of Numpy (e.g. 1.26) until we have implemented a fix. - The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using cubic interpolation. This will be fixed in an upcoming release. -- Cache/resource management introduced in v0.10 add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​ -- Sporadic Pybind11-deallocation crashes have been reported in long-lasting multi-threaded Python pipelines with externally allocated memory (eg wrapped Pytorch buffers). We are evaluating an upgrade of Pybind11 (currently using 2.10) as a potential fix in an upcoming release. +- The YUV(420) color conversion API (NVCV_COLOR_RGB2YUV_I420) incorrectly computes the U and V plane index​. ### Installation @@ -209,7 +207,6 @@ For instructions on how to build samples from source and run them, see the [Samp Install the dependencies required for running the tests: - python3, python3-pip: to run python bindings tests - torch: dependencies needed by python bindings tests -- numpy: known limitation: Python tests fail with numpy 2.0. We recommend using an older version (eg 1.26) until we have implemented a fix. On Ubuntu >= 20.04, install the following packages using `apt` and `pip`: ```shell diff --git a/bench/python/all_ops/op_adaptivethreshold.py b/bench/python/all_ops/op_adaptivethreshold.py index 1d7b09fbc..a38728382 100644 --- a/bench/python/all_ops/op_adaptivethreshold.py +++ b/bench/python/all_ops/op_adaptivethreshold.py @@ -23,6 +23,7 @@ class OpAdaptiveThreshold(AbstractOpBase): def setup(self, input): + super().setup(input) self.maxval = 255.0 self.adaptive_method = cvcuda.AdaptiveThresholdType.GAUSSIAN_C self.threshold_type = cvcuda.ThresholdType.BINARY diff --git a/bench/python/all_ops/op_as_image.py b/bench/python/all_ops/op_as_image.py index bae98afa5..993ed1841 100644 --- a/bench/python/all_ops/op_as_image.py +++ b/bench/python/all_ops/op_as_image.py @@ -23,6 +23,7 @@ class OpAsImageFromNVCVImage(AbstractOpBase): def setup(self, input): + super().setup(input) # dummy run that does not use cache img = nvcv.Image((128, 128), nvcv.Format.RGBA8) diff --git a/bench/python/all_ops/op_as_images.py b/bench/python/all_ops/op_as_images.py index d5022cf23..ed5132a08 100644 --- a/bench/python/all_ops/op_as_images.py +++ b/bench/python/all_ops/op_as_images.py @@ -23,6 +23,7 @@ class OpAsImagesFromNVCVImage(AbstractOpBase): def setup(self, input): + super().setup(input) # dummy run that does not use cache nvcv.ImageBatchVarShape(100) img = nvcv.Image((128, 128), nvcv.Format.RGBA8) diff --git a/bench/python/all_ops/op_averageblur.py b/bench/python/all_ops/op_averageblur.py index e6dee83eb..444a65b64 100644 --- a/bench/python/all_ops/op_averageblur.py +++ b/bench/python/all_ops/op_averageblur.py @@ -23,6 +23,7 @@ class OpAverageBlur(AbstractOpBase): def setup(self, input): + super().setup(input) self.kernel_size = (3, 3) self.kernel_anchor = (-1, -1) diff --git a/bench/python/all_ops/op_blurbox.py b/bench/python/all_ops/op_blurbox.py index ea37e00ba..ba32fccfe 100644 --- a/bench/python/all_ops/op_blurbox.py +++ b/bench/python/all_ops/op_blurbox.py @@ -26,6 +26,7 @@ class OpBlurBox(AbstractOpBase): def setup(self, input): + super().setup(input) self.kernel_size = 5 data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) diff --git a/bench/python/all_ops/op_boundingbox.py b/bench/python/all_ops/op_boundingbox.py index cc08e9703..4faedfe37 100644 --- a/bench/python/all_ops/op_boundingbox.py +++ b/bench/python/all_ops/op_boundingbox.py @@ -26,6 +26,7 @@ class OpBoundingBox(AbstractOpBase): def setup(self, input): + super().setup(input) self.border_color = (0, 255, 0, 255) self.fill_color = (0, 0, 255, 0) self.thickness = 5 diff --git a/bench/python/all_ops/op_brightnesscontrast.py b/bench/python/all_ops/op_brightnesscontrast.py index 58d9cb833..efe825e0e 100644 --- a/bench/python/all_ops/op_brightnesscontrast.py +++ b/bench/python/all_ops/op_brightnesscontrast.py @@ -24,6 +24,7 @@ class OpBrightnessContrast(AbstractOpBase): def setup(self, input): + super().setup(input) brightness = torch.tensor([1.2]).cuda(self.device_id) self.brightness = cvcuda.as_tensor(brightness, "N") diff --git a/bench/python/all_ops/op_cache_limit.py b/bench/python/all_ops/op_cache_limit.py new file mode 100644 index 000000000..6c49c7add --- /dev/null +++ b/bench/python/all_ops/op_cache_limit.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nvcv +import torch + +# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise +# things may throw unexpected errors. +import pycuda.driver as cuda # noqa: F401 +from bench_utils import AbstractOpBase + +# For the following setup depicted in the table, we have to repeatedly call the functions: cudaMalloc and/or +# cudaFree. +# +# --------------------------------------------------------------------- +# | shape\cache limit | small large | +# |-------------------------------------------------------------------| +# | non-random | cudaMalloc + cudaFree - (best-case) | +# | random | cudaMalloc + cudaFree cudaMalloc | +# --------------------------------------------------------------------- +# +# Due to the this table, we benchmark three scenarios: {non-random, small}, {non-random, large}, +# {random, large} + + +# Base class for cache limit benchmarks, to ensure all three classes have the same overhead, leading to +# consistent numbers. +class BaseOpCacheLimit(AbstractOpBase): + def setup(self, input, new_cache_limit, low, high): + super().setup(input) + + # make this benchmark compatible with older cvcuda/nvncv versions + if hasattr(nvcv, "set_cache_limit_inbytes"): + nvcv.set_cache_limit_inbytes(new_cache_limit) + + # We don't have access to the outer benchmark iterations (default=10), so we have to create our own + # counter. + self.max_iter_outer = 10 + self.iter_outer = 0 + + # Number of "random" tensors created per benchmarked run + self.n_tensors = 20 + self.hw = torch.randint( + low=low, high=high, size=(self.max_iter_outer, 2, self.n_tensors) + ) + + def run(self, input): + # If we exceed the outer bench iterations, we return. + # If we didn't return, we might re-use the cache, which we specifically don't want for + # "OpCacheLimitLargeAndRandom". + # For the other classes (OpCacheLimitZero, OpCacheLimitLarge), we could continue running the + # benchmarks, but then we would not get comparable numbers between all three classes + if self.iter_outer >= self.max_iter_outer: + return + + for ii in range(self.n_tensors): + shape = ( + self.hw[self.iter_outer, 0, ii].item(), + self.hw[self.iter_outer, 1, ii].item(), + 3, + ) + _ = nvcv.Tensor(shape, nvcv.Type.F32, nvcv.TensorLayout.HWC) + + self.iter_outer += 1 + return + + +# This is the {non-random, small} case. The smallest we can choose is 0, so we set the cache limit to 0 and +# effectively disable the cache +class OpCacheLimitZero(BaseOpCacheLimit): + def setup(self, input): + # Set the cache limit to 0 for this benchmark + # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3) + super().setup(input, 0, low=1000, high=1001) + + def run(self, input): + super().run(input) + + +# This is the {non-random, large} case. This is the best case scenario, always re-using the cache +class OpCacheLimitLarge(BaseOpCacheLimit): + def setup(self, input): + # Set the cache limit to the total gpu memory for this benchmark + # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3) + total = torch.cuda.mem_get_info()[1] + super().setup(input, total, low=1000, high=1001) + + def run(self, input): + super().run(input) + + +# This is the {random, large} case. This is the worst case scenario, never re-using the cache +class OpCacheLimitLargeAndRandom(BaseOpCacheLimit): + def setup(self, input): + # Set the cache limit to the total gpu memory for this benchmark + # low=1000, high=2000 results in always creating tensor's of random shape + # between [(1000,1000,3), (1999,1999,3)] + total = torch.cuda.mem_get_info()[1] + super().setup(input, total, low=1000, high=2000) + + def run(self, input): + super().run(input) diff --git a/bench/python/all_ops/op_centercrop.py b/bench/python/all_ops/op_centercrop.py index e70e964fc..9de2cb38f 100644 --- a/bench/python/all_ops/op_centercrop.py +++ b/bench/python/all_ops/op_centercrop.py @@ -23,6 +23,7 @@ class OpCenterCrop(AbstractOpBase): def setup(self, input): + super().setup(input) width, height = input.shape[2], input.shape[1] self.crop_size = [width // 2, height // 2] diff --git a/bench/python/all_ops/op_composite.py b/bench/python/all_ops/op_composite.py index 2ef027469..dc306c18e 100644 --- a/bench/python/all_ops/op_composite.py +++ b/bench/python/all_ops/op_composite.py @@ -26,6 +26,7 @@ class OpComposite(AbstractOpBase): def setup(self, input): + super().setup(input) data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) data = data.moveaxis(0, -1).contiguous() # From CHW to HWC data = data.cuda(self.device_id) diff --git a/bench/python/all_ops/op_convertto.py b/bench/python/all_ops/op_convertto.py index 443db43e9..89bce7a21 100644 --- a/bench/python/all_ops/op_convertto.py +++ b/bench/python/all_ops/op_convertto.py @@ -24,6 +24,7 @@ class OpConvertTo(AbstractOpBase): def setup(self, input): + super().setup(input) self.target_dtype = nvcv.Type.F32 self.offset = 10.2 self.scale = 1 / 255.0 diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py index 39c79d5e8..b8fac51f9 100644 --- a/bench/python/all_ops/op_copymakeborder.py +++ b/bench/python/all_ops/op_copymakeborder.py @@ -23,6 +23,7 @@ class OpCopyMakeBorder(AbstractOpBase): def setup(self, input): + super().setup(input) self.border_mode = cvcuda.Border.CONSTANT self.border_values = [255, 0, 0] # Border values for 3 channel RGB input. self.top = 30 diff --git a/bench/python/all_ops/op_customcrop.py b/bench/python/all_ops/op_customcrop.py index 4cb31c2fd..91f2133d6 100644 --- a/bench/python/all_ops/op_customcrop.py +++ b/bench/python/all_ops/op_customcrop.py @@ -24,6 +24,7 @@ class OpCustomCrop(AbstractOpBase): def setup(self, input): + super().setup(input) self.rectI = nvcv.RectI(x=30, y=40, width=420, height=390) def run(self, input): diff --git a/bench/python/all_ops/op_cvtcolor.py b/bench/python/all_ops/op_cvtcolor.py index 35a7322fe..a48a5a2c2 100644 --- a/bench/python/all_ops/op_cvtcolor.py +++ b/bench/python/all_ops/op_cvtcolor.py @@ -23,7 +23,7 @@ class OpCvtColorRGB2GRAY(AbstractOpBase): def setup(self, input): - pass + super().setup(input) def run(self, input): return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2GRAY) @@ -31,7 +31,7 @@ def run(self, input): class OpCvtColorRGB2BGR(AbstractOpBase): def setup(self, input): - pass + super().setup(input) def run(self, input): return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2BGR) diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py index 0e0a1eb51..9f47c1b01 100644 --- a/bench/python/all_ops/op_flip.py +++ b/bench/python/all_ops/op_flip.py @@ -23,6 +23,7 @@ class OpFlipX(AbstractOpBase): def setup(self, input): + super().setup(input) self.flip_code = 0 # means flipping around x axis. def run(self, input): @@ -31,6 +32,7 @@ def run(self, input): class OpFlipY(AbstractOpBase): def setup(self, input): + super().setup(input) self.flip_code = 1 # means flipping around y axis. def run(self, input): @@ -39,6 +41,7 @@ def run(self, input): class OpFlipXY(AbstractOpBase): def setup(self, input): + super().setup(input) self.flip_code = -1 # means flipping around x and y axis. def run(self, input): diff --git a/bench/python/all_ops/op_gaussianblur.py b/bench/python/all_ops/op_gaussianblur.py index 45f57eda3..918caf4b0 100644 --- a/bench/python/all_ops/op_gaussianblur.py +++ b/bench/python/all_ops/op_gaussianblur.py @@ -23,6 +23,7 @@ class OpGaussianBlur(AbstractOpBase): def setup(self, input): + super().setup(input) self.kernel_size = (3, 3) self.sigma = (5, 5) diff --git a/bench/python/all_ops/op_hqresize.py b/bench/python/all_ops/op_hqresize.py index 765256873..0296eecad 100644 --- a/bench/python/all_ops/op_hqresize.py +++ b/bench/python/all_ops/op_hqresize.py @@ -23,6 +23,7 @@ class OpHqResizeDown(AbstractOpBase): def setup(self, input): + super().setup(input) self.resize_width = 640 self.resize_height = 420 @@ -39,6 +40,7 @@ def run(self, input): class OpHqResizeUp(AbstractOpBase): def setup(self, input): + super().setup(input) self.resize_width = 1920 self.resize_height = 1280 diff --git a/bench/python/all_ops/op_inpaint.py b/bench/python/all_ops/op_inpaint.py index 6fa8af553..72da50131 100644 --- a/bench/python/all_ops/op_inpaint.py +++ b/bench/python/all_ops/op_inpaint.py @@ -26,6 +26,7 @@ class OpInpaint(AbstractOpBase): def setup(self, input): + super().setup(input) data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg")) mask = read_image(os.path.join(self.assets_dir, "countour_lines.jpg")) # Binarize the mask diff --git a/bench/python/all_ops/op_jointbilateral.py b/bench/python/all_ops/op_jointbilateral.py index 9b9e694e3..a2a2b74f8 100644 --- a/bench/python/all_ops/op_jointbilateral.py +++ b/bench/python/all_ops/op_jointbilateral.py @@ -26,6 +26,7 @@ class OpJointBilateral(AbstractOpBase): def setup(self, input): + super().setup(input) self.diameter = 5 self.sigma_color = 50 self.sigma_space = 1 diff --git a/bench/python/all_ops/op_laplacian.py b/bench/python/all_ops/op_laplacian.py index 829268aed..440910629 100644 --- a/bench/python/all_ops/op_laplacian.py +++ b/bench/python/all_ops/op_laplacian.py @@ -23,6 +23,7 @@ class OpLaplacian(AbstractOpBase): def setup(self, input): + super().setup(input) self.kernel_size = 3 self.scale = 2.0 diff --git a/bench/python/all_ops/op_morphology.py b/bench/python/all_ops/op_morphology.py index 10744b3d1..674ea549d 100644 --- a/bench/python/all_ops/op_morphology.py +++ b/bench/python/all_ops/op_morphology.py @@ -64,6 +64,7 @@ def __call__(self): class OpMorphologyOpen(AbstractOpBase): def setup(self, input): + super().setup(input) self.MorphologyBase = MorphologyBase( self.device_id, input, cvcuda.MorphologyType.OPEN ) @@ -74,6 +75,7 @@ def run(self, input): class OpMorphologyClose(AbstractOpBase): def setup(self, input): + super().setup(input) self.MorphologyBase = MorphologyBase( self.device_id, input, cvcuda.MorphologyType.CLOSE ) @@ -84,6 +86,7 @@ def run(self, input): class OpMorphologyDilate(AbstractOpBase): def setup(self, input): + super().setup(input) self.MorphologyBase = MorphologyBase( self.device_id, input, cvcuda.MorphologyType.DILATE ) @@ -94,6 +97,7 @@ def run(self, input): class OpMorphologyErode(AbstractOpBase): def setup(self, input): + super().setup(input) self.MorphologyBase = MorphologyBase( self.device_id, input, cvcuda.MorphologyType.ERODE ) diff --git a/bench/python/all_ops/op_nms.py b/bench/python/all_ops/op_nms.py index c73def6c5..9b72df5b4 100644 --- a/bench/python/all_ops/op_nms.py +++ b/bench/python/all_ops/op_nms.py @@ -25,6 +25,7 @@ class OpNMS(AbstractOpBase): def setup(self, input): + super().setup(input) bboxes = torch.load( os.path.join(self.assets_dir, "brooklyn_bboxes.pt"), map_location="cuda:%d" % self.device_id, diff --git a/bench/python/all_ops/op_normalize.py b/bench/python/all_ops/op_normalize.py index fa7b3eddf..7ebed99e9 100644 --- a/bench/python/all_ops/op_normalize.py +++ b/bench/python/all_ops/op_normalize.py @@ -24,6 +24,7 @@ class OpNormalize(AbstractOpBase): def setup(self, input): + super().setup(input) mean_tensor = ( torch.Tensor([0.485, 0.456, 0.406]).reshape(1, 1, 1, 3).cuda(self.device_id) ) diff --git a/bench/python/all_ops/op_randomresizedcrop.py b/bench/python/all_ops/op_randomresizedcrop.py index 7da248a42..8ea99d734 100644 --- a/bench/python/all_ops/op_randomresizedcrop.py +++ b/bench/python/all_ops/op_randomresizedcrop.py @@ -23,6 +23,7 @@ class OpRandomResizedCrop(AbstractOpBase): def setup(self, input): + super().setup(input) self.resized_shape = (input.shape[0], 320, 580, 3) self.min_scale = 0.08 self.max_scale = 1.0 diff --git a/bench/python/all_ops/op_reformat.py b/bench/python/all_ops/op_reformat.py index 016b4acb7..c1c9bfa20 100644 --- a/bench/python/all_ops/op_reformat.py +++ b/bench/python/all_ops/op_reformat.py @@ -23,6 +23,7 @@ class OpReformatNCHWToNHWC(AbstractOpBase): def setup(self, input): + super().setup(input) self.input_nchw = cvcuda.reformat(input, "NCHW") def run(self, input): @@ -34,7 +35,7 @@ def visualize(self): class OpReformatNHWCToNCHW(AbstractOpBase): def setup(self, input): - pass + super().setup(input) def run(self, input): return cvcuda.reformat(input, "NCHW") diff --git a/bench/python/all_ops/op_remap.py b/bench/python/all_ops/op_remap.py index 9e6b94d29..15fc585e1 100644 --- a/bench/python/all_ops/op_remap.py +++ b/bench/python/all_ops/op_remap.py @@ -25,6 +25,7 @@ class OpRemap(AbstractOpBase): def setup(self, input): + super().setup(input) batch_size, width, height = input.shape[0], input.shape[2], input.shape[1] batch_map = np.stack([self.flipH(w=width, h=height) for _ in range(batch_size)]) batch_map = torch.as_tensor(batch_map, device="cuda") diff --git a/bench/python/all_ops/op_reshape.py b/bench/python/all_ops/op_reshape.py index 438aa1af6..d4d01e4c1 100644 --- a/bench/python/all_ops/op_reshape.py +++ b/bench/python/all_ops/op_reshape.py @@ -23,6 +23,7 @@ class OpReshape(AbstractOpBase): def setup(self, input): + super().setup(input) self.shape = input.shape[::-1] # Reverse everything out def run(self, input): diff --git a/bench/python/all_ops/op_resize.py b/bench/python/all_ops/op_resize.py index b60abe92f..08046da53 100644 --- a/bench/python/all_ops/op_resize.py +++ b/bench/python/all_ops/op_resize.py @@ -23,6 +23,7 @@ class OpResizeDown(AbstractOpBase): def setup(self, input): + super().setup(input) self.resize_width = 640 self.resize_height = 420 @@ -41,6 +42,7 @@ def run(self, input): class OpResizeUp(AbstractOpBase): def setup(self, input): + super().setup(input) self.resize_width = 1920 self.resize_height = 1280 diff --git a/bench/python/all_ops/op_resize_crop_convert_reformat.py b/bench/python/all_ops/op_resize_crop_convert_reformat.py index 32dccfbc7..52d0083ec 100644 --- a/bench/python/all_ops/op_resize_crop_convert_reformat.py +++ b/bench/python/all_ops/op_resize_crop_convert_reformat.py @@ -24,6 +24,7 @@ class OpResizeCropConvertReformat(AbstractOpBase): def setup(self, input): + super().setup(input) resize = 256 crop = 224 delta_shape = resize - crop diff --git a/bench/python/all_ops/op_rotate.py b/bench/python/all_ops/op_rotate.py index 9681f67e1..f542c0da9 100644 --- a/bench/python/all_ops/op_rotate.py +++ b/bench/python/all_ops/op_rotate.py @@ -23,6 +23,7 @@ class OpRotate(AbstractOpBase): def setup(self, input): + super().setup(input) self.angle_deg = 40 self.shift = [input.shape[2] // 4, input.shape[1] // 4] self.interpolation_type = cvcuda.Interp.LINEAR diff --git a/bench/python/all_ops/op_sift.py b/bench/python/all_ops/op_sift.py index 724cdd2af..98e61ecae 100644 --- a/bench/python/all_ops/op_sift.py +++ b/bench/python/all_ops/op_sift.py @@ -23,6 +23,7 @@ class OpSIFT(AbstractOpBase): def setup(self, input): + super().setup(input) self.max_features = 100 self.num_octave_layers = 3 self.contrast_threshold = 0.04 diff --git a/bench/python/all_ops/op_threshold.py b/bench/python/all_ops/op_threshold.py index 70208114b..300f9c9f9 100644 --- a/bench/python/all_ops/op_threshold.py +++ b/bench/python/all_ops/op_threshold.py @@ -24,6 +24,7 @@ class OpThreshold(AbstractOpBase): def setup(self, input): + super().setup(input) threshold = torch.tensor([150.0] * input.shape[0]) threshold = threshold.type(torch.float64) threshold = threshold.cuda(self.device_id) diff --git a/bench/python/all_ops/op_warpaffine.py b/bench/python/all_ops/op_warpaffine.py index 5130e3953..a983c4aa3 100644 --- a/bench/python/all_ops/op_warpaffine.py +++ b/bench/python/all_ops/op_warpaffine.py @@ -24,6 +24,7 @@ class OpWarpAffine(AbstractOpBase): def setup(self, input): + super().setup(input) self.xform = np.array( [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]] ) @@ -43,6 +44,7 @@ def run(self, input): class OpWarpAffineInverse(AbstractOpBase): def setup(self, input): + super().setup(input) self.xform = np.array( [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]] ) diff --git a/bench/python/all_ops/op_warpperspective.py b/bench/python/all_ops/op_warpperspective.py index af3bced1f..528b82df6 100644 --- a/bench/python/all_ops/op_warpperspective.py +++ b/bench/python/all_ops/op_warpperspective.py @@ -24,6 +24,7 @@ class OpWarpPerspective(AbstractOpBase): def setup(self, input): + super().setup(input) self.xform = np.array( [ [3.46153846e-01, 3.33031674e-01, 1.28000000e02], @@ -48,6 +49,7 @@ def run(self, input): class OpWarpPerspectiveInverse(AbstractOpBase): def setup(self, input): + super().setup(input) self.xform = np.array( [ [3.46153846e-01, 3.33031674e-01, 1.28000000e02], diff --git a/bench/python/bench_utils.py b/bench/python/bench_utils.py index 36874890b..c0b194544 100644 --- a/bench/python/bench_utils.py +++ b/bench/python/bench_utils.py @@ -30,6 +30,7 @@ import numpy as np import json import pandas +import nvcv logger = logging.getLogger(__name__) logging.basicConfig( @@ -96,7 +97,8 @@ def setup(self, input): Performs various setup activities to set this operator before it can be run. :param input: The input tensor to run the operator on. """ - pass + if hasattr(nvcv, "clear_cache"): + nvcv.clear_cache() @abstractmethod def run(self, input): diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py index 9bab6c1e7..2ca39326d 100644 --- a/bench/python/run_bench.py +++ b/bench/python/run_bench.py @@ -20,6 +20,7 @@ import sys import logging import cvcuda +import nvcv import torch from pathlib import Path @@ -134,6 +135,7 @@ def run_bench( output_dir=output_dir, should_visualize=should_visualize, ) + torch.cuda.current_stream().synchronize() cvcuda_perf.pop_range() # For init_op except Exception as e: logger.error( @@ -164,6 +166,11 @@ def run_bench( break cvcuda_perf.pop_range(delete_range=not success) # For the run_op + # reset the cache limit to not affect other operator benchmarks, in case a benchmark test + # changed it + if hasattr(nvcv, "set_cache_limit_inbytes"): + total = torch.cuda.mem_get_info()[1] + nvcv.set_cache_limit_inbytes(total // 2) # Step 3: log the parameters used by the operator, initialized during the setup call. if success: diff --git a/cmake/PrintConfig.cmake b/cmake/PrintConfig.cmake index 4c58c7e89..728dfdaa3 100644 --- a/cmake/PrintConfig.cmake +++ b/cmake/PrintConfig.cmake @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -55,6 +55,12 @@ else() message(STATUS " BUILD_BENCH : off") endif() +if(BUILD_DOCS) + message(STATUS " BUILD_DOCS : ON") +else() + message(STATUS " BUILD_DOCS : off") +endif() + if(ENABLE_TEGRA) message(STATUS " ENABLE_TEGRA : ON") else() diff --git a/docker/build20.04/Dockerfile b/docker/build20.04/Dockerfile index d99223968..8db65d0d1 100644 --- a/docker/build20.04/Dockerfile +++ b/docker/build20.04/Dockerfile @@ -17,6 +17,7 @@ ARG VER_CUDA=? ARG VER_UBUNTU=? +ARG VER_NUMPY=? FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU @@ -63,7 +64,7 @@ RUN python3 -m pip install pre-commit RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 RUN python3 -m pip install breathe exhale recommonmark graphviz # Needed for python sphinx docs and Python wheels -RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 +RUN python3 -m pip install numpy patchelf==0.17.2.1 # Python bindings ====================================== @@ -71,7 +72,7 @@ RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 ADD deadsnakes-ubuntu-ppa-focal.list /etc/apt/sources.list.d RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776 -RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \ +RUN for PYTHON_VERSION in 3.8 3.9 3.10 3.11; do \ apt-get update \ && apt-get install -y --no-install-recommends \ python$PYTHON_VERSION-dev python$PYTHON_VERSION-distutils; \ diff --git a/docker/build22.04/Dockerfile b/docker/build22.04/Dockerfile index 216860511..84cfc2453 100644 --- a/docker/build22.04/Dockerfile +++ b/docker/build22.04/Dockerfile @@ -17,6 +17,7 @@ ARG VER_CUDA=? ARG VER_UBUNTU=? +ARG VER_NUMPY=? FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU @@ -59,7 +60,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0 RUN python3 -m pip install breathe exhale recommonmark graphviz # Needed for python sphinx docs and Python wheels -RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1 +RUN python3 -m pip install numpy patchelf==0.17.2.1 # Python bindings ====================================== @@ -72,7 +73,6 @@ RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime RUN apt-get update \ && apt-get install -y --no-install-recommends \ - python3.7-dev python3.7-distutils \ python3.8-dev python3.8-distutils \ python3.9-dev python3.9-distutils \ python3.10-dev python3.10-distutils \ diff --git a/docker/config b/docker/config index d597b477b..85e3a6f9e 100644 --- a/docker/config +++ b/docker/config @@ -29,3 +29,4 @@ TAG_IMAGE_TEST=5 VER_CUDA=11.7.1 VER_UBUNTU=20.04 VER_TRT=24.01 +VER_NUMPY=1.24.1 diff --git a/docker/devel20.04/Dockerfile b/docker/devel20.04/Dockerfile index d71475ded..0865b5014 100644 --- a/docker/devel20.04/Dockerfile +++ b/docker/devel20.04/Dockerfile @@ -17,6 +17,7 @@ ARG BASE_IMAGE=? ARG TAG_IMAGE=? +ARG VER_NUMPY=? FROM $BASE_IMAGE:$TAG_IMAGE @@ -50,14 +51,19 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* # needed by tests -RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \ +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8 && \ + python3.8 -m pip install --upgrade pip && \ + python3.8 -m pip install --upgrade \ + pytest torch==2.4.0 numpy typing-extensions && \ + rm -rf /root/.cache/pip; + +# Debug: Print the VER_NUMPY value to ensure it is passed correctly +RUN for PYTHON_VERSION in 3.9 3.10 3.11; do \ curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION && \ python$PYTHON_VERSION -m pip install --upgrade pip && \ python$PYTHON_VERSION -m pip install --upgrade \ - pytest torch==1.13.0 numpy typing-extensions && \ + pytest torch==2.4.0 numpy==$VER_NUMPY typing-extensions && \ rm -rf /root/.cache/pip; \ done - - WORKDIR /cvcuda diff --git a/docker/devel22.04/Dockerfile b/docker/devel22.04/Dockerfile index 51f9c9abe..7c660024f 100644 --- a/docker/devel22.04/Dockerfile +++ b/docker/devel22.04/Dockerfile @@ -17,6 +17,7 @@ ARG BASE_IMAGE=? ARG TAG_IMAGE=? +ARG VER_NUMPY=? FROM $BASE_IMAGE:$TAG_IMAGE @@ -59,14 +60,17 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* # needed by tests (python3 is python3.10 in ubuntu22.04) -RUN python3 -m pip install torch==1.13.0 torchvision cupy-cuda11x \ +RUN python3 -m pip install torch==2.4.0 torchvision cupy-cuda11x \ + numpy==$VER_NUMPY \ && rm -rf /root/.cache/pip -RUN python3.8 -m pip install torch==1.13.0 torchvision cupy-cuda11x \ - numpy sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \ +RUN python3.8 -m pip install torch==2.4.0 torchvision cupy-cuda11x \ + numpy sphinx-rtd-theme sphinx breathe exhale recommonmark \ + graphviz && rm -rf /root/.cache/pip +RUN python3.9 -m pip install pytest torch==2.4.0 torchvision cupy-cuda11x \ + numpy==$VER_NUMPY \ && rm -rf /root/.cache/pip -RUN python3.9 -m pip install pytest torch==1.13.0 torchvision cupy-cuda11x \ - && rm -rf /root/.cache/pip -RUN python3.11 -m pip install --upgrade pytest torch==1.13.0 cupy-cuda11x \ +RUN python3.11 -m pip install --upgrade pytest torch==2.4.0 cupy-cuda11x \ + numpy==$VER_NUMPY \ && rm -rf /root/.cache/pip WORKDIR /cvcuda diff --git a/docker/env_devel_linux.sh b/docker/env_devel_linux.sh index f031be358..106fa1dcd 100755 --- a/docker/env_devel_linux.sh +++ b/docker/env_devel_linux.sh @@ -70,5 +70,5 @@ docker run --gpus=all --net=host --pull always -ti \ -v /var/tmp:/var/tmp \ -v $SDIR/..:$HOME/cvcuda \ $extra_args \ - $IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA \ + $IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY \ /usr/bin/bash -c "mkdir -p $HOME && chown $USER:$USER $HOME && su - $USER -c \"$extra_cmds\" && su - $USER" diff --git a/docker/update_build_image.sh b/docker/update_build_image.sh index 2a5cb4494..28e6e9bcd 100755 --- a/docker/update_build_image.sh +++ b/docker/update_build_image.sh @@ -37,11 +37,12 @@ cd "$SDIR" cd build$VER_UBUNTU -image=$IMAGE_URL_BASE/build-linux:$VER_UBUNTU-$VER_CUDA +image=$IMAGE_URL_BASE/build-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY docker build --network=host \ --build-arg "VER_CUDA=$VER_CUDA" \ --build-arg "VER_UBUNTU=$VER_UBUNTU" \ + --build-arg "VER_NUMPY=$VER_NUMPY" \ . -t "$image" if [[ $do_push == 1 ]]; then diff --git a/docker/update_devel_image.sh b/docker/update_devel_image.sh index 1bc93a10b..65bd34fc6 100755 --- a/docker/update_devel_image.sh +++ b/docker/update_devel_image.sh @@ -35,11 +35,12 @@ cd "$SDIR" cd devel$VER_UBUNTU -image=$IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA +image=$IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY docker build --network=host \ - --build-arg BASE_IMAGE=$IMAGE_URL_BASE/build-linux \ - --build-arg TAG_IMAGE=$VER_UBUNTU-$VER_CUDA \ + --build-arg "BASE_IMAGE=$IMAGE_URL_BASE/build-linux" \ + --build-arg "TAG_IMAGE=$VER_UBUNTU-$VER_CUDA-$VER_NUMPY" \ + --build-arg "VER_NUMPY=$VER_NUMPY" \ . -t $image if [[ $do_push == 1 ]]; then diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 5ad4f5973..a43037c50 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -52,8 +52,12 @@ set(C_CPP_API_RST ${SPHINX_SOURCE}/_c_cpp_api) set(PY_CVCUDA_API_RST ${SPHINX_SOURCE}/_python_api/_cvcuda_api) # Start from clean directory for rst files, otherwise build could be affected due to old files -file(REMOVE_RECURSE ${C_CPP_API_RST}/*) -file(REMOVE_RECURSE ${PY_CVCUDA_API_RST}/*) +if(EXISTS ${C_CPP_API_RST}) + file(REMOVE_RECURSE ${C_CPP_API_RST}) +endif() +if(EXISTS ${PY_CVCUDA_API_RST}) + file(REMOVE_RECURSE ${PY_CVCUDA_API_RST}) +endif() # Generate rst files for groups from doxygen index.xml add_custom_target(cvcuda_groups ALL python3 ${SPHINX_SOURCE}/generate_groups.py ${C_CPP_API_RST} ${DOXYGEN_OUTPUT_DIR}/xml @@ -74,6 +78,7 @@ add_custom_command(OUTPUT ${SPHINX_INDEX_FILE} DEPENDS ${SPHINX_SOURCE}/index.rst cvcuda_doxygen cvcuda_groups + cvcuda_python_docs MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py COMMENT "Generating documentation with Sphinx") diff --git a/docs/sphinx/gen_py_doc_rsts.py b/docs/sphinx/gen_py_doc_rsts.py deleted file mode 100644 index 320f97600..000000000 --- a/docs/sphinx/gen_py_doc_rsts.py +++ /dev/null @@ -1,231 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import sys -from typing import List, Tuple - - -def exports_enum(s: str) -> bool: - return s.lstrip().startswith("py::enum_<") - - -def get_name_of_enum(s: str) -> str: - """Name of enum is first string in line""" - return re.findall('"([^"]*)"', s)[0] - - -def exports_class(s: str) -> bool: - return s.lstrip().startswith("py::class_<") - - -def get_name_of_class_if_documented(s: str) -> Tuple[bool, str]: - """ - If a class has only one strings in line, it has no documentation to be exported. - If it has more than one string, it has doc and first string is the title of the class - """ - found_strings = re.findall('"([^"]*)"', s) # get all strings - if len(found_strings) > 1: - return True, found_strings[0] - else: - return False, "" - - -def exports_def(s: str) -> bool: - return s.lstrip().startswith("m.def(") - - -def get_name_of_def(s: str) -> str: - """Name of def is first string in line""" - return re.findall('"([^"]*)"', s)[0] - - -def has_exports(file_path: str, export_calls: List[str]) -> bool: - with open(file_path, "r") as file_str: - file_str_read = file_str.read() - for call in export_calls: - if call in file_str_read: - export_calls.remove(call) - return True - return False - - -def create_rst_text(template_file: str, name: str, module: str, members: str) -> str: - with open(template_file, "r") as f: - rst_text = f.read() - rst_text = rst_text.replace("@OperatorName@", name) - rst_text = rst_text.replace("@=@", "=" * len(name)) - rst_text = rst_text.replace("@Module@", module) - rst_text = rst_text.replace("@MemberFunctions@", members) - return rst_text - - -def create_cvcuda_operator_rst_files( - cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str] -) -> None: - # search for template rst file - template_rst_file_path = os.path.join( - cvcuda_path, "docs", "sphinx", "_python_api", "template.rst" - ) - if not os.path.isfile(template_rst_file_path): - raise FileNotFoundError(f"File {template_rst_file_path} not found") - - # iterate through all files - for i in sorted(os.listdir(python_cvcuda_root)): - op_file_path = os.path.join(python_cvcuda_root, i) - # Only work on .cpp files that export operators - if ( - os.path.isfile(op_file_path) - and i.endswith(".cpp") - and i != "Main.cpp" - and has_exports(op_file_path, export_calls) - ): - - # Get operator name form .cpp file: remove prefix "Op" and file type - operator_name = os.path.splitext(i)[0] - operator_name = operator_name[len("Op") :] # noqa: E203 - - # Look for functions to add to documentation - # search for all lines that start with "m.def(" (stripping leading white spaces) - # then pick first string of that line, this is the name of the python function to be exported - exports = set() - with open(op_file_path, "r") as fp: - for line in fp: - if exports_def(line): - exports.add(get_name_of_def(line)) - if len(exports) == 0: - raise RuntimeError(f"No exports found in file {op_file_path}") - exports_str = ", ".join(exports) - - # Create text to put into rst file - starting from a template - rst_text = create_rst_text( - template_rst_file_path, operator_name, "cvcuda", exports_str - ) - - # Write rst file: outdir/_op_.rst - outfile = os.path.join(outdir, f"_op_{operator_name.lower()}.rst") - with open(outfile, "w") as f: - f.write(rst_text) - return - - -def create_cvcuda_non_operator_rst_files( - cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str] -) -> None: - # search for template rst file - template_rst_file_path = os.path.join( - cvcuda_path, "docs", "sphinx", "_python_api", "template.rst" - ) - if not os.path.isfile(template_rst_file_path): - raise FileNotFoundError(f"File {template_rst_file_path} not found") - - for i in sorted(os.listdir(python_cvcuda_root)): - nonop_file_path = os.path.join(python_cvcuda_root, i) - # Only work on .cpp files that something different than operators - if ( - os.path.isfile(nonop_file_path) - and i.endswith(".cpp") - and i != "Main.cpp" - and has_exports(nonop_file_path, export_calls) - ): - # Look for functions to add to documentation - # Search for all lines that start with "py::enum_<" or "py::class_<" - with open(nonop_file_path, "r") as fp: - for line in fp: - if exports_enum(line): - export = get_name_of_enum(line) - elif exports_class(line): - has_doc, name = get_name_of_class_if_documented(line) - if has_doc: - export = name - else: - continue - else: - continue - - # Create text to put into rst file - starting from a template - rst_text = create_rst_text( - template_rst_file_path, export, "cvcuda", export - ) - - # Write rst file: outdir/_aux_.rst - outfile = os.path.join(outdir, f"_aux_{export.lower()}.rst") - with open(outfile, "w") as f: - f.write(rst_text) - return - - -def export_found(s: str) -> bool: - return s.lstrip().startswith("Export") - - -def get_export_fun_name(s: str) -> str: - return s.lstrip().split("(", 1)[0] - - -def exporting_nonops(s: str) -> bool: - """Everything after that command exports auxiliary operator entities - (non-operators)""" - return s.lstrip().startswith("// doctag: Non-Operators") - - -def exporting_ops(s: str) -> bool: - """Everything after that command exports operators""" - return s.lstrip().startswith("// doctag: Operators") - - -def get_exported_cvcuda(path_to_main: str): - export_nonop = [] # list for non operators - export_op = [] # list for operators - exports = None - with open(path_to_main, "r") as fp: - for line in fp: - if export_found(line): - # remove everything after first "(" - name = get_export_fun_name(line) - try: - exports.append(name) - except AttributeError: - print( - "No comment '// doctag: Non-Operators' or '// doctag: Operators' was found in " - f"{path_to_main} prior to 'Export*(m);'-routines." - ) - sys.exit() - elif exporting_nonops(line): - exports = export_nonop - elif exporting_ops(line): - exports = export_op - assert len(export_nonop) > 0 and len(export_op) > 0 - return export_nonop, export_op - - -def generate_py_doc_rsts_cvcuda(cvcuda_path: str, outdir: str): - python_cvcuda_root = os.path.join(cvcuda_path, "python", "mod_cvcuda") - export_nonop, export_op = get_exported_cvcuda( - os.path.join(python_cvcuda_root, "Main.cpp") - ) - create_cvcuda_operator_rst_files(cvcuda_path, outdir, python_cvcuda_root, export_op) - create_cvcuda_non_operator_rst_files( - cvcuda_path, outdir, python_cvcuda_root, export_nonop - ) - return - - -if __name__ == "__main__": - outdir = sys.argv[1] # path/to/cvcuda/docs/sphinx/_python_api/_cvcuda_api - cvcuda_path = sys.argv[2] # path/to/cvcuda - os.makedirs(outdir, exist_ok=True) - generate_py_doc_rsts_cvcuda(cvcuda_path, outdir) diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index f228f771e..66f03f3f8 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -123,7 +123,7 @@ Copyright :maxdepth: 1 :hidden: - v0.10.1-beta + v0.11.0-beta v0.10.0-beta v0.9.0-beta v0.8.0-beta diff --git a/docs/sphinx/modules/c_algos.rst b/docs/sphinx/modules/c_algos.rst index 9164df2c5..8588d7c04 100644 --- a/docs/sphinx/modules/c_algos.rst +++ b/docs/sphinx/modules/c_algos.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -Algorithms -========== +OPERATORS +========= -CV-CUDA Algorithms +CV-CUDA Operators .. toctree:: :glob: diff --git a/docs/sphinx/modules/c_modules.rst b/docs/sphinx/modules/c_modules.rst index 6d0949a65..80cd189a5 100644 --- a/docs/sphinx/modules/c_modules.rst +++ b/docs/sphinx/modules/c_modules.rst @@ -1,5 +1,5 @@ .. - # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,4 +21,4 @@ C API Core Utilities - Algorithms + Operators diff --git a/docs/sphinx/modules/cpp_algos.rst b/docs/sphinx/modules/cpp_algos.rst index c10ae2051..8588d7c04 100644 --- a/docs/sphinx/modules/cpp_algos.rst +++ b/docs/sphinx/modules/cpp_algos.rst @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -ALGORITHMS -========== +OPERATORS +========= -CV-CUDA Algorithms +CV-CUDA Operators .. toctree:: :glob: diff --git a/docs/sphinx/modules/cpp_modules.rst b/docs/sphinx/modules/cpp_modules.rst index 413d08a8d..4251c0235 100644 --- a/docs/sphinx/modules/cpp_modules.rst +++ b/docs/sphinx/modules/cpp_modules.rst @@ -21,5 +21,5 @@ C++ API Core Utilities - Algorithms + Operators CUDA Tools diff --git a/docs/sphinx/modules/python_modules.rst b/docs/sphinx/modules/python_modules.rst index b76f8420b..667313cc3 100644 --- a/docs/sphinx/modules/python_modules.rst +++ b/docs/sphinx/modules/python_modules.rst @@ -22,4 +22,5 @@ Python API :maxdepth: 3 Core - Algorithms + Operators + Operator's auxiliary entities diff --git a/docs/sphinx/modules/python_algos.rst b/docs/sphinx/modules/python_ops.rst similarity index 88% rename from docs/sphinx/modules/python_algos.rst rename to docs/sphinx/modules/python_ops.rst index bdeb5d6c3..7769d72dd 100644 --- a/docs/sphinx/modules/python_algos.rst +++ b/docs/sphinx/modules/python_ops.rst @@ -14,13 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -ALGORITHMS -========== +OPERATORS +========= -Algorithms for the NVIDIA® CV-CUDA library. +Operators for the NVIDIA® CV-CUDA library. .. toctree:: :glob: ../_python_api/_cvcuda_api/_op_* - ../_python_api/_cvcuda_api/_aux_* diff --git a/docs/sphinx/modules/python_ops_aux.rst b/docs/sphinx/modules/python_ops_aux.rst new file mode 100644 index 000000000..7f6c3ff8b --- /dev/null +++ b/docs/sphinx/modules/python_ops_aux.rst @@ -0,0 +1,25 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +OPERATOR'S AUXILIARY ENTITIES +============================= + +Operator's auxiliary entities for the NVIDIA® CV-CUDA library. + +.. toctree:: + :glob: + + ../_python_api/_cvcuda_api/_aux_* diff --git a/docs/sphinx/relnotes/v0.10.0-beta.rst b/docs/sphinx/relnotes/v0.10.0-beta.rst index 7f446dd77..17488f9dc 100644 --- a/docs/sphinx/relnotes/v0.10.0-beta.rst +++ b/docs/sphinx/relnotes/v0.10.0-beta.rst @@ -26,14 +26,14 @@ CV-CUDA v0.10.0 includes a critical bug fix (cache growth management) alongside * **New Features**: - * Added mechanism to limit and manage cache memory consumption (includes new "Best Practices" documentation). + * Added mechanism to limit and manage cache memory consumption (includes new "Best Practices" documentation) [1]_. * Performance improvements of color conversion operators (e.g., 2x faster RGB2YUV). * Refactored codebase to allow independent build of NVCV library (data structures). * **Bug Fixes**: - * Fixed unbounded cache memory consumption issue. - * Improved management of Python-created object lifetimes, decoupled from cache management. + * Fixed unbounded cache memory consumption issue [1]_. + * Improved management of Python-created object lifetimes, decoupled from cache management [1]_. * Fixed potential crash in Resize operator's linear and nearest neighbor interpolation from non-aligned vectorized writes. * Fixed Python CvtColor operator to correctly handle NV12 and NV21 outputs. * Fixed Resize and RandomResizedCrop linear interpolation weight for border rows and columns. @@ -68,3 +68,5 @@ Acknowledgements ---------------- CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. + +.. [1] These fixes and features add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​ diff --git a/docs/sphinx/relnotes/v0.11.0-beta.rst b/docs/sphinx/relnotes/v0.11.0-beta.rst new file mode 100644 index 000000000..98b4631d4 --- /dev/null +++ b/docs/sphinx/relnotes/v0.11.0-beta.rst @@ -0,0 +1,84 @@ +.. + # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +.. _v0.11.0-beta: + +v0.11.0-beta +============ + +Release Highlights +------------------ + +CV-CUDA v0.11.0 includes critical bug fixes alongside the following changes:​ + +* **New Features**:​ + + * Enable NVCV to be built as static library​ + * Improve Python doc generation and structure ​ + +* **Bug Fixes**:​ + + * Update pybind11 2.10.0 to 2.13.1. Fixes rare race conditions with Python garbage collector, adds compatibility with numpy2​ + +Compatibility and Known Limitations +----------------------------------- + +* **Pre-existing limitations**: + + * The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​ + + * Known affected formats:​ + - NVCV_COLOR_YUV2RGB_I420​ + - NVCV_COLOR_RGB2YUV_I420​ + - NVCV_COLOR_YUV2BGR_I420​ + - NVCV_COLOR_BGR2YUV_I420​ + - NVCV_COLOR_YUV2RGBA_I420​ + - NVCV_COLOR_RGBA2YUV_I420​ + - NVCV_COLOR_YUV2BGRA_I420​ + - NVCV_COLOR_BGRA2YUV_I420​ + - NVCV_COLOR_RGB2YUV_I420​ + - NVCV_COLOR_YUV2RGB_YV12​ + - NVCV_COLOR_RGB2YUV_YV12​ + - NVCV_COLOR_YUV2BGR_YV12​ + - NVCV_COLOR_BGR2YUV_YV12​ + - NVCV_COLOR_YUV2RGBA_YV12​ + - NVCV_COLOR_RGBA2YUV_YV12​ + - NVCV_COLOR_YUV2BGRA_YV12​ + - NVCV_COLOR_BGRA2YUV_YV12​ + - NVCV_COLOR_RGB2YUV_YV12​ + - NVCV_COLOR_YUV2GRAY_420​ + +For the full list, see main README on `CV-CUDA GitHub `_. + +License +------- + +CV-CUDA is licensed under the `Apache 2.0 `_ license. + +Resources +--------- + +1. `CV-CUDA GitHub `_ +2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_ +3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_ +4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_ + +Acknowledgements +---------------- + +CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team. + +.. [1] These fixes and features add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​ diff --git a/python/mod_cvcuda/CvtColorUtil.cpp b/python/mod_cvcuda/CvtColorUtil.cpp index 5d4df214e..5f80c08bf 100644 --- a/python/mod_cvcuda/CvtColorUtil.cpp +++ b/python/mod_cvcuda/CvtColorUtil.cpp @@ -296,7 +296,7 @@ int64_t GetOutputHeight(int64_t height, NVCVColorConversionCode code) } } -nvcv::TensorShape GetOutputTensorShape(nvcv::TensorShape inputShape, nvcv::ImageFormat outputFormat, +nvcv::TensorShape GetOutputTensorShape(const nvcv::TensorShape &inputShape, nvcv::ImageFormat outputFormat, NVCVColorConversionCode code) { if (inputShape.rank() < 3 || inputShape.rank() > 4) diff --git a/python/mod_cvcuda/CvtColorUtil.hpp b/python/mod_cvcuda/CvtColorUtil.hpp index caf277375..17203d1ed 100644 --- a/python/mod_cvcuda/CvtColorUtil.hpp +++ b/python/mod_cvcuda/CvtColorUtil.hpp @@ -26,7 +26,7 @@ nvcv::ImageFormat GetOutputFormat(nvcv::DataType in, NVCVColorConversionCode cod int64_t GetOutputHeight(int64_t inputHeight, NVCVColorConversionCode code); -nvcv::TensorShape GetOutputTensorShape(nvcv::TensorShape inputShape, nvcv::ImageFormat outputFormat, +nvcv::TensorShape GetOutputTensorShape(const nvcv::TensorShape &inputShape, nvcv::ImageFormat outputFormat, NVCVColorConversionCode code); #endif // NVCV_COLOR_CONVERSION_UTIL_HPP diff --git a/python/mod_nvcv/Cache.cpp b/python/mod_nvcv/Cache.cpp index ecc854c8f..fa62b5e70 100644 --- a/python/mod_nvcv/Cache.cpp +++ b/python/mod_nvcv/Cache.cpp @@ -85,6 +85,7 @@ struct Cache::Impl std::mutex mtx; Items items; int64_t cache_limit_inbytes; + int64_t current_size_inbytes = 0; }; Cache::Cache() @@ -102,12 +103,15 @@ void Cache::add(CacheItem &item) return; } - if (item.GetSizeInBytes() + doCurrentSizeInBytes() > doGetCacheLimit()) + if (item.GetSizeInBytes() + doGetCurrentSizeInBytes() > doGetCacheLimit()) { - savedItems = std::move(pimpl->items); + // we clear the cache: all pimpl->items will be dtor'ed at the end of scope of savedItems and cache size will be reset to 0 + savedItems = std::move(pimpl->items); + pimpl->current_size_inbytes = 0; } pimpl->items.emplace(&item.key(), item.shared_from_this()); + pimpl->current_size_inbytes += item.GetSizeInBytes(); } } @@ -138,6 +142,7 @@ void Cache::removeAllNotInUseMatching(const IKey &key) if (!it->second->isInUse()) { holdItemsUntilMtxUnlocked.push_back(it->second); + pimpl->current_size_inbytes -= it->second->GetSizeInBytes(); pimpl->items.erase(it++); } else @@ -202,11 +207,12 @@ std::shared_ptr Cache::fetchOne(const IKey &key) const void Cache::clear() { - Items savedItems; - std::unique_lock lk(pimpl->mtx); - savedItems = std::move(pimpl->items); - lk.unlock(); - savedItems.clear(); + Items savedItems; + { + std::unique_lock lk(pimpl->mtx); + savedItems = std::move(pimpl->items); + pimpl->current_size_inbytes = 0; + } } size_t Cache::size() const @@ -236,9 +242,11 @@ void Cache::setCacheLimit(int64_t new_cache_limit_inbytes) Items savedItems; { std::unique_lock lk(pimpl->mtx); - if (doCurrentSizeInBytes() > new_cache_limit_inbytes) + if (doGetCurrentSizeInBytes() > new_cache_limit_inbytes) { - savedItems = std::move(pimpl->items); + // we clear the cache: all pimpl->items will be dtor'ed at the end of scope of savedItems and cache size will be reset to 0 + savedItems = std::move(pimpl->items); + pimpl->current_size_inbytes = 0; } pimpl->cache_limit_inbytes = new_cache_limit_inbytes; } @@ -258,19 +266,12 @@ int64_t Cache::doGetCacheLimit() const int64_t Cache::getCurrentSizeInBytes() { std::unique_lock lk(pimpl->mtx); - return doCurrentSizeInBytes(); + return doGetCurrentSizeInBytes(); } -int64_t Cache::doCurrentSizeInBytes() const +int64_t Cache::doGetCurrentSizeInBytes() const { - int64_t current_size_inbytes = 0; - - for (auto it = pimpl->items.begin(); it != pimpl->items.end(); ++it) - { - current_size_inbytes += it->second->GetSizeInBytes(); - } - - return current_size_inbytes; + return pimpl->current_size_inbytes; } void Cache::doIterateThroughItems(const std::function &fn) const diff --git a/python/mod_nvcv/Cache.hpp b/python/mod_nvcv/Cache.hpp index db29a2b39..55cd60a5f 100644 --- a/python/mod_nvcv/Cache.hpp +++ b/python/mod_nvcv/Cache.hpp @@ -135,7 +135,7 @@ class PYBIND11_EXPORT Cache Cache(); void doIterateThroughItems(const std::function &fn) const; - int64_t doCurrentSizeInBytes() const; + int64_t doGetCurrentSizeInBytes() const; int64_t doGetCacheLimit() const; }; diff --git a/python/setup.py.in b/python/setup.py.in index b88d9bc74..c22e9d0ff 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -66,7 +66,7 @@ setup( "": ["*.so", "cvcuda.libs/*.*"] }, # Includes the binding .so + core .so files include_package_data=True, - install_requires=["numpy>=1.23.5,<2.0.0"], + install_requires=["numpy>=1.23.5"], python_requires="==${PYTHON_VERSION}.*", zip_safe=False, cmdclass={ diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh index f7f17b5b2..e207fd303 100755 --- a/samples/scripts/run_samples.sh +++ b/samples/scripts/run_samples.sh @@ -136,3 +136,4 @@ python3 $SAMPLES_DIR/label/python/main.py -o "$LABEL_RUN_DIR" # Run it with batch size 1 on a single image LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR") python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$LABEL_RUN_DIR" + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c4b4001d9..a4a5696f5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,5 +21,8 @@ if(LTO_ENABLED) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION on) endif() +# Setting NVCV lib to be a shared lib +set(NVCV_BUILD_SHARED_LIBS ON) + add_subdirectory(nvcv) add_subdirectory(cvcuda) diff --git a/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp b/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp index 6bac8da17..edcf1f0c6 100644 --- a/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp +++ b/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -284,6 +284,8 @@ NVCVStatus TranslateError(legacy::cuda_op::ErrorCode err) switch (err) { + case ErrorCode::SUCCESS: + return NVCV_SUCCESS; case ErrorCode::INVALID_PARAMETER: case ErrorCode::INVALID_DATA_FORMAT: case ErrorCode::INVALID_DATA_SHAPE: diff --git a/src/nvcv/CMakeLists.txt b/src/nvcv/CMakeLists.txt index 7dd2a0674..f5f544ff0 100644 --- a/src/nvcv/CMakeLists.txt +++ b/src/nvcv/CMakeLists.txt @@ -17,14 +17,24 @@ cmake_minimum_required(VERSION 3.20.1) project(nvcv LANGUAGES C CXX - VERSION 0.10.1 + VERSION 0.11.0 DESCRIPTION "NVCV is NVIDIA Computer Vision library" ) # Used when creating special builds set(PROJECT_VERSION_SUFFIX "-beta") -option(NVCV_ENABLE_INSTALL "Enables creation of NVCV installers using cpack" ON) +include(CMakeDependentOption) + +option(NVCV_BUILD_SHARED_LIBS "Builds NVCV as shared library" ON) + +if(NVCV_ENABLE_INSTALL AND NOT NVCV_BUILD_SHARED_LIBS) + message(FATAL_ERROR "NVCV installers can only be enabled when building NVCV as a shared library") +endif() + +# Enables install targets by default if building shared libs. User can disable it. +# If building static libs, forcibly disables install targets. +cmake_dependent_option(NVCV_ENABLE_INSTALL "Enables creation of NVCV installers using cpack" ON NVCV_BUILD_SHARED_LIBS OFF) # Configure build tree ====================== @@ -33,8 +43,5 @@ list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(ConfigVersion) include(ConfigBuildTree) -# NVCV currently supports only shared build -set(CMAKE_POSITION_INDEPENDENT_CODE on) - add_subdirectory(util) add_subdirectory(src) diff --git a/src/nvcv/cmake/ConfigBuildTree.cmake b/src/nvcv/cmake/ConfigBuildTree.cmake index ab8082ba2..9b23c6565 100644 --- a/src/nvcv/cmake/ConfigBuildTree.cmake +++ b/src/nvcv/cmake/ConfigBuildTree.cmake @@ -22,6 +22,14 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +# PIC must be enabled if building a shared lib +if(NVCV_BUILD_SHARED_LIBS) + set(BUILD_SHARED_LIBS ON) + set(CMAKE_POSITION_INDEPENDENT_CODE on) +else() + set(BUILD_SHARED_LIBS OFF) +endif() + include(GNUInstallDirs) set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}") diff --git a/src/nvcv/src/CMakeLists.txt b/src/nvcv/src/CMakeLists.txt index 5253b21d3..fef54d274 100644 --- a/src/nvcv/src/CMakeLists.txt +++ b/src/nvcv/src/CMakeLists.txt @@ -16,7 +16,7 @@ # nvcv_types private implementation add_subdirectory(priv) -add_library(nvcv_types SHARED +add_library(nvcv_types Config.cpp Version.cpp Status.cpp @@ -49,8 +49,14 @@ target_include_directories(nvcv_types ) configure_version(nvcv_types NVCV nvcv ${PROJECT_VERSION}) + configure_symbol_versioning(nvcv_types NVCV nvcv_types) -setup_dso(nvcv_types ${PROJECT_VERSION}) + +if(BUILD_SHARED_LIBS) + setup_dso(nvcv_types ${PROJECT_VERSION}) +else() + target_compile_definitions(nvcv_types PRIVATE -DNVCV_STATIC=1) +endif() target_compile_definitions(nvcv_types PRIVATE -DNVCV_EXPORTING=1) diff --git a/src/nvcv/src/include/nvcv/Export.h b/src/nvcv/src/include/nvcv/Export.h index 611bcf4f7..53581b0fd 100644 --- a/src/nvcv/src/include/nvcv/Export.h +++ b/src/nvcv/src/include/nvcv/Export.h @@ -34,7 +34,11 @@ # endif #else # if __GNUC__ >= 4 -# define NVCV_PUBLIC __attribute__((visibility("default"))) +# ifdef NVCV_STATIC +# define NVCV_PUBLIC __attribute__((visibility("hidden"))) +# else +# define NVCV_PUBLIC __attribute__((visibility("default"))) +# endif # else # define NVCV_PUBLIC # endif diff --git a/src/nvcv/util/CMakeLists.txt b/src/nvcv/util/CMakeLists.txt index 748cc9843..1287b2ff4 100644 --- a/src/nvcv/util/CMakeLists.txt +++ b/src/nvcv/util/CMakeLists.txt @@ -16,7 +16,7 @@ find_package(CUDAToolkit REQUIRED) # nvcv_util_sanitizer --------------------------------- -add_library(nvcv_util_sanitizer +add_library(nvcv_util_sanitizer STATIC SanitizerOptions.c ) @@ -176,7 +176,7 @@ target_include_directories(nvcv_util_symver ) # nvcv_util --------------------------------- -add_library(nvcv_util +add_library(nvcv_util STATIC Assert.cpp CheckError.cpp String.cpp diff --git a/tests/cvcuda/python/test_opmatch.py b/tests/cvcuda/python/test_opmatch.py index 2d29fd812..81133fe93 100644 --- a/tests/cvcuda/python/test_opmatch.py +++ b/tests/cvcuda/python/test_opmatch.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -39,9 +39,21 @@ def distance(p1, p2, norm_type): if norm_type == cvcuda.Norm.HAMMING: return sum([bin(c1 ^ c2).count("1") for c1, c2 in zip(p1, p2)]) elif norm_type == cvcuda.Norm.L1: - return sum([abs(ref.absdiff(c1, c2)) for c1, c2 in zip(p1, p2)]) + return sum( + [ + abs(ref.absdiff(c1.astype(np.float64), c2.astype(np.float64))) + for c1, c2 in zip(p1, p2) + ] + ) elif norm_type == cvcuda.Norm.L2: - return np.sqrt(sum([ref.absdiff(c1, c2) ** 2 for c1, c2 in zip(p1, p2)])) + return np.sqrt( + sum( + [ + ref.absdiff(c1.astype(np.float64), c2.astype(np.float64)) ** 2 + for c1, c2 in zip(p1, p2) + ] + ) + ) def brute_force_matcher(batch_set1, batch_set2, cross_check, norm_type): batch_matches = [] diff --git a/tests/cvcuda/system/TestOpHQResize.cpp b/tests/cvcuda/system/TestOpHQResize.cpp index c5953b528..39e634015 100644 --- a/tests/cvcuda/system/TestOpHQResize.cpp +++ b/tests/cvcuda/system/TestOpHQResize.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -45,12 +46,29 @@ using uniform_distribution namespace baseline { +template +struct Roi +{ + using ShapeT = cuda::MakeType; + ShapeT origin; + ShapeT shape; +}; + +template +Roi FullRoi(typename Roi::ShapeT shape) +{ + Roi roi; + roi.origin = decltype(roi.origin){0}; + roi.shape = shape; + return roi; +} + template -void ForAll(int2 shape, Cb &&cb) +void ForAllInRoi(Roi<2> roi, Cb &&cb) { - for (int y = 0; y < shape.y; y++) + for (int y = roi.origin.y; y < roi.origin.y + roi.shape.y; y++) { - for (int x = 0; x < shape.x; x++) + for (int x = roi.origin.x; x < roi.origin.x + roi.shape.x; x++) { cb(int2{x, y}); } @@ -58,16 +76,18 @@ void ForAll(int2 shape, Cb &&cb) } template -void ForAll(int3 shape, Cb &&cb) +void ForAllInRoi(Roi<3> roi, Cb &&cb) { - for (int z = 0; z < shape.z; z++) - for (int y = 0; y < shape.y; y++) + for (int z = roi.origin.z; z < roi.origin.z + roi.shape.z; z++) + { + for (int y = roi.origin.y; y < roi.origin.y + roi.shape.y; y++) { - for (int x = 0; x < shape.x; x++) + for (int x = roi.origin.x; x < roi.origin.x + roi.shape.x; x++) { cb(int3{x, y, z}); } } + } } template @@ -88,7 +108,7 @@ struct CpuSample BT &get(int sampleIdx, const ShapeT idx, int channel) { - return *(reinterpret_cast(m_data.data() + offset(sampleIdx, idx)) + channel); + return *(reinterpret_cast(data() + offset(sampleIdx, idx)) + channel); } uint8_t *data() @@ -268,7 +288,8 @@ struct Filter }; template -void RunNN(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu) +void RunNN(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, + Roi roi) { const int numSamples = inTensorCpu.numSamples(); const int numChannels = inTensorCpu.numChannels(); @@ -280,24 +301,25 @@ void RunNN(int axis, CpuSample &outTensorCpu, CpuSample outIdx) - { - auto inIdx = outIdx; - int inAxis = std::floor(cuda::GetElement(outIdx, axis) * axisScale + axisOrigin); - inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); - cuda::GetElement(inIdx, axis) = inAxis; - for (int c = 0; c < numChannels; c++) - { - outTensorCpu.get(sampleIdx, outIdx, c) - = cuda::SaturateCast(inTensorCpu.get(sampleIdx, inIdx, c)); - } - }); + ForAllInRoi(roi, + [&](const cuda::MakeType outIdx) + { + auto inIdx = outIdx; + int inAxis = std::floor(cuda::GetElement(outIdx, axis) * axisScale + axisOrigin); + inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); + cuda::GetElement(inIdx, axis) = inAxis; + for (int c = 0; c < numChannels; c++) + { + outTensorCpu.get(sampleIdx, outIdx, c) + = cuda::SaturateCast(inTensorCpu.get(sampleIdx, inIdx, c)); + } + }); } } template -void RunLinear(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu) +void RunLinear(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, + Roi roi) { const int numSamples = inTensorCpu.numSamples(); const int numChannels = inTensorCpu.numChannels(); @@ -309,33 +331,33 @@ void RunLinear(int axis, CpuSample &outTensorCpu, CpuSample const float axisOrigin = 0.5f * axisScale - 0.5f; for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { - ForAll(outShape, - [&](const cuda::MakeType outIdx) - { - const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; - int inAxis0 = std::floor(inAxis0f); - int inAxis1 = inAxis0 + 1; - const float q = inAxis0f - inAxis0; - inAxis0 = inAxis0 < 0 ? 0 : (inAxis0 > inSize - 1 ? inSize - 1 : inAxis0); - inAxis1 = inAxis1 < 0 ? 0 : (inAxis1 > inSize - 1 ? inSize - 1 : inAxis1); - auto inIdx0 = outIdx; - auto inIdx1 = outIdx; - cuda::GetElement(inIdx0, axis) = inAxis0; - cuda::GetElement(inIdx1, axis) = inAxis1; - for (int c = 0; c < numChannels; c++) - { - const float a = inTensorCpu.get(sampleIdx, inIdx0, c); - const float b = inTensorCpu.get(sampleIdx, inIdx1, c); - const float tmp = b - a; - outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(std::fmaf(tmp, q, a)); - } - }); + ForAllInRoi(roi, + [&](const cuda::MakeType outIdx) + { + const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; + int inAxis0 = std::floor(inAxis0f); + int inAxis1 = inAxis0 + 1; + const float q = inAxis0f - inAxis0; + inAxis0 = inAxis0 < 0 ? 0 : (inAxis0 > inSize - 1 ? inSize - 1 : inAxis0); + inAxis1 = inAxis1 < 0 ? 0 : (inAxis1 > inSize - 1 ? inSize - 1 : inAxis1); + auto inIdx0 = outIdx; + auto inIdx1 = outIdx; + cuda::GetElement(inIdx0, axis) = inAxis0; + cuda::GetElement(inIdx1, axis) = inAxis1; + for (int c = 0; c < numChannels; c++) + { + const float a = inTensorCpu.get(sampleIdx, inIdx0, c); + const float b = inTensorCpu.get(sampleIdx, inIdx1, c); + const float tmp = b - a; + outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(std::fmaf(tmp, q, a)); + } + }); } } template void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, - const FilterT &filter) + const FilterT &filter, Roi roi) { const int numSamples = inTensorCpu.numSamples(); const int numChannels = inTensorCpu.numChannels(); @@ -350,36 +372,36 @@ void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { - ForAll(outShape, - [&](const cuda::MakeType outIdx) - { - const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; - int inAxis0 = std::ceil(inAxis0f); - const float fStart = (inAxis0 - inAxis0f) * filterStep; - for (int c = 0; c < numChannels; c++) - { - float tmp = 0; - float norm = 0; - for (int k = 0; k < filterSupport; k++) - { - int inAxis = inAxis0 + k; - inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); - auto inIdx = outIdx; - cuda::GetElement(inIdx, axis) = inAxis; - const InBT inVal = inTensorCpu.get(sampleIdx, inIdx, c); - float coeff = filter(fStart + k * filterStep); - tmp = std::fmaf(inVal, coeff, tmp); - norm += coeff; - } - outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(tmp / norm); - } - }); + ForAllInRoi(roi, + [&](const cuda::MakeType outIdx) + { + const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin; + int inAxis0 = std::ceil(inAxis0f); + const float fStart = (inAxis0 - inAxis0f) * filterStep; + for (int c = 0; c < numChannels; c++) + { + float tmp = 0; + float norm = 0; + for (int k = 0; k < filterSupport; k++) + { + int inAxis = inAxis0 + k; + inAxis = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis); + auto inIdx = outIdx; + cuda::GetElement(inIdx, axis) = inAxis; + const InBT inVal = inTensorCpu.get(sampleIdx, inIdx, c); + float coeff = filter(fStart + k * filterStep); + tmp = std::fmaf(inVal, coeff, tmp); + norm += coeff; + } + outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast(tmp / norm); + } + }); } } template void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, - const NVCVInterpolationType interpolation, bool antialias) + const NVCVInterpolationType interpolation, bool antialias, Roi roi) { const auto inShape = inTensorCpu.shape(); const auto outShape = outTensorCpu.shape(); @@ -391,28 +413,28 @@ void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample { float radius = antialias ? inSize / outSize : 1; float support = std::max(1.0f, 2 * radius); - RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}, roi); } break; case NVCV_INTERP_CUBIC: { float radius = antialias ? (2 * inSize / outSize) : 2; float support = std::max(4.0f, 2 * radius); - RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}, roi); } break; case NVCV_INTERP_GAUSSIAN: { float radius = antialias ? inSize / outSize : 1; float support = std::max(1.0f, 2 * radius); - RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}, roi); } break; case NVCV_INTERP_LANCZOS: { float radius = antialias ? (3 * inSize / outSize) : 3; float support = std::max(6.0f, 2 * radius); - RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}); + RunFilter(axis, outTensorCpu, inTensorCpu, Filter{support}, roi); } break; default: @@ -422,7 +444,8 @@ void RunFilter(int axis, CpuSample &outTensorCpu, CpuSample template void RunPass(int axis, CpuSample &outTensorCpu, CpuSample &inTensorCpu, - const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias, + Roi roi) { const auto inShape = inTensorCpu.shape(); @@ -435,63 +458,80 @@ void RunPass(int axis, CpuSample &outTensorCpu, CpuSample void Resize(CpuSample &refTensorCpu, CpuSample &inTensorCpu, - const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias, + std::optional> inRoiArg = std::nullopt, std::optional> outRoiArg = std::nullopt) { - int numSamples = inTensorCpu.numSamples(); - int numChannels = inTensorCpu.numChannels(); - const int2 inShape = inTensorCpu.shape(); - const int2 outShape = refTensorCpu.shape(); - const int2 interShape = {outShape.x, inShape.y}; - auto intermediateTensor = GetIntermediate(numSamples, interShape, numChannels); - RunPass(0, intermediateTensor, inTensorCpu, minInterpolation, magInterpolation, antialias); - RunPass(1, refTensorCpu, intermediateTensor, minInterpolation, magInterpolation, antialias); + int numSamples = inTensorCpu.numSamples(); + int numChannels = inTensorCpu.numChannels(); + const int2 inShape = inTensorCpu.shape(); + const int2 outShape = refTensorCpu.shape(); + const int2 interShape = {outShape.x, inShape.y}; + Roi<2> inRoi = inRoiArg.value_or(FullRoi<2>(inShape)); + Roi<2> outRoi = outRoiArg.value_or(FullRoi<2>(outShape)); + auto interRoi = Roi<2>(int2(outRoi.origin.x, inRoi.origin.y), int2(outRoi.shape.x, inRoi.shape.y)); + + auto intermediateTensor = GetIntermediate(numSamples, interShape, numChannels); + RunPass(0, intermediateTensor, inTensorCpu, minInterpolation, magInterpolation, antialias, interRoi); + RunPass(1, refTensorCpu, intermediateTensor, minInterpolation, magInterpolation, antialias, outRoi); } template void Resize(CpuSample &refTensorCpu, CpuSample &inTensorCpu, - const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias) + const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias, + std::optional> inRoiArg = std::nullopt, std::optional> outRoiArg = std::nullopt) { - int numSamples = inTensorCpu.numSamples(); - int numChannels = inTensorCpu.numChannels(); - const int3 inShape = inTensorCpu.shape(); - const int3 outShape = refTensorCpu.shape(); - const int3 interShape0 = {outShape.x, inShape.y, inShape.z}; - const int3 interShape1 = {outShape.x, outShape.y, inShape.z}; - auto intermediateTensor0 = GetIntermediate(numSamples, interShape0, numChannels); - RunPass(0, intermediateTensor0, inTensorCpu, minInterpolation, magInterpolation, antialias); + int numSamples = inTensorCpu.numSamples(); + int numChannels = inTensorCpu.numChannels(); + const int3 inShape = inTensorCpu.shape(); + const int3 outShape = refTensorCpu.shape(); + const int3 interShape0 = {outShape.x, inShape.y, inShape.z}; + const int3 interShape1 = {outShape.x, outShape.y, inShape.z}; + Roi<3> inRoi = inRoiArg.value_or(FullRoi<3>(inShape)); + Roi<3> outRoi = outRoiArg.value_or(FullRoi<3>(outShape)); + auto interRoi0 = Roi<3>{ + int3{outRoi.origin.x, inRoi.origin.y, inRoi.origin.z}, + int3{ outRoi.shape.x, inRoi.shape.y, inRoi.shape.z} + }; + auto interRoi1 = Roi<3>(int3{outRoi.origin.x, outRoi.origin.y, inRoi.origin.z}, + int3{outRoi.shape.x, outRoi.shape.y, inRoi.shape.z}); + + auto intermediateTensor0 = GetIntermediate(numSamples, interShape0, numChannels); + RunPass(0, intermediateTensor0, inTensorCpu, minInterpolation, magInterpolation, antialias, interRoi0); auto intermediateTensor1 = GetIntermediate(numSamples, interShape1, numChannels); - RunPass(1, intermediateTensor1, intermediateTensor0, minInterpolation, magInterpolation, antialias); - RunPass(2, refTensorCpu, intermediateTensor1, minInterpolation, magInterpolation, antialias); + RunPass(1, intermediateTensor1, intermediateTensor0, minInterpolation, magInterpolation, antialias, interRoi1); + RunPass(2, refTensorCpu, intermediateTensor1, minInterpolation, magInterpolation, antialias, outRoi); } template -void Compare(CpuSample &tensor, CpuSample &refTensor, bool antialias) +void Compare(CpuSample &tensor, CpuSample &refTensor, bool antialias, + std::optional> roi_arg = std::nullopt) { int numSamples = tensor.numSamples(); int numChannels = tensor.numChannels(); const auto shape = tensor.shape(); + const auto roi = roi_arg.value_or(FullRoi(shape)); ASSERT_EQ(numSamples, refTensor.numSamples()); ASSERT_EQ(numChannels, refTensor.numChannels()); ASSERT_EQ(shape, refTensor.shape()); @@ -499,33 +539,33 @@ void Compare(CpuSample &tensor, CpuSample &r int64_t vol = 0; for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { - ForAll(shape, - [&](const cuda::MakeType idx) - { - for (int c = 0; c < numChannels; c++) - { - const BT val = tensor.get(sampleIdx, idx, c); - const BT refVal = refTensor.get(sampleIdx, idx, c); - err += abs(val - refVal); - vol += 1; - - if (std::is_integral_v) // uchar -> uchar, short -> short, ushort -> ushort - { - ASSERT_NEAR(val, refVal, (std::is_same_v ? 1 : 10)); // uchar : short, ushort - } - else // output type is float - { - if (!std::is_integral_v) // float -> float - { - ASSERT_NEAR(val, refVal, 1e-4); - } - else // [uchar, short, ushort] -> float - { - ASSERT_NEAR(val, refVal, (std::is_same_v ? 0.1 : 6)); - } - } - } - }); + ForAllInRoi(roi, + [&](const cuda::MakeType idx) + { + for (int c = 0; c < numChannels; c++) + { + const BT val = tensor.get(sampleIdx, idx, c); + const BT refVal = refTensor.get(sampleIdx, idx, c); + err += abs(val - refVal); + vol += 1; + + if (std::is_integral_v) // uchar -> uchar, short -> short, ushort -> ushort + { + ASSERT_NEAR(val, refVal, (std::is_same_v ? 1 : 10)); // uchar : short, ushort + } + else // output type is float + { + if (!std::is_integral_v) // float -> float + { + ASSERT_NEAR(val, refVal, 1e-4); + } + else // [uchar, short, ushort] -> float + { + ASSERT_NEAR(val, refVal, (std::is_same_v ? 0.1 : 6)); + } + } + } + }); } double mean_err = err / vol; ASSERT_LE(mean_err, antialias ? 0.1 : 0.4); @@ -662,6 +702,18 @@ void TestTensor(bool antialias) nvcv::Tensor inTensor = CreateTensorHelper(inDtype, "NHWC", numSamples, inShape.y, inShape.x, numChannels); nvcv::Tensor outTensor = CreateTensorHelper(outDtype, "NHWC", numSamples, outShape.y, outShape.x, numChannels); + baseline::Roi<2> inRoi = baseline::FullRoi<2>(inShape); + baseline::Roi<2> outRoi = baseline::FullRoi<2>(outShape); + if (inShape.x * inShape.y > 1 << 23) + { + inRoi.shape = cuda::min(inShape, int2{1 << 12, 1 << 11}); + inRoi.origin = inShape - inRoi.shape; + + double2 scale = cuda::StaticCast(outShape) / cuda::StaticCast(inShape); + outRoi.shape = cuda::StaticCast(scale * cuda::StaticCast(inRoi.shape)); + outRoi.origin = cuda::StaticCast(scale * cuda::StaticCast(inRoi.origin)); + } + auto inData = inTensor.exportData(); auto outData = outTensor.exportData(); ASSERT_TRUE(inData && outData); @@ -690,16 +742,14 @@ void TestTensor(bool antialias) for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { - for (int y = 0; y < inShape.y; y++) - { - for (int x = 0; x < inShape.x; x++) - { - for (int c = 0; c < numChannels; c++) - { - inTensorCpu.get(sampleIdx, int2{x, y}, c) = rand(rng); - } - } - } + baseline::ForAllInRoi(inRoi, + [&](int2 idx) + { + for (int c = 0; c < numChannels; c++) + { + inTensorCpu.get(sampleIdx, idx, c) = rand(rng); + } + }); } cvcuda::HQResize op; @@ -725,9 +775,9 @@ void TestTensor(bool antialias) ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias)); ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.z * numSamples, cudaMemcpyDeviceToHost, stream)); - baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias); + baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias, {inRoi}, {outRoi}); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - baseline::Compare(outTensorCpu, refTensorCpu, antialias); + baseline::Compare(outTensorCpu, refTensorCpu, antialias, {outRoi}); ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } @@ -752,7 +802,7 @@ NVCV_TYPED_TEST_SUITE( NVCV_TEST_ROW(3, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 100, 50), 3, float, float, NVCV_INTERP_CUBIC), NVCV_TEST_ROW(4, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(100, 40, 40), 5, uchar, float, NVCV_INTERP_LANCZOS), NVCV_TEST_ROW(7, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC), - NVCV_TEST_ROW(3, NVCV_SHAPE3D(1 << 10, 1 << 9, 1 << 9), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, + NVCV_TEST_ROW(3, NVCV_SHAPE3D(1 << 10, 1 << 9, 1 << 9), NVCV_SHAPE3D(100, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC)>); TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) @@ -773,6 +823,18 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) nvcv::Tensor outTensor = CreateTensorHelper(outDtype, "NDHWC", numSamples, outShape.z, outShape.y, outShape.x, numChannels); + baseline::Roi<3> inRoi = baseline::FullRoi<3>(inShape); + baseline::Roi<3> outRoi = baseline::FullRoi<3>(outShape); + if (inShape.x * inShape.y * inShape.z > 1 << 22) + { + inRoi.shape = cuda::min(inShape, int3{1 << 8, 1 << 7, 1 << 7}); + inRoi.origin = inShape - inRoi.shape; + + double3 scale = cuda::StaticCast(outShape) / cuda::StaticCast(inShape); + outRoi.shape = cuda::StaticCast(scale * cuda::StaticCast(inRoi.shape)); + outRoi.origin = cuda::StaticCast(scale * cuda::StaticCast(inRoi.origin)); + } + auto inData = inTensor.exportData(); auto outData = outTensor.exportData(); ASSERT_TRUE(inData && outData); @@ -802,19 +864,14 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { - for (int z = 0; z < inShape.z; z++) - { - for (int y = 0; y < inShape.y; y++) - { - for (int x = 0; x < inShape.x; x++) - { - for (int c = 0; c < numChannels; c++) - { - inTensorCpu.get(sampleIdx, int3{x, y, z}, c) = rand(rng); - } - } - } - } + baseline::ForAllInRoi(inRoi, + [&](int3 idx) + { + for (int c = 0; c < numChannels; c++) + { + inTensorCpu.get(sampleIdx, idx, c) = rand(rng); + } + }); } cvcuda::HQResize op; @@ -840,9 +897,9 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias) ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias)); ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.w * numSamples, cudaMemcpyDeviceToHost, stream)); - baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias); + baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias, {inRoi}, {outRoi}); ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); - baseline::Compare(outTensorCpu, refTensorCpu, antialias); + baseline::Compare(outTensorCpu, refTensorCpu, antialias, {outRoi}); ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } @@ -906,6 +963,28 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) {{512, 512}, 2, inShapes[4].numChannels} }; + std::vector> inRois(numSamples); + std::vector> outRois(numSamples); + + for (int s = 0; s < numSamples; s++) + { + int2 inShape{inShapes[s].extent[1], inShapes[s].extent[0]}; + int2 outShape{outShapes[s].extent[1], outShapes[s].extent[0]}; + + inRois[s] = baseline::FullRoi<2>(inShape); + outRois[s] = baseline::FullRoi<2>(outShape); + + if (inShape.x * inShape.y > 1 << 23) + { + inRois[s].shape = cuda::min(inShape, int2{1 << 12, 1 << 11}); + inRois[s].origin = inShape - inRois[s].shape; + + double2 scale = cuda::StaticCast(outShape) / cuda::StaticCast(inShape); + outRois[s].shape = cuda::StaticCast(scale * cuda::StaticCast(inRois[s].shape)); + outRois[s].origin = cuda::StaticCast(scale * cuda::StaticCast(inRois[s].origin)); + } + } + ASSERT_EQ(numSamples, inShapes.size()); ASSERT_EQ(numSamples, outShapes.size()); @@ -969,17 +1048,17 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) refBatchCpu.push_back( baseline::CpuSample{outStrides.z, outStrides, 1, outShape, outAccess->numChannels()}); - auto &inTensorCpu = inBatchCpu[sampleIdx]; - for (int y = 0; y < inShape.y; y++) - { - for (int x = 0; x < inShape.x; x++) - { - for (int c = 0; c < inShapes[sampleIdx].numChannels; c++) - { - inTensorCpu.get(0, int2{x, y}, c) = rand(rng); - } - } - } + const auto &inRoi = inRois[sampleIdx]; + auto &inTensorCpu = inBatchCpu[sampleIdx]; + baseline::ForAllInRoi(inRoi, + [&](int2 idx) + { + for (int c = 0; c < inShapes[sampleIdx].numChannels; c++) + { + inTensorCpu.get(0, idx, c) = rand(rng); + } + }); + ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.z, cudaMemcpyHostToDevice, stream)); } @@ -1008,8 +1087,9 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output) for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++) { SCOPED_TRACE(sampleIdx); - baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias); - baseline::Compare(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias); + baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias, + {inRois[sampleIdx]}, {outRois[sampleIdx]}); + baseline::Compare(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias, {outRois[sampleIdx]}); } ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); } diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp index 620a8448b..3b5785562 100644 --- a/tests/cvcuda/system/TestOpOSD.cpp +++ b/tests/cvcuda/system/TestOpOSD.cpp @@ -419,6 +419,7 @@ NVCV_TEST_SUITE_P(OpOSD, test::ValueList>> elementVec; + + std::vector testStrings{ + // valid + "Hello!", "\u00E9", "\u20AC", "\U0001F600", + // invalid + "\xC0\x80", "\xc2\x00", // second bytes + "\xde\x82\xa0", "\xe2\x00\xa0", "\xe2\x82\x00", "\xe0\x9f\xa0", + "\xed\xbf\xa0", // three bytes + "\xf4\x90\x84\x9e", "\xf0\x9d\x04\x9e", "\xf0\x80\x84\x9e", "\xf4\xbf\x84\x9e", "\xf0\x9d\x84\x0e", + "\xf5\xc0\x84\x9e", "\xf3\xc0\x84\x9e" // four bytes + }; + + std::vector> textVec; + for (auto testStr : testStrings) + { + std::shared_ptr element; + NVCVText text = NVCVText(testStr.c_str(), 5 * randl(1, 10), DEFAULT_OSD_FONT, + NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}), + NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), + (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)})); + element = std::make_shared(type, &text); + textVec.push_back(element); + } + + elementVec.push_back(textVec); + + std::shared_ptr ctx = std::make_shared(elementVec); + + nvcv::Tensor imgIn = nvcv::util::CreateTensor(inN, inW, inH, format); + nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format); + + auto input = imgIn.exportData(); + auto output = imgOut.exportData(); + + ASSERT_NE(input, nullptr); + ASSERT_NE(output, nullptr); + + auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*input); + ASSERT_TRUE(inAccess); + + auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*output); + ASSERT_TRUE(outAccess); + + long inSampleStride = inAccess->numRows() * inAccess->rowStride(); + long outSampleStride = outAccess->numRows() * outAccess->rowStride(); + + int inBufSize = inSampleStride * inAccess->numSamples(); + int outBufSize = outSampleStride * outAccess->numSamples(); + + EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples())); + EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples())); + + EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVElements)ctx.get())); + + // check cdata + std::vector test(outBufSize); + std::vector testIn(inBufSize); + + EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + EXPECT_EQ(cudaSuccess, cudaMemcpy(testIn.data(), input->basePtr(), inBufSize, cudaMemcpyDeviceToHost)); + EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), output->basePtr(), outBufSize, cudaMemcpyDeviceToHost)); + + std::vector gold(outBufSize); + setGoldBuffer(gold, format, *inAccess, input->basePtr(), ctx, stream); + + EXPECT_EQ(gold, test); + + EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} diff --git a/tests/nvcv_types/CMakeLists.txt b/tests/nvcv_types/CMakeLists.txt index 3a0221a21..566f2d088 100644 --- a/tests/nvcv_types/CMakeLists.txt +++ b/tests/nvcv_types/CMakeLists.txt @@ -44,19 +44,27 @@ include(ExternalProject) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../../src/nvcv DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/standalone/) +set(NVCV_STANDALONE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/standalone) + +set(NVCV_STANDALONE_CMAKE_ARGS + -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY} + -DNVCV_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone/nvcv + -DNVCV_ENABLE_INSTALL=OFF + -DEXPOSE_CODE=OFF + -DWARNINGS_AS_ERRORS=${WARNINGS_AS_ERRORS} + -DENABLE_TEGRA=${ENABLE_TEGRA} + -DENABLE_COMPAT_OLD_GLIBC=${ENABLE_COMPAT_OLD_GLIBC} +) + ExternalProject_Add( nvcv_standalone - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/standalone - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/standalone + SOURCE_DIR ${NVCV_STANDALONE_SOURCE_DIR} + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/standalone/shared INSTALL_COMMAND "" CMAKE_ARGS - -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY} - -DNVCV_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone/nvcv - -DNVCV_ENABLE_INSTALL=OFF - -DEXPOSE_CODE=OFF - -DWARNINGS_AS_ERRORS=${WARNINGS_AS_ERRORS} - -DENABLE_TEGRA=${ENABLE_TEGRA} - -DENABLE_COMPAT_OLD_GLIBC=${ENABLE_COMPAT_OLD_GLIBC} + ${NVCV_STANDALONE_CMAKE_ARGS} + -DNVCV_BUILD_SHARED_LIBS=ON + -DNVCV_EXE=nvcv_test_standalone ) add_executable(nvcv_test_standalone IMPORTED) @@ -66,3 +74,24 @@ set_target_properties(nvcv_test_standalone PROPERTIES add_dependencies(nvcv_test_standalone nvcv_standalone) nvcv_add_test(${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone nvcv) + +# Test NVCV can be build statically via the standalone test + +ExternalProject_Add( + nvcv_standalone_static + SOURCE_DIR ${NVCV_STANDALONE_SOURCE_DIR} + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/standalone/static + INSTALL_COMMAND "" + CMAKE_ARGS + ${NVCV_STANDALONE_CMAKE_ARGS} + -DNVCV_BUILD_SHARED_LIBS=OFF + -DNVCV_EXE=nvcv_test_standalone_static +) + +add_executable(nvcv_test_standalone_static IMPORTED) +set_target_properties(nvcv_test_standalone_static PROPERTIES + IMPORTED_LOCATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone_static +) +add_dependencies(nvcv_test_standalone_static nvcv_standalone_static) + +nvcv_add_test(${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone_static nvcv) diff --git a/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp b/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp index 93b350682..87cd1af4f 100644 --- a/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp +++ b/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual @@ -64,6 +64,7 @@ NVCV_TEST_SUITE_P(CheckLegacyFormatHelpersInvalid, test::ValueList(); @@ -87,6 +88,7 @@ NVCV_TEST_SUITE_P(CheckLegacyHelpersDataType, test::ValueList(); @@ -95,3 +97,136 @@ TEST_P(CheckLegacyHelpersDataType, check_conversion_to_legacy_data_type) EXPECT_EQ(expect, helpers::GetLegacyDataType(bpp, kind)); } + +// clang-format off +NVCV_TEST_SUITE_P(CheckLegacyHelpersDataTypeInvalid, test::ValueList +{ + {8, nvcv::DataKind::FLOAT}, + {64, nvcv::DataKind::SIGNED}, + {64, nvcv::DataKind::UNSIGNED}, + {32, nvcv::DataKind::COMPLEX}, + {32, nvcv::DataKind::UNSPECIFIED} +}); + +// clang-format on + +TEST_P(CheckLegacyHelpersDataTypeInvalid, check_conversion_to_legacy_data_type_invalid) +{ + int32_t bpp = GetParamValue<0>(); + nvcv::DataKind kind = GetParamValue<1>(); + EXPECT_THROW(helpers::GetLegacyDataType(bpp, kind), nvcv::Exception); +} + +TEST(CheckLegacyHelpersDataFormat, check_image_batch_invalid_different_fmt) +{ + nvcv::ImageBatchVarShape imgBatch(2); + imgBatch.pushBack(nvcv::Image{ + nvcv::Size2D{24, 24}, + nvcv::FMT_NV12 + }); + imgBatch.pushBack(nvcv::Image{ + nvcv::Size2D{24, 24}, + nvcv::FMT_U8 + }); + EXPECT_THROW(helpers::GetLegacyDataFormat(imgBatch), nvcv::Exception); +} + +// clang-format off +NVCV_TEST_SUITE_P(CheckLegacyHelpersDataFormat, test::ValueList +{ + {legOp::DataFormat::kNCHW, 2, nvcv::FMT_RGB8p}, + {legOp::DataFormat::kCHW, 1, nvcv::FMT_RGB8p}, + {legOp::DataFormat::kNHWC, 2, nvcv::FMT_RGB8}, + {legOp::DataFormat::kHWC, 1, nvcv::FMT_RGB8}, +}); + +// clang-format on + +TEST_P(CheckLegacyHelpersDataFormat, check_image_batch_conversion) +{ + legOp::DataFormat expect = GetParamValue<0>(); + int32_t batchSize = GetParamValue<1>(); + nvcv::ImageFormat fmt = GetParamValue<2>(); + + nvcv::ImageBatchVarShape imgBatch(batchSize); + for (auto i = 0; i < batchSize; ++i) + { + imgBatch.pushBack(nvcv::Image{ + nvcv::Size2D{24, 24}, + fmt + }); + } + + EXPECT_EQ(helpers::GetLegacyDataFormat(imgBatch), expect); +} + +TEST_P(CheckLegacyHelpersDataFormat, check_image_batch_conversion_exported) +{ + legOp::DataFormat expect = GetParamValue<0>(); + int32_t batchSize = GetParamValue<1>(); + nvcv::ImageFormat fmt = GetParamValue<2>(); + + cudaStream_t stream; + ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); + + nvcv::ImageBatchVarShape imgBatch(batchSize); + for (auto i = 0; i < batchSize; ++i) + { + imgBatch.pushBack(nvcv::Image{ + nvcv::Size2D{24, 24}, + fmt + }); + } + + auto exportedData = imgBatch.exportData(stream); + + EXPECT_EQ(helpers::GetLegacyDataFormat(exportedData.value()), expect); + + ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream)); + ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); +} + +// clang-format off +NVCV_TEST_SUITE_P(CheckLegacyTranslateError, test::ValueList +{ + {NVCV_SUCCESS, legOp::ErrorCode::SUCCESS}, + {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_PARAMETER}, + {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_FORMAT}, + {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_SHAPE}, + {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_TYPE} +}); + +// clang-format on + +TEST_P(CheckLegacyTranslateError, check_error_conversion) +{ + NVCVStatus expect = GetParamValue<0>(); + legOp::ErrorCode err = GetParamValue<1>(); + EXPECT_EQ(nvcv::util::TranslateError(err), expect); +} + +// clang-format off +NVCV_TEST_SUITE_P(CheckLegacyToString, test::ValueList +{ + {legOp::ErrorCode::SUCCESS, "SUCCESS", "Operation executed successfully"}, + {legOp::ErrorCode::INVALID_PARAMETER, "INVALID_PARAMETER", "Some parameter is outside its acceptable range"}, + {legOp::ErrorCode::INVALID_DATA_FORMAT, "INVALID_DATA_FORMAT", "Data format is outside its acceptable range"}, + {legOp::ErrorCode::INVALID_DATA_SHAPE, "INVALID_DATA_SHAPE", "Tensor shape is outside its acceptable range"}, + {legOp::ErrorCode::INVALID_DATA_TYPE, "INVALID_DATA_TYPE", "Data type is outside its acceptable range"} +}); + +// clang-format on + +TEST_P(CheckLegacyToString, check_error_to_string_conversion) +{ + legOp::ErrorCode err = GetParamValue<0>(); + std::string expectedErrorName = GetParamValue<1>(); + std::string expectedDescr = GetParamValue<2>(); + + char bufferDesc[256]; + const char *bufferDescPtr = bufferDesc; + const char *buffer = nvcv::util::ToString(err, &bufferDescPtr); + + EXPECT_STREQ(bufferDescPtr, expectedDescr.c_str()); + EXPECT_STREQ(buffer, expectedErrorName.c_str()); +} diff --git a/tests/nvcv_types/standalone/CMakeLists.txt b/tests/nvcv_types/standalone/CMakeLists.txt index cee910fc0..8fb087cdd 100644 --- a/tests/nvcv_types/standalone/CMakeLists.txt +++ b/tests/nvcv_types/standalone/CMakeLists.txt @@ -29,11 +29,15 @@ endif() add_subdirectory(${NVCV_DIR} nvcv_dir) -add_executable(nvcv_test_standalone +if(NOT NVCV_EXE) + message(FATAL_ERROR "NVCV_EXE is empty! NVCV executable must be given.") +endif() + +add_executable(${NVCV_EXE} TestNVCVStandalone.cpp ) -target_link_libraries(nvcv_test_standalone +target_link_libraries(${NVCV_EXE} PUBLIC nvcv_types GTest::gtest_main