diff --git a/.github/workflows/linux-ov.yml b/.github/workflows/linux-ov.yml index 6fbd1f1..fb03816 100644 --- a/.github/workflows/linux-ov.yml +++ b/.github/workflows/linux-ov.yml @@ -3,7 +3,6 @@ name: Build (Linux-OV) on: push: paths: - - 'common/**' - 'vsov/**' - '.github/workflows/linux-ov.yml' workflow_dispatch: @@ -18,7 +17,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -27,17 +26,17 @@ jobs: - name: Cache protobuf id: cache-protobuf - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsov/protobuf/install key: ${{ runner.os }}-vsov-protobuf-v1 - name: Checkout protobuf - uses: actions/checkout@v3 + uses: actions/checkout@v4 if: steps.cache-protobuf.outputs.cache-hit != 'true' with: repository: protocolbuffers/protobuf - # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/protobuf + # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf # if you change this, remember to bump the version of the cache key. ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e fetch-depth: 1 @@ -60,19 +59,19 @@ jobs: - name: Cache onnx id: cache-onnx - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsov/onnx/install key: ${{ runner.os }}-vsov-onnx-v1 - name: Checkout onnx if: steps.cache-onnx.outputs.cache-hit != 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: onnx/onnx - # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/onnx + # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx # if you change this, remember to bump the version of the cache key. - ref: 1014f41f17ecc778d63e760a994579d96ba471ff + ref: b86cc54efce19530fb953e4b21f57e6b3888534c fetch-depth: 1 path: vsov/onnx @@ -104,7 +103,7 @@ jobs: - name: Setup OpenVINO run: | - curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.2/linux/l_openvino_toolkit_ubuntu22_2023.2.0.13089.cfd42bd2cb0_x86_64.tgz + curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu22_2024.0.0.14509.34caeefd078_x86_64.tgz tar -xf ov.tgz mv l_openvino_* openvino -v @@ -113,7 +112,6 @@ jobs: -D CMAKE_BUILD_TYPE=Release -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" - -D InferenceEngine_DIR=openvino/runtime/cmake -D OpenVINO_DIR=openvino/runtime/cmake -D ENABLE_VISUALIZATION=ON -D WIN32_SHARED_OPENVINO=ON @@ -135,7 +133,7 @@ jobs: run: git describe --tags --long - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSOV-Linux-x64 path: vsov/artifact diff --git a/.github/workflows/linux-trt.yml b/.github/workflows/linux-trt.yml index 1efce0b..c97322b 100644 --- a/.github/workflows/linux-trt.yml +++ b/.github/workflows/linux-trt.yml @@ -17,7 +17,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -30,20 +30,16 @@ jobs: unzip -q vs.zip mv vapoursynth*/ vapoursynth - - name: Setup CUDA + - name: Setup CUDA and TensorRT run: | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update - sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1 libcublas-dev-12-1 libcudnn8=8.9.0.131-1+cuda12.1 libcudnn8-dev=8.9.0.131-1+cuda12.1 + sudo apt-get install -y cuda-nvcc-12-4 cuda-cudart-dev-12-4 libnvinfer-dev=10.0.1.6-1+cuda12.4 libnvinfer-headers-dev=10.0.1.6-1+cuda12.4 echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV - - name: Setup tensorrt - run: | - sudo apt-get install -y libnvinfer-dev=8.6.1.6-1+cuda12.0 libnvinfer-headers-dev=8.6.1.6-1+cuda12.0 - - name: Configure run: cmake -S . -B build -G Ninja -LA -D CMAKE_BUILD_TYPE=Release @@ -66,8 +62,9 @@ jobs: run: git describe --tags --long - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSTRT-Linux-x64 path: vstrt/artifact + overwrite: true diff --git a/.github/workflows/windows-cuda-dependency.yml b/.github/workflows/windows-cuda-dependency.yml index feba4fd..a53e389 100644 --- a/.github/workflows/windows-cuda-dependency.yml +++ b/.github/workflows/windows-cuda-dependency.yml @@ -28,63 +28,41 @@ jobs: shell: bash steps: - # Note: unfortunately, cuDNN download requires registration and according to its license - # agreements, we can only redistribute it along with an application that uses it, so we - # have to use a private repository to hold a copy of the library. - name: Download cuDNN inference library - run: | - # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093 - - tag=cudnn-8.6.0-bin - name=cudnn.7z - - AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}" - response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag) - eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=') - [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; } - ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id" - curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET" + run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip -o cudnn.zip - name: Extract cuDNN library - run: 7z x cudnn.7z + run: unzip cudnn.zip - name: Move cuDNN library - run: mv cudnn/bin vsmlrt-cuda -v + run: | + mkdir -p vsmlrt-cuda + mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v + rm vsmlrt-cuda/cudnn_*_train*.dll -v - # Same licensing issue with the tensorrt libraries. - name: Download TensorRT library run: | - # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093 - - tag=tensorrt-8.5.1-bin - name=TensorRT.7z - - AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}" - response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag) - eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=') - [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; } - ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id" - curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET" - - mv "$name" trt.7z + curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip - name: Extract TensorRT library - run: 7z x trt.7z + run: | + unzip trt.zip + mv TensorRT-*/ TensorRT/ - name: Move TensorRT library - run: mv TensorRT/*.dll vsmlrt-cuda -v + run: mv TensorRT/lib/*.dll vsmlrt-cuda -v - name: Download CUDA Libraries shell: cmd run: | - curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe - cuda_installer.exe -s cudart_11.8 cublas_11.8 cufft_11.8 cupti_11.8 nvrtc_11.8 + curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe + cuda_installer.exe -s cudart_12.4 cublas_12.4 cufft_12.4 cupti_12.4 nvrtc_12.4 - name: Move CUDA Libraries shell: cmd run: | - move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda - move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\*.dll" vsmlrt-cuda + move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda + move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\*.dll" vsmlrt-cuda del vsmlrt-cuda\cudart32*.dll - name: Setup VC commands @@ -99,7 +77,7 @@ jobs: while true; do changed=false for dll in *.[dD][lL][lL]; do - for dep in $(dumpbin -dependents "$dll" | grep -o -i '\(vc\|msvc\)[a-z0-9_-]*\.dll'); do + for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do echo "finding $dep for $dll" if ! test -f ./"$dep"; then changed=true @@ -113,42 +91,17 @@ jobs: $changed || break done - - name: Cache zlib - id: cache-zlib - uses: actions/cache@v3 - with: - path: ${{ github.workspace }}/zlib-release - key: ${{ runner.os }}-zlib-1.2.12 - - - name: Setup msbuild - if: steps.cache-zlib.outputs.cache-hit != 'true' - uses: microsoft/setup-msbuild@v1 - - - name: Compile zlib - if: steps.cache-zlib.outputs.cache-hit != 'true' - shell: cmd - run: | - curl -s -o zlib.tar.gz -LJO https://github.com/madler/zlib/archive/refs/tags/v1.2.12.tar.gz - tar -xf zlib.tar.gz - cd zlib-1.2.12\contrib\vstudio\vc14 - python -c "f=open('zlibvc.vcxproj','r+b');s=f.read();s=s.replace(b'MultiThreadedDLL',b'MultiThreaded');start=s.index(b'');end_str=b'\n';end=s.index(end_str,start);s=s[:start]+s[end+len(end_str):];f.seek(0);f.write(s);f.truncate();f.close()" - msbuild zlibvc.sln -target:zlibvc /p:Configuration=ReleaseWithoutAsm /p:Platform=x64 /p:PlatformToolset=v143 -maxCpuCount - mkdir %GITHUB_WORKSPACE%\zlib-release - move x64\ZlibDllReleaseWithoutAsm\zlibwapi.dll %GITHUB_WORKSPACE%\zlib-release - - - name: Copy zlib - run: cp -v zlib-release/zlibwapi.dll vsmlrt-cuda - - name: Compress run: | 7z a -t7z -mx=3 vsmlrt-cuda.7z vsmlrt-cuda - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: vsmlrt-cuda path: vsmlrt-cuda.7z retention-days: 1 + compression-level: 0 - name: Rename release asset run: | diff --git a/.github/workflows/windows-ncnn.yml b/.github/workflows/windows-ncnn.yml index 74c93c9..31a425b 100644 --- a/.github/workflows/windows-ncnn.yml +++ b/.github/workflows/windows-ncnn.yml @@ -39,7 +39,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -48,13 +48,13 @@ jobs: - name: Cache protobuf id: cache-protobuf - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsncnn/protobuf/install key: ${{ runner.os }}-vsncnn-protobuf-v3.16.0 - name: Checkout protobuf - uses: actions/checkout@v3 + uses: actions/checkout@v4 if: steps.cache-protobuf.outputs.cache-hit != 'true' with: repository: protocolbuffers/protobuf @@ -81,14 +81,14 @@ jobs: - name: Cache onnx id: cache-onnx - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsncnn/onnx/install key: ${{ runner.os }}-vsncnn-onnx-v1.12.0-protobuf-v3.16.0 - name: Checkout onnx if: steps.cache-onnx.outputs.cache-hit != 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: onnx/onnx ref: v1.12.0 @@ -161,7 +161,7 @@ jobs: copy install\bin\vsncnn.dll artifact\ - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSNCNN-GPU-Windows-x64 path: vsncnn/artifact diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml index 8261e5d..0aafede 100644 --- a/.github/workflows/windows-ort.yml +++ b/.github/workflows/windows-ort.yml @@ -29,7 +29,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -41,17 +41,17 @@ jobs: - name: Cache protobuf id: cache-protobuf - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsort/protobuf/install key: ${{ runner.os }}-vsort-protobuf-v4 - name: Checkout protobuf - uses: actions/checkout@v3 + uses: actions/checkout@v4 if: steps.cache-protobuf.outputs.cache-hit != 'true' with: repository: protocolbuffers/protobuf - # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161 + # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203 # if you change this, remember to bump the version of the cache key. ref: v3.21.12 fetch-depth: 1 @@ -73,19 +73,19 @@ jobs: - name: Cache onnx id: cache-onnx - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsort/onnx/install - key: ${{ runner.os }}-vsort-onnx-v4 + key: ${{ runner.os }}-vsort-onnx-v5 - name: Checkout onnx if: steps.cache-onnx.outputs.cache-hit != 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: onnx/onnx # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external # if you change this, remember to bump the version of the cache key. - ref: a0d77f18516d2da7468a96b0de3b737266f23176 + ref: 990217f043af7222348ca8f0301e17fa7b841781 fetch-depth: 1 path: vsort/onnx @@ -116,21 +116,21 @@ jobs: - name: Download ONNX Runtime Precompilation run: | - curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip + curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-7983-g9001c69b84-240419-0832/onnxruntime-gpu-win64.zip unzip -q ortgpu.zip - name: Cache CUDA id: cache-cuda - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA - key: ${{ runner.os }}-cuda-11.8.0 + key: ${{ runner.os }}-cuda-12.4.0 - name: Setup CUDA if: steps.cache-cuda.outputs.cache-hit != 'true' run: | - curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe - cuda_installer.exe -s nvcc_11.8 cudart_11.8 + curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe + cuda_installer.exe -s nvcc_12.4 cudart_12.4 - name: Configure run: cmake -S . -B build -G Ninja -LA @@ -139,10 +139,10 @@ jobs: -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include -D protobuf_DIR=protobuf\install\cmake -D ONNX_DIR=onnx\install\lib\cmake\ONNX - -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime\core\session + -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib -D ENABLE_CUDA=1 - -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" + -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" -D ENABLE_DML=1 -D CMAKE_CXX_STANDARD=20 @@ -161,12 +161,12 @@ jobs: - name: Download DirectML Library # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44 run: | - curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0 + curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1 unzip -q directml.nupkg -d dml copy dml\bin\x64-win\DirectML.dll artifact\vsort\ - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSORT-Windows-x64 path: vsort/artifact @@ -199,7 +199,7 @@ jobs: - name: Create script shell: bash - run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy + run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy - name: Run vspipe shell: bash @@ -221,8 +221,40 @@ jobs: shell: bash run: | set -ex - vs_portable/vspipe -i test.vpy - - vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - + vs_portable/vspipe -i test_fp16.vpy - + vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - + ls -l out.hevc x265.log + cat x265.log + grep -F 'encoded 10 frames' x265.log || exit 2 + grep -i 'error' x265.log && exit 1 + exit 0 + + - name: Create script (fp16 input) + shell: bash + run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy + + - name: Run vspipe (fp16 input) + shell: bash + run: | + set -ex + vs_portable/vspipe -i test_fp16_input.vpy - + vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - + ls -l out.hevc x265.log + cat x265.log + grep -F 'encoded 10 frames' x265.log || exit 2 + grep -i 'error' x265.log && exit 1 + exit 0 + + - name: Create script (fp16 output) + shell: bash + run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy + + - name: Run vspipe (fp16 output) + shell: bash + run: | + set -ex + vs_portable/vspipe -i test_fp16_output.vpy - + vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - ls -l out.hevc x265.log cat x265.log grep -F 'encoded 10 frames' x265.log || exit 2 diff --git a/.github/workflows/windows-ov.yml b/.github/workflows/windows-ov.yml index 3dfae3d..a47fc91 100644 --- a/.github/workflows/windows-ov.yml +++ b/.github/workflows/windows-ov.yml @@ -39,7 +39,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -51,19 +51,19 @@ jobs: - name: Cache protobuf id: cache-protobuf - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsov/protobuf/install - key: ${{ runner.os }}-vsov-protobuf-v1 + key: ${{ runner.os }}-vsov-protobuf-v2 - name: Checkout protobuf - uses: actions/checkout@v3 + uses: actions/checkout@v4 if: steps.cache-protobuf.outputs.cache-hit != 'true' with: repository: protocolbuffers/protobuf - # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf + # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf # if you change this, remember to bump the version of the cache key. - ref: 6c6b0778b70f35f93c2f0dee30e5d12ad2a83eea + ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e fetch-depth: 1 path: vsov/protobuf @@ -83,19 +83,19 @@ jobs: - name: Cache onnx id: cache-onnx - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: vsov/onnx/install - key: ${{ runner.os }}-vsov-onnx-v2 + key: ${{ runner.os }}-vsov-onnx-v3 - name: Checkout onnx if: steps.cache-onnx.outputs.cache-hit != 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: onnx/onnx - # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx + # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx # if you change this, remember to bump the version of the cache key. - ref: f7ee1ac60d06abe8e26c9b6bbe1e3db5286b614b + ref: b86cc54efce19530fb953e4b21f57e6b3888534c fetch-depth: 1 path: vsov/onnx @@ -124,23 +124,16 @@ jobs: unzip -q vs.zip mv vapoursynth-*/ vapoursynth/ - - name: Download OpenVINO Runtime Precompilation - shell: bash + - name: Setup OpenVINO run: | - rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}" - if [ "$rev" == "latest" ]; then - url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip" - else - url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip" - fi - curl -s -o openvino.zip -LJO "$url" - unzip -q openvino.zip + curl -L -o ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/windows/w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64.zip + unzip ov.zip + rename w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64 openvino - name: Configure run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded - -D InferenceEngine_DIR=openvino/runtime/cmake -D OpenVINO_DIR=openvino/runtime/cmake -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" -D ENABLE_VISUALIZATION=ON @@ -156,12 +149,12 @@ jobs: cmake --install build --prefix install mkdir artifact mkdir artifact\vsov - copy openvino\runtime\3rdparty\tbb\bin\tbb.dll artifact\vsov\ + copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\ copy install\bin\vsov.dll artifact\ xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSOV-Windows-x64 path: vsov/artifact diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml index 64318d3..07687b3 100644 --- a/.github/workflows/windows-release.yml +++ b/.github/workflows/windows-release.yml @@ -66,7 +66,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Compress scirpts.7z run: | @@ -74,7 +74,7 @@ jobs: 7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z . - name: Upload scripts release - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Scripts path: scripts @@ -116,11 +116,12 @@ jobs: 7za a -t7z -bb3 -mx=9 ../models.7z . - name: Upload model release - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Models path: release retention-days: 1 + compression-level: 0 - name: Download External Models run: | @@ -147,11 +148,44 @@ jobs: 7za a -t7z -bb3 -mx=9 ../ext-models.7z . - name: Upload external model release - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: External-Models path: release retention-days: 1 + compression-level: 0 + + - name: Download Contributed Models + run: | + rm -rf release + set -ex + mkdir -p release/models + cd release + pushd models + for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do + echo "Handling tag $tag" + curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json + for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do + echo "Downloading $url" + curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" + # later release should overwrite earlier ones + 7za x -y dl.7z + done + #test -f "dl.7z" # contrib-models might be empty. + rm -f dl.7z release.json + done + popd + ls -lR + du -sh + 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . + + - name: Upload contrib model release + uses: actions/upload-artifact@v4 + with: + name: Contrib-Models + path: release + retention-days: 1 + compression-level: 0 - name: Download Contributed Models run: | @@ -212,43 +246,43 @@ jobs: steps: - name: Download artifact for scripts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: Scripts path: scripts-release - name: Download artifact for models - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: Models path: models-release - name: Download artifact for vsov - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: VSOV-Windows-x64 path: vsov-release - name: Download artifact for vsort - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: VSORT-Windows-x64 path: vsort-release - name: Download artifact for vstrt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: VSTRT-Windows-x64 path: vstrt-release - name: Download artifact for vsncnn - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: VSNCNN-GPU-Windows-x64 path: vsncnn-release - name: Download artifact for cuda dependencies - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: vsmlrt-cuda path: cuda-release @@ -267,12 +301,13 @@ jobs: 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z . - name: Upload CPU-only release - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: false with: name: vsmlrt-cpu-release path: vsmlrt-windows-x64-cpu.7z retention-days: 1 + compression-level: 0 - name: Rename release asset run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z @@ -300,18 +335,19 @@ jobs: ls -lR 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z . - - name: Upload non-CUDA GPU release - uses: actions/upload-artifact@v3 + - name: Upload generic GPU release + uses: actions/upload-artifact@v4 if: false with: name: vsmlrt-generic-gpu-release path: vsmlrt-windows-x64-generic-gpu.7z retention-days: 1 + compression-level: 0 - - name: Rename release asset for non-CUDA GPU release + - name: Rename release asset for generic GPU release run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z - - name: Release non-CUDA GPU + - name: Release generic GPU uses: softprops/action-gh-release@v1 with: tag_name: ${{ github.event.inputs.tag }} @@ -342,12 +378,13 @@ jobs: 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z . - name: Upload CUDA release - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: false with: name: vsmlrt-cuda-release path: vsmlrt-windows-x64-cuda.7z retention-days: 1 + compression-level: 0 - name: Rename release asset run: mv vsmlrt-windows-x64-cuda.7z vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z @@ -364,7 +401,7 @@ jobs: # Update nightly tag. - name: Checkout repo if: github.event.inputs.tag == 'nightly' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Overwrite tag diff --git a/.github/workflows/windows-trt.yml b/.github/workflows/windows-trt.yml index 735e6eb..5105ab1 100644 --- a/.github/workflows/windows-trt.yml +++ b/.github/workflows/windows-trt.yml @@ -31,7 +31,7 @@ jobs: steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -43,24 +43,23 @@ jobs: - name: Cache CUDA id: cache-cuda - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA - key: ${{ runner.os }}-vstrt-cuda-11.8.0 + key: ${{ runner.os }}-vstrt-cuda-12.4.1 + save-always: true - name: Setup CUDA if: steps.cache-cuda.outputs.cache-hit != 'true' run: | - curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe - cuda_installer.exe -s nvcc_11.8 cudart_11.8 nvprof_11.8 cuda_profiler_api_11.8 + curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe + cuda_installer.exe -s nvcc_12.4 cudart_12.4 cuda_profiler_api_12.4 - - name: Checkout tensorrt - uses: actions/checkout@v3 - with: - repository: AmusementClub/cuda - token: ${{ secrets.REPO_TOKEN }} - ref: tensorrt-8.5.1 - path: tensorrt + - name: Download TensorRT + run: | + curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip + unzip trt.zip + mv TensorRT-*/ tensorrt/ - name: Download VapourSynth headers run: | @@ -72,10 +71,11 @@ jobs: run: cmake -S . -B build -G Ninja -LA -D CMAKE_BUILD_TYPE=Release -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded - -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" + -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" - -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt" + -D TENSORRT_HOME="%cd%\tensorrt" -D USE_NVINFER_PLUGIN=ON + -D TENSORRT_LIBRARY_SUFFIX="_10" - name: Build run: cmake --build build --config Release --verbose @@ -84,15 +84,17 @@ jobs: run: cmake --install build --prefix install - name: Checkout TensorRT OSS - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: NVIDIA/TensorRT - ref: 8.5.1 + ref: release/10.0 fetch-depth: 1 path: tensorrt-oss - name: Override trtexec CMake file run: | + cp -f -r -v tensorrt/samples ../tensorrt-oss + mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec mv trtexec/*.manifest ../tensorrt-oss/samples/trtexec @@ -101,8 +103,9 @@ jobs: run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja -D CMAKE_BUILD_TYPE=Release -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded - -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" - -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt" + -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + -D TENSORRT_HOME="%cd%\tensorrt" + -D TENSORRT_LIBRARY_SUFFIX="_10" - name: Build trtexec run: cmake --build build_trtexec --verbose @@ -121,7 +124,7 @@ jobs: run: git describe --tags --long - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: VSTRT-Windows-x64 path: vstrt/artifact diff --git a/common/convert_float_to_float16.cpp b/common/convert_float_to_float16.cpp index 0cd0294..8a05651 100644 --- a/common/convert_float_to_float16.cpp +++ b/common/convert_float_to_float16.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -19,11 +18,7 @@ #include - -void convert_float_to_float16( - ONNX_NAMESPACE::ModelProto & model, - bool force_fp16_initializers -) noexcept; +#include "convert_float_to_float16.h" namespace { @@ -298,12 +293,14 @@ static ONNX_NAMESPACE::ValueInfoProto make_value_info_from_tensor( void convert_float_to_float16( ONNX_NAMESPACE::ModelProto & model, - bool force_fp16_initializers - // , bool keep_io_types = True - // , bool disable_shape_infer = True - // , const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST - // , const std::optional> op_block_list = {} - , const std::unordered_set & op_block_list + bool force_fp16_initializers, + // bool keep_io_types = True, + // bool disable_shape_infer = True, + // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST, + // const std::optional> op_block_list = {}, + const std::unordered_set & op_block_list, + bool cast_input, + bool cast_output ) noexcept { std::vector value_info_list {}; @@ -312,97 +309,101 @@ void convert_float_to_float16( std::unordered_map name_mapping {}; std::unordered_set graph_io_to_skip {}; - const std::vector fp32_inputs = [&]() { - std::vector ret {}; + if (cast_input) { + const std::vector fp32_inputs = [&]() { + std::vector ret {}; - for (const auto & n : model.graph().input()) { - if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) { - ret.emplace_back(n.name()); + for (const auto & n : model.graph().input()) { + if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) { + ret.emplace_back(n.name()); + } } - } - return ret; - }(); - - for (const auto & n : model.graph().input()) { - if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name()); - idx != std::cend(fp32_inputs) - ) { - const auto i = idx - std::cbegin(fp32_inputs); - std::string node_name = "graph_input_cast_" + std::to_string(i); - name_mapping.emplace(n.name(), node_name); - graph_io_to_skip.emplace(n.name()); - - auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add(); - new_value_info->CopyFrom(n); - new_value_info->set_name(node_name); - new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type( - ONNX_NAMESPACE::TensorProto::FLOAT16 - ); - // add Cast node (from tensor(float) to tensor(float16) after graph input - for (auto & node : *model.mutable_graph()->mutable_node()) { - for (auto & input : *node.mutable_input()) { - if (input == n.name()) { - input = node_name; + return ret; + }(); + + for (const auto & n : model.graph().input()) { + if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name()); + idx != std::cend(fp32_inputs) + ) { + const auto i = idx - std::cbegin(fp32_inputs); + std::string node_name = "graph_input_cast_" + std::to_string(i); + name_mapping.emplace(n.name(), node_name); + graph_io_to_skip.emplace(n.name()); + + auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add(); + new_value_info->CopyFrom(n); + new_value_info->set_name(node_name); + new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type( + ONNX_NAMESPACE::TensorProto::FLOAT16 + ); + // add Cast node (from tensor(float) to tensor(float16) after graph input + for (auto & node : *model.mutable_graph()->mutable_node()) { + for (auto & input : *node.mutable_input()) { + if (input == n.name()) { + input = node_name; + } } } + auto new_node = make_node( + "Cast", {n.name()}, {node_name}, node_name, + "to", ONNX_NAMESPACE::TensorProto::FLOAT16 + ); + model.mutable_graph()->mutable_node()->Add(); + for (int i = model.graph().node_size() - 2; i >= 0; --i) { + model.mutable_graph()->mutable_node()->SwapElements(i, i + 1); + } + *model.mutable_graph()->mutable_node(0) = std::move(new_node); + value_info_list.emplace_back(*new_value_info); + io_casts.emplace(std::move(node_name)); } - auto new_node = make_node( - "Cast", {n.name()}, {node_name}, node_name, - "to", ONNX_NAMESPACE::TensorProto::FLOAT16 - ); - model.mutable_graph()->mutable_node()->Add(); - for (int i = model.graph().node_size() - 2; i >= 0; --i) { - model.mutable_graph()->mutable_node()->SwapElements(i, i + 1); - } - *model.mutable_graph()->mutable_node(0) = std::move(new_node); - value_info_list.emplace_back(*new_value_info); - io_casts.emplace(std::move(node_name)); } } - const std::vector fp32_outputs = [&]() { - std::vector ret {}; + if (cast_output) { + const std::vector fp32_outputs = [&]() { + std::vector ret {}; - for (const auto & n : model.graph().output()) { - if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) { - ret.emplace_back(n.name()); + for (const auto & n : model.graph().output()) { + if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) { + ret.emplace_back(n.name()); + } } - } - return ret; - }(); - - for (const auto & n : model.graph().output()) { - if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name()); - idx != std::cend(fp32_outputs) - ) { - const auto i = idx - std::cbegin(fp32_outputs); - std::string node_name = "graph_output_cast_" + std::to_string(i); - name_mapping.emplace(n.name(), node_name); - graph_io_to_skip.emplace(n.name()); - - auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add(); - new_value_info->CopyFrom(n); - new_value_info->set_name(node_name); - new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type( - ONNX_NAMESPACE::TensorProto::FLOAT16 - ); - // add Cast node (from tensor(float16) to tensor(float) before graph output - for (auto & node : *model.mutable_graph()->mutable_node()) { - for (auto & output : *node.mutable_output()) { - if (output == n.name()) { - output = node_name; + return ret; + }(); + + for (const auto & n : model.graph().output()) { + if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name()); + idx != std::cend(fp32_outputs) + ) { + const auto i = idx - std::cbegin(fp32_outputs); + std::string node_name = "graph_output_cast_" + std::to_string(i); + name_mapping.emplace(n.name(), node_name); + graph_io_to_skip.emplace(n.name()); + + auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add(); + new_value_info->CopyFrom(n); + new_value_info->set_name(node_name); + new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type( + ONNX_NAMESPACE::TensorProto::FLOAT16 + ); + // add Cast node (from tensor(float16) to tensor(float) before graph output + for (auto & node : *model.mutable_graph()->mutable_node()) { + for (auto & output : *node.mutable_output()) { + if (output == n.name()) { + output = node_name; + } } } + auto new_node = make_node( + "Cast", {node_name}, {n.name()}, node_name, + "to", ONNX_NAMESPACE::TensorProto::FLOAT + ); + model.mutable_graph()->mutable_node()->Add(std::move(new_node)); + value_info_list.emplace_back(*new_value_info); + io_casts.emplace(std::move(node_name)); } - auto new_node = make_node( - "Cast", {node_name}, {n.name()}, node_name, - "to", ONNX_NAMESPACE::TensorProto::FLOAT - ); - model.mutable_graph()->mutable_node()->Add(std::move(new_node)); - value_info_list.emplace_back(*new_value_info); - io_casts.emplace(std::move(node_name)); } } diff --git a/common/convert_float_to_float16.h b/common/convert_float_to_float16.h new file mode 100644 index 0000000..1c41b4a --- /dev/null +++ b/common/convert_float_to_float16.h @@ -0,0 +1,21 @@ +#ifndef CONVERT_FLOAT_TO_FLOAT16_H +#define CONVERT_FLOAT_TO_FLOAT16_H + +#include +#include + +#include + +void convert_float_to_float16( + ONNX_NAMESPACE::ModelProto & model, + bool force_fp16_initializers, + // bool keep_io_types = True, + // bool disable_shape_infer = True, + // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST, + // const std::optional> op_block_list = {}, + const std::unordered_set & op_block_list, + bool cast_input = true, + bool cast_output = true +) noexcept; + +#endif diff --git a/common/onnx_utils.cpp b/common/onnx_utils.cpp index 9c22898..5de7b67 100644 --- a/common/onnx_utils.cpp +++ b/common/onnx_utils.cpp @@ -8,6 +8,8 @@ #include #include +#include "onnx_utils.h" + using namespace std::string_literals; diff --git a/common/onnx_utils.h b/common/onnx_utils.h new file mode 100644 index 0000000..7041ab7 --- /dev/null +++ b/common/onnx_utils.h @@ -0,0 +1,18 @@ +#ifndef ONNX_UTILS_H +#define ONNX_UTILS_H + +#include +#include +#include +#include + +#include + +std::variant loadONNX( + const std::string_view & path, + int64_t tile_w, + int64_t tile_h, + bool path_is_serialization +) noexcept; + +#endif diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py index ecccb61..011c2b6 100644 --- a/scripts/vsmlrt.py +++ b/scripts/vsmlrt.py @@ -1,4 +1,4 @@ -__version__ = "3.15.55" +__version__ = "3.20.11" __all__ = [ "Backend", "BackendV2", @@ -9,6 +9,8 @@ "CUGAN", "RIFE", "RIFEModel", "RIFEMerge", "SAFA", "SAFAModel", "SAFAAdaptiveMode", + "SCUNet", "SCUNetModel", + "SwinIR", "SwinIRModel", "inference" ] @@ -18,6 +20,8 @@ from fractions import Fraction import math import os +import os.path +import platform import subprocess import sys import tempfile @@ -75,6 +79,18 @@ class ORT_CUDA: basic performance tuning: set fp16 = True (on RTX GPUs) + + Semantics of `fp16`: + Enabling `fp16` will use a built-in quantization that converts a fp32 onnx to a fp16 onnx. + If the input video is of half-precision floating-point format, + the generated fp16 onnx will use fp16 input. + The output format can be controlled by the `output_format` option (0 = fp32, 1 = fp16). + + Disabling `fp16` will not use the built-in quantization. + However, if the onnx file itself uses fp16 for computation, + the actual computation will be done in fp16. + In this case, the input video format should match the input format of the onnx, + and the output format is inferred from the onnx. """ device_id: int = 0 @@ -84,6 +100,9 @@ class ORT_CUDA: fp16: bool = False use_cuda_graph: bool = False # preview, not supported by all models fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None + prefer_nhwc: bool = False + output_format: int = 0 # 0: fp32, 1: fp16 + tf32: bool = False # internal backend attributes supports_onnx_serialization: bool = True @@ -122,17 +141,17 @@ class TRT: opt_shapes: typing.Optional[typing.Tuple[int, int]] = None fp16: bool = False device_id: int = 0 - workspace: typing.Optional[int] = 128 + workspace: typing.Optional[int] = None verbose: bool = False use_cuda_graph: bool = False num_streams: int = 1 use_cublas: bool = False # cuBLAS + cuBLASLt static_shape: bool = True - tf32: bool = True + tf32: bool = False log: bool = True # as of TensorRT 8.4, it can be turned off without performance penalty in most cases - use_cudnn: bool = True + use_cudnn: bool = False # changed to False since vsmlrt.vpy 3.16 use_edge_mask_convolutions: bool = True use_jit_convolutions: bool = True heuristic: bool = False # only supported on Ampere+ with TensorRT 8.5+ @@ -141,8 +160,12 @@ class TRT: faster_dynamic_shapes: bool = True force_fp16: bool = False builder_optimization_level: int = 3 + max_aux_streams: typing.Optional[int] = None + short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise + bf16: bool = False custom_env: typing.Dict[str, str] = field(default_factory=lambda: {}) custom_args: typing.List[str] = field(default_factory=lambda: []) + engine_folder: typing.Optional[str] = None # internal backend attributes supports_onnx_serialization: bool = False @@ -207,12 +230,21 @@ class MIGX: fast_math: bool = True exhaustive_tune: bool = False + short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise custom_env: typing.Dict[str, str] = field(default_factory=lambda: {}) custom_args: typing.List[str] = field(default_factory=lambda: []) # internal backend attributes supports_onnx_serialization: bool = False + @dataclass(frozen=False) + class OV_NPU: + """ backend for intel npus + """ + + # internal backend attributes + supports_onnx_serialization: bool = True + backendT = typing.Union[ Backend.OV_CPU, @@ -222,7 +254,8 @@ class MIGX: Backend.OV_GPU, Backend.NCNN_VK, Backend.ORT_DML, - Backend.MIGX + Backend.MIGX, + Backend.OV_NPU, ] @@ -271,7 +304,7 @@ def Waifu2x( raise ValueError(f'{func_name}: "scale" must be 1, 2 or 4') if not isinstance(model, int) or model not in Waifu2xModel.__members__.values(): - raise ValueError(f'{func_name}: "model" must be in [0, 9]') + raise ValueError(f'{func_name}: invalid "model"') if model == 0 and noise == 0: raise ValueError( @@ -433,7 +466,7 @@ def DPIR( raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported") if not isinstance(model, int) or model not in DPIRModel.__members__.values(): - raise ValueError(f'{func_name}: "model" must be 0, 1, 2 or 3') + raise ValueError(f'{func_name}: invalid "model"') if model in [0, 2] and clip.format.color_family != vs.GRAY: raise ValueError(f'{func_name}: "clip" must be of GRAY color family') @@ -976,8 +1009,11 @@ def RIFEMerge( "/Reciprocal:fp32,/Reciprocal_1:fp32," "/Mul:fp32,/Mul_1:fp32,/Mul_8:fp32,/Mul_10:fp32," "/Sub_5:fp32,/Sub_6:fp32," + # generated by TensorRT's onnx parser "ONNXTRT_Broadcast_236:fp32,ONNXTRT_Broadcast_238:fp32," - "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32" + "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32," + # TensorRT 9.0 or later + "ONNXTRT_Broadcast_*:fp32" ) ]) @@ -1147,6 +1183,14 @@ def handler(n: int, f: vs.VideoFrame) -> vs.VideoNode: else: return res else: + if not hasattr(core, 'akarin') or \ + not hasattr(core.akarin, 'PropExpr') or \ + not hasattr(core.akarin, 'PickFrames'): + raise RuntimeError( + 'fractional multi requires plugin akarin ' + '(https://github.com/AkarinVS/vapoursynth-plugin/releases)' + ', version v0.96g or later.') + if clip.fps_num == 0 or clip.fps_den == 0: src_fps = Fraction(1) else: @@ -1169,6 +1213,7 @@ def left_func(n: int) -> vs.VideoNode: left_clip = core.std.FrameEval(temp, left_func) def right_func(n: int) -> vs.VideoNode: + # no out of range access because of function filter_sc return clip[dst_duration * n // src_duration + 1] right_clip = core.std.FrameEval(temp, right_func) @@ -1181,14 +1226,14 @@ def timepoint_func(n: int) -> vs.VideoNode: return temp_gray.std.BlankClip(color=tp, keep=True) tp_clip = core.std.FrameEval(temp_gray, timepoint_func) - output = RIFEMerge( + output0 = RIFEMerge( clipa=left_clip, clipb=right_clip, mask=tp_clip, scale=scale, tiles=tiles, tilesize=tilesize, overlap=overlap, model=model, backend=backend, ensemble=ensemble, _implementation=_implementation ) - left0 = bits_as(left_clip, output) + left0 = bits_as(left_clip, output0) def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode: current_time = dst_duration * n @@ -1200,9 +1245,9 @@ def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode: ): return left0 else: - return output + return output0 - res = core.std.FrameEval(output, filter_sc, left0) + res = core.std.FrameEval(output0, filter_sc, left0) else: if not hasattr(core, 'akarin') or \ not hasattr(core.akarin, 'PropExpr') or \ @@ -1373,6 +1418,210 @@ def SAFA( return clip +@enum.unique +class SCUNetModel(enum.IntEnum): + scunet_color_15 = 0 + scunet_color_25 = 1 + scunet_color_50 = 2 + scunet_color_real_psnr = 3 + scunet_color_real_gan = 4 + scunet_gray_15 = 5 + scunet_gray_25 = 6 + scunet_gray_50 = 7 + + +def SCUNet( + clip: vs.VideoNode, + tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + model: SCUNetModel = SCUNetModel.scunet_color_real_psnr, + backend: backendT = Backend.OV_CPU() +) -> vs.VideoNode: + """ Practical Blind Denoising via Swin-Conv-UNet and Data Synthesis + + Unlike vs-scunet v1.0.0, the default model is set to scunet_color_real_psnr due to the color shift. + """ + + func_name = "vsmlrt.SCUNet" + + if not isinstance(clip, vs.VideoNode): + raise TypeError(f'{func_name}: "clip" must be a clip!') + + if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]: + raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported") + + if not isinstance(model, int) or model not in SCUNetModel.__members__.values(): + raise ValueError(f'{func_name}: invalid "model"') + + if model in range(5) and clip.format.color_family != vs.RGB: + raise ValueError(f'{func_name}: "clip" must be of RGB color family') + elif model in range(5, 8) and clip.format.color_family != vs.GRAY: + raise ValueError(f'{func_name}: "clip" must be of GRAY color family') + + if overlap is None: + overlap_w = overlap_h = 16 + elif isinstance(overlap, int): + overlap_w = overlap_h = overlap + else: + overlap_w, overlap_h = overlap + + multiple = 1 + + (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize( + tiles=tiles, tilesize=tilesize, + width=clip.width, height=clip.height, + multiple=multiple, + overlap_w=overlap_w, overlap_h=overlap_h + ) + + if tile_w % multiple != 0 or tile_h % multiple != 0: + raise ValueError( + f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})' + ) + + backend = init_backend( + backend=backend, + trt_opt_shapes=(tile_w, tile_h) + ) + + network_path = os.path.join( + models_path, + "scunet", + f"{tuple(SCUNetModel.__members__)[model]}.onnx" + ) + + clip = inference_with_fallback( + clips=[clip], network_path=network_path, + overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h), + backend=backend + ) + + return clip + + +@enum.unique +class SwinIRModel(enum.IntEnum): + lightweightSR_DIV2K_s64w8_SwinIR_S_x2 = 0 + lightweightSR_DIV2K_s64w8_SwinIR_S_x3 = 1 + lightweightSR_DIV2K_s64w8_SwinIR_S_x4 = 2 + realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_GAN = 3 + # unused + realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_PSNR = 5 + classicalSR_DF2K_s64w8_SwinIR_M_x2 = 6 + classicalSR_DF2K_s64w8_SwinIR_M_x3 = 7 + classicalSR_DF2K_s64w8_SwinIR_M_x4 = 8 + classicalSR_DF2K_s64w8_SwinIR_M_x8 = 9 + realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_GAN = 10 + realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_PSNR = 11 + realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_GAN = 12 + realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_PSNR = 13 + grayDN_DFWB_s128w8_SwinIR_M_noise15 = 14 + grayDN_DFWB_s128w8_SwinIR_M_noise25 = 15 + grayDN_DFWB_s128w8_SwinIR_M_noise50 = 16 + colorDN_DFWB_s128w8_SwinIR_M_noise15 = 17 + colorDN_DFWB_s128w8_SwinIR_M_noise25 = 18 + colorDN_DFWB_s128w8_SwinIR_M_noise50 = 19 + CAR_DFWB_s126w7_SwinIR_M_jpeg10 = 20 + CAR_DFWB_s126w7_SwinIR_M_jpeg20 = 21 + CAR_DFWB_s126w7_SwinIR_M_jpeg30 = 22 + CAR_DFWB_s126w7_SwinIR_M_jpeg40 = 23 + colorCAR_DFWB_s126w7_SwinIR_M_jpeg10 = 24 + colorCAR_DFWB_s126w7_SwinIR_M_jpeg20 = 25 + colorCAR_DFWB_s126w7_SwinIR_M_jpeg30 = 26 + colorCAR_DFWB_s126w7_SwinIR_M_jpeg40 = 27 + + +def SwinIR( + clip: vs.VideoNode, + tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None, + model: SwinIRModel = SwinIRModel.lightweightSR_DIV2K_s64w8_SwinIR_S_x2, + backend: backendT = Backend.OV_CPU() +) -> vs.VideoNode: + """ SwinIR: Image Restoration Using Swin Transformer """ + + func_name = "vsmlrt.SwinIR" + + if not isinstance(clip, vs.VideoNode): + raise TypeError(f'{func_name}: "clip" must be a clip!') + + if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]: + raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported") + + if not isinstance(model, int) or model not in SwinIRModel.__members__.values(): + raise ValueError(f'{func_name}: invalid "model"') + + if model in range(14, 17) or model in range(20, 24): + if clip.format.color_family != vs.GRAY: + raise ValueError(f'{func_name}: "clip" must be of GRAY color family') + elif clip.format.color_family != vs.RGB: + raise ValueError(f'{func_name}: "clip" must be of RGB color family') + + if overlap is None: + overlap_w = overlap_h = 16 + elif isinstance(overlap, int): + overlap_w = overlap_h = overlap + else: + overlap_w, overlap_h = overlap + + multiple = 1 + + (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize( + tiles=tiles, tilesize=tilesize, + width=clip.width, height=clip.height, + multiple=multiple, + overlap_w=overlap_w, overlap_h=overlap_h + ) + + if tile_w % multiple != 0 or tile_h % multiple != 0: + raise ValueError( + f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})' + ) + + backend = init_backend( + backend=backend, + trt_opt_shapes=(tile_w, tile_h) + ) + + if model < 4: + model_name = tuple(SwinIRModel.__members__)[model] + else: + model_name = tuple(SwinIRModel.__members__)[model - 1] + + model_name = model_name.replace("SwinIR_", "SwinIR-") + + if model in range(3): + model_name = f"002_{model_name}" + elif model in (3, 5): + model_name = f"003_{model_name}" + elif model in range(6, 10): + model_name = f"001_{model_name}" + elif model in range(10, 14): + model_name = f"003_{model_name}" + elif model in range(14, 17): + model_name = f"004_{model_name}" + elif model in range(17, 20): + model_name = f"005_{model_name}" + elif model in range(20, 28): + model_name = f"006_{model_name}" + + network_path = os.path.join( + models_path, + "swinir", + f"{model_name}.onnx" + ) + + clip = inference_with_fallback( + clips=[clip], network_path=network_path, + overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h), + backend=backend + ) + + return clip + + def get_engine_path( network_path: str, min_shapes: typing.Tuple[int, int], @@ -1386,7 +1635,12 @@ def get_engine_path( tf32: bool, use_cudnn: bool, input_format: int, - output_format: int + output_format: int, + builder_optimization_level: int, + max_aux_streams: typing.Optional[int], + short_path: typing.Optional[bool], + bf16: bool, + engine_folder: typing.Optional[str] ) -> str: with open(network_path, "rb") as file: @@ -1401,30 +1655,42 @@ def get_engine_path( device_name = f"device{device_id}" if static_shape: - shape_str = f".{opt_shapes[0]}x{opt_shapes[1]}" + shape_str = f"{opt_shapes[0]}x{opt_shapes[1]}" else: shape_str = ( - f".min{min_shapes[0]}x{min_shapes[1]}" + f"min{min_shapes[0]}x{min_shapes[1]}" f"_opt{opt_shapes[0]}x{opt_shapes[1]}" f"_max{max_shapes[0]}x{max_shapes[1]}" ) - return ( - network_path + + identity = ( shape_str + ("_fp16" if fp16 else "") + - ("_no-tf32" if not tf32 else "") + + ("_tf32" if tf32 else "") + + ("_bf16" if bf16 else "") + (f"_workspace{workspace}" if workspace is not None else "") + + f"_opt{builder_optimization_level}" + + (f"_max-aux-streams{max_aux_streams}" if max_aux_streams is not None else "") + f"_trt-{trt_version}" + ("_cublas" if use_cublas else "") + ("_cudnn" if use_cudnn else "") + "_I-" + ("fp32" if input_format == 0 else "fp16") + "_O-" + ("fp32" if output_format == 0 else "fp16") + f"_{device_name}" + - f"_{checksum:x}" + - ".engine" + f"_{checksum:x}" ) + dirname, basename = os.path.split(network_path) + + if engine_folder is not None: + os.makedirs(engine_folder, exist_ok=True) + dirname = engine_folder + + if short_path or (short_path is None and platform.system() == "Windows"): + return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.engine") + else: + return f"{os.path.join(dirname, basename)}.{identity}.engine" + def trtexec( network_path: str, @@ -1433,12 +1699,12 @@ def trtexec( max_shapes: typing.Tuple[int, int], fp16: bool, device_id: int, - workspace: typing.Optional[int] = 128, + workspace: typing.Optional[int] = None, verbose: bool = False, use_cuda_graph: bool = False, use_cublas: bool = False, static_shape: bool = True, - tf32: bool = True, + tf32: bool = False, log: bool = False, use_cudnn: bool = True, use_edge_mask_convolutions: bool = True, @@ -1451,11 +1717,15 @@ def trtexec( faster_dynamic_shapes: bool = True, force_fp16: bool = False, builder_optimization_level: int = 3, + max_aux_streams: typing.Optional[int] = None, + short_path: typing.Optional[bool] = None, + bf16: bool = False, custom_env: typing.Dict[str, str] = {}, - custom_args: typing.List[str] = [] + custom_args: typing.List[str] = [], + engine_folder: typing.Optional[str] = None ) -> str: - # tensort runtime version, e.g. 8401 => 8.4.1 + # tensort runtime version trt_version = parse_trt_version(int(core.trt.Version()["tensorrt_version"])) if isinstance(opt_shapes, int): @@ -1467,6 +1737,7 @@ def trtexec( if force_fp16: fp16 = True tf32 = False + bf16 = False engine_path = get_engine_path( network_path=network_path, @@ -1481,19 +1752,26 @@ def trtexec( tf32=tf32, use_cudnn=use_cudnn, input_format=input_format, - output_format=output_format + output_format=output_format, + builder_optimization_level=builder_optimization_level, + max_aux_streams=max_aux_streams, + short_path=short_path, + bf16=bf16, + engine_folder=engine_folder, ) if os.access(engine_path, mode=os.R_OK): return engine_path - alter_engine_path = os.path.join( - tempfile.gettempdir(), - os.path.splitdrive(engine_path)[1][1:] - ) + # do not consider alternative path when the engine_folder is given + if engine_folder is None: + alter_engine_path = os.path.join( + tempfile.gettempdir(), + os.path.splitdrive(engine_path)[1][1:] + ) - if os.access(alter_engine_path, mode=os.R_OK): - return alter_engine_path + if os.access(alter_engine_path, mode=os.R_OK): + return alter_engine_path try: # test writability @@ -1501,12 +1779,16 @@ def trtexec( pass os.remove(engine_path) except PermissionError: - print(f"{engine_path} not writable", file=sys.stderr) - engine_path = alter_engine_path - dirname = os.path.dirname(engine_path) - if not os.path.exists(dirname): - os.makedirs(dirname) - print(f"change engine path to {engine_path}", file=sys.stderr) + if engine_folder is None: + print(f"{engine_path} is not writable", file=sys.stderr) + engine_path = alter_engine_path + dirname = os.path.dirname(engine_path) + if not os.path.exists(dirname): + os.makedirs(dirname) + print(f"change engine path to {engine_path}", file=sys.stderr) + else: + # do not consider alternative path when the engine_folder is given + raise PermissionError(f"{engine_path} is not writable") args = [ trtexec_path, @@ -1537,17 +1819,38 @@ def trtexec( if verbose: args.append("--verbose") - disabled_tactic_sources = [] - if not use_cublas: - disabled_tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"]) - if not use_cudnn: - disabled_tactic_sources.append("-CUDNN") - if not use_edge_mask_convolutions and trt_version >= (8, 4, 1): - disabled_tactic_sources.append("-EDGE_MASK_CONVOLUTIONS") - if not use_jit_convolutions and trt_version >= (8, 5, 0): - disabled_tactic_sources.append("-JIT_CONVOLUTIONS") - if disabled_tactic_sources: - args.append(f"--tacticSources={','.join(disabled_tactic_sources)}") + preview_features = [] + if (use_cublas or use_cudnn) and (8, 6, 0) <= trt_version < (10, 0, 0): + preview_features.append("-disableExternalTacticSourcesForCore0805") + + if preview_features and trt_version >= (8, 5, 0): + args.append(f"--preview={','.join(preview_features)}") + + tactic_sources = [] + + if use_cublas: + tactic_sources.extend(["+CUBLAS", "+CUBLAS_LT"]) + else: + tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"]) + + if use_cudnn: + tactic_sources.append("+CUDNN") + else: + tactic_sources.append("-CUDNN") + + if trt_version >= (8, 4, 1): + if use_edge_mask_convolutions: + tactic_sources.append("+EDGE_MASK_CONVOLUTIONS") + else: + tactic_sources.append("-EDGE_MASK_CONVOLUTIONS") + + if trt_version >= (8, 5, 0): + if use_jit_convolutions: + tactic_sources.append("+JIT_CONVOLUTIONS") + else: + tactic_sources.append("-JIT_CONVOLUTIONS") + + args.append(f"--tacticSources={','.join(tactic_sources)}") if use_cuda_graph: args.extend(( @@ -1555,7 +1858,7 @@ def trtexec( "--noDataTransfers" )) else: - if trt_version >= 8600: + if trt_version >= (8, 6, 0): args.append("--skipInference") else: args.append("--buildOnly") @@ -1590,6 +1893,13 @@ def trtexec( if trt_version >= (8, 6, 0): args.append(f"--builderOptimizationLevel={builder_optimization_level}") + if max_aux_streams is not None: + args.append(f"--maxAuxStreams={max_aux_streams}") + + if trt_version >= (9, 0, 0): + if bf16: + args.append("--bf16") + args.extend(custom_args) if log: @@ -1598,7 +1908,7 @@ def trtexec( if prev_env_value is not None and len(prev_env_value) > 0: # env_key has been set, no extra action - env = {env_key: prev_env_value} + env = {env_key: prev_env_value, "CUDA_MODULE_LOADING": "LAZY"} env.update(**custom_env) subprocess.run(args, env=env, check=True, stdout=sys.stderr) else: @@ -1609,7 +1919,7 @@ def trtexec( f"trtexec_{time_str}.log" ) - env = {env_key: log_filename} + env = {env_key: log_filename, "CUDA_MODULE_LOADING": "LAZY"} env.update(**custom_env) completed_process = subprocess.run(args, env=env, check=False, stdout=sys.stderr) @@ -1639,7 +1949,8 @@ def get_mxr_path( fp16: bool, fast_math: bool, exhaustive_tune: bool, - device_id: int + device_id: int, + short_path: typing.Optional[bool] ) -> str: with open(network_path, "rb") as file: @@ -1665,7 +1976,11 @@ def get_mxr_path( f"_{checksum:x}" ) - return f"{network_path}.{identity}.mxr" + if short_path or (short_path is None and platform.system() == "Windows"): + dirname, basename = os.path.split(network_path) + return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.mxr") + else: + return f"{network_path}.{identity}.mxr" def migraphx_driver( @@ -1677,6 +1992,7 @@ def migraphx_driver( exhaustive_tune: bool, device_id: int, input_name: str = "input", + short_path: typing.Optional[bool] = None, custom_env: typing.Dict[str, str] = {}, custom_args: typing.List[str] = [] ) -> str: @@ -1690,7 +2006,8 @@ def migraphx_driver( fp16=fp16, fast_math=fast_math, exhaustive_tune=exhaustive_tune, - device_id=device_id + device_id=device_id, + short_path=short_path ) if os.access(mxr_path, mode=os.R_OK): @@ -1805,6 +2122,8 @@ def init_backend( backend = Backend.ORT_DML() elif backend is Backend.MIGX: # type: ignore backend = Backend.MIGX() + elif backend is Backend.OV_NPU: + backend = Backend.OV_NPU() backend = copy.deepcopy(backend) @@ -1866,6 +2185,19 @@ def _inference( fp16_blacklist_ops=backend.fp16_blacklist_ops ) elif isinstance(backend, Backend.ORT_CUDA): + kwargs = dict() + + version_list = core.ort.Version().get("onnxruntime_version", b"0.0.0").split(b'.') + if len(version_list) != 3: + version = (0, 0, 0) + else: + version = tuple(map(int, version_list)) + + if version >= (1, 18, 0): + kwargs["prefer_nhwc"] = backend.prefer_nhwc + kwargs["output_format"] = backend.output_format + kwargs["tf32"] = backend.tf32 + clip = core.ort.Model( clips, network_path, overlap=overlap, tilesize=tilesize, @@ -1877,34 +2209,66 @@ def _inference( fp16=backend.fp16, path_is_serialization=path_is_serialization, use_cuda_graph=backend.use_cuda_graph, - fp16_blacklist_ops=backend.fp16_blacklist_ops + fp16_blacklist_ops=backend.fp16_blacklist_ops, + **kwargs ) elif isinstance(backend, Backend.OV_CPU): - config = lambda: dict( - CPU_THROUGHPUT_STREAMS=backend.num_streams, - CPU_BIND_THREAD="YES" if backend.bind_thread else "NO", - CPU_THREADS_NUM=backend.num_threads, - ENFORCE_BF16="YES" if backend.bf16 else "NO" - ) + version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.'))) + + if version >= (2024, 0, 0): + config_dict = dict( + NUM_STREAMS=backend.num_streams, + INFERENCE_NUM_THREADS=backend.num_threads, + ENABLE_CPU_PINNING="YES" if backend.bind_thread else "NO" + ) + if backend.fp16: + config_dict["INFERENCE_PRECISION_HINT"] = "f16" + elif backend.bf16: + config_dict["INFERENCE_PRECISION_HINT"] = "bf16" + else: + config_dict["INFERENCE_PRECISION_HINT"] = "f32" + + config = lambda: config_dict + else: + config = lambda: dict( + CPU_THROUGHPUT_STREAMS=backend.num_streams, + CPU_BIND_THREAD="YES" if backend.bind_thread else "NO", + CPU_THREADS_NUM=backend.num_threads, + ENFORCE_BF16="YES" if backend.bf16 else "NO" + ) clip = core.ov.Model( clips, network_path, overlap=overlap, tilesize=tilesize, device="CPU", builtin=False, - fp16=backend.fp16, + fp16=False, # use ov's internal quantization config=config, path_is_serialization=path_is_serialization, - fp16_blacklist_ops=backend.fp16_blacklist_ops + fp16_blacklist_ops=backend.fp16_blacklist_ops # disabled since fp16 = False ) elif isinstance(backend, Backend.OV_GPU): - config = lambda: dict( - GPU_THROUGHPUT_STREAMS=backend.num_streams - ) + version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.'))) + + if version >= (2024, 0, 0): + config_dict = dict( + NUM_STREAMS=backend.num_streams, + ) + if backend.fp16: + config_dict["INFERENCE_PRECISION_HINT"] = "f16" + else: + config_dict["INFERENCE_PRECISION_HINT"] = "f32" + + config = lambda: config_dict + else: + config = lambda: dict( + GPU_THROUGHPUT_STREAMS=backend.num_streams + ) + clip = core.ov.Model( clips, network_path, overlap=overlap, tilesize=tilesize, device=f"GPU.{backend.device_id}", builtin=False, - fp16=backend.fp16, + fp16=False, # use ov's internal quantization config=config, path_is_serialization=path_is_serialization, fp16_blacklist_ops=backend.fp16_blacklist_ops @@ -1945,8 +2309,12 @@ def _inference( faster_dynamic_shapes=backend.faster_dynamic_shapes, force_fp16=backend.force_fp16, builder_optimization_level=backend.builder_optimization_level, + max_aux_streams=backend.max_aux_streams, + short_path=backend.short_path, + bf16=backend.bf16, custom_env=backend.custom_env, - custom_args=backend.custom_args + custom_args=backend.custom_args, + engine_folder=backend.engine_folder, ) clip = core.trt.Model( clips, engine_path, @@ -1986,6 +2354,7 @@ def _inference( exhaustive_tune=backend.exhaustive_tune, device_id=backend.device_id, input_name=input_name, + short_path=backend.short_path, custom_env=backend.custom_env, custom_args=backend.custom_args ) @@ -1995,6 +2364,14 @@ def _inference( tilesize=tilesize, device_id=backend.device_id ) + elif isinstance(backend, Backend.OV_NPU): + clip = core.ov.Model( + clips, network_path, + overlap=overlap, tilesize=tilesize, + device="NPU", builtin=False, + fp16=False, # use ov's internal quantization + path_is_serialization=path_is_serialization, + ) else: raise TypeError(f'unknown backend {backend}') @@ -2101,9 +2478,9 @@ class BackendV2: def TRT(*, num_streams: int = 1, fp16: bool = False, - tf32: bool = True, + tf32: bool = False, output_format: int = 0, # 0: fp32, 1: fp16 - workspace: typing.Optional[int] = 128, + workspace: typing.Optional[int] = None, use_cuda_graph: bool = False, static_shape: bool = True, min_shapes: typing.Tuple[int, int] = (0, 0), @@ -2111,7 +2488,7 @@ def TRT(*, max_shapes: typing.Optional[typing.Tuple[int, int]] = None, force_fp16: bool = False, use_cublas: bool = False, - use_cudnn: bool = True, + use_cudnn: bool = False, device_id: int = 0, **kwargs ) -> Backend.TRT: @@ -2224,18 +2601,25 @@ def MIGX(*, **kwargs ) + @staticmethod + def OV_NPU(**kwargs + ) -> Backend.OV_NPU: + return Backend.OV_NPU( + **kwargs + ) + def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode: clip_org = clip if clip.format.sample_type == vs.FLOAT and clip.format.bits_per_sample != 32: format = clip.format.replace(core=core, bits_per_sample=32) - clip = core.resize.Point(clip, format=format) + clip = core.resize.Point(clip, format=format.id) clip = core.fmtc.resample(clip, **kwargs) if clip.format.bits_per_sample != clip_org.format.bits_per_sample: - clip = core.resize.Point(clip, format=clip_org.format) + clip = core.resize.Point(clip, format=clip_org.format.id) return clip diff --git a/vsncnn/vs_ncnn.cpp b/vsncnn/vs_ncnn.cpp index 8b06821..cabe182 100644 --- a/vsncnn/vs_ncnn.cpp +++ b/vsncnn/vs_ncnn.cpp @@ -22,17 +22,12 @@ #include #include -#include "config.h" // generated by cmake #include -#include "onnx2ncnn.hpp" +#include "../common/onnx_utils.h" +#include "onnx2ncnn.hpp" -extern std::variant loadONNX( - const std::string_view & path, - int64_t tile_w, - int64_t tile_h, - bool path_is_serialization -) noexcept; +#include "config.h" // generated by cmake static const VSPlugin * myself = nullptr; diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt index 85dcae3..c85ddc1 100644 --- a/vsort/CMakeLists.txt +++ b/vsort/CMakeLists.txt @@ -39,7 +39,14 @@ if (CMAKE_CXX_STANDARD GREATER_EQUAL 20) set_target_properties(vsort PROPERTIES CXX_STANDARD 20) endif() -target_link_libraries(vsort PRIVATE onnx onnxruntime) +# https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 +if (ONNX_VERSION VERSION_LESS 1.16.0) + target_link_libraries(vsort PRIVATE onnx) +else() + target_link_libraries(vsort PRIVATE ONNX::onnx) +endif() + +target_link_libraries(vsort PRIVATE onnxruntime) if (ENABLE_CUDA) find_package(CUDAToolkit REQUIRED) diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp index f543afd..290c1d2 100644 --- a/vsort/vs_onnxruntime.cpp +++ b/vsort/vs_onnxruntime.cpp @@ -1,11 +1,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -27,31 +25,20 @@ using namespace std::chrono_literals; #define NOMINMAX #include +#include #ifdef ENABLE_CUDA #include #endif // ENABLE_CUDA #ifdef ENABLE_DML -// include/onnxruntime/core/providers/dml/dml_provider_factory.h -#include <../providers/dml/dml_provider_factory.h> +#include #endif // ENABLE_DML -#include "config.h" - +#include "../common/convert_float_to_float16.h" +#include "../common/onnx_utils.h" -extern std::variant loadONNX( - const std::string_view & path, - int64_t tile_w, - int64_t tile_h, - bool path_is_serialization -) noexcept; - -extern void convert_float_to_float16( - ONNX_NAMESPACE::ModelProto & model, - bool force_fp16_initializers, - const std::unordered_set & op_block_list -) noexcept; +#include "config.h" #ifdef ENABLE_COREML @@ -87,6 +74,7 @@ static std::mutex capture_lock; // rename GridSample to com.microsoft::GridSample // onnxruntime has support for CUDA-accelerated GridSample only in its own opset domain static void rename(ONNX_NAMESPACE::ModelProto & model) { +#if ORT_API_VERSION < 18 constexpr auto ms_domain = "com.microsoft"; bool has_ms_opset = false; @@ -109,6 +97,7 @@ static void rename(ONNX_NAMESPACE::ModelProto & model) { *node.mutable_domain() = ms_domain; } } +#endif // ORT_API_VERSION < 18 } @@ -176,6 +165,19 @@ static std::variant> getShape( return std::get>(maybe_shape); } +static size_t getNumBytes(int32_t type) { + using namespace ONNX_NAMESPACE; + + switch (type) { + case TensorProto::FLOAT: + return 4; + case TensorProto::FLOAT16: + return 2; + default: + return 0; + } +} + static int numPlanes( const std::vector & vis @@ -197,8 +199,12 @@ static std::optional checkNodes( ) noexcept { for (const auto & vi : vis) { - if (vi->format->sampleType != stFloat || vi->format->bitsPerSample != 32) { - return "expects clip with type fp32"; + if (vi->format->sampleType != stFloat) { + return "expects clip with floating-point type"; + } + + if (vi->format->bitsPerSample != 32 && vi->format->bitsPerSample != 16) { + return "expects clip with type fp32 or fp16"; } if (vi->width != vis[0]->width || vi->height != vis[0]->height) { @@ -234,8 +240,8 @@ static std::optional checkIOInfo( ONNXTensorElementDataType element_type; checkError(ortapi->GetTensorElementType(tensor_info, &element_type)); - if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { - return set_error("expects network IO with type fp32"); + if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) { + return set_error("expects network IO with type fp32 or fp16"); } size_t num_dims; @@ -341,6 +347,23 @@ static std::optional checkNodesAndNetwork( return set_error("tile size larger than clip dimension"); } + OrtTypeInfo * output_type_info; + checkError(ortapi->SessionGetOutputTypeInfo(session, 0, &output_type_info)); + + const OrtTensorTypeAndShapeInfo * output_tensor_info; + checkError(ortapi->CastTypeInfoToTensorInfo(output_type_info, &output_tensor_info)); + + auto network_out_dims = std::get>(getShape(output_tensor_info)); + + auto network_out_height = network_out_dims[2]; + auto network_out_width = network_out_dims[3]; + + if (network_out_height % network_in_height != 0 || network_out_width % network_in_width != 0) { + return set_error("output dimensions must be divisible by input dimensions"); + } + + ortapi->ReleaseTypeInfo(output_type_info); + ortapi->ReleaseTypeInfo(input_type_info); return {}; @@ -351,16 +374,17 @@ static void setDimensions( const std::array & input_shape, const std::array & output_shape, VSCore * core, - const VSAPI * vsapi + const VSAPI * vsapi, + int32_t onnx_output_type ) noexcept { vi->height *= output_shape[2] / input_shape[2]; vi->width *= output_shape[3] / input_shape[3]; if (output_shape[1] == 1) { - vi->format = vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core); + vi->format = vsapi->registerFormat(cmGray, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core); } else if (output_shape[1] == 3) { - vi->format = vsapi->registerFormat(cmRGB, stFloat, 32, 0, 0, core); + vi->format = vsapi->registerFormat(cmRGB, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core); } } @@ -565,9 +589,23 @@ static const VSFrameRef *VS_CC vsOrtGetFrame( return nullptr; }; + OrtRunOptions * run_options {}; + #ifdef ENABLE_CUDA if (d->backend == Backend::CUDA) { checkCUDAError(cudaSetDevice(d->device_id)); + +#if ORT_API_VERSION >= 16 + checkError(ortapi->CreateRunOptions(&run_options)); + if (run_options == nullptr) { + return set_error("create run_options failed"); + } + checkError(ortapi->AddRunConfigEntry( + run_options, + kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, + "1" + )); +#endif // ORT_API_VERSION >= 16 } #endif // ENABLE_CUDA @@ -627,9 +665,9 @@ static const VSFrameRef *VS_CC vsOrtGetFrame( resource.stream )); - // OrtCUDAProviderOptionsV2 disallows using custom user stream - // and the inference is executed on a private non-blocking stream +#if ORT_API_VERSION < 16 checkCUDAError(cudaStreamSynchronize(resource.stream)); +#endif // ORT_API_VERSION < 16 } #endif // ENABLE_CUDA @@ -644,17 +682,17 @@ static const VSFrameRef *VS_CC vsOrtGetFrame( // note that this applies only to stream capture from the ort library // this fails when another plugin also uses global-mode stream capture std::lock_guard _ { capture_lock }; - checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding)); + checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding)); // onnxruntime replays the graph itself in CUDAExecutionProvider::OnRunEnd } else #endif // ENABLE_CUDA if (d->backend == Backend::CPU || d->backend == Backend::CUDA) { - checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding)); + checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding)); } else { checkError(ortapi->Run( resource.session, - nullptr, + run_options, &resource.input_name, &resource.input_tensor, 1, @@ -735,6 +773,10 @@ static const VSFrameRef *VS_CC vsOrtGetFrame( y = std::min(y + step_h, src_height - src_tile_h); } + if (run_options) { + ortapi->ReleaseRunOptions(run_options); + } + d->release(ticket); for (const auto & frame : src_frames) { @@ -908,6 +950,18 @@ static void VS_CC vsOrtCreate( if (error) { cudnn_benchmark = true; } + +#if ORT_API_VERSION >= 17 + bool prefer_nhwc = !!(vsapi->propGetInt(in, "prefer_nhwc", 0, &error)); + if (error) { + prefer_nhwc = false; + } +#endif // ORT_API_VERSION >= 17 + + bool tf32 = !!(vsapi->propGetInt(in, "tf32", 0, &error)); + if (error) { + tf32 = false; + } #endif // ENABLE_CUDA if (auto err = ortInit(); err.has_value()) { @@ -929,6 +983,14 @@ static void VS_CC vsOrtCreate( use_cuda_graph = false; } + int output_format = int64ToIntS(vsapi->propGetInt(in, "output_format", 0, &error)); + if (error) { + output_format = 0; + } + if (output_format != 0 && output_format != 1) { + return set_error("\"output_format\" must be 0 or 1"); + } + std::string_view path_view; std::string path; if (path_is_serialization) { @@ -976,11 +1038,26 @@ static void VS_CC vsOrtCreate( fp16_blacklist_ops.emplace(vsapi->propGetData(in, "fp16_blacklist_ops", i, nullptr)); } } - convert_float_to_float16(onnx_model, false, fp16_blacklist_ops); + convert_float_to_float16( + onnx_model, + false, + fp16_blacklist_ops, + in_vis.front()->format->bytesPerSample == 4, + output_format == 0 + ); } rename(onnx_model); + auto onnx_input_type = onnx_model.graph().input()[0].type().tensor_type().elem_type(); + auto onnx_output_type = onnx_model.graph().output()[0].type().tensor_type().elem_type(); + + if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT && in_vis.front()->format->bitsPerSample != 32) { + return set_error("the onnx requires input to be of type fp32"); + } else if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT16 && in_vis.front()->format->bitsPerSample != 16) { + return set_error("the onnx requires input to be of type fp16"); + } + std::string onnx_data = onnx_model.SerializeAsString(); if (std::size(onnx_data) == 0) { return set_error("proto serialization failed"); @@ -1041,6 +1118,8 @@ static void VS_CC vsOrtCreate( // TODO: other providers #ifdef ENABLE_CUDA if (d->backend == Backend::CUDA) { + checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking)); + OrtCUDAProviderOptionsV2 * cuda_options; checkError(ortapi->CreateCUDAProviderOptions(&cuda_options)); #ifdef _MSC_VER @@ -1061,7 +1140,11 @@ static void VS_CC vsOrtCreate( "cudnn_conv_algo_search", "cudnn_conv_use_max_workspace", "arena_extend_strategy", - "enable_cuda_graph" + "enable_cuda_graph", +#if ORT_API_VERSION >= 17 + "prefer_nhwc", + "use_tf32", +#endif // ORT_API_VERSION >= 17 }; auto device_id_str = std::to_string(d->device_id); const char * values [] { @@ -1069,7 +1152,11 @@ static void VS_CC vsOrtCreate( "EXHAUSTIVE", "1", "kSameAsRequested", - "0" + "0", +#if ORT_API_VERSION >= 17 + "0", + "0", +#endif // ORT_API_VERSION >= 17 }; if (!cudnn_benchmark) { values[1] = "HEURISTIC"; @@ -1080,8 +1167,24 @@ static void VS_CC vsOrtCreate( } else { resource.require_replay = false; } +#if ORT_API_VERSION >= 17 + if (prefer_nhwc) { + values[5] = "1"; + } + if (tf32) { + values[6] = "1"; + } +#endif // ORT_API_VERSION >= 17 checkError(ortapi->UpdateCUDAProviderOptions(cuda_options, keys, values, std::size(keys))); +#if ORT_API_VERSION >= 16 + checkError(ortapi->UpdateCUDAProviderOptionsWithValue( + cuda_options, + "user_compute_stream", + resource.stream + )); +#endif // ORT_API_VERSION >= 16 + checkError(ortapi->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options)); ortapi->ReleaseCUDAProviderOptions(cuda_options); @@ -1122,14 +1225,12 @@ static void VS_CC vsOrtCreate( #ifdef ENABLE_CUDA if (d->backend == Backend::CUDA) { - checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking)); - resource.input.size = ( input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3] - ) * sizeof(float); + ) * getNumBytes(onnx_input_type); checkCUDAError(cudaMallocHost( &resource.input.h_data, resource.input.size, @@ -1141,7 +1242,8 @@ static void VS_CC vsOrtCreate( memory_info, resource.input.d_data, resource.input.size, std::data(input_shape), std::size(input_shape), - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.input_tensor + static_cast(onnx_input_type), + &resource.input_tensor )); } else #endif // ENALBE_CUDA @@ -1149,7 +1251,7 @@ static void VS_CC vsOrtCreate( checkError(ortapi->CreateTensorAsOrtValue( cpu_allocator, std::data(input_shape), std::size(input_shape), - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + static_cast(onnx_input_type), &resource.input_tensor )); } @@ -1165,7 +1267,7 @@ static void VS_CC vsOrtCreate( output_shape[1] * output_shape[2] * output_shape[3] - ) * sizeof(float); + ) * getNumBytes(onnx_output_type); checkCUDAError(cudaMallocHost(&resource.output.h_data, resource.output.size)); checkCUDAError(cudaMalloc(&resource.output.d_data, resource.output.size)); @@ -1174,7 +1276,8 @@ static void VS_CC vsOrtCreate( memory_info, resource.output.d_data, resource.output.size, std::data(output_shape), std::size(output_shape), - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.output_tensor + static_cast(onnx_output_type), + &resource.output_tensor )); } else #endif // ENABLE_CUDA @@ -1182,7 +1285,7 @@ static void VS_CC vsOrtCreate( checkError(ortapi->CreateTensorAsOrtValue( cpu_allocator, std::data(output_shape), std::size(output_shape), - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + static_cast(onnx_output_type), &resource.output_tensor )); } @@ -1204,7 +1307,7 @@ static void VS_CC vsOrtCreate( } if (i == 0) { - setDimensions(d->out_vi, input_shape, output_shape, core, vsapi); + setDimensions(d->out_vi, input_shape, output_shape, core, vsapi, onnx_output_type); } d->resources.push_back(resource); @@ -1249,6 +1352,9 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( "path_is_serialization:int:opt;" "use_cuda_graph:int:opt;" "fp16_blacklist_ops:data[]:opt;" + "prefer_nhwc:int:opt;" + "output_format:int:opt;" + "tf32:int:opt;" , vsOrtCreate, nullptr, plugin @@ -1258,10 +1364,26 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( vsapi->propSetData(out, "version", VERSION, -1, paReplace); vsapi->propSetData( - out, "onnxruntime_version", + out, "onnxruntime_api_version_build", std::to_string(ORT_API_VERSION).c_str(), -1, paReplace ); + if (auto err = ortInit(); err.has_value()) { + vsapi->logMessage(mtWarning, err.value().c_str()); + } else { + if (auto p = OrtGetApiBase(); p) { + vsapi->propSetData( + out, "onnxruntime_version", + p->GetVersionString(), -1, paReplace + ); + } + + vsapi->propSetData( + out, "onnxruntime_build_info", + ortapi->GetBuildInfoString(), -1, paReplace + ); + } + #ifdef ENABLE_CUDA vsapi->propSetData( out, "cuda_runtime_version", diff --git a/vsort/win32.cpp b/vsort/win32.cpp index 8741167..cca5e18 100644 --- a/vsort/win32.cpp +++ b/vsort/win32.cpp @@ -22,7 +22,6 @@ static std::vector cudaDlls { L"cudart64", L"cublasLt64", L"cublas64", L"cufft64", - L"zlibwapi", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency L"cudnn_ops_infer64", L"cudnn_cnn_infer64", L"cudnn_adv_infer64", L"cudnn64", L"cupti64", }; diff --git a/vsov/CMakeLists.txt b/vsov/CMakeLists.txt index f003806..e6f033c 100644 --- a/vsov/CMakeLists.txt +++ b/vsov/CMakeLists.txt @@ -7,7 +7,6 @@ set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualizatio set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library") find_package(OpenVINO REQUIRED CONFIG) -find_package(InferenceEngine REQUIRED CONFIG) add_library(vsov SHARED vs_openvino.cpp @@ -24,20 +23,21 @@ if(WIN32_SHARED_OPENVINO) target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO) endif() -target_include_directories(vsov PRIVATE - ${VAPOURSYNTH_INCLUDE_DIRECTORY} - ${ONNX_INCLUDE_DIRS} - ${InferenceEngine_INCLUDE_DIRS} -) - if(WIN32_SHARED_OPENVINO) find_package(protobuf REQUIRED CONFIG) find_package(ONNX REQUIRED CONFIG) - target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} onnx) + target_link_libraries(vsov PRIVATE onnx) else() - target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} openvino::onnx) + target_link_libraries(vsov PRIVATE openvino::frontend::onnx) endif() +target_include_directories(vsov PRIVATE + ${VAPOURSYNTH_INCLUDE_DIRECTORY} + ${ONNX_INCLUDE_DIRS} +) + +target_link_libraries(vsov PRIVATE openvino::runtime) + set_target_properties(vsov PROPERTIES CXX_EXTENSIONS OFF CXX_STANDARD 17 diff --git a/vsov/vs_openvino.cpp b/vsov/vs_openvino.cpp index f186be4..ce6dfe3 100644 --- a/vsov/vs_openvino.cpp +++ b/vsov/vs_openvino.cpp @@ -20,28 +20,17 @@ #include #include -#include +#include #include #ifdef ENABLE_VISUALIZATION #include #endif // ENABLE_VISUALIZATION -#include "config.h" - - -extern std::variant loadONNX( - const std::string_view & path, - int64_t tile_w, - int64_t tile_h, - bool path_is_serialization -) noexcept; +#include "../common/convert_float_to_float16.h" +#include "../common/onnx_utils.h" -extern void convert_float_to_float16( - ONNX_NAMESPACE::ModelProto & model, - bool force_fp16_initializers, - const std::unordered_set & op_block_list -) noexcept; +#include "config.h" using namespace std::string_literals; @@ -50,16 +39,16 @@ static const VSPlugin * myself = nullptr; static std::array getShape( - const InferenceEngine::ExecutableNetwork & network, + const ov::CompiledModel & network, bool input ) { - InferenceEngine::SizeVector dims; + ov::Shape dims; if (input) { - dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims(); + dims = network.input().get_shape(); } else { - dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims(); + dims = network.output().get_shape(); } std::array ret; @@ -112,21 +101,19 @@ static std::optional checkNodes( } -template [[nodiscard]] static std::optional checkIOInfo( - const T & info, + const ov::Output & info, bool is_output ) { - if (info->getPrecision() != InferenceEngine::Precision::FP32) { + if (info.get_element_type() != ov::element::f32) { return "expects network IO with type fp32"; } - const auto & desc = info->getTensorDesc(); - if (desc.getLayout() != InferenceEngine::Layout::NCHW) { - return "expects network IO with layout NCHW"; - } - const auto & dims = desc.getDims(); + // if (ov::layout::get_layout(info) != ov::Layout("NCHW")) { + // return "expects network IO with layout NCHW"; + // } + const auto & dims = info.get_shape(); if (dims.size() != 4) { return "expects network with 4-D IO"; } @@ -148,27 +135,23 @@ static std::optional checkIOInfo( [[nodiscard]] static std::optional checkNetwork( - const InferenceEngine::CNNNetwork & network + const std::shared_ptr & network ) { - const auto & inputs_info = network.getInputsInfo(); - - if (auto num_inputs = std::size(inputs_info); num_inputs != 1) { + if (auto num_inputs = std::size(network->inputs()); num_inputs != 1) { return "network input count must be 1, got " + std::to_string(num_inputs); } - const auto & input_info = inputs_info.cbegin()->second; + const auto & input_info = network->input(); if (auto err = checkIOInfo(input_info, false); err.has_value()) { return err.value(); } - const auto & outputs_info = network.getOutputsInfo(); - - if (auto num_outputs = std::size(outputs_info); num_outputs != 1) { + if (auto num_outputs = std::size(network->outputs()); num_outputs != 1) { return "network output count must be 1, got " + std::to_string(num_outputs); } - const auto & output_info = outputs_info.cbegin()->second; + const auto & output_info = network->output(); if (auto err = checkIOInfo(output_info, true); err.has_value()) { return err.value(); } @@ -179,12 +162,12 @@ static std::optional checkNetwork( [[nodiscard]] static std::optional checkNodesAndNetwork( - const InferenceEngine::ExecutableNetwork & network, + const ov::CompiledModel & network, const std::vector & vis ) { const auto & network_in_dims = ( - network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims() + network.input().get_tensor().get_shape() ); int network_in_channels = static_cast(network_in_dims[1]); @@ -205,15 +188,16 @@ static std::optional checkNodesAndNetwork( } + static void setDimensions( std::unique_ptr & vi, - const InferenceEngine::ExecutableNetwork & network, + const ov::CompiledModel & network, VSCore * core, const VSAPI * vsapi ) { - auto in_dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims(); - auto out_dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims(); + const auto & in_dims = network.input().get_shape(); + const auto & out_dims = network.output().get_shape(); vi->height *= out_dims[2] / in_dims[2]; vi->width *= out_dims[3] / in_dims[3]; @@ -226,13 +210,13 @@ static void setDimensions( } -static std::variant> getConfig( +static std::variant getConfig( VSFuncRef * config_func, VSCore * core, const VSAPI * vsapi ) { - std::map config; + ov::AnyMap config; if (config_func == nullptr) { return config; @@ -285,13 +269,10 @@ struct OVData { int overlap_w, overlap_h; - InferenceEngine::Core core; - InferenceEngine::ExecutableNetwork executable_network; - std::unordered_map infer_requests; + ov::Core core; + ov::CompiledModel executable_network; + std::unordered_map infer_requests; std::shared_mutex infer_requests_lock; - - std::string input_name; - std::string output_name; }; @@ -396,7 +377,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame( auto thread_id = std::this_thread::get_id(); bool initialized = true; - InferenceEngine::InferRequest * infer_request; + ov::InferRequest * infer_request; d->infer_requests_lock.lock_shared(); try { @@ -409,9 +390,9 @@ static const VSFrameRef *VS_CC vsOvGetFrame( if (!initialized) { std::lock_guard _ { d->infer_requests_lock }; try { - d->infer_requests.emplace(thread_id, d->executable_network.CreateInferRequest()); - } catch (const InferenceEngine::Exception& e) { - return set_error("[IE exception] Create inference request: "s + e.what()); + d->infer_requests.emplace(thread_id, d->executable_network.create_infer_request()); + } catch (const ov::Exception & e) { + return set_error("[OV exception] Create inference request: "s + e.what()); } catch (const std::exception& e) { return set_error("[Standard exception] Create inference request: "s + e.what()); } @@ -429,11 +410,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame( int x_crop_end = (x == src_width - src_tile_w) ? 0 : d->overlap_w; { - InferenceEngine::Blob::Ptr input = infer_request->GetBlob(d->input_name); - - auto minput = input->as(); - auto minputHolder = minput->wmap(); - uint8_t * input_buffer = minputHolder.as(); + auto input_buffer = (uint8_t *) infer_request->get_input_tensor().data(); for (const auto & _src_ptr : src_ptrs) { const uint8_t * src_ptr { _src_ptr + @@ -451,19 +428,15 @@ static const VSFrameRef *VS_CC vsOvGetFrame( } try { - infer_request->Infer(); - } catch (const InferenceEngine::Exception & e) { - return set_error("[IE exception] Create inference request: "s + e.what()); + infer_request->infer(); + } catch (const ov::Exception & e) { + return set_error("[OV exception] Create inference request: "s + e.what()); } catch (const std::exception& e) { return set_error("[Standard exception] Create inference request: "s + e.what()); } { - InferenceEngine::Blob::CPtr output = infer_request->GetBlob(d->output_name); - - auto moutput = output->as(); - auto moutputHolder = moutput->rmap(); - const uint8_t * output_buffer = moutputHolder.as(); + auto output_buffer = (const uint8_t *) infer_request->get_output_tensor().data(); for (int plane = 0; plane < dst_planes; ++plane) { uint8_t * dst_ptr = (dst_ptrs[plane] + @@ -533,11 +506,11 @@ static void VS_CC vsOvCreate( ) { std::unique_ptr d = nullptr; - + try { d = std::make_unique(); - } catch (const InferenceEngine::Exception& e) { - vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str()); + } catch (const ov::Exception& e) { + vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str()); return ; } catch (const std::exception& e) { vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str()); @@ -675,12 +648,11 @@ static void VS_CC vsOvCreate( } { - InferenceEngine::CNNNetwork network; + std::shared_ptr network; try { - auto empty = InferenceEngine::Blob::CPtr(); - network = d->core.ReadNetwork(onnx_data, empty); - } catch (const InferenceEngine::Exception& e) { - return set_error("[IE exception] ReadNetwork(): "s + e.what()); + network = d->core.read_model(onnx_data, ov::Tensor()); + } catch (const ov::Exception& e) { + return set_error("[OV exception] ReadNetwork(): "s + e.what()); } catch (const std::exception& e) { return set_error("[Standard exception] ReadNetwork(): "s + e.what()); } @@ -689,10 +661,8 @@ static void VS_CC vsOvCreate( return set_error(err.value()); } - auto function = network.getFunction(); // mutable - try { - ov::pass::ConstantFolding().run_on_model(function); + ov::pass::ConstantFolding().run_on_model(network); } catch (const ov::Exception & e) { return set_error(e.what()); } @@ -701,7 +671,7 @@ static void VS_CC vsOvCreate( const char * dot_path = vsapi->propGetData(in, "dot_path", 0, &error); if (!error) { try { - ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(function); + ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(network); } catch (const ov::Exception & e) { return set_error(e.what()); } @@ -714,11 +684,11 @@ static void VS_CC vsOvCreate( if (std::holds_alternative(config_ret)) { return set_error(std::get(config_ret)); } - auto & config = std::get>(config_ret); + auto & config = std::get(config_ret); try { - d->executable_network = d->core.LoadNetwork(network, device, config); - } catch (const InferenceEngine::Exception & e) { + d->executable_network = d->core.compile_model(network, device, config); + } catch (const ov::Exception & e) { return set_error(e.what()); } @@ -728,9 +698,6 @@ static void VS_CC vsOvCreate( setDimensions(d->out_vi, d->executable_network, core, vsapi); - d->input_name = d->executable_network.GetInputsInfo().cbegin()->first; - d->output_name = d->executable_network.GetOutputsInfo().cbegin()->first; - VSCoreInfo core_info; vsapi->getCoreInfo2(core, &core_info); d->infer_requests.reserve(core_info.numThreads); @@ -780,8 +747,10 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( vsapi->propSetData(out, "version", VERSION, -1, paReplace); std::ostringstream ostream; - ostream << IE_VERSION_MAJOR << '.' << IE_VERSION_MINOR << '.' << IE_VERSION_PATCH; - vsapi->propSetData(out, "inference_engine_version", ostream.str().c_str(), -1, paReplace); + ostream << OPENVINO_VERSION_MAJOR << '.' << OPENVINO_VERSION_MINOR << '.' << OPENVINO_VERSION_PATCH; + vsapi->propSetData(out, "openvino_version_build", ostream.str().c_str(), -1, paReplace); + + vsapi->propSetData(out, "openvino_version", ov::get_openvino_version().buildNumber, -1, paReplace); vsapi->propSetData( out, "onnx_version", @@ -798,13 +767,13 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( auto availableDevices = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { try { - auto core = InferenceEngine::Core(); - auto devices = core.GetAvailableDevices(); + auto core = ov::Core(); + auto devices = core.get_available_devices(); for (const auto & device : devices) { vsapi->propSetData(out, "devices", device.c_str(), -1, paAppend); } - } catch (const InferenceEngine::Exception& e) { - vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str()); + } catch (const ov::Exception& e) { + vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str()); } catch (const std::exception& e) { vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str()); } diff --git a/vsov/win32.cpp b/vsov/win32.cpp index a6572d7..e0b8d4d 100644 --- a/vsov/win32.cpp +++ b/vsov/win32.cpp @@ -14,10 +14,10 @@ namespace { std::vector dlls = { // This list must be sorted by dependency. #ifdef WIN32_SHARED_OPENVINO - L"tbb.dll", + L"tbb12.dll", L"openvino.dll", // must be the last #else // WIN32_SHARED_OPENVINO - L"tbb.dll", // must be the last + L"tbb12.dll", // must be the last #endif // WIN32_SHARED_OPENVINO }; diff --git a/vstrt/CMakeLists.txt b/vstrt/CMakeLists.txt index 7bddf7e..212a48f 100644 --- a/vstrt/CMakeLists.txt +++ b/vstrt/CMakeLists.txt @@ -8,7 +8,7 @@ set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT") option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE) option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE) -set(CUDNN_HOME "" CACHE PATH "Path to cuDNN") +set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix") FIND_PACKAGE(CUDAToolkit REQUIRED) @@ -31,28 +31,21 @@ set_target_properties(vstrt PROPERTIES ) target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib) -target_link_libraries(vstrt PRIVATE CUDA::cudart_static nvinfer) +target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}") if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer.dll" "delayimp.lib") + target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib") endif() if (USE_NVINFER_PLUGIN) add_definitions(-DUSE_NVINFER_PLUGIN) if (USE_NVINFER_PLUGIN_STATIC) - target_link_libraries(vstrt PRIVATE nvinfer_plugin_static) - - find_library(CUDNN_LIB cudnn HINTS ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib) - target_link_libraries(vstrt PRIVATE ${CUDNN_LIB} CUDA::cublas) - - if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - target_link_options(vstrt PRIVATE "/DELAYLOAD:cublas64_11.dll" "/DELAYLOAD:cudnn64_8.dll") - endif() + target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}") else() - target_link_libraries(vstrt PRIVATE nvinfer_plugin) + target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}") if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin.dll") + target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll") endif() endif() endif() diff --git a/vstrt/trt_utils.h b/vstrt/trt_utils.h index 3a2e3c8..3c0feb9 100644 --- a/vstrt/trt_utils.h +++ b/vstrt/trt_utils.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -255,6 +254,16 @@ size_t getBytesPerSample(nvinfer1::DataType type) noexcept { return 1; case nvinfer1::DataType::kUINT8: return 1; +#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 + case nvinfer1::DataType::kFP8: + return 1; +#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 +#if NV_TENSORRT_MAJOR >= 9 + case nvinfer1::DataType::kBF16: + return 2; + case nvinfer1::DataType::kINT64: + return 8; +#endif // NV_TENSORRT_MAJOR >= 9 default: return 0; } @@ -566,11 +575,20 @@ int getSampleType(nvinfer1::DataType type) noexcept { switch (type) { case nvinfer1::DataType::kFLOAT: case nvinfer1::DataType::kHALF: +#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 + case nvinfer1::DataType::kFP8: +#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 +#if NV_TENSORRT_MAJOR >= 9 + case nvinfer1::DataType::kBF16: +#endif // NV_TENSORRT_MAJOR >= 9 return 1; case nvinfer1::DataType::kINT8: case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kBOOL: case nvinfer1::DataType::kUINT8: +#if NV_TENSORRT_MAJOR >= 9 + case nvinfer1::DataType::kINT64: +#endif // NV_TENSORRT_MAJOR >= 9 return 0; default: return -1; diff --git a/vstrt/trtexec/CMakeLists.txt b/vstrt/trtexec/CMakeLists.txt index 0b77c94..482ba83 100644 --- a/vstrt/trtexec/CMakeLists.txt +++ b/vstrt/trtexec/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.20) project(trtexec LANGUAGES CXX) set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT") +set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix") find_package(CUDAToolkit REQUIRED) @@ -10,16 +11,21 @@ add_executable(trtexec $<$: trtexec.manifest> trtexec.cpp logfile.cpp + ../common/bfloat16.cpp ../common/logger.cpp + ../common/sampleDevice.cpp ../common/sampleEngines.cpp ../common/sampleInference.cpp ../common/sampleOptions.cpp ../common/sampleReporting.cpp ../common/sampleUtils.cpp + ../utils/fileLock.cpp + ../utils/timingCache.cpp ) target_include_directories(trtexec PRIVATE ../common + .. ${CUDAToolkit_INCLUDE_DIRS} ${TENSORRT_HOME}/include ) @@ -31,10 +37,9 @@ target_link_directories(trtexec PRIVATE target_link_libraries(trtexec PRIVATE CUDA::cudart_static - nvinfer - nvinfer_plugin - nvparsers - nvonnxparser + "nvinfer${TENSORRT_LIBRARY_SUFFIX}" + "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}" + "nvonnxparser${TENSORRT_LIBRARY_SUFFIX}" ) install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/vstrt/vs_tensorrt.cpp b/vstrt/vs_tensorrt.cpp index b16402b..0f22832 100644 --- a/vstrt/vs_tensorrt.cpp +++ b/vstrt/vs_tensorrt.cpp @@ -366,9 +366,12 @@ static void VS_CC vsTrtCreate( d->logger.set_verbosity(static_cast(verbosity)); #ifdef USE_NVINFER_PLUGIN + // related to https://github.com/AmusementClub/vs-mlrt/discussions/65, for unknown reason +#if !(NV_TENSORRT_MAJOR == 9 && defined(_WIN32)) if (!initLibNvInferPlugins(&d->logger, "")) { vsapi->logMessage(mtWarning, "vsTrt: Initialize TensorRT plugins failed"); } +#endif #endif std::ifstream engine_stream { @@ -446,7 +449,17 @@ static void VS_CC vsTrtCreate( auto input_type = d->engines[0]->getBindingDataType(0); #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 - auto input_sample_type = getSampleType(input_type) == 0 ? stInteger : stFloat; + VSSampleType input_sample_type; + { + auto sample_type = getSampleType(input_type); + if (sample_type == 0) { + input_sample_type = stInteger; + } else if (sample_type == 1) { + input_sample_type = stFloat; + } else { + return set_error("unknown input sample type"); + } + } auto input_bits_per_sample = getBytesPerSample(input_type) * 8; if (auto err = checkNodes(in_vis, input_sample_type, input_bits_per_sample); err.has_value()) { @@ -462,7 +475,17 @@ static void VS_CC vsTrtCreate( auto output_type = d->engines[0]->getBindingDataType(1); #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 - auto output_sample_type = getSampleType(output_type) == 0 ? stInteger : stFloat; + VSSampleType output_sample_type; + { + auto sample_type = getSampleType(output_type); + if (sample_type == 0) { + output_sample_type = stInteger; + } else if (sample_type == 1) { + output_sample_type = stFloat; + } else { + return set_error("unknown output sample type"); + } + } auto output_bits_per_sample = getBytesPerSample(output_type) * 8; setDimensions( @@ -489,13 +512,22 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( VAPOURSYNTH_API_VERSION, 1, plugin ); + // TRT 9 for windows does not export getInferLibVersion() +#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) + auto test = getPluginRegistry(); + + if (test == nullptr) { + std::fprintf(stderr, "vstrt: TensorRT failed to load.\n"); + return; + } +#else // NV_TENSORRT_MAJOR == 9 && defined(_WIN32) int ver = getInferLibVersion(); // must ensure this is the first nvinfer function called #ifdef _WIN32 if (ver == 0) { // a sentinel value, see dummy function in win32.cpp. std::fprintf(stderr, "vstrt: TensorRT failed to load.\n"); return; } -#endif +#endif // _WIN32 if (ver != NV_TENSORRT_VERSION) { #if NV_TENSORRT_MAJOR >= 10 std::fprintf( @@ -513,6 +545,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( ); #endif // NV_TENSORRT_MAJOR >= 10 } +#endif // NV_TENSORRT_MAJOR == 9 && defined(_WIN32) myself = plugin; @@ -535,7 +568,12 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( vsapi->propSetData( out, "tensorrt_version", - std::to_string(getInferLibVersion()).c_str(), -1, paReplace +#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) + std::to_string(NV_TENSORRT_VERSION).c_str(), +#else + std::to_string(getInferLibVersion()).c_str(), +#endif + -1, paReplace ); vsapi->propSetData( diff --git a/vstrt/win32.cpp b/vstrt/win32.cpp index a0cfcfc..b984803 100644 --- a/vstrt/win32.cpp +++ b/vstrt/win32.cpp @@ -10,22 +10,32 @@ #include +#include + +#if NV_TENSORRT_VERSION >= 100100 +#define CONCAT_VERSION(name, version) (name "_" #version ".dll") +#endif // NV_TENSORRT_VERSION >= 100100 + namespace { std::vector dlls = { // This list must be sorted by dependency. - L"zlibwapi.dll", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency - L"cudnn_ops_infer64_8.dll", - L"cudnn_cnn_infer64_8.dll", - L"cudnn64_8.dll", +#if NV_TENSORRT_VERSION >= 100100 +#ifdef USE_NVINFER_PLUGIN + // nvinfer_plugin dependencies + CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), + CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR), +#endif // USE_NVINFER_PLUGIN + // Finally, nvinfer again. + CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last +#else // NV_TENSORRT_VERSION >= 100100 #ifdef USE_NVINFER_PLUGIN // nvinfer_plugin dependencies L"nvinfer.dll", - L"cublasLt64_11.dll", - L"cublas64_11.dll", L"nvinfer_plugin.dll", -#endif +#endif // USE_NVINFER_PLUGIN // Finally, nvinfer again. L"nvinfer.dll", // must be the last +#endif // NV_TENSORRT_VERSION >= 100100 }; namespace fs = std::filesystem; @@ -64,7 +74,11 @@ FARPROC loadDLLs() { return (FARPROC)h; } +#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) +static void * dummy() { // mimic getPluginRegistry +#else static int dummy() { // mimic getInferLibVersion +#endif return 0; }