diff --git a/.github/workflows/linux-ov.yml b/.github/workflows/linux-ov.yml
index 6fbd1f1..fb03816 100644
--- a/.github/workflows/linux-ov.yml
+++ b/.github/workflows/linux-ov.yml
@@ -3,7 +3,6 @@ name: Build (Linux-OV)
on:
push:
paths:
- - 'common/**'
- 'vsov/**'
- '.github/workflows/linux-ov.yml'
workflow_dispatch:
@@ -18,7 +17,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -27,17 +26,17 @@ jobs:
- name: Cache protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsov/protobuf/install
key: ${{ runner.os }}-vsov-protobuf-v1
- name: Checkout protobuf
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
if: steps.cache-protobuf.outputs.cache-hit != 'true'
with:
repository: protocolbuffers/protobuf
- # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/protobuf
+ # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf
# if you change this, remember to bump the version of the cache key.
ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e
fetch-depth: 1
@@ -60,19 +59,19 @@ jobs:
- name: Cache onnx
id: cache-onnx
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsov/onnx/install
key: ${{ runner.os }}-vsov-onnx-v1
- name: Checkout onnx
if: steps.cache-onnx.outputs.cache-hit != 'true'
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: onnx/onnx
- # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/onnx
+ # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx
# if you change this, remember to bump the version of the cache key.
- ref: 1014f41f17ecc778d63e760a994579d96ba471ff
+ ref: b86cc54efce19530fb953e4b21f57e6b3888534c
fetch-depth: 1
path: vsov/onnx
@@ -104,7 +103,7 @@ jobs:
- name: Setup OpenVINO
run: |
- curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.2/linux/l_openvino_toolkit_ubuntu22_2023.2.0.13089.cfd42bd2cb0_x86_64.tgz
+ curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu22_2024.0.0.14509.34caeefd078_x86_64.tgz
tar -xf ov.tgz
mv l_openvino_* openvino -v
@@ -113,7 +112,6 @@ jobs:
-D CMAKE_BUILD_TYPE=Release
-D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
-D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
- -D InferenceEngine_DIR=openvino/runtime/cmake
-D OpenVINO_DIR=openvino/runtime/cmake
-D ENABLE_VISUALIZATION=ON
-D WIN32_SHARED_OPENVINO=ON
@@ -135,7 +133,7 @@ jobs:
run: git describe --tags --long
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSOV-Linux-x64
path: vsov/artifact
diff --git a/.github/workflows/linux-trt.yml b/.github/workflows/linux-trt.yml
index 1efce0b..c97322b 100644
--- a/.github/workflows/linux-trt.yml
+++ b/.github/workflows/linux-trt.yml
@@ -17,7 +17,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -30,20 +30,16 @@ jobs:
unzip -q vs.zip
mv vapoursynth*/ vapoursynth
- - name: Setup CUDA
+ - name: Setup CUDA and TensorRT
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
- sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1 libcublas-dev-12-1 libcudnn8=8.9.0.131-1+cuda12.1 libcudnn8-dev=8.9.0.131-1+cuda12.1
+ sudo apt-get install -y cuda-nvcc-12-4 cuda-cudart-dev-12-4 libnvinfer-dev=10.0.1.6-1+cuda12.4 libnvinfer-headers-dev=10.0.1.6-1+cuda12.4
echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
- - name: Setup tensorrt
- run: |
- sudo apt-get install -y libnvinfer-dev=8.6.1.6-1+cuda12.0 libnvinfer-headers-dev=8.6.1.6-1+cuda12.0
-
- name: Configure
run: cmake -S . -B build -G Ninja -LA
-D CMAKE_BUILD_TYPE=Release
@@ -66,8 +62,9 @@ jobs:
run: git describe --tags --long
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSTRT-Linux-x64
path: vstrt/artifact
+ overwrite: true
diff --git a/.github/workflows/windows-cuda-dependency.yml b/.github/workflows/windows-cuda-dependency.yml
index feba4fd..a53e389 100644
--- a/.github/workflows/windows-cuda-dependency.yml
+++ b/.github/workflows/windows-cuda-dependency.yml
@@ -28,63 +28,41 @@ jobs:
shell: bash
steps:
- # Note: unfortunately, cuDNN download requires registration and according to its license
- # agreements, we can only redistribute it along with an application that uses it, so we
- # have to use a private repository to hold a copy of the library.
- name: Download cuDNN inference library
- run: |
- # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093
-
- tag=cudnn-8.6.0-bin
- name=cudnn.7z
-
- AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}"
- response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag)
- eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=')
- [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; }
- ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id"
- curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET"
+ run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip -o cudnn.zip
- name: Extract cuDNN library
- run: 7z x cudnn.7z
+ run: unzip cudnn.zip
- name: Move cuDNN library
- run: mv cudnn/bin vsmlrt-cuda -v
+ run: |
+ mkdir -p vsmlrt-cuda
+ mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v
+ rm vsmlrt-cuda/cudnn_*_train*.dll -v
- # Same licensing issue with the tensorrt libraries.
- name: Download TensorRT library
run: |
- # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093
-
- tag=tensorrt-8.5.1-bin
- name=TensorRT.7z
-
- AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}"
- response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag)
- eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=')
- [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; }
- ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id"
- curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET"
-
- mv "$name" trt.7z
+ curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
- name: Extract TensorRT library
- run: 7z x trt.7z
+ run: |
+ unzip trt.zip
+ mv TensorRT-*/ TensorRT/
- name: Move TensorRT library
- run: mv TensorRT/*.dll vsmlrt-cuda -v
+ run: mv TensorRT/lib/*.dll vsmlrt-cuda -v
- name: Download CUDA Libraries
shell: cmd
run: |
- curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
- cuda_installer.exe -s cudart_11.8 cublas_11.8 cufft_11.8 cupti_11.8 nvrtc_11.8
+ curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
+ cuda_installer.exe -s cudart_12.4 cublas_12.4 cufft_12.4 cupti_12.4 nvrtc_12.4
- name: Move CUDA Libraries
shell: cmd
run: |
- move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
- move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\*.dll" vsmlrt-cuda
+ move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
+ move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\*.dll" vsmlrt-cuda
del vsmlrt-cuda\cudart32*.dll
- name: Setup VC commands
@@ -99,7 +77,7 @@ jobs:
while true; do
changed=false
for dll in *.[dD][lL][lL]; do
- for dep in $(dumpbin -dependents "$dll" | grep -o -i '\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
+ for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
echo "finding $dep for $dll"
if ! test -f ./"$dep"; then
changed=true
@@ -113,42 +91,17 @@ jobs:
$changed || break
done
- - name: Cache zlib
- id: cache-zlib
- uses: actions/cache@v3
- with:
- path: ${{ github.workspace }}/zlib-release
- key: ${{ runner.os }}-zlib-1.2.12
-
- - name: Setup msbuild
- if: steps.cache-zlib.outputs.cache-hit != 'true'
- uses: microsoft/setup-msbuild@v1
-
- - name: Compile zlib
- if: steps.cache-zlib.outputs.cache-hit != 'true'
- shell: cmd
- run: |
- curl -s -o zlib.tar.gz -LJO https://github.com/madler/zlib/archive/refs/tags/v1.2.12.tar.gz
- tar -xf zlib.tar.gz
- cd zlib-1.2.12\contrib\vstudio\vc14
- python -c "f=open('zlibvc.vcxproj','r+b');s=f.read();s=s.replace(b'MultiThreadedDLL',b'MultiThreaded');start=s.index(b'');end_str=b'\n';end=s.index(end_str,start);s=s[:start]+s[end+len(end_str):];f.seek(0);f.write(s);f.truncate();f.close()"
- msbuild zlibvc.sln -target:zlibvc /p:Configuration=ReleaseWithoutAsm /p:Platform=x64 /p:PlatformToolset=v143 -maxCpuCount
- mkdir %GITHUB_WORKSPACE%\zlib-release
- move x64\ZlibDllReleaseWithoutAsm\zlibwapi.dll %GITHUB_WORKSPACE%\zlib-release
-
- - name: Copy zlib
- run: cp -v zlib-release/zlibwapi.dll vsmlrt-cuda
-
- name: Compress
run: |
7z a -t7z -mx=3 vsmlrt-cuda.7z vsmlrt-cuda
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: vsmlrt-cuda
path: vsmlrt-cuda.7z
retention-days: 1
+ compression-level: 0
- name: Rename release asset
run: |
diff --git a/.github/workflows/windows-ncnn.yml b/.github/workflows/windows-ncnn.yml
index 74c93c9..31a425b 100644
--- a/.github/workflows/windows-ncnn.yml
+++ b/.github/workflows/windows-ncnn.yml
@@ -39,7 +39,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -48,13 +48,13 @@ jobs:
- name: Cache protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsncnn/protobuf/install
key: ${{ runner.os }}-vsncnn-protobuf-v3.16.0
- name: Checkout protobuf
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
if: steps.cache-protobuf.outputs.cache-hit != 'true'
with:
repository: protocolbuffers/protobuf
@@ -81,14 +81,14 @@ jobs:
- name: Cache onnx
id: cache-onnx
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsncnn/onnx/install
key: ${{ runner.os }}-vsncnn-onnx-v1.12.0-protobuf-v3.16.0
- name: Checkout onnx
if: steps.cache-onnx.outputs.cache-hit != 'true'
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: onnx/onnx
ref: v1.12.0
@@ -161,7 +161,7 @@ jobs:
copy install\bin\vsncnn.dll artifact\
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSNCNN-GPU-Windows-x64
path: vsncnn/artifact
diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml
index 8261e5d..0aafede 100644
--- a/.github/workflows/windows-ort.yml
+++ b/.github/workflows/windows-ort.yml
@@ -29,7 +29,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -41,17 +41,17 @@ jobs:
- name: Cache protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsort/protobuf/install
key: ${{ runner.os }}-vsort-protobuf-v4
- name: Checkout protobuf
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
if: steps.cache-protobuf.outputs.cache-hit != 'true'
with:
repository: protocolbuffers/protobuf
- # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161
+ # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203
# if you change this, remember to bump the version of the cache key.
ref: v3.21.12
fetch-depth: 1
@@ -73,19 +73,19 @@ jobs:
- name: Cache onnx
id: cache-onnx
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsort/onnx/install
- key: ${{ runner.os }}-vsort-onnx-v4
+ key: ${{ runner.os }}-vsort-onnx-v5
- name: Checkout onnx
if: steps.cache-onnx.outputs.cache-hit != 'true'
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: onnx/onnx
# follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
# if you change this, remember to bump the version of the cache key.
- ref: a0d77f18516d2da7468a96b0de3b737266f23176
+ ref: 990217f043af7222348ca8f0301e17fa7b841781
fetch-depth: 1
path: vsort/onnx
@@ -116,21 +116,21 @@ jobs:
- name: Download ONNX Runtime Precompilation
run: |
- curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip
+ curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-7983-g9001c69b84-240419-0832/onnxruntime-gpu-win64.zip
unzip -q ortgpu.zip
- name: Cache CUDA
id: cache-cuda
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
- key: ${{ runner.os }}-cuda-11.8.0
+ key: ${{ runner.os }}-cuda-12.4.0
- name: Setup CUDA
if: steps.cache-cuda.outputs.cache-hit != 'true'
run: |
- curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
- cuda_installer.exe -s nvcc_11.8 cudart_11.8
+ curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe
+ cuda_installer.exe -s nvcc_12.4 cudart_12.4
- name: Configure
run: cmake -S . -B build -G Ninja -LA
@@ -139,10 +139,10 @@ jobs:
-D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
-D protobuf_DIR=protobuf\install\cmake
-D ONNX_DIR=onnx\install\lib\cmake\ONNX
- -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime\core\session
+ -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime
-D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
-D ENABLE_CUDA=1
- -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+ -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-D ENABLE_DML=1
-D CMAKE_CXX_STANDARD=20
@@ -161,12 +161,12 @@ jobs:
- name: Download DirectML Library
# follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
run: |
- curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0
+ curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1
unzip -q directml.nupkg -d dml
copy dml\bin\x64-win\DirectML.dll artifact\vsort\
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSORT-Windows-x64
path: vsort/artifact
@@ -199,7 +199,7 @@ jobs:
- name: Create script
shell: bash
- run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
+ run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
- name: Run vspipe
shell: bash
@@ -221,8 +221,40 @@ jobs:
shell: bash
run: |
set -ex
- vs_portable/vspipe -i test.vpy -
- vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+ vs_portable/vspipe -i test_fp16.vpy -
+ vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+ ls -l out.hevc x265.log
+ cat x265.log
+ grep -F 'encoded 10 frames' x265.log || exit 2
+ grep -i 'error' x265.log && exit 1
+ exit 0
+
+ - name: Create script (fp16 input)
+ shell: bash
+ run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy
+
+ - name: Run vspipe (fp16 input)
+ shell: bash
+ run: |
+ set -ex
+ vs_portable/vspipe -i test_fp16_input.vpy -
+ vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+ ls -l out.hevc x265.log
+ cat x265.log
+ grep -F 'encoded 10 frames' x265.log || exit 2
+ grep -i 'error' x265.log && exit 1
+ exit 0
+
+ - name: Create script (fp16 output)
+ shell: bash
+ run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy
+
+ - name: Run vspipe (fp16 output)
+ shell: bash
+ run: |
+ set -ex
+ vs_portable/vspipe -i test_fp16_output.vpy -
+ vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
ls -l out.hevc x265.log
cat x265.log
grep -F 'encoded 10 frames' x265.log || exit 2
diff --git a/.github/workflows/windows-ov.yml b/.github/workflows/windows-ov.yml
index 3dfae3d..a47fc91 100644
--- a/.github/workflows/windows-ov.yml
+++ b/.github/workflows/windows-ov.yml
@@ -39,7 +39,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -51,19 +51,19 @@ jobs:
- name: Cache protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsov/protobuf/install
- key: ${{ runner.os }}-vsov-protobuf-v1
+ key: ${{ runner.os }}-vsov-protobuf-v2
- name: Checkout protobuf
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
if: steps.cache-protobuf.outputs.cache-hit != 'true'
with:
repository: protocolbuffers/protobuf
- # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf
+ # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf
# if you change this, remember to bump the version of the cache key.
- ref: 6c6b0778b70f35f93c2f0dee30e5d12ad2a83eea
+ ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e
fetch-depth: 1
path: vsov/protobuf
@@ -83,19 +83,19 @@ jobs:
- name: Cache onnx
id: cache-onnx
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: vsov/onnx/install
- key: ${{ runner.os }}-vsov-onnx-v2
+ key: ${{ runner.os }}-vsov-onnx-v3
- name: Checkout onnx
if: steps.cache-onnx.outputs.cache-hit != 'true'
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: onnx/onnx
- # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx
+ # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx
# if you change this, remember to bump the version of the cache key.
- ref: f7ee1ac60d06abe8e26c9b6bbe1e3db5286b614b
+ ref: b86cc54efce19530fb953e4b21f57e6b3888534c
fetch-depth: 1
path: vsov/onnx
@@ -124,23 +124,16 @@ jobs:
unzip -q vs.zip
mv vapoursynth-*/ vapoursynth/
- - name: Download OpenVINO Runtime Precompilation
- shell: bash
+ - name: Setup OpenVINO
run: |
- rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}"
- if [ "$rev" == "latest" ]; then
- url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip"
- else
- url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip"
- fi
- curl -s -o openvino.zip -LJO "$url"
- unzip -q openvino.zip
+ curl -L -o ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/windows/w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64.zip
+ unzip ov.zip
+ rename w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64 openvino
- name: Configure
run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
-D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON
-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
- -D InferenceEngine_DIR=openvino/runtime/cmake
-D OpenVINO_DIR=openvino/runtime/cmake
-D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
-D ENABLE_VISUALIZATION=ON
@@ -156,12 +149,12 @@ jobs:
cmake --install build --prefix install
mkdir artifact
mkdir artifact\vsov
- copy openvino\runtime\3rdparty\tbb\bin\tbb.dll artifact\vsov\
+ copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\
copy install\bin\vsov.dll artifact\
xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSOV-Windows-x64
path: vsov/artifact
diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml
index 64318d3..07687b3 100644
--- a/.github/workflows/windows-release.yml
+++ b/.github/workflows/windows-release.yml
@@ -66,7 +66,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
- name: Compress scirpts.7z
run: |
@@ -74,7 +74,7 @@ jobs:
7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
- name: Upload scripts release
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: Scripts
path: scripts
@@ -116,11 +116,12 @@ jobs:
7za a -t7z -bb3 -mx=9 ../models.7z .
- name: Upload model release
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: Models
path: release
retention-days: 1
+ compression-level: 0
- name: Download External Models
run: |
@@ -147,11 +148,44 @@ jobs:
7za a -t7z -bb3 -mx=9 ../ext-models.7z .
- name: Upload external model release
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: External-Models
path: release
retention-days: 1
+ compression-level: 0
+
+ - name: Download Contributed Models
+ run: |
+ rm -rf release
+ set -ex
+ mkdir -p release/models
+ cd release
+ pushd models
+ for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
+ echo "Handling tag $tag"
+ curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
+ for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
+ echo "Downloading $url"
+ curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
+ # later release should overwrite earlier ones
+ 7za x -y dl.7z
+ done
+ #test -f "dl.7z" # contrib-models might be empty.
+ rm -f dl.7z release.json
+ done
+ popd
+ ls -lR
+ du -sh
+ 7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
+
+ - name: Upload contrib model release
+ uses: actions/upload-artifact@v4
+ with:
+ name: Contrib-Models
+ path: release
+ retention-days: 1
+ compression-level: 0
- name: Download Contributed Models
run: |
@@ -212,43 +246,43 @@ jobs:
steps:
- name: Download artifact for scripts
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: Scripts
path: scripts-release
- name: Download artifact for models
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: Models
path: models-release
- name: Download artifact for vsov
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: VSOV-Windows-x64
path: vsov-release
- name: Download artifact for vsort
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: VSORT-Windows-x64
path: vsort-release
- name: Download artifact for vstrt
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: VSTRT-Windows-x64
path: vstrt-release
- name: Download artifact for vsncnn
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: VSNCNN-GPU-Windows-x64
path: vsncnn-release
- name: Download artifact for cuda dependencies
- uses: actions/download-artifact@v3
+ uses: actions/download-artifact@v4
with:
name: vsmlrt-cuda
path: cuda-release
@@ -267,12 +301,13 @@ jobs:
7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
- name: Upload CPU-only release
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
if: false
with:
name: vsmlrt-cpu-release
path: vsmlrt-windows-x64-cpu.7z
retention-days: 1
+ compression-level: 0
- name: Rename release asset
run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z
@@ -300,18 +335,19 @@ jobs:
ls -lR
7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
- - name: Upload non-CUDA GPU release
- uses: actions/upload-artifact@v3
+ - name: Upload generic GPU release
+ uses: actions/upload-artifact@v4
if: false
with:
name: vsmlrt-generic-gpu-release
path: vsmlrt-windows-x64-generic-gpu.7z
retention-days: 1
+ compression-level: 0
- - name: Rename release asset for non-CUDA GPU release
+ - name: Rename release asset for generic GPU release
run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
- - name: Release non-CUDA GPU
+ - name: Release generic GPU
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ github.event.inputs.tag }}
@@ -342,12 +378,13 @@ jobs:
7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z .
- name: Upload CUDA release
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
if: false
with:
name: vsmlrt-cuda-release
path: vsmlrt-windows-x64-cuda.7z
retention-days: 1
+ compression-level: 0
- name: Rename release asset
run: mv vsmlrt-windows-x64-cuda.7z vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z
@@ -364,7 +401,7 @@ jobs:
# Update nightly tag.
- name: Checkout repo
if: github.event.inputs.tag == 'nightly'
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Overwrite tag
diff --git a/.github/workflows/windows-trt.yml b/.github/workflows/windows-trt.yml
index 735e6eb..5105ab1 100644
--- a/.github/workflows/windows-trt.yml
+++ b/.github/workflows/windows-trt.yml
@@ -31,7 +31,7 @@ jobs:
steps:
- name: Checkout repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -43,24 +43,23 @@ jobs:
- name: Cache CUDA
id: cache-cuda
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
- key: ${{ runner.os }}-vstrt-cuda-11.8.0
+ key: ${{ runner.os }}-vstrt-cuda-12.4.1
+ save-always: true
- name: Setup CUDA
if: steps.cache-cuda.outputs.cache-hit != 'true'
run: |
- curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
- cuda_installer.exe -s nvcc_11.8 cudart_11.8 nvprof_11.8 cuda_profiler_api_11.8
+ curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
+ cuda_installer.exe -s nvcc_12.4 cudart_12.4 cuda_profiler_api_12.4
- - name: Checkout tensorrt
- uses: actions/checkout@v3
- with:
- repository: AmusementClub/cuda
- token: ${{ secrets.REPO_TOKEN }}
- ref: tensorrt-8.5.1
- path: tensorrt
+ - name: Download TensorRT
+ run: |
+ curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
+ unzip trt.zip
+ mv TensorRT-*/ tensorrt/
- name: Download VapourSynth headers
run: |
@@ -72,10 +71,11 @@ jobs:
run: cmake -S . -B build -G Ninja -LA
-D CMAKE_BUILD_TYPE=Release
-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
- -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+ -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
- -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt"
+ -D TENSORRT_HOME="%cd%\tensorrt"
-D USE_NVINFER_PLUGIN=ON
+ -D TENSORRT_LIBRARY_SUFFIX="_10"
- name: Build
run: cmake --build build --config Release --verbose
@@ -84,15 +84,17 @@ jobs:
run: cmake --install build --prefix install
- name: Checkout TensorRT OSS
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
repository: NVIDIA/TensorRT
- ref: 8.5.1
+ ref: release/10.0
fetch-depth: 1
path: tensorrt-oss
- name: Override trtexec CMake file
run: |
+ cp -f -r -v tensorrt/samples ../tensorrt-oss
+
mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec
mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec
mv trtexec/*.manifest ../tensorrt-oss/samples/trtexec
@@ -101,8 +103,9 @@ jobs:
run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja
-D CMAKE_BUILD_TYPE=Release
-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
- -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
- -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt"
+ -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+ -D TENSORRT_HOME="%cd%\tensorrt"
+ -D TENSORRT_LIBRARY_SUFFIX="_10"
- name: Build trtexec
run: cmake --build build_trtexec --verbose
@@ -121,7 +124,7 @@ jobs:
run: git describe --tags --long
- name: Upload
- uses: actions/upload-artifact@v3
+ uses: actions/upload-artifact@v4
with:
name: VSTRT-Windows-x64
path: vstrt/artifact
diff --git a/common/convert_float_to_float16.cpp b/common/convert_float_to_float16.cpp
index 0cd0294..8a05651 100644
--- a/common/convert_float_to_float16.cpp
+++ b/common/convert_float_to_float16.cpp
@@ -6,7 +6,6 @@
#include
#include
#include
-#include
#include
#include
#include
@@ -19,11 +18,7 @@
#include
-
-void convert_float_to_float16(
- ONNX_NAMESPACE::ModelProto & model,
- bool force_fp16_initializers
-) noexcept;
+#include "convert_float_to_float16.h"
namespace {
@@ -298,12 +293,14 @@ static ONNX_NAMESPACE::ValueInfoProto make_value_info_from_tensor(
void convert_float_to_float16(
ONNX_NAMESPACE::ModelProto & model,
- bool force_fp16_initializers
- // , bool keep_io_types = True
- // , bool disable_shape_infer = True
- // , const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST
- // , const std::optional> op_block_list = {}
- , const std::unordered_set & op_block_list
+ bool force_fp16_initializers,
+ // bool keep_io_types = True,
+ // bool disable_shape_infer = True,
+ // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST,
+ // const std::optional> op_block_list = {},
+ const std::unordered_set & op_block_list,
+ bool cast_input,
+ bool cast_output
) noexcept {
std::vector value_info_list {};
@@ -312,97 +309,101 @@ void convert_float_to_float16(
std::unordered_map name_mapping {};
std::unordered_set graph_io_to_skip {};
- const std::vector fp32_inputs = [&]() {
- std::vector ret {};
+ if (cast_input) {
+ const std::vector fp32_inputs = [&]() {
+ std::vector ret {};
- for (const auto & n : model.graph().input()) {
- if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
- ret.emplace_back(n.name());
+ for (const auto & n : model.graph().input()) {
+ if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
+ ret.emplace_back(n.name());
+ }
}
- }
- return ret;
- }();
-
- for (const auto & n : model.graph().input()) {
- if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name());
- idx != std::cend(fp32_inputs)
- ) {
- const auto i = idx - std::cbegin(fp32_inputs);
- std::string node_name = "graph_input_cast_" + std::to_string(i);
- name_mapping.emplace(n.name(), node_name);
- graph_io_to_skip.emplace(n.name());
-
- auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
- new_value_info->CopyFrom(n);
- new_value_info->set_name(node_name);
- new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
- ONNX_NAMESPACE::TensorProto::FLOAT16
- );
- // add Cast node (from tensor(float) to tensor(float16) after graph input
- for (auto & node : *model.mutable_graph()->mutable_node()) {
- for (auto & input : *node.mutable_input()) {
- if (input == n.name()) {
- input = node_name;
+ return ret;
+ }();
+
+ for (const auto & n : model.graph().input()) {
+ if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name());
+ idx != std::cend(fp32_inputs)
+ ) {
+ const auto i = idx - std::cbegin(fp32_inputs);
+ std::string node_name = "graph_input_cast_" + std::to_string(i);
+ name_mapping.emplace(n.name(), node_name);
+ graph_io_to_skip.emplace(n.name());
+
+ auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
+ new_value_info->CopyFrom(n);
+ new_value_info->set_name(node_name);
+ new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
+ ONNX_NAMESPACE::TensorProto::FLOAT16
+ );
+ // add Cast node (from tensor(float) to tensor(float16) after graph input
+ for (auto & node : *model.mutable_graph()->mutable_node()) {
+ for (auto & input : *node.mutable_input()) {
+ if (input == n.name()) {
+ input = node_name;
+ }
}
}
+ auto new_node = make_node(
+ "Cast", {n.name()}, {node_name}, node_name,
+ "to", ONNX_NAMESPACE::TensorProto::FLOAT16
+ );
+ model.mutable_graph()->mutable_node()->Add();
+ for (int i = model.graph().node_size() - 2; i >= 0; --i) {
+ model.mutable_graph()->mutable_node()->SwapElements(i, i + 1);
+ }
+ *model.mutable_graph()->mutable_node(0) = std::move(new_node);
+ value_info_list.emplace_back(*new_value_info);
+ io_casts.emplace(std::move(node_name));
}
- auto new_node = make_node(
- "Cast", {n.name()}, {node_name}, node_name,
- "to", ONNX_NAMESPACE::TensorProto::FLOAT16
- );
- model.mutable_graph()->mutable_node()->Add();
- for (int i = model.graph().node_size() - 2; i >= 0; --i) {
- model.mutable_graph()->mutable_node()->SwapElements(i, i + 1);
- }
- *model.mutable_graph()->mutable_node(0) = std::move(new_node);
- value_info_list.emplace_back(*new_value_info);
- io_casts.emplace(std::move(node_name));
}
}
- const std::vector fp32_outputs = [&]() {
- std::vector ret {};
+ if (cast_output) {
+ const std::vector fp32_outputs = [&]() {
+ std::vector ret {};
- for (const auto & n : model.graph().output()) {
- if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
- ret.emplace_back(n.name());
+ for (const auto & n : model.graph().output()) {
+ if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
+ ret.emplace_back(n.name());
+ }
}
- }
- return ret;
- }();
-
- for (const auto & n : model.graph().output()) {
- if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name());
- idx != std::cend(fp32_outputs)
- ) {
- const auto i = idx - std::cbegin(fp32_outputs);
- std::string node_name = "graph_output_cast_" + std::to_string(i);
- name_mapping.emplace(n.name(), node_name);
- graph_io_to_skip.emplace(n.name());
-
- auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
- new_value_info->CopyFrom(n);
- new_value_info->set_name(node_name);
- new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
- ONNX_NAMESPACE::TensorProto::FLOAT16
- );
- // add Cast node (from tensor(float16) to tensor(float) before graph output
- for (auto & node : *model.mutable_graph()->mutable_node()) {
- for (auto & output : *node.mutable_output()) {
- if (output == n.name()) {
- output = node_name;
+ return ret;
+ }();
+
+ for (const auto & n : model.graph().output()) {
+ if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name());
+ idx != std::cend(fp32_outputs)
+ ) {
+ const auto i = idx - std::cbegin(fp32_outputs);
+ std::string node_name = "graph_output_cast_" + std::to_string(i);
+ name_mapping.emplace(n.name(), node_name);
+ graph_io_to_skip.emplace(n.name());
+
+ auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
+ new_value_info->CopyFrom(n);
+ new_value_info->set_name(node_name);
+ new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
+ ONNX_NAMESPACE::TensorProto::FLOAT16
+ );
+ // add Cast node (from tensor(float16) to tensor(float) before graph output
+ for (auto & node : *model.mutable_graph()->mutable_node()) {
+ for (auto & output : *node.mutable_output()) {
+ if (output == n.name()) {
+ output = node_name;
+ }
}
}
+ auto new_node = make_node(
+ "Cast", {node_name}, {n.name()}, node_name,
+ "to", ONNX_NAMESPACE::TensorProto::FLOAT
+ );
+ model.mutable_graph()->mutable_node()->Add(std::move(new_node));
+ value_info_list.emplace_back(*new_value_info);
+ io_casts.emplace(std::move(node_name));
}
- auto new_node = make_node(
- "Cast", {node_name}, {n.name()}, node_name,
- "to", ONNX_NAMESPACE::TensorProto::FLOAT
- );
- model.mutable_graph()->mutable_node()->Add(std::move(new_node));
- value_info_list.emplace_back(*new_value_info);
- io_casts.emplace(std::move(node_name));
}
}
diff --git a/common/convert_float_to_float16.h b/common/convert_float_to_float16.h
new file mode 100644
index 0000000..1c41b4a
--- /dev/null
+++ b/common/convert_float_to_float16.h
@@ -0,0 +1,21 @@
+#ifndef CONVERT_FLOAT_TO_FLOAT16_H
+#define CONVERT_FLOAT_TO_FLOAT16_H
+
+#include
+#include
+
+#include
+
+void convert_float_to_float16(
+ ONNX_NAMESPACE::ModelProto & model,
+ bool force_fp16_initializers,
+ // bool keep_io_types = True,
+ // bool disable_shape_infer = True,
+ // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST,
+ // const std::optional> op_block_list = {},
+ const std::unordered_set & op_block_list,
+ bool cast_input = true,
+ bool cast_output = true
+) noexcept;
+
+#endif
diff --git a/common/onnx_utils.cpp b/common/onnx_utils.cpp
index 9c22898..5de7b67 100644
--- a/common/onnx_utils.cpp
+++ b/common/onnx_utils.cpp
@@ -8,6 +8,8 @@
#include
#include
+#include "onnx_utils.h"
+
using namespace std::string_literals;
diff --git a/common/onnx_utils.h b/common/onnx_utils.h
new file mode 100644
index 0000000..7041ab7
--- /dev/null
+++ b/common/onnx_utils.h
@@ -0,0 +1,18 @@
+#ifndef ONNX_UTILS_H
+#define ONNX_UTILS_H
+
+#include
+#include
+#include
+#include
+
+#include
+
+std::variant loadONNX(
+ const std::string_view & path,
+ int64_t tile_w,
+ int64_t tile_h,
+ bool path_is_serialization
+) noexcept;
+
+#endif
diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py
index ecccb61..011c2b6 100644
--- a/scripts/vsmlrt.py
+++ b/scripts/vsmlrt.py
@@ -1,4 +1,4 @@
-__version__ = "3.15.55"
+__version__ = "3.20.11"
__all__ = [
"Backend", "BackendV2",
@@ -9,6 +9,8 @@
"CUGAN",
"RIFE", "RIFEModel", "RIFEMerge",
"SAFA", "SAFAModel", "SAFAAdaptiveMode",
+ "SCUNet", "SCUNetModel",
+ "SwinIR", "SwinIRModel",
"inference"
]
@@ -18,6 +20,8 @@
from fractions import Fraction
import math
import os
+import os.path
+import platform
import subprocess
import sys
import tempfile
@@ -75,6 +79,18 @@ class ORT_CUDA:
basic performance tuning:
set fp16 = True (on RTX GPUs)
+
+ Semantics of `fp16`:
+ Enabling `fp16` will use a built-in quantization that converts a fp32 onnx to a fp16 onnx.
+ If the input video is of half-precision floating-point format,
+ the generated fp16 onnx will use fp16 input.
+ The output format can be controlled by the `output_format` option (0 = fp32, 1 = fp16).
+
+ Disabling `fp16` will not use the built-in quantization.
+ However, if the onnx file itself uses fp16 for computation,
+ the actual computation will be done in fp16.
+ In this case, the input video format should match the input format of the onnx,
+ and the output format is inferred from the onnx.
"""
device_id: int = 0
@@ -84,6 +100,9 @@ class ORT_CUDA:
fp16: bool = False
use_cuda_graph: bool = False # preview, not supported by all models
fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None
+ prefer_nhwc: bool = False
+ output_format: int = 0 # 0: fp32, 1: fp16
+ tf32: bool = False
# internal backend attributes
supports_onnx_serialization: bool = True
@@ -122,17 +141,17 @@ class TRT:
opt_shapes: typing.Optional[typing.Tuple[int, int]] = None
fp16: bool = False
device_id: int = 0
- workspace: typing.Optional[int] = 128
+ workspace: typing.Optional[int] = None
verbose: bool = False
use_cuda_graph: bool = False
num_streams: int = 1
use_cublas: bool = False # cuBLAS + cuBLASLt
static_shape: bool = True
- tf32: bool = True
+ tf32: bool = False
log: bool = True
# as of TensorRT 8.4, it can be turned off without performance penalty in most cases
- use_cudnn: bool = True
+ use_cudnn: bool = False # changed to False since vsmlrt.vpy 3.16
use_edge_mask_convolutions: bool = True
use_jit_convolutions: bool = True
heuristic: bool = False # only supported on Ampere+ with TensorRT 8.5+
@@ -141,8 +160,12 @@ class TRT:
faster_dynamic_shapes: bool = True
force_fp16: bool = False
builder_optimization_level: int = 3
+ max_aux_streams: typing.Optional[int] = None
+ short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise
+ bf16: bool = False
custom_env: typing.Dict[str, str] = field(default_factory=lambda: {})
custom_args: typing.List[str] = field(default_factory=lambda: [])
+ engine_folder: typing.Optional[str] = None
# internal backend attributes
supports_onnx_serialization: bool = False
@@ -207,12 +230,21 @@ class MIGX:
fast_math: bool = True
exhaustive_tune: bool = False
+ short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise
custom_env: typing.Dict[str, str] = field(default_factory=lambda: {})
custom_args: typing.List[str] = field(default_factory=lambda: [])
# internal backend attributes
supports_onnx_serialization: bool = False
+ @dataclass(frozen=False)
+ class OV_NPU:
+ """ backend for intel npus
+ """
+
+ # internal backend attributes
+ supports_onnx_serialization: bool = True
+
backendT = typing.Union[
Backend.OV_CPU,
@@ -222,7 +254,8 @@ class MIGX:
Backend.OV_GPU,
Backend.NCNN_VK,
Backend.ORT_DML,
- Backend.MIGX
+ Backend.MIGX,
+ Backend.OV_NPU,
]
@@ -271,7 +304,7 @@ def Waifu2x(
raise ValueError(f'{func_name}: "scale" must be 1, 2 or 4')
if not isinstance(model, int) or model not in Waifu2xModel.__members__.values():
- raise ValueError(f'{func_name}: "model" must be in [0, 9]')
+ raise ValueError(f'{func_name}: invalid "model"')
if model == 0 and noise == 0:
raise ValueError(
@@ -433,7 +466,7 @@ def DPIR(
raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
if not isinstance(model, int) or model not in DPIRModel.__members__.values():
- raise ValueError(f'{func_name}: "model" must be 0, 1, 2 or 3')
+ raise ValueError(f'{func_name}: invalid "model"')
if model in [0, 2] and clip.format.color_family != vs.GRAY:
raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
@@ -976,8 +1009,11 @@ def RIFEMerge(
"/Reciprocal:fp32,/Reciprocal_1:fp32,"
"/Mul:fp32,/Mul_1:fp32,/Mul_8:fp32,/Mul_10:fp32,"
"/Sub_5:fp32,/Sub_6:fp32,"
+ # generated by TensorRT's onnx parser
"ONNXTRT_Broadcast_236:fp32,ONNXTRT_Broadcast_238:fp32,"
- "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32"
+ "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32,"
+ # TensorRT 9.0 or later
+ "ONNXTRT_Broadcast_*:fp32"
)
])
@@ -1147,6 +1183,14 @@ def handler(n: int, f: vs.VideoFrame) -> vs.VideoNode:
else:
return res
else:
+ if not hasattr(core, 'akarin') or \
+ not hasattr(core.akarin, 'PropExpr') or \
+ not hasattr(core.akarin, 'PickFrames'):
+ raise RuntimeError(
+ 'fractional multi requires plugin akarin '
+ '(https://github.com/AkarinVS/vapoursynth-plugin/releases)'
+ ', version v0.96g or later.')
+
if clip.fps_num == 0 or clip.fps_den == 0:
src_fps = Fraction(1)
else:
@@ -1169,6 +1213,7 @@ def left_func(n: int) -> vs.VideoNode:
left_clip = core.std.FrameEval(temp, left_func)
def right_func(n: int) -> vs.VideoNode:
+ # no out of range access because of function filter_sc
return clip[dst_duration * n // src_duration + 1]
right_clip = core.std.FrameEval(temp, right_func)
@@ -1181,14 +1226,14 @@ def timepoint_func(n: int) -> vs.VideoNode:
return temp_gray.std.BlankClip(color=tp, keep=True)
tp_clip = core.std.FrameEval(temp_gray, timepoint_func)
- output = RIFEMerge(
+ output0 = RIFEMerge(
clipa=left_clip, clipb=right_clip, mask=tp_clip,
scale=scale, tiles=tiles, tilesize=tilesize, overlap=overlap,
model=model, backend=backend, ensemble=ensemble,
_implementation=_implementation
)
- left0 = bits_as(left_clip, output)
+ left0 = bits_as(left_clip, output0)
def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode:
current_time = dst_duration * n
@@ -1200,9 +1245,9 @@ def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode:
):
return left0
else:
- return output
+ return output0
- res = core.std.FrameEval(output, filter_sc, left0)
+ res = core.std.FrameEval(output0, filter_sc, left0)
else:
if not hasattr(core, 'akarin') or \
not hasattr(core.akarin, 'PropExpr') or \
@@ -1373,6 +1418,210 @@ def SAFA(
return clip
+@enum.unique
+class SCUNetModel(enum.IntEnum):
+ scunet_color_15 = 0
+ scunet_color_25 = 1
+ scunet_color_50 = 2
+ scunet_color_real_psnr = 3
+ scunet_color_real_gan = 4
+ scunet_gray_15 = 5
+ scunet_gray_25 = 6
+ scunet_gray_50 = 7
+
+
+def SCUNet(
+ clip: vs.VideoNode,
+ tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ model: SCUNetModel = SCUNetModel.scunet_color_real_psnr,
+ backend: backendT = Backend.OV_CPU()
+) -> vs.VideoNode:
+ """ Practical Blind Denoising via Swin-Conv-UNet and Data Synthesis
+
+ Unlike vs-scunet v1.0.0, the default model is set to scunet_color_real_psnr due to the color shift.
+ """
+
+ func_name = "vsmlrt.SCUNet"
+
+ if not isinstance(clip, vs.VideoNode):
+ raise TypeError(f'{func_name}: "clip" must be a clip!')
+
+ if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]:
+ raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
+
+ if not isinstance(model, int) or model not in SCUNetModel.__members__.values():
+ raise ValueError(f'{func_name}: invalid "model"')
+
+ if model in range(5) and clip.format.color_family != vs.RGB:
+ raise ValueError(f'{func_name}: "clip" must be of RGB color family')
+ elif model in range(5, 8) and clip.format.color_family != vs.GRAY:
+ raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
+
+ if overlap is None:
+ overlap_w = overlap_h = 16
+ elif isinstance(overlap, int):
+ overlap_w = overlap_h = overlap
+ else:
+ overlap_w, overlap_h = overlap
+
+ multiple = 1
+
+ (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize(
+ tiles=tiles, tilesize=tilesize,
+ width=clip.width, height=clip.height,
+ multiple=multiple,
+ overlap_w=overlap_w, overlap_h=overlap_h
+ )
+
+ if tile_w % multiple != 0 or tile_h % multiple != 0:
+ raise ValueError(
+ f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})'
+ )
+
+ backend = init_backend(
+ backend=backend,
+ trt_opt_shapes=(tile_w, tile_h)
+ )
+
+ network_path = os.path.join(
+ models_path,
+ "scunet",
+ f"{tuple(SCUNetModel.__members__)[model]}.onnx"
+ )
+
+ clip = inference_with_fallback(
+ clips=[clip], network_path=network_path,
+ overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h),
+ backend=backend
+ )
+
+ return clip
+
+
+@enum.unique
+class SwinIRModel(enum.IntEnum):
+ lightweightSR_DIV2K_s64w8_SwinIR_S_x2 = 0
+ lightweightSR_DIV2K_s64w8_SwinIR_S_x3 = 1
+ lightweightSR_DIV2K_s64w8_SwinIR_S_x4 = 2
+ realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_GAN = 3
+ # unused
+ realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_PSNR = 5
+ classicalSR_DF2K_s64w8_SwinIR_M_x2 = 6
+ classicalSR_DF2K_s64w8_SwinIR_M_x3 = 7
+ classicalSR_DF2K_s64w8_SwinIR_M_x4 = 8
+ classicalSR_DF2K_s64w8_SwinIR_M_x8 = 9
+ realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_GAN = 10
+ realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_PSNR = 11
+ realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_GAN = 12
+ realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_PSNR = 13
+ grayDN_DFWB_s128w8_SwinIR_M_noise15 = 14
+ grayDN_DFWB_s128w8_SwinIR_M_noise25 = 15
+ grayDN_DFWB_s128w8_SwinIR_M_noise50 = 16
+ colorDN_DFWB_s128w8_SwinIR_M_noise15 = 17
+ colorDN_DFWB_s128w8_SwinIR_M_noise25 = 18
+ colorDN_DFWB_s128w8_SwinIR_M_noise50 = 19
+ CAR_DFWB_s126w7_SwinIR_M_jpeg10 = 20
+ CAR_DFWB_s126w7_SwinIR_M_jpeg20 = 21
+ CAR_DFWB_s126w7_SwinIR_M_jpeg30 = 22
+ CAR_DFWB_s126w7_SwinIR_M_jpeg40 = 23
+ colorCAR_DFWB_s126w7_SwinIR_M_jpeg10 = 24
+ colorCAR_DFWB_s126w7_SwinIR_M_jpeg20 = 25
+ colorCAR_DFWB_s126w7_SwinIR_M_jpeg30 = 26
+ colorCAR_DFWB_s126w7_SwinIR_M_jpeg40 = 27
+
+
+def SwinIR(
+ clip: vs.VideoNode,
+ tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+ model: SwinIRModel = SwinIRModel.lightweightSR_DIV2K_s64w8_SwinIR_S_x2,
+ backend: backendT = Backend.OV_CPU()
+) -> vs.VideoNode:
+ """ SwinIR: Image Restoration Using Swin Transformer """
+
+ func_name = "vsmlrt.SwinIR"
+
+ if not isinstance(clip, vs.VideoNode):
+ raise TypeError(f'{func_name}: "clip" must be a clip!')
+
+ if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]:
+ raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
+
+ if not isinstance(model, int) or model not in SwinIRModel.__members__.values():
+ raise ValueError(f'{func_name}: invalid "model"')
+
+ if model in range(14, 17) or model in range(20, 24):
+ if clip.format.color_family != vs.GRAY:
+ raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
+ elif clip.format.color_family != vs.RGB:
+ raise ValueError(f'{func_name}: "clip" must be of RGB color family')
+
+ if overlap is None:
+ overlap_w = overlap_h = 16
+ elif isinstance(overlap, int):
+ overlap_w = overlap_h = overlap
+ else:
+ overlap_w, overlap_h = overlap
+
+ multiple = 1
+
+ (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize(
+ tiles=tiles, tilesize=tilesize,
+ width=clip.width, height=clip.height,
+ multiple=multiple,
+ overlap_w=overlap_w, overlap_h=overlap_h
+ )
+
+ if tile_w % multiple != 0 or tile_h % multiple != 0:
+ raise ValueError(
+ f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})'
+ )
+
+ backend = init_backend(
+ backend=backend,
+ trt_opt_shapes=(tile_w, tile_h)
+ )
+
+ if model < 4:
+ model_name = tuple(SwinIRModel.__members__)[model]
+ else:
+ model_name = tuple(SwinIRModel.__members__)[model - 1]
+
+ model_name = model_name.replace("SwinIR_", "SwinIR-")
+
+ if model in range(3):
+ model_name = f"002_{model_name}"
+ elif model in (3, 5):
+ model_name = f"003_{model_name}"
+ elif model in range(6, 10):
+ model_name = f"001_{model_name}"
+ elif model in range(10, 14):
+ model_name = f"003_{model_name}"
+ elif model in range(14, 17):
+ model_name = f"004_{model_name}"
+ elif model in range(17, 20):
+ model_name = f"005_{model_name}"
+ elif model in range(20, 28):
+ model_name = f"006_{model_name}"
+
+ network_path = os.path.join(
+ models_path,
+ "swinir",
+ f"{model_name}.onnx"
+ )
+
+ clip = inference_with_fallback(
+ clips=[clip], network_path=network_path,
+ overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h),
+ backend=backend
+ )
+
+ return clip
+
+
def get_engine_path(
network_path: str,
min_shapes: typing.Tuple[int, int],
@@ -1386,7 +1635,12 @@ def get_engine_path(
tf32: bool,
use_cudnn: bool,
input_format: int,
- output_format: int
+ output_format: int,
+ builder_optimization_level: int,
+ max_aux_streams: typing.Optional[int],
+ short_path: typing.Optional[bool],
+ bf16: bool,
+ engine_folder: typing.Optional[str]
) -> str:
with open(network_path, "rb") as file:
@@ -1401,30 +1655,42 @@ def get_engine_path(
device_name = f"device{device_id}"
if static_shape:
- shape_str = f".{opt_shapes[0]}x{opt_shapes[1]}"
+ shape_str = f"{opt_shapes[0]}x{opt_shapes[1]}"
else:
shape_str = (
- f".min{min_shapes[0]}x{min_shapes[1]}"
+ f"min{min_shapes[0]}x{min_shapes[1]}"
f"_opt{opt_shapes[0]}x{opt_shapes[1]}"
f"_max{max_shapes[0]}x{max_shapes[1]}"
)
- return (
- network_path +
+ identity = (
shape_str +
("_fp16" if fp16 else "") +
- ("_no-tf32" if not tf32 else "") +
+ ("_tf32" if tf32 else "") +
+ ("_bf16" if bf16 else "") +
(f"_workspace{workspace}" if workspace is not None else "") +
+ f"_opt{builder_optimization_level}" +
+ (f"_max-aux-streams{max_aux_streams}" if max_aux_streams is not None else "") +
f"_trt-{trt_version}" +
("_cublas" if use_cublas else "") +
("_cudnn" if use_cudnn else "") +
"_I-" + ("fp32" if input_format == 0 else "fp16") +
"_O-" + ("fp32" if output_format == 0 else "fp16") +
f"_{device_name}" +
- f"_{checksum:x}" +
- ".engine"
+ f"_{checksum:x}"
)
+ dirname, basename = os.path.split(network_path)
+
+ if engine_folder is not None:
+ os.makedirs(engine_folder, exist_ok=True)
+ dirname = engine_folder
+
+ if short_path or (short_path is None and platform.system() == "Windows"):
+ return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.engine")
+ else:
+ return f"{os.path.join(dirname, basename)}.{identity}.engine"
+
def trtexec(
network_path: str,
@@ -1433,12 +1699,12 @@ def trtexec(
max_shapes: typing.Tuple[int, int],
fp16: bool,
device_id: int,
- workspace: typing.Optional[int] = 128,
+ workspace: typing.Optional[int] = None,
verbose: bool = False,
use_cuda_graph: bool = False,
use_cublas: bool = False,
static_shape: bool = True,
- tf32: bool = True,
+ tf32: bool = False,
log: bool = False,
use_cudnn: bool = True,
use_edge_mask_convolutions: bool = True,
@@ -1451,11 +1717,15 @@ def trtexec(
faster_dynamic_shapes: bool = True,
force_fp16: bool = False,
builder_optimization_level: int = 3,
+ max_aux_streams: typing.Optional[int] = None,
+ short_path: typing.Optional[bool] = None,
+ bf16: bool = False,
custom_env: typing.Dict[str, str] = {},
- custom_args: typing.List[str] = []
+ custom_args: typing.List[str] = [],
+ engine_folder: typing.Optional[str] = None
) -> str:
- # tensort runtime version, e.g. 8401 => 8.4.1
+ # tensort runtime version
trt_version = parse_trt_version(int(core.trt.Version()["tensorrt_version"]))
if isinstance(opt_shapes, int):
@@ -1467,6 +1737,7 @@ def trtexec(
if force_fp16:
fp16 = True
tf32 = False
+ bf16 = False
engine_path = get_engine_path(
network_path=network_path,
@@ -1481,19 +1752,26 @@ def trtexec(
tf32=tf32,
use_cudnn=use_cudnn,
input_format=input_format,
- output_format=output_format
+ output_format=output_format,
+ builder_optimization_level=builder_optimization_level,
+ max_aux_streams=max_aux_streams,
+ short_path=short_path,
+ bf16=bf16,
+ engine_folder=engine_folder,
)
if os.access(engine_path, mode=os.R_OK):
return engine_path
- alter_engine_path = os.path.join(
- tempfile.gettempdir(),
- os.path.splitdrive(engine_path)[1][1:]
- )
+ # do not consider alternative path when the engine_folder is given
+ if engine_folder is None:
+ alter_engine_path = os.path.join(
+ tempfile.gettempdir(),
+ os.path.splitdrive(engine_path)[1][1:]
+ )
- if os.access(alter_engine_path, mode=os.R_OK):
- return alter_engine_path
+ if os.access(alter_engine_path, mode=os.R_OK):
+ return alter_engine_path
try:
# test writability
@@ -1501,12 +1779,16 @@ def trtexec(
pass
os.remove(engine_path)
except PermissionError:
- print(f"{engine_path} not writable", file=sys.stderr)
- engine_path = alter_engine_path
- dirname = os.path.dirname(engine_path)
- if not os.path.exists(dirname):
- os.makedirs(dirname)
- print(f"change engine path to {engine_path}", file=sys.stderr)
+ if engine_folder is None:
+ print(f"{engine_path} is not writable", file=sys.stderr)
+ engine_path = alter_engine_path
+ dirname = os.path.dirname(engine_path)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+ print(f"change engine path to {engine_path}", file=sys.stderr)
+ else:
+ # do not consider alternative path when the engine_folder is given
+ raise PermissionError(f"{engine_path} is not writable")
args = [
trtexec_path,
@@ -1537,17 +1819,38 @@ def trtexec(
if verbose:
args.append("--verbose")
- disabled_tactic_sources = []
- if not use_cublas:
- disabled_tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"])
- if not use_cudnn:
- disabled_tactic_sources.append("-CUDNN")
- if not use_edge_mask_convolutions and trt_version >= (8, 4, 1):
- disabled_tactic_sources.append("-EDGE_MASK_CONVOLUTIONS")
- if not use_jit_convolutions and trt_version >= (8, 5, 0):
- disabled_tactic_sources.append("-JIT_CONVOLUTIONS")
- if disabled_tactic_sources:
- args.append(f"--tacticSources={','.join(disabled_tactic_sources)}")
+ preview_features = []
+ if (use_cublas or use_cudnn) and (8, 6, 0) <= trt_version < (10, 0, 0):
+ preview_features.append("-disableExternalTacticSourcesForCore0805")
+
+ if preview_features and trt_version >= (8, 5, 0):
+ args.append(f"--preview={','.join(preview_features)}")
+
+ tactic_sources = []
+
+ if use_cublas:
+ tactic_sources.extend(["+CUBLAS", "+CUBLAS_LT"])
+ else:
+ tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"])
+
+ if use_cudnn:
+ tactic_sources.append("+CUDNN")
+ else:
+ tactic_sources.append("-CUDNN")
+
+ if trt_version >= (8, 4, 1):
+ if use_edge_mask_convolutions:
+ tactic_sources.append("+EDGE_MASK_CONVOLUTIONS")
+ else:
+ tactic_sources.append("-EDGE_MASK_CONVOLUTIONS")
+
+ if trt_version >= (8, 5, 0):
+ if use_jit_convolutions:
+ tactic_sources.append("+JIT_CONVOLUTIONS")
+ else:
+ tactic_sources.append("-JIT_CONVOLUTIONS")
+
+ args.append(f"--tacticSources={','.join(tactic_sources)}")
if use_cuda_graph:
args.extend((
@@ -1555,7 +1858,7 @@ def trtexec(
"--noDataTransfers"
))
else:
- if trt_version >= 8600:
+ if trt_version >= (8, 6, 0):
args.append("--skipInference")
else:
args.append("--buildOnly")
@@ -1590,6 +1893,13 @@ def trtexec(
if trt_version >= (8, 6, 0):
args.append(f"--builderOptimizationLevel={builder_optimization_level}")
+ if max_aux_streams is not None:
+ args.append(f"--maxAuxStreams={max_aux_streams}")
+
+ if trt_version >= (9, 0, 0):
+ if bf16:
+ args.append("--bf16")
+
args.extend(custom_args)
if log:
@@ -1598,7 +1908,7 @@ def trtexec(
if prev_env_value is not None and len(prev_env_value) > 0:
# env_key has been set, no extra action
- env = {env_key: prev_env_value}
+ env = {env_key: prev_env_value, "CUDA_MODULE_LOADING": "LAZY"}
env.update(**custom_env)
subprocess.run(args, env=env, check=True, stdout=sys.stderr)
else:
@@ -1609,7 +1919,7 @@ def trtexec(
f"trtexec_{time_str}.log"
)
- env = {env_key: log_filename}
+ env = {env_key: log_filename, "CUDA_MODULE_LOADING": "LAZY"}
env.update(**custom_env)
completed_process = subprocess.run(args, env=env, check=False, stdout=sys.stderr)
@@ -1639,7 +1949,8 @@ def get_mxr_path(
fp16: bool,
fast_math: bool,
exhaustive_tune: bool,
- device_id: int
+ device_id: int,
+ short_path: typing.Optional[bool]
) -> str:
with open(network_path, "rb") as file:
@@ -1665,7 +1976,11 @@ def get_mxr_path(
f"_{checksum:x}"
)
- return f"{network_path}.{identity}.mxr"
+ if short_path or (short_path is None and platform.system() == "Windows"):
+ dirname, basename = os.path.split(network_path)
+ return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.mxr")
+ else:
+ return f"{network_path}.{identity}.mxr"
def migraphx_driver(
@@ -1677,6 +1992,7 @@ def migraphx_driver(
exhaustive_tune: bool,
device_id: int,
input_name: str = "input",
+ short_path: typing.Optional[bool] = None,
custom_env: typing.Dict[str, str] = {},
custom_args: typing.List[str] = []
) -> str:
@@ -1690,7 +2006,8 @@ def migraphx_driver(
fp16=fp16,
fast_math=fast_math,
exhaustive_tune=exhaustive_tune,
- device_id=device_id
+ device_id=device_id,
+ short_path=short_path
)
if os.access(mxr_path, mode=os.R_OK):
@@ -1805,6 +2122,8 @@ def init_backend(
backend = Backend.ORT_DML()
elif backend is Backend.MIGX: # type: ignore
backend = Backend.MIGX()
+ elif backend is Backend.OV_NPU:
+ backend = Backend.OV_NPU()
backend = copy.deepcopy(backend)
@@ -1866,6 +2185,19 @@ def _inference(
fp16_blacklist_ops=backend.fp16_blacklist_ops
)
elif isinstance(backend, Backend.ORT_CUDA):
+ kwargs = dict()
+
+ version_list = core.ort.Version().get("onnxruntime_version", b"0.0.0").split(b'.')
+ if len(version_list) != 3:
+ version = (0, 0, 0)
+ else:
+ version = tuple(map(int, version_list))
+
+ if version >= (1, 18, 0):
+ kwargs["prefer_nhwc"] = backend.prefer_nhwc
+ kwargs["output_format"] = backend.output_format
+ kwargs["tf32"] = backend.tf32
+
clip = core.ort.Model(
clips, network_path,
overlap=overlap, tilesize=tilesize,
@@ -1877,34 +2209,66 @@ def _inference(
fp16=backend.fp16,
path_is_serialization=path_is_serialization,
use_cuda_graph=backend.use_cuda_graph,
- fp16_blacklist_ops=backend.fp16_blacklist_ops
+ fp16_blacklist_ops=backend.fp16_blacklist_ops,
+ **kwargs
)
elif isinstance(backend, Backend.OV_CPU):
- config = lambda: dict(
- CPU_THROUGHPUT_STREAMS=backend.num_streams,
- CPU_BIND_THREAD="YES" if backend.bind_thread else "NO",
- CPU_THREADS_NUM=backend.num_threads,
- ENFORCE_BF16="YES" if backend.bf16 else "NO"
- )
+ version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.')))
+
+ if version >= (2024, 0, 0):
+ config_dict = dict(
+ NUM_STREAMS=backend.num_streams,
+ INFERENCE_NUM_THREADS=backend.num_threads,
+ ENABLE_CPU_PINNING="YES" if backend.bind_thread else "NO"
+ )
+ if backend.fp16:
+ config_dict["INFERENCE_PRECISION_HINT"] = "f16"
+ elif backend.bf16:
+ config_dict["INFERENCE_PRECISION_HINT"] = "bf16"
+ else:
+ config_dict["INFERENCE_PRECISION_HINT"] = "f32"
+
+ config = lambda: config_dict
+ else:
+ config = lambda: dict(
+ CPU_THROUGHPUT_STREAMS=backend.num_streams,
+ CPU_BIND_THREAD="YES" if backend.bind_thread else "NO",
+ CPU_THREADS_NUM=backend.num_threads,
+ ENFORCE_BF16="YES" if backend.bf16 else "NO"
+ )
clip = core.ov.Model(
clips, network_path,
overlap=overlap, tilesize=tilesize,
device="CPU", builtin=False,
- fp16=backend.fp16,
+ fp16=False, # use ov's internal quantization
config=config,
path_is_serialization=path_is_serialization,
- fp16_blacklist_ops=backend.fp16_blacklist_ops
+ fp16_blacklist_ops=backend.fp16_blacklist_ops # disabled since fp16 = False
)
elif isinstance(backend, Backend.OV_GPU):
- config = lambda: dict(
- GPU_THROUGHPUT_STREAMS=backend.num_streams
- )
+ version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.')))
+
+ if version >= (2024, 0, 0):
+ config_dict = dict(
+ NUM_STREAMS=backend.num_streams,
+ )
+ if backend.fp16:
+ config_dict["INFERENCE_PRECISION_HINT"] = "f16"
+ else:
+ config_dict["INFERENCE_PRECISION_HINT"] = "f32"
+
+ config = lambda: config_dict
+ else:
+ config = lambda: dict(
+ GPU_THROUGHPUT_STREAMS=backend.num_streams
+ )
+
clip = core.ov.Model(
clips, network_path,
overlap=overlap, tilesize=tilesize,
device=f"GPU.{backend.device_id}", builtin=False,
- fp16=backend.fp16,
+ fp16=False, # use ov's internal quantization
config=config,
path_is_serialization=path_is_serialization,
fp16_blacklist_ops=backend.fp16_blacklist_ops
@@ -1945,8 +2309,12 @@ def _inference(
faster_dynamic_shapes=backend.faster_dynamic_shapes,
force_fp16=backend.force_fp16,
builder_optimization_level=backend.builder_optimization_level,
+ max_aux_streams=backend.max_aux_streams,
+ short_path=backend.short_path,
+ bf16=backend.bf16,
custom_env=backend.custom_env,
- custom_args=backend.custom_args
+ custom_args=backend.custom_args,
+ engine_folder=backend.engine_folder,
)
clip = core.trt.Model(
clips, engine_path,
@@ -1986,6 +2354,7 @@ def _inference(
exhaustive_tune=backend.exhaustive_tune,
device_id=backend.device_id,
input_name=input_name,
+ short_path=backend.short_path,
custom_env=backend.custom_env,
custom_args=backend.custom_args
)
@@ -1995,6 +2364,14 @@ def _inference(
tilesize=tilesize,
device_id=backend.device_id
)
+ elif isinstance(backend, Backend.OV_NPU):
+ clip = core.ov.Model(
+ clips, network_path,
+ overlap=overlap, tilesize=tilesize,
+ device="NPU", builtin=False,
+ fp16=False, # use ov's internal quantization
+ path_is_serialization=path_is_serialization,
+ )
else:
raise TypeError(f'unknown backend {backend}')
@@ -2101,9 +2478,9 @@ class BackendV2:
def TRT(*,
num_streams: int = 1,
fp16: bool = False,
- tf32: bool = True,
+ tf32: bool = False,
output_format: int = 0, # 0: fp32, 1: fp16
- workspace: typing.Optional[int] = 128,
+ workspace: typing.Optional[int] = None,
use_cuda_graph: bool = False,
static_shape: bool = True,
min_shapes: typing.Tuple[int, int] = (0, 0),
@@ -2111,7 +2488,7 @@ def TRT(*,
max_shapes: typing.Optional[typing.Tuple[int, int]] = None,
force_fp16: bool = False,
use_cublas: bool = False,
- use_cudnn: bool = True,
+ use_cudnn: bool = False,
device_id: int = 0,
**kwargs
) -> Backend.TRT:
@@ -2224,18 +2601,25 @@ def MIGX(*,
**kwargs
)
+ @staticmethod
+ def OV_NPU(**kwargs
+ ) -> Backend.OV_NPU:
+ return Backend.OV_NPU(
+ **kwargs
+ )
+
def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode:
clip_org = clip
if clip.format.sample_type == vs.FLOAT and clip.format.bits_per_sample != 32:
format = clip.format.replace(core=core, bits_per_sample=32)
- clip = core.resize.Point(clip, format=format)
+ clip = core.resize.Point(clip, format=format.id)
clip = core.fmtc.resample(clip, **kwargs)
if clip.format.bits_per_sample != clip_org.format.bits_per_sample:
- clip = core.resize.Point(clip, format=clip_org.format)
+ clip = core.resize.Point(clip, format=clip_org.format.id)
return clip
diff --git a/vsncnn/vs_ncnn.cpp b/vsncnn/vs_ncnn.cpp
index 8b06821..cabe182 100644
--- a/vsncnn/vs_ncnn.cpp
+++ b/vsncnn/vs_ncnn.cpp
@@ -22,17 +22,12 @@
#include
#include
-#include "config.h" // generated by cmake
#include
-#include "onnx2ncnn.hpp"
+#include "../common/onnx_utils.h"
+#include "onnx2ncnn.hpp"
-extern std::variant loadONNX(
- const std::string_view & path,
- int64_t tile_w,
- int64_t tile_h,
- bool path_is_serialization
-) noexcept;
+#include "config.h" // generated by cmake
static const VSPlugin * myself = nullptr;
diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt
index 85dcae3..c85ddc1 100644
--- a/vsort/CMakeLists.txt
+++ b/vsort/CMakeLists.txt
@@ -39,7 +39,14 @@ if (CMAKE_CXX_STANDARD GREATER_EQUAL 20)
set_target_properties(vsort PROPERTIES CXX_STANDARD 20)
endif()
-target_link_libraries(vsort PRIVATE onnx onnxruntime)
+# https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
+if (ONNX_VERSION VERSION_LESS 1.16.0)
+ target_link_libraries(vsort PRIVATE onnx)
+else()
+ target_link_libraries(vsort PRIVATE ONNX::onnx)
+endif()
+
+target_link_libraries(vsort PRIVATE onnxruntime)
if (ENABLE_CUDA)
find_package(CUDAToolkit REQUIRED)
diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp
index f543afd..290c1d2 100644
--- a/vsort/vs_onnxruntime.cpp
+++ b/vsort/vs_onnxruntime.cpp
@@ -1,11 +1,9 @@
#include
#include
#include
-#include
#include
#include
#include
-#include
#include
#include
#include
@@ -27,31 +25,20 @@ using namespace std::chrono_literals;
#define NOMINMAX
#include
+#include
#ifdef ENABLE_CUDA
#include
#endif // ENABLE_CUDA
#ifdef ENABLE_DML
-// include/onnxruntime/core/providers/dml/dml_provider_factory.h
-#include <../providers/dml/dml_provider_factory.h>
+#include
#endif // ENABLE_DML
-#include "config.h"
-
+#include "../common/convert_float_to_float16.h"
+#include "../common/onnx_utils.h"
-extern std::variant loadONNX(
- const std::string_view & path,
- int64_t tile_w,
- int64_t tile_h,
- bool path_is_serialization
-) noexcept;
-
-extern void convert_float_to_float16(
- ONNX_NAMESPACE::ModelProto & model,
- bool force_fp16_initializers,
- const std::unordered_set & op_block_list
-) noexcept;
+#include "config.h"
#ifdef ENABLE_COREML
@@ -87,6 +74,7 @@ static std::mutex capture_lock;
// rename GridSample to com.microsoft::GridSample
// onnxruntime has support for CUDA-accelerated GridSample only in its own opset domain
static void rename(ONNX_NAMESPACE::ModelProto & model) {
+#if ORT_API_VERSION < 18
constexpr auto ms_domain = "com.microsoft";
bool has_ms_opset = false;
@@ -109,6 +97,7 @@ static void rename(ONNX_NAMESPACE::ModelProto & model) {
*node.mutable_domain() = ms_domain;
}
}
+#endif // ORT_API_VERSION < 18
}
@@ -176,6 +165,19 @@ static std::variant> getShape(
return std::get>(maybe_shape);
}
+static size_t getNumBytes(int32_t type) {
+ using namespace ONNX_NAMESPACE;
+
+ switch (type) {
+ case TensorProto::FLOAT:
+ return 4;
+ case TensorProto::FLOAT16:
+ return 2;
+ default:
+ return 0;
+ }
+}
+
static int numPlanes(
const std::vector & vis
@@ -197,8 +199,12 @@ static std::optional checkNodes(
) noexcept {
for (const auto & vi : vis) {
- if (vi->format->sampleType != stFloat || vi->format->bitsPerSample != 32) {
- return "expects clip with type fp32";
+ if (vi->format->sampleType != stFloat) {
+ return "expects clip with floating-point type";
+ }
+
+ if (vi->format->bitsPerSample != 32 && vi->format->bitsPerSample != 16) {
+ return "expects clip with type fp32 or fp16";
}
if (vi->width != vis[0]->width || vi->height != vis[0]->height) {
@@ -234,8 +240,8 @@ static std::optional checkIOInfo(
ONNXTensorElementDataType element_type;
checkError(ortapi->GetTensorElementType(tensor_info, &element_type));
- if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
- return set_error("expects network IO with type fp32");
+ if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
+ return set_error("expects network IO with type fp32 or fp16");
}
size_t num_dims;
@@ -341,6 +347,23 @@ static std::optional checkNodesAndNetwork(
return set_error("tile size larger than clip dimension");
}
+ OrtTypeInfo * output_type_info;
+ checkError(ortapi->SessionGetOutputTypeInfo(session, 0, &output_type_info));
+
+ const OrtTensorTypeAndShapeInfo * output_tensor_info;
+ checkError(ortapi->CastTypeInfoToTensorInfo(output_type_info, &output_tensor_info));
+
+ auto network_out_dims = std::get>(getShape(output_tensor_info));
+
+ auto network_out_height = network_out_dims[2];
+ auto network_out_width = network_out_dims[3];
+
+ if (network_out_height % network_in_height != 0 || network_out_width % network_in_width != 0) {
+ return set_error("output dimensions must be divisible by input dimensions");
+ }
+
+ ortapi->ReleaseTypeInfo(output_type_info);
+
ortapi->ReleaseTypeInfo(input_type_info);
return {};
@@ -351,16 +374,17 @@ static void setDimensions(
const std::array & input_shape,
const std::array & output_shape,
VSCore * core,
- const VSAPI * vsapi
+ const VSAPI * vsapi,
+ int32_t onnx_output_type
) noexcept {
vi->height *= output_shape[2] / input_shape[2];
vi->width *= output_shape[3] / input_shape[3];
if (output_shape[1] == 1) {
- vi->format = vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core);
+ vi->format = vsapi->registerFormat(cmGray, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core);
} else if (output_shape[1] == 3) {
- vi->format = vsapi->registerFormat(cmRGB, stFloat, 32, 0, 0, core);
+ vi->format = vsapi->registerFormat(cmRGB, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core);
}
}
@@ -565,9 +589,23 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
return nullptr;
};
+ OrtRunOptions * run_options {};
+
#ifdef ENABLE_CUDA
if (d->backend == Backend::CUDA) {
checkCUDAError(cudaSetDevice(d->device_id));
+
+#if ORT_API_VERSION >= 16
+ checkError(ortapi->CreateRunOptions(&run_options));
+ if (run_options == nullptr) {
+ return set_error("create run_options failed");
+ }
+ checkError(ortapi->AddRunConfigEntry(
+ run_options,
+ kOrtRunOptionsConfigDisableSynchronizeExecutionProviders,
+ "1"
+ ));
+#endif // ORT_API_VERSION >= 16
}
#endif // ENABLE_CUDA
@@ -627,9 +665,9 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
resource.stream
));
- // OrtCUDAProviderOptionsV2 disallows using custom user stream
- // and the inference is executed on a private non-blocking stream
+#if ORT_API_VERSION < 16
checkCUDAError(cudaStreamSynchronize(resource.stream));
+#endif // ORT_API_VERSION < 16
}
#endif // ENABLE_CUDA
@@ -644,17 +682,17 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
// note that this applies only to stream capture from the ort library
// this fails when another plugin also uses global-mode stream capture
std::lock_guard _ { capture_lock };
- checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
+ checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding));
// onnxruntime replays the graph itself in CUDAExecutionProvider::OnRunEnd
} else
#endif // ENABLE_CUDA
if (d->backend == Backend::CPU || d->backend == Backend::CUDA) {
- checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
+ checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding));
} else {
checkError(ortapi->Run(
resource.session,
- nullptr,
+ run_options,
&resource.input_name,
&resource.input_tensor,
1,
@@ -735,6 +773,10 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
y = std::min(y + step_h, src_height - src_tile_h);
}
+ if (run_options) {
+ ortapi->ReleaseRunOptions(run_options);
+ }
+
d->release(ticket);
for (const auto & frame : src_frames) {
@@ -908,6 +950,18 @@ static void VS_CC vsOrtCreate(
if (error) {
cudnn_benchmark = true;
}
+
+#if ORT_API_VERSION >= 17
+ bool prefer_nhwc = !!(vsapi->propGetInt(in, "prefer_nhwc", 0, &error));
+ if (error) {
+ prefer_nhwc = false;
+ }
+#endif // ORT_API_VERSION >= 17
+
+ bool tf32 = !!(vsapi->propGetInt(in, "tf32", 0, &error));
+ if (error) {
+ tf32 = false;
+ }
#endif // ENABLE_CUDA
if (auto err = ortInit(); err.has_value()) {
@@ -929,6 +983,14 @@ static void VS_CC vsOrtCreate(
use_cuda_graph = false;
}
+ int output_format = int64ToIntS(vsapi->propGetInt(in, "output_format", 0, &error));
+ if (error) {
+ output_format = 0;
+ }
+ if (output_format != 0 && output_format != 1) {
+ return set_error("\"output_format\" must be 0 or 1");
+ }
+
std::string_view path_view;
std::string path;
if (path_is_serialization) {
@@ -976,11 +1038,26 @@ static void VS_CC vsOrtCreate(
fp16_blacklist_ops.emplace(vsapi->propGetData(in, "fp16_blacklist_ops", i, nullptr));
}
}
- convert_float_to_float16(onnx_model, false, fp16_blacklist_ops);
+ convert_float_to_float16(
+ onnx_model,
+ false,
+ fp16_blacklist_ops,
+ in_vis.front()->format->bytesPerSample == 4,
+ output_format == 0
+ );
}
rename(onnx_model);
+ auto onnx_input_type = onnx_model.graph().input()[0].type().tensor_type().elem_type();
+ auto onnx_output_type = onnx_model.graph().output()[0].type().tensor_type().elem_type();
+
+ if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT && in_vis.front()->format->bitsPerSample != 32) {
+ return set_error("the onnx requires input to be of type fp32");
+ } else if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT16 && in_vis.front()->format->bitsPerSample != 16) {
+ return set_error("the onnx requires input to be of type fp16");
+ }
+
std::string onnx_data = onnx_model.SerializeAsString();
if (std::size(onnx_data) == 0) {
return set_error("proto serialization failed");
@@ -1041,6 +1118,8 @@ static void VS_CC vsOrtCreate(
// TODO: other providers
#ifdef ENABLE_CUDA
if (d->backend == Backend::CUDA) {
+ checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking));
+
OrtCUDAProviderOptionsV2 * cuda_options;
checkError(ortapi->CreateCUDAProviderOptions(&cuda_options));
#ifdef _MSC_VER
@@ -1061,7 +1140,11 @@ static void VS_CC vsOrtCreate(
"cudnn_conv_algo_search",
"cudnn_conv_use_max_workspace",
"arena_extend_strategy",
- "enable_cuda_graph"
+ "enable_cuda_graph",
+#if ORT_API_VERSION >= 17
+ "prefer_nhwc",
+ "use_tf32",
+#endif // ORT_API_VERSION >= 17
};
auto device_id_str = std::to_string(d->device_id);
const char * values [] {
@@ -1069,7 +1152,11 @@ static void VS_CC vsOrtCreate(
"EXHAUSTIVE",
"1",
"kSameAsRequested",
- "0"
+ "0",
+#if ORT_API_VERSION >= 17
+ "0",
+ "0",
+#endif // ORT_API_VERSION >= 17
};
if (!cudnn_benchmark) {
values[1] = "HEURISTIC";
@@ -1080,8 +1167,24 @@ static void VS_CC vsOrtCreate(
} else {
resource.require_replay = false;
}
+#if ORT_API_VERSION >= 17
+ if (prefer_nhwc) {
+ values[5] = "1";
+ }
+ if (tf32) {
+ values[6] = "1";
+ }
+#endif // ORT_API_VERSION >= 17
checkError(ortapi->UpdateCUDAProviderOptions(cuda_options, keys, values, std::size(keys)));
+#if ORT_API_VERSION >= 16
+ checkError(ortapi->UpdateCUDAProviderOptionsWithValue(
+ cuda_options,
+ "user_compute_stream",
+ resource.stream
+ ));
+#endif // ORT_API_VERSION >= 16
+
checkError(ortapi->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
ortapi->ReleaseCUDAProviderOptions(cuda_options);
@@ -1122,14 +1225,12 @@ static void VS_CC vsOrtCreate(
#ifdef ENABLE_CUDA
if (d->backend == Backend::CUDA) {
- checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking));
-
resource.input.size = (
input_shape[0] *
input_shape[1] *
input_shape[2] *
input_shape[3]
- ) * sizeof(float);
+ ) * getNumBytes(onnx_input_type);
checkCUDAError(cudaMallocHost(
&resource.input.h_data, resource.input.size,
@@ -1141,7 +1242,8 @@ static void VS_CC vsOrtCreate(
memory_info,
resource.input.d_data, resource.input.size,
std::data(input_shape), std::size(input_shape),
- ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.input_tensor
+ static_cast(onnx_input_type),
+ &resource.input_tensor
));
} else
#endif // ENALBE_CUDA
@@ -1149,7 +1251,7 @@ static void VS_CC vsOrtCreate(
checkError(ortapi->CreateTensorAsOrtValue(
cpu_allocator,
std::data(input_shape), std::size(input_shape),
- ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+ static_cast(onnx_input_type),
&resource.input_tensor
));
}
@@ -1165,7 +1267,7 @@ static void VS_CC vsOrtCreate(
output_shape[1] *
output_shape[2] *
output_shape[3]
- ) * sizeof(float);
+ ) * getNumBytes(onnx_output_type);
checkCUDAError(cudaMallocHost(&resource.output.h_data, resource.output.size));
checkCUDAError(cudaMalloc(&resource.output.d_data, resource.output.size));
@@ -1174,7 +1276,8 @@ static void VS_CC vsOrtCreate(
memory_info,
resource.output.d_data, resource.output.size,
std::data(output_shape), std::size(output_shape),
- ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.output_tensor
+ static_cast(onnx_output_type),
+ &resource.output_tensor
));
} else
#endif // ENABLE_CUDA
@@ -1182,7 +1285,7 @@ static void VS_CC vsOrtCreate(
checkError(ortapi->CreateTensorAsOrtValue(
cpu_allocator,
std::data(output_shape), std::size(output_shape),
- ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+ static_cast(onnx_output_type),
&resource.output_tensor
));
}
@@ -1204,7 +1307,7 @@ static void VS_CC vsOrtCreate(
}
if (i == 0) {
- setDimensions(d->out_vi, input_shape, output_shape, core, vsapi);
+ setDimensions(d->out_vi, input_shape, output_shape, core, vsapi, onnx_output_type);
}
d->resources.push_back(resource);
@@ -1249,6 +1352,9 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
"path_is_serialization:int:opt;"
"use_cuda_graph:int:opt;"
"fp16_blacklist_ops:data[]:opt;"
+ "prefer_nhwc:int:opt;"
+ "output_format:int:opt;"
+ "tf32:int:opt;"
, vsOrtCreate,
nullptr,
plugin
@@ -1258,10 +1364,26 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
vsapi->propSetData(out, "version", VERSION, -1, paReplace);
vsapi->propSetData(
- out, "onnxruntime_version",
+ out, "onnxruntime_api_version_build",
std::to_string(ORT_API_VERSION).c_str(), -1, paReplace
);
+ if (auto err = ortInit(); err.has_value()) {
+ vsapi->logMessage(mtWarning, err.value().c_str());
+ } else {
+ if (auto p = OrtGetApiBase(); p) {
+ vsapi->propSetData(
+ out, "onnxruntime_version",
+ p->GetVersionString(), -1, paReplace
+ );
+ }
+
+ vsapi->propSetData(
+ out, "onnxruntime_build_info",
+ ortapi->GetBuildInfoString(), -1, paReplace
+ );
+ }
+
#ifdef ENABLE_CUDA
vsapi->propSetData(
out, "cuda_runtime_version",
diff --git a/vsort/win32.cpp b/vsort/win32.cpp
index 8741167..cca5e18 100644
--- a/vsort/win32.cpp
+++ b/vsort/win32.cpp
@@ -22,7 +22,6 @@ static std::vector cudaDlls {
L"cudart64",
L"cublasLt64", L"cublas64",
L"cufft64",
- L"zlibwapi", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency
L"cudnn_ops_infer64", L"cudnn_cnn_infer64", L"cudnn_adv_infer64", L"cudnn64",
L"cupti64",
};
diff --git a/vsov/CMakeLists.txt b/vsov/CMakeLists.txt
index f003806..e6f033c 100644
--- a/vsov/CMakeLists.txt
+++ b/vsov/CMakeLists.txt
@@ -7,7 +7,6 @@ set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualizatio
set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library")
find_package(OpenVINO REQUIRED CONFIG)
-find_package(InferenceEngine REQUIRED CONFIG)
add_library(vsov SHARED
vs_openvino.cpp
@@ -24,20 +23,21 @@ if(WIN32_SHARED_OPENVINO)
target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO)
endif()
-target_include_directories(vsov PRIVATE
- ${VAPOURSYNTH_INCLUDE_DIRECTORY}
- ${ONNX_INCLUDE_DIRS}
- ${InferenceEngine_INCLUDE_DIRS}
-)
-
if(WIN32_SHARED_OPENVINO)
find_package(protobuf REQUIRED CONFIG)
find_package(ONNX REQUIRED CONFIG)
- target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} onnx)
+ target_link_libraries(vsov PRIVATE onnx)
else()
- target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} openvino::onnx)
+ target_link_libraries(vsov PRIVATE openvino::frontend::onnx)
endif()
+target_include_directories(vsov PRIVATE
+ ${VAPOURSYNTH_INCLUDE_DIRECTORY}
+ ${ONNX_INCLUDE_DIRS}
+)
+
+target_link_libraries(vsov PRIVATE openvino::runtime)
+
set_target_properties(vsov PROPERTIES
CXX_EXTENSIONS OFF
CXX_STANDARD 17
diff --git a/vsov/vs_openvino.cpp b/vsov/vs_openvino.cpp
index f186be4..ce6dfe3 100644
--- a/vsov/vs_openvino.cpp
+++ b/vsov/vs_openvino.cpp
@@ -20,28 +20,17 @@
#include
#include
-#include
+#include
#include
#ifdef ENABLE_VISUALIZATION
#include
#endif // ENABLE_VISUALIZATION
-#include "config.h"
-
-
-extern std::variant loadONNX(
- const std::string_view & path,
- int64_t tile_w,
- int64_t tile_h,
- bool path_is_serialization
-) noexcept;
+#include "../common/convert_float_to_float16.h"
+#include "../common/onnx_utils.h"
-extern void convert_float_to_float16(
- ONNX_NAMESPACE::ModelProto & model,
- bool force_fp16_initializers,
- const std::unordered_set & op_block_list
-) noexcept;
+#include "config.h"
using namespace std::string_literals;
@@ -50,16 +39,16 @@ static const VSPlugin * myself = nullptr;
static std::array getShape(
- const InferenceEngine::ExecutableNetwork & network,
+ const ov::CompiledModel & network,
bool input
) {
- InferenceEngine::SizeVector dims;
+ ov::Shape dims;
if (input) {
- dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims();
+ dims = network.input().get_shape();
} else {
- dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims();
+ dims = network.output().get_shape();
}
std::array ret;
@@ -112,21 +101,19 @@ static std::optional checkNodes(
}
-template
[[nodiscard]]
static std::optional checkIOInfo(
- const T & info,
+ const ov::Output & info,
bool is_output
) {
- if (info->getPrecision() != InferenceEngine::Precision::FP32) {
+ if (info.get_element_type() != ov::element::f32) {
return "expects network IO with type fp32";
}
- const auto & desc = info->getTensorDesc();
- if (desc.getLayout() != InferenceEngine::Layout::NCHW) {
- return "expects network IO with layout NCHW";
- }
- const auto & dims = desc.getDims();
+ // if (ov::layout::get_layout(info) != ov::Layout("NCHW")) {
+ // return "expects network IO with layout NCHW";
+ // }
+ const auto & dims = info.get_shape();
if (dims.size() != 4) {
return "expects network with 4-D IO";
}
@@ -148,27 +135,23 @@ static std::optional checkIOInfo(
[[nodiscard]]
static std::optional checkNetwork(
- const InferenceEngine::CNNNetwork & network
+ const std::shared_ptr & network
) {
- const auto & inputs_info = network.getInputsInfo();
-
- if (auto num_inputs = std::size(inputs_info); num_inputs != 1) {
+ if (auto num_inputs = std::size(network->inputs()); num_inputs != 1) {
return "network input count must be 1, got " + std::to_string(num_inputs);
}
- const auto & input_info = inputs_info.cbegin()->second;
+ const auto & input_info = network->input();
if (auto err = checkIOInfo(input_info, false); err.has_value()) {
return err.value();
}
- const auto & outputs_info = network.getOutputsInfo();
-
- if (auto num_outputs = std::size(outputs_info); num_outputs != 1) {
+ if (auto num_outputs = std::size(network->outputs()); num_outputs != 1) {
return "network output count must be 1, got " + std::to_string(num_outputs);
}
- const auto & output_info = outputs_info.cbegin()->second;
+ const auto & output_info = network->output();
if (auto err = checkIOInfo(output_info, true); err.has_value()) {
return err.value();
}
@@ -179,12 +162,12 @@ static std::optional checkNetwork(
[[nodiscard]]
static std::optional checkNodesAndNetwork(
- const InferenceEngine::ExecutableNetwork & network,
+ const ov::CompiledModel & network,
const std::vector & vis
) {
const auto & network_in_dims = (
- network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims()
+ network.input().get_tensor().get_shape()
);
int network_in_channels = static_cast(network_in_dims[1]);
@@ -205,15 +188,16 @@ static std::optional checkNodesAndNetwork(
}
+
static void setDimensions(
std::unique_ptr & vi,
- const InferenceEngine::ExecutableNetwork & network,
+ const ov::CompiledModel & network,
VSCore * core,
const VSAPI * vsapi
) {
- auto in_dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims();
- auto out_dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims();
+ const auto & in_dims = network.input().get_shape();
+ const auto & out_dims = network.output().get_shape();
vi->height *= out_dims[2] / in_dims[2];
vi->width *= out_dims[3] / in_dims[3];
@@ -226,13 +210,13 @@ static void setDimensions(
}
-static std::variant> getConfig(
+static std::variant getConfig(
VSFuncRef * config_func,
VSCore * core,
const VSAPI * vsapi
) {
- std::map config;
+ ov::AnyMap config;
if (config_func == nullptr) {
return config;
@@ -285,13 +269,10 @@ struct OVData {
int overlap_w, overlap_h;
- InferenceEngine::Core core;
- InferenceEngine::ExecutableNetwork executable_network;
- std::unordered_map infer_requests;
+ ov::Core core;
+ ov::CompiledModel executable_network;
+ std::unordered_map infer_requests;
std::shared_mutex infer_requests_lock;
-
- std::string input_name;
- std::string output_name;
};
@@ -396,7 +377,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
auto thread_id = std::this_thread::get_id();
bool initialized = true;
- InferenceEngine::InferRequest * infer_request;
+ ov::InferRequest * infer_request;
d->infer_requests_lock.lock_shared();
try {
@@ -409,9 +390,9 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
if (!initialized) {
std::lock_guard _ { d->infer_requests_lock };
try {
- d->infer_requests.emplace(thread_id, d->executable_network.CreateInferRequest());
- } catch (const InferenceEngine::Exception& e) {
- return set_error("[IE exception] Create inference request: "s + e.what());
+ d->infer_requests.emplace(thread_id, d->executable_network.create_infer_request());
+ } catch (const ov::Exception & e) {
+ return set_error("[OV exception] Create inference request: "s + e.what());
} catch (const std::exception& e) {
return set_error("[Standard exception] Create inference request: "s + e.what());
}
@@ -429,11 +410,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
int x_crop_end = (x == src_width - src_tile_w) ? 0 : d->overlap_w;
{
- InferenceEngine::Blob::Ptr input = infer_request->GetBlob(d->input_name);
-
- auto minput = input->as();
- auto minputHolder = minput->wmap();
- uint8_t * input_buffer = minputHolder.as();
+ auto input_buffer = (uint8_t *) infer_request->get_input_tensor().data();
for (const auto & _src_ptr : src_ptrs) {
const uint8_t * src_ptr { _src_ptr +
@@ -451,19 +428,15 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
}
try {
- infer_request->Infer();
- } catch (const InferenceEngine::Exception & e) {
- return set_error("[IE exception] Create inference request: "s + e.what());
+ infer_request->infer();
+ } catch (const ov::Exception & e) {
+ return set_error("[OV exception] Create inference request: "s + e.what());
} catch (const std::exception& e) {
return set_error("[Standard exception] Create inference request: "s + e.what());
}
{
- InferenceEngine::Blob::CPtr output = infer_request->GetBlob(d->output_name);
-
- auto moutput = output->as();
- auto moutputHolder = moutput->rmap();
- const uint8_t * output_buffer = moutputHolder.as();
+ auto output_buffer = (const uint8_t *) infer_request->get_output_tensor().data();
for (int plane = 0; plane < dst_planes; ++plane) {
uint8_t * dst_ptr = (dst_ptrs[plane] +
@@ -533,11 +506,11 @@ static void VS_CC vsOvCreate(
) {
std::unique_ptr d = nullptr;
-
+
try {
d = std::make_unique();
- } catch (const InferenceEngine::Exception& e) {
- vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str());
+ } catch (const ov::Exception& e) {
+ vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str());
return ;
} catch (const std::exception& e) {
vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str());
@@ -675,12 +648,11 @@ static void VS_CC vsOvCreate(
}
{
- InferenceEngine::CNNNetwork network;
+ std::shared_ptr network;
try {
- auto empty = InferenceEngine::Blob::CPtr();
- network = d->core.ReadNetwork(onnx_data, empty);
- } catch (const InferenceEngine::Exception& e) {
- return set_error("[IE exception] ReadNetwork(): "s + e.what());
+ network = d->core.read_model(onnx_data, ov::Tensor());
+ } catch (const ov::Exception& e) {
+ return set_error("[OV exception] ReadNetwork(): "s + e.what());
} catch (const std::exception& e) {
return set_error("[Standard exception] ReadNetwork(): "s + e.what());
}
@@ -689,10 +661,8 @@ static void VS_CC vsOvCreate(
return set_error(err.value());
}
- auto function = network.getFunction(); // mutable
-
try {
- ov::pass::ConstantFolding().run_on_model(function);
+ ov::pass::ConstantFolding().run_on_model(network);
} catch (const ov::Exception & e) {
return set_error(e.what());
}
@@ -701,7 +671,7 @@ static void VS_CC vsOvCreate(
const char * dot_path = vsapi->propGetData(in, "dot_path", 0, &error);
if (!error) {
try {
- ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(function);
+ ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(network);
} catch (const ov::Exception & e) {
return set_error(e.what());
}
@@ -714,11 +684,11 @@ static void VS_CC vsOvCreate(
if (std::holds_alternative(config_ret)) {
return set_error(std::get(config_ret));
}
- auto & config = std::get>(config_ret);
+ auto & config = std::get(config_ret);
try {
- d->executable_network = d->core.LoadNetwork(network, device, config);
- } catch (const InferenceEngine::Exception & e) {
+ d->executable_network = d->core.compile_model(network, device, config);
+ } catch (const ov::Exception & e) {
return set_error(e.what());
}
@@ -728,9 +698,6 @@ static void VS_CC vsOvCreate(
setDimensions(d->out_vi, d->executable_network, core, vsapi);
- d->input_name = d->executable_network.GetInputsInfo().cbegin()->first;
- d->output_name = d->executable_network.GetOutputsInfo().cbegin()->first;
-
VSCoreInfo core_info;
vsapi->getCoreInfo2(core, &core_info);
d->infer_requests.reserve(core_info.numThreads);
@@ -780,8 +747,10 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
vsapi->propSetData(out, "version", VERSION, -1, paReplace);
std::ostringstream ostream;
- ostream << IE_VERSION_MAJOR << '.' << IE_VERSION_MINOR << '.' << IE_VERSION_PATCH;
- vsapi->propSetData(out, "inference_engine_version", ostream.str().c_str(), -1, paReplace);
+ ostream << OPENVINO_VERSION_MAJOR << '.' << OPENVINO_VERSION_MINOR << '.' << OPENVINO_VERSION_PATCH;
+ vsapi->propSetData(out, "openvino_version_build", ostream.str().c_str(), -1, paReplace);
+
+ vsapi->propSetData(out, "openvino_version", ov::get_openvino_version().buildNumber, -1, paReplace);
vsapi->propSetData(
out, "onnx_version",
@@ -798,13 +767,13 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
auto availableDevices = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
try {
- auto core = InferenceEngine::Core();
- auto devices = core.GetAvailableDevices();
+ auto core = ov::Core();
+ auto devices = core.get_available_devices();
for (const auto & device : devices) {
vsapi->propSetData(out, "devices", device.c_str(), -1, paAppend);
}
- } catch (const InferenceEngine::Exception& e) {
- vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str());
+ } catch (const ov::Exception& e) {
+ vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str());
} catch (const std::exception& e) {
vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str());
}
diff --git a/vsov/win32.cpp b/vsov/win32.cpp
index a6572d7..e0b8d4d 100644
--- a/vsov/win32.cpp
+++ b/vsov/win32.cpp
@@ -14,10 +14,10 @@ namespace {
std::vector dlls = {
// This list must be sorted by dependency.
#ifdef WIN32_SHARED_OPENVINO
- L"tbb.dll",
+ L"tbb12.dll",
L"openvino.dll", // must be the last
#else // WIN32_SHARED_OPENVINO
- L"tbb.dll", // must be the last
+ L"tbb12.dll", // must be the last
#endif // WIN32_SHARED_OPENVINO
};
diff --git a/vstrt/CMakeLists.txt b/vstrt/CMakeLists.txt
index 7bddf7e..212a48f 100644
--- a/vstrt/CMakeLists.txt
+++ b/vstrt/CMakeLists.txt
@@ -8,7 +8,7 @@ set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE)
option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE)
-set(CUDNN_HOME "" CACHE PATH "Path to cuDNN")
+set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
FIND_PACKAGE(CUDAToolkit REQUIRED)
@@ -31,28 +31,21 @@ set_target_properties(vstrt PROPERTIES
)
target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib)
-target_link_libraries(vstrt PRIVATE CUDA::cudart_static nvinfer)
+target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}")
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
- target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer.dll" "delayimp.lib")
+ target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib")
endif()
if (USE_NVINFER_PLUGIN)
add_definitions(-DUSE_NVINFER_PLUGIN)
if (USE_NVINFER_PLUGIN_STATIC)
- target_link_libraries(vstrt PRIVATE nvinfer_plugin_static)
-
- find_library(CUDNN_LIB cudnn HINTS ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib)
- target_link_libraries(vstrt PRIVATE ${CUDNN_LIB} CUDA::cublas)
-
- if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
- target_link_options(vstrt PRIVATE "/DELAYLOAD:cublas64_11.dll" "/DELAYLOAD:cudnn64_8.dll")
- endif()
+ target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}")
else()
- target_link_libraries(vstrt PRIVATE nvinfer_plugin)
+ target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}")
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
- target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin.dll")
+ target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll")
endif()
endif()
endif()
diff --git a/vstrt/trt_utils.h b/vstrt/trt_utils.h
index 3a2e3c8..3c0feb9 100644
--- a/vstrt/trt_utils.h
+++ b/vstrt/trt_utils.h
@@ -7,7 +7,6 @@
#include
#include
#include
-#include
#include
#include
@@ -255,6 +254,16 @@ size_t getBytesPerSample(nvinfer1::DataType type) noexcept {
return 1;
case nvinfer1::DataType::kUINT8:
return 1;
+#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+ case nvinfer1::DataType::kFP8:
+ return 1;
+#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+#if NV_TENSORRT_MAJOR >= 9
+ case nvinfer1::DataType::kBF16:
+ return 2;
+ case nvinfer1::DataType::kINT64:
+ return 8;
+#endif // NV_TENSORRT_MAJOR >= 9
default:
return 0;
}
@@ -566,11 +575,20 @@ int getSampleType(nvinfer1::DataType type) noexcept {
switch (type) {
case nvinfer1::DataType::kFLOAT:
case nvinfer1::DataType::kHALF:
+#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+ case nvinfer1::DataType::kFP8:
+#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+#if NV_TENSORRT_MAJOR >= 9
+ case nvinfer1::DataType::kBF16:
+#endif // NV_TENSORRT_MAJOR >= 9
return 1;
case nvinfer1::DataType::kINT8:
case nvinfer1::DataType::kINT32:
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kUINT8:
+#if NV_TENSORRT_MAJOR >= 9
+ case nvinfer1::DataType::kINT64:
+#endif // NV_TENSORRT_MAJOR >= 9
return 0;
default:
return -1;
diff --git a/vstrt/trtexec/CMakeLists.txt b/vstrt/trtexec/CMakeLists.txt
index 0b77c94..482ba83 100644
--- a/vstrt/trtexec/CMakeLists.txt
+++ b/vstrt/trtexec/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.20)
project(trtexec LANGUAGES CXX)
set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
+set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
find_package(CUDAToolkit REQUIRED)
@@ -10,16 +11,21 @@ add_executable(trtexec
$<$: trtexec.manifest>
trtexec.cpp
logfile.cpp
+ ../common/bfloat16.cpp
../common/logger.cpp
+ ../common/sampleDevice.cpp
../common/sampleEngines.cpp
../common/sampleInference.cpp
../common/sampleOptions.cpp
../common/sampleReporting.cpp
../common/sampleUtils.cpp
+ ../utils/fileLock.cpp
+ ../utils/timingCache.cpp
)
target_include_directories(trtexec PRIVATE
../common
+ ..
${CUDAToolkit_INCLUDE_DIRS}
${TENSORRT_HOME}/include
)
@@ -31,10 +37,9 @@ target_link_directories(trtexec PRIVATE
target_link_libraries(trtexec PRIVATE
CUDA::cudart_static
- nvinfer
- nvinfer_plugin
- nvparsers
- nvonnxparser
+ "nvinfer${TENSORRT_LIBRARY_SUFFIX}"
+ "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}"
+ "nvonnxparser${TENSORRT_LIBRARY_SUFFIX}"
)
install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/vstrt/vs_tensorrt.cpp b/vstrt/vs_tensorrt.cpp
index b16402b..0f22832 100644
--- a/vstrt/vs_tensorrt.cpp
+++ b/vstrt/vs_tensorrt.cpp
@@ -366,9 +366,12 @@ static void VS_CC vsTrtCreate(
d->logger.set_verbosity(static_cast(verbosity));
#ifdef USE_NVINFER_PLUGIN
+ // related to https://github.com/AmusementClub/vs-mlrt/discussions/65, for unknown reason
+#if !(NV_TENSORRT_MAJOR == 9 && defined(_WIN32))
if (!initLibNvInferPlugins(&d->logger, "")) {
vsapi->logMessage(mtWarning, "vsTrt: Initialize TensorRT plugins failed");
}
+#endif
#endif
std::ifstream engine_stream {
@@ -446,7 +449,17 @@ static void VS_CC vsTrtCreate(
auto input_type = d->engines[0]->getBindingDataType(0);
#endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
- auto input_sample_type = getSampleType(input_type) == 0 ? stInteger : stFloat;
+ VSSampleType input_sample_type;
+ {
+ auto sample_type = getSampleType(input_type);
+ if (sample_type == 0) {
+ input_sample_type = stInteger;
+ } else if (sample_type == 1) {
+ input_sample_type = stFloat;
+ } else {
+ return set_error("unknown input sample type");
+ }
+ }
auto input_bits_per_sample = getBytesPerSample(input_type) * 8;
if (auto err = checkNodes(in_vis, input_sample_type, input_bits_per_sample); err.has_value()) {
@@ -462,7 +475,17 @@ static void VS_CC vsTrtCreate(
auto output_type = d->engines[0]->getBindingDataType(1);
#endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
- auto output_sample_type = getSampleType(output_type) == 0 ? stInteger : stFloat;
+ VSSampleType output_sample_type;
+ {
+ auto sample_type = getSampleType(output_type);
+ if (sample_type == 0) {
+ output_sample_type = stInteger;
+ } else if (sample_type == 1) {
+ output_sample_type = stFloat;
+ } else {
+ return set_error("unknown output sample type");
+ }
+ }
auto output_bits_per_sample = getBytesPerSample(output_type) * 8;
setDimensions(
@@ -489,13 +512,22 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
VAPOURSYNTH_API_VERSION, 1, plugin
);
+ // TRT 9 for windows does not export getInferLibVersion()
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+ auto test = getPluginRegistry();
+
+ if (test == nullptr) {
+ std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
+ return;
+ }
+#else // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
int ver = getInferLibVersion(); // must ensure this is the first nvinfer function called
#ifdef _WIN32
if (ver == 0) { // a sentinel value, see dummy function in win32.cpp.
std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
return;
}
-#endif
+#endif // _WIN32
if (ver != NV_TENSORRT_VERSION) {
#if NV_TENSORRT_MAJOR >= 10
std::fprintf(
@@ -513,6 +545,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
);
#endif // NV_TENSORRT_MAJOR >= 10
}
+#endif // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
myself = plugin;
@@ -535,7 +568,12 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
vsapi->propSetData(
out, "tensorrt_version",
- std::to_string(getInferLibVersion()).c_str(), -1, paReplace
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+ std::to_string(NV_TENSORRT_VERSION).c_str(),
+#else
+ std::to_string(getInferLibVersion()).c_str(),
+#endif
+ -1, paReplace
);
vsapi->propSetData(
diff --git a/vstrt/win32.cpp b/vstrt/win32.cpp
index a0cfcfc..b984803 100644
--- a/vstrt/win32.cpp
+++ b/vstrt/win32.cpp
@@ -10,22 +10,32 @@
#include
+#include
+
+#if NV_TENSORRT_VERSION >= 100100
+#define CONCAT_VERSION(name, version) (name "_" #version ".dll")
+#endif // NV_TENSORRT_VERSION >= 100100
+
namespace {
std::vector dlls = {
// This list must be sorted by dependency.
- L"zlibwapi.dll", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency
- L"cudnn_ops_infer64_8.dll",
- L"cudnn_cnn_infer64_8.dll",
- L"cudnn64_8.dll",
+#if NV_TENSORRT_VERSION >= 100100
+#ifdef USE_NVINFER_PLUGIN
+ // nvinfer_plugin dependencies
+ CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR),
+ CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR),
+#endif // USE_NVINFER_PLUGIN
+ // Finally, nvinfer again.
+ CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last
+#else // NV_TENSORRT_VERSION >= 100100
#ifdef USE_NVINFER_PLUGIN
// nvinfer_plugin dependencies
L"nvinfer.dll",
- L"cublasLt64_11.dll",
- L"cublas64_11.dll",
L"nvinfer_plugin.dll",
-#endif
+#endif // USE_NVINFER_PLUGIN
// Finally, nvinfer again.
L"nvinfer.dll", // must be the last
+#endif // NV_TENSORRT_VERSION >= 100100
};
namespace fs = std::filesystem;
@@ -64,7 +74,11 @@ FARPROC loadDLLs() {
return (FARPROC)h;
}
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+static void * dummy() { // mimic getPluginRegistry
+#else
static int dummy() { // mimic getInferLibVersion
+#endif
return 0;
}