diff --git a/.github/workflows/linux-ov.yml b/.github/workflows/linux-ov.yml
index 6fbd1f1..fb03816 100644
--- a/.github/workflows/linux-ov.yml
+++ b/.github/workflows/linux-ov.yml
@@ -3,7 +3,6 @@ name: Build (Linux-OV)
 on:
   push:
     paths:
-      - 'common/**'
       - 'vsov/**'
       - '.github/workflows/linux-ov.yml'
   workflow_dispatch:
@@ -18,7 +17,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -27,17 +26,17 @@ jobs:
 
     - name: Cache protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsov/protobuf/install
         key: ${{ runner.os }}-vsov-protobuf-v1
 
     - name: Checkout protobuf
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
-        # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/protobuf
+        # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf
         # if you change this, remember to bump the version of the cache key.
         ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e
         fetch-depth: 1
@@ -60,19 +59,19 @@ jobs:
 
     - name: Cache onnx
       id: cache-onnx
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsov/onnx/install
         key: ${{ runner.os }}-vsov-onnx-v1
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: onnx/onnx
-        # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2023.2.0/thirdparty/onnx
+        # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx
         # if you change this, remember to bump the version of the cache key.
-        ref: 1014f41f17ecc778d63e760a994579d96ba471ff
+        ref: b86cc54efce19530fb953e4b21f57e6b3888534c
         fetch-depth: 1
         path: vsov/onnx
 
@@ -104,7 +103,7 @@ jobs:
 
     - name: Setup OpenVINO
       run: |
-        curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.2/linux/l_openvino_toolkit_ubuntu22_2023.2.0.13089.cfd42bd2cb0_x86_64.tgz
+        curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu22_2024.0.0.14509.34caeefd078_x86_64.tgz
         tar -xf ov.tgz
         mv l_openvino_* openvino -v
 
@@ -113,7 +112,6 @@ jobs:
         -D CMAKE_BUILD_TYPE=Release
         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
-        -D InferenceEngine_DIR=openvino/runtime/cmake
         -D OpenVINO_DIR=openvino/runtime/cmake
         -D ENABLE_VISUALIZATION=ON
         -D WIN32_SHARED_OPENVINO=ON
@@ -135,7 +133,7 @@ jobs:
       run: git describe --tags --long
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSOV-Linux-x64
         path: vsov/artifact
diff --git a/.github/workflows/linux-trt.yml b/.github/workflows/linux-trt.yml
index 1efce0b..c97322b 100644
--- a/.github/workflows/linux-trt.yml
+++ b/.github/workflows/linux-trt.yml
@@ -17,7 +17,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -30,20 +30,16 @@ jobs:
         unzip -q vs.zip
         mv vapoursynth*/ vapoursynth
 
-    - name: Setup CUDA
+    - name: Setup CUDA and TensorRT
       run: |
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update
-        sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1 libcublas-dev-12-1 libcudnn8=8.9.0.131-1+cuda12.1 libcudnn8-dev=8.9.0.131-1+cuda12.1
+        sudo apt-get install -y cuda-nvcc-12-4 cuda-cudart-dev-12-4 libnvinfer-dev=10.0.1.6-1+cuda12.4 libnvinfer-headers-dev=10.0.1.6-1+cuda12.4
         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
 
-    - name: Setup tensorrt
-      run: |
-        sudo apt-get install -y libnvinfer-dev=8.6.1.6-1+cuda12.0 libnvinfer-headers-dev=8.6.1.6-1+cuda12.0
-
     - name: Configure
       run: cmake -S . -B build -G Ninja -LA
         -D CMAKE_BUILD_TYPE=Release
@@ -66,8 +62,9 @@ jobs:
       run: git describe --tags --long
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSTRT-Linux-x64
         path: vstrt/artifact
+        overwrite: true
 
diff --git a/.github/workflows/windows-cuda-dependency.yml b/.github/workflows/windows-cuda-dependency.yml
index feba4fd..a53e389 100644
--- a/.github/workflows/windows-cuda-dependency.yml
+++ b/.github/workflows/windows-cuda-dependency.yml
@@ -28,63 +28,41 @@ jobs:
         shell: bash
 
     steps:
-    # Note: unfortunately, cuDNN download requires registration and according to its license
-    # agreements, we can only redistribute it along with an application that uses it, so we
-    # have to use a private repository to hold a copy of the library.
     - name: Download cuDNN inference library
-      run: |
-        # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093
-
-        tag=cudnn-8.6.0-bin
-        name=cudnn.7z
-
-        AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}"
-        response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag)
-        eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=')
-        [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; }
-        ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id"
-        curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET"
+      run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip -o cudnn.zip
 
     - name: Extract cuDNN library
-      run: 7z x cudnn.7z
+      run: unzip cudnn.zip
 
     - name: Move cuDNN library
-      run: mv cudnn/bin vsmlrt-cuda -v
+      run: |
+        mkdir -p vsmlrt-cuda
+        mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v
+        rm vsmlrt-cuda/cudnn_*_train*.dll -v
 
-    # Same licensing issue with the tensorrt libraries.
     - name: Download TensorRT library
       run: |
-        # https://stackoverflow.com/questions/20396329/how-to-download-github-release-from-private-repo-using-command-line/35688093#35688093
-
-        tag=tensorrt-8.5.1-bin
-        name=TensorRT.7z
-
-        AUTH="Authorization: token ${{ secrets.REPO_TOKEN }}"
-        response=$(curl -sH "$AUTH" https://api.github.com/repos/AmusementClub/cuda/releases/tags/$tag)
-        eval $(echo "$response" | grep -C3 "name.:.\+$NAME" | grep -w id | tr : = | tr -cd '[[:alnum:]]=')
-        [ "$id" ] || { echo "Error: Failed to get asset id, response: $response" | awk 'length($0)<100' >&2; exit 1; }
-        ASSET="https://api.github.com/repos/AmusementClub/cuda/releases/assets/$id"
-        curl -LJO -H "$AUTH" -H 'Accept: application/octet-stream' "$ASSET"
-
-        mv "$name" trt.7z
+        curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
 
     - name: Extract TensorRT library
-      run: 7z x trt.7z
+      run: |
+        unzip trt.zip
+        mv TensorRT-*/ TensorRT/
 
     - name: Move TensorRT library
-      run: mv TensorRT/*.dll vsmlrt-cuda -v
+      run: mv TensorRT/lib/*.dll vsmlrt-cuda -v
 
     - name: Download CUDA Libraries
       shell: cmd
       run: |
-        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
-        cuda_installer.exe -s cudart_11.8 cublas_11.8 cufft_11.8 cupti_11.8 nvrtc_11.8
+        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
+        cuda_installer.exe -s cudart_12.4 cublas_12.4 cufft_12.4 cupti_12.4 nvrtc_12.4
 
     - name: Move CUDA Libraries
       shell: cmd
       run: |
-        move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
-        move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\*.dll" vsmlrt-cuda
+        move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
+        move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\*.dll" vsmlrt-cuda
         del vsmlrt-cuda\cudart32*.dll
 
     - name: Setup VC commands
@@ -99,7 +77,7 @@ jobs:
         while true; do
           changed=false
           for dll in *.[dD][lL][lL]; do
-             for dep in $(dumpbin -dependents "$dll" | grep -o -i '\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
+             for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
                 echo "finding $dep for $dll"
                 if ! test -f ./"$dep"; then
                   changed=true
@@ -113,42 +91,17 @@ jobs:
           $changed || break
         done
 
-    - name: Cache zlib
-      id: cache-zlib
-      uses: actions/cache@v3
-      with:
-        path: ${{ github.workspace }}/zlib-release
-        key: ${{ runner.os }}-zlib-1.2.12
-
-    - name: Setup msbuild
-      if: steps.cache-zlib.outputs.cache-hit != 'true'
-      uses: microsoft/setup-msbuild@v1
-
-    - name: Compile zlib
-      if: steps.cache-zlib.outputs.cache-hit != 'true'
-      shell: cmd
-      run: |
-        curl -s -o zlib.tar.gz -LJO https://github.com/madler/zlib/archive/refs/tags/v1.2.12.tar.gz
-        tar -xf zlib.tar.gz
-        cd zlib-1.2.12\contrib\vstudio\vc14
-        python -c "f=open('zlibvc.vcxproj','r+b');s=f.read();s=s.replace(b'MultiThreadedDLL',b'MultiThreaded');start=s.index(b'<ClCompile Include=\"..\\..\\masmx64\\inffas8664.c\">');end_str=b'</ClCompile>\n';end=s.index(end_str,start);s=s[:start]+s[end+len(end_str):];f.seek(0);f.write(s);f.truncate();f.close()"
-        msbuild zlibvc.sln -target:zlibvc /p:Configuration=ReleaseWithoutAsm /p:Platform=x64 /p:PlatformToolset=v143 -maxCpuCount
-        mkdir %GITHUB_WORKSPACE%\zlib-release
-        move x64\ZlibDllReleaseWithoutAsm\zlibwapi.dll %GITHUB_WORKSPACE%\zlib-release
-
-    - name: Copy zlib
-      run: cp -v zlib-release/zlibwapi.dll vsmlrt-cuda
-
     - name: Compress
       run: |
         7z a -t7z -mx=3 vsmlrt-cuda.7z vsmlrt-cuda
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: vsmlrt-cuda
         path: vsmlrt-cuda.7z
         retention-days: 1
+        compression-level: 0
 
     - name: Rename release asset
       run: |
diff --git a/.github/workflows/windows-ncnn.yml b/.github/workflows/windows-ncnn.yml
index 74c93c9..31a425b 100644
--- a/.github/workflows/windows-ncnn.yml
+++ b/.github/workflows/windows-ncnn.yml
@@ -39,7 +39,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -48,13 +48,13 @@ jobs:
 
     - name: Cache protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsncnn/protobuf/install
         key: ${{ runner.os }}-vsncnn-protobuf-v3.16.0
 
     - name: Checkout protobuf
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
@@ -81,14 +81,14 @@ jobs:
 
     - name: Cache onnx
       id: cache-onnx
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsncnn/onnx/install
         key: ${{ runner.os }}-vsncnn-onnx-v1.12.0-protobuf-v3.16.0
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: onnx/onnx
         ref: v1.12.0
@@ -161,7 +161,7 @@ jobs:
         copy install\bin\vsncnn.dll artifact\
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSNCNN-GPU-Windows-x64
         path: vsncnn/artifact
diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml
index 8261e5d..0aafede 100644
--- a/.github/workflows/windows-ort.yml
+++ b/.github/workflows/windows-ort.yml
@@ -29,7 +29,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -41,17 +41,17 @@ jobs:
 
     - name: Cache protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsort/protobuf/install
         key: ${{ runner.os }}-vsort-protobuf-v4
 
     - name: Checkout protobuf
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
-        # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161
+        # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203
         # if you change this, remember to bump the version of the cache key.
         ref: v3.21.12
         fetch-depth: 1
@@ -73,19 +73,19 @@ jobs:
 
     - name: Cache onnx
       id: cache-onnx
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsort/onnx/install
-        key: ${{ runner.os }}-vsort-onnx-v4
+        key: ${{ runner.os }}-vsort-onnx-v5
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: onnx/onnx
         # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
         # if you change this, remember to bump the version of the cache key.
-        ref: a0d77f18516d2da7468a96b0de3b737266f23176
+        ref: 990217f043af7222348ca8f0301e17fa7b841781
         fetch-depth: 1
         path: vsort/onnx
 
@@ -116,21 +116,21 @@ jobs:
 
     - name: Download ONNX Runtime Precompilation
       run: |
-        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip
+        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-7983-g9001c69b84-240419-0832/onnxruntime-gpu-win64.zip
         unzip -q ortgpu.zip
 
     - name: Cache CUDA
       id: cache-cuda
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
-        key: ${{ runner.os }}-cuda-11.8.0
+        key: ${{ runner.os }}-cuda-12.4.0
 
     - name: Setup CUDA
       if: steps.cache-cuda.outputs.cache-hit != 'true'
       run: |
-        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
-        cuda_installer.exe -s nvcc_11.8 cudart_11.8
+        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe
+        cuda_installer.exe -s nvcc_12.4 cudart_12.4
 
     - name: Configure
       run: cmake -S . -B build -G Ninja -LA
@@ -139,10 +139,10 @@ jobs:
         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
         -D protobuf_DIR=protobuf\install\cmake
         -D ONNX_DIR=onnx\install\lib\cmake\ONNX
-        -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime\core\session
+        -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime
         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
         -D ENABLE_CUDA=1
-        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
         -D ENABLE_DML=1
         -D CMAKE_CXX_STANDARD=20
 
@@ -161,12 +161,12 @@ jobs:
     - name: Download DirectML Library
       # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
       run: |
-        curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0
+        curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1
         unzip -q directml.nupkg -d dml
         copy dml\bin\x64-win\DirectML.dll artifact\vsort\
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSORT-Windows-x64
         path: vsort/artifact
@@ -199,7 +199,7 @@ jobs:
 
     - name: Create script
       shell: bash
-      run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
+      run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
 
     - name: Run vspipe
       shell: bash
@@ -221,8 +221,40 @@ jobs:
       shell: bash
       run: |
         set -ex
-        vs_portable/vspipe -i test.vpy -
-        vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+        vs_portable/vspipe -i test_fp16.vpy -
+        vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+        ls -l out.hevc x265.log
+        cat x265.log
+        grep -F 'encoded 10 frames' x265.log || exit 2
+        grep -i 'error' x265.log && exit 1
+        exit 0
+
+    - name: Create script (fp16 input)
+      shell: bash
+      run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy
+
+    - name: Run vspipe (fp16 input)
+      shell: bash
+      run: |
+        set -ex
+        vs_portable/vspipe -i test_fp16_input.vpy -
+        vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
+        ls -l out.hevc x265.log
+        cat x265.log
+        grep -F 'encoded 10 frames' x265.log || exit 2
+        grep -i 'error' x265.log && exit 1
+        exit 0
+
+    - name: Create script (fp16 output)
+      shell: bash
+      run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy
+
+    - name: Run vspipe (fp16 output)
+      shell: bash
+      run: |
+        set -ex
+        vs_portable/vspipe -i test_fp16_output.vpy -
+        vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
         ls -l out.hevc x265.log
         cat x265.log
         grep -F 'encoded 10 frames' x265.log || exit 2
diff --git a/.github/workflows/windows-ov.yml b/.github/workflows/windows-ov.yml
index 3dfae3d..a47fc91 100644
--- a/.github/workflows/windows-ov.yml
+++ b/.github/workflows/windows-ov.yml
@@ -39,7 +39,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -51,19 +51,19 @@ jobs:
 
     - name: Cache protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsov/protobuf/install
-        key: ${{ runner.os }}-vsov-protobuf-v1
+        key: ${{ runner.os }}-vsov-protobuf-v2
 
     - name: Checkout protobuf
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
-        # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf
+        # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/protobuf
         # if you change this, remember to bump the version of the cache key.
-        ref: 6c6b0778b70f35f93c2f0dee30e5d12ad2a83eea
+        ref: fe271ab76f2ad2b2b28c10443865d2af21e27e0e
         fetch-depth: 1
         path: vsov/protobuf
 
@@ -83,19 +83,19 @@ jobs:
 
     - name: Cache onnx
       id: cache-onnx
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: vsov/onnx/install
-        key: ${{ runner.os }}-vsov-onnx-v2
+        key: ${{ runner.os }}-vsov-onnx-v3
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: onnx/onnx
-        # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx
+        # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.0.0/thirdparty/onnx
         # if you change this, remember to bump the version of the cache key.
-        ref: f7ee1ac60d06abe8e26c9b6bbe1e3db5286b614b
+        ref: b86cc54efce19530fb953e4b21f57e6b3888534c
         fetch-depth: 1
         path: vsov/onnx
 
@@ -124,23 +124,16 @@ jobs:
         unzip -q vs.zip
         mv vapoursynth-*/ vapoursynth/
 
-    - name: Download OpenVINO Runtime Precompilation
-      shell: bash
+    - name: Setup OpenVINO
       run: |
-        rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}"
-        if [ "$rev" == "latest" ]; then
-          url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip"
-        else
-          url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip"
-        fi
-        curl -s -o openvino.zip -LJO "$url"
-        unzip -q openvino.zip
+        curl -L -o ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/windows/w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64.zip
+        unzip ov.zip
+        rename w_openvino_toolkit_windows_2024.0.0.14509.34caeefd078_x86_64 openvino
 
     - name: Configure
       run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
         -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON
         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
-        -D InferenceEngine_DIR=openvino/runtime/cmake
         -D OpenVINO_DIR=openvino/runtime/cmake
         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
         -D ENABLE_VISUALIZATION=ON
@@ -156,12 +149,12 @@ jobs:
         cmake --install build --prefix install
         mkdir artifact
         mkdir artifact\vsov
-        copy openvino\runtime\3rdparty\tbb\bin\tbb.dll artifact\vsov\
+        copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\
         copy install\bin\vsov.dll artifact\
         xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSOV-Windows-x64
         path: vsov/artifact
diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml
index 64318d3..07687b3 100644
--- a/.github/workflows/windows-release.yml
+++ b/.github/workflows/windows-release.yml
@@ -66,7 +66,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Compress scirpts.7z
       run: |
@@ -74,7 +74,7 @@ jobs:
         7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
 
     - name: Upload scripts release
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: Scripts
         path: scripts
@@ -116,11 +116,12 @@ jobs:
         7za a -t7z -bb3 -mx=9 ../models.7z .
 
     - name: Upload model release
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: Models
         path: release
         retention-days: 1
+        compression-level: 0
 
     - name: Download External Models
       run: |
@@ -147,11 +148,44 @@ jobs:
         7za a -t7z -bb3 -mx=9 ../ext-models.7z .
 
     - name: Upload external model release
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: External-Models
         path: release
         retention-days: 1
+        compression-level: 0
+
+    - name: Download Contributed Models
+      run: |
+        rm -rf release
+        set -ex
+        mkdir -p release/models
+        cd release
+        pushd models
+        for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
+          echo "Handling tag $tag"
+          curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
+          for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
+            echo "Downloading $url"
+            curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
+            # later release should overwrite earlier ones
+            7za x -y dl.7z
+          done
+          #test -f "dl.7z"  # contrib-models might be empty.
+          rm -f dl.7z release.json
+        done
+        popd
+        ls -lR
+        du -sh
+        7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
+
+    - name: Upload contrib model release
+      uses: actions/upload-artifact@v4
+      with:
+        name: Contrib-Models
+        path: release
+        retention-days: 1
+        compression-level: 0
 
     - name: Download Contributed Models
       run: |
@@ -212,43 +246,43 @@ jobs:
 
     steps:
     - name: Download artifact for scripts
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: Scripts
         path: scripts-release
 
     - name: Download artifact for models
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: Models
         path: models-release
 
     - name: Download artifact for vsov
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: VSOV-Windows-x64
         path: vsov-release
 
     - name: Download artifact for vsort
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: VSORT-Windows-x64
         path: vsort-release
 
     - name: Download artifact for vstrt
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: VSTRT-Windows-x64
         path: vstrt-release
 
     - name: Download artifact for vsncnn
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: VSNCNN-GPU-Windows-x64
         path: vsncnn-release
 
     - name: Download artifact for cuda dependencies
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: vsmlrt-cuda
         path: cuda-release
@@ -267,12 +301,13 @@ jobs:
         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
 
     - name: Upload CPU-only release
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: false
       with:
         name: vsmlrt-cpu-release
         path: vsmlrt-windows-x64-cpu.7z
         retention-days: 1
+        compression-level: 0
 
     - name: Rename release asset
       run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z
@@ -300,18 +335,19 @@ jobs:
         ls -lR
         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
 
-    - name: Upload non-CUDA GPU release
-      uses: actions/upload-artifact@v3
+    - name: Upload generic GPU release
+      uses: actions/upload-artifact@v4
       if: false
       with:
         name: vsmlrt-generic-gpu-release
         path: vsmlrt-windows-x64-generic-gpu.7z
         retention-days: 1
+        compression-level: 0
 
-    - name: Rename release asset for non-CUDA GPU release
+    - name: Rename release asset for generic GPU release
       run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
 
-    - name: Release non-CUDA GPU
+    - name: Release generic GPU
       uses: softprops/action-gh-release@v1
       with:
         tag_name: ${{ github.event.inputs.tag }}
@@ -342,12 +378,13 @@ jobs:
         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z .
 
     - name: Upload CUDA release
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: false
       with:
         name: vsmlrt-cuda-release
         path: vsmlrt-windows-x64-cuda.7z
         retention-days: 1
+        compression-level: 0
 
     - name: Rename release asset
       run: mv vsmlrt-windows-x64-cuda.7z vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z
@@ -364,7 +401,7 @@ jobs:
     # Update nightly tag.
     - name: Checkout repo
       if: github.event.inputs.tag == 'nightly'
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
     - name: Overwrite tag
diff --git a/.github/workflows/windows-trt.yml b/.github/workflows/windows-trt.yml
index 735e6eb..5105ab1 100644
--- a/.github/workflows/windows-trt.yml
+++ b/.github/workflows/windows-trt.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
     - name: Checkout repo
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         fetch-depth: 0
 
@@ -43,24 +43,23 @@ jobs:
 
     - name: Cache CUDA
       id: cache-cuda
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
-        key: ${{ runner.os }}-vstrt-cuda-11.8.0
+        key: ${{ runner.os }}-vstrt-cuda-12.4.1
+        save-always: true
 
     - name: Setup CUDA
       if: steps.cache-cuda.outputs.cache-hit != 'true'
       run: |
-        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe
-        cuda_installer.exe -s nvcc_11.8 cudart_11.8 nvprof_11.8 cuda_profiler_api_11.8
+        curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
+        cuda_installer.exe -s nvcc_12.4 cudart_12.4 cuda_profiler_api_12.4
 
-    - name: Checkout tensorrt
-      uses: actions/checkout@v3
-      with:
-        repository: AmusementClub/cuda
-        token: ${{ secrets.REPO_TOKEN }}
-        ref: tensorrt-8.5.1
-        path: tensorrt
+    - name: Download TensorRT
+      run: |
+        curl -L -o trt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
+        unzip trt.zip
+        mv TensorRT-*/ tensorrt/
 
     - name: Download VapourSynth headers
       run: |
@@ -72,10 +71,11 @@ jobs:
       run: cmake -S . -B build -G Ninja -LA
         -D CMAKE_BUILD_TYPE=Release
         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
-        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
-        -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt"
+        -D TENSORRT_HOME="%cd%\tensorrt"
         -D USE_NVINFER_PLUGIN=ON
+        -D TENSORRT_LIBRARY_SUFFIX="_10"
 
     - name: Build
       run: cmake --build build --config Release --verbose
@@ -84,15 +84,17 @@ jobs:
       run: cmake --install build --prefix install
 
     - name: Checkout TensorRT OSS
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         repository: NVIDIA/TensorRT
-        ref: 8.5.1
+        ref: release/10.0
         fetch-depth: 1
         path: tensorrt-oss
 
     - name: Override trtexec CMake file
       run: |
+        cp -f -r -v tensorrt/samples ../tensorrt-oss
+
         mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec
         mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec
         mv trtexec/*.manifest ../tensorrt-oss/samples/trtexec
@@ -101,8 +103,9 @@ jobs:
       run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja
         -D CMAKE_BUILD_TYPE=Release
         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
-        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
-        -D TENSORRT_HOME="%cd%\..\tensorrt\tensorrt"
+        -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+        -D TENSORRT_HOME="%cd%\tensorrt"
+        -D TENSORRT_LIBRARY_SUFFIX="_10"
 
     - name: Build trtexec
       run: cmake --build build_trtexec --verbose
@@ -121,7 +124,7 @@ jobs:
       run: git describe --tags --long
 
     - name: Upload
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: VSTRT-Windows-x64
         path: vstrt/artifact
diff --git a/common/convert_float_to_float16.cpp b/common/convert_float_to_float16.cpp
index 0cd0294..8a05651 100644
--- a/common/convert_float_to_float16.cpp
+++ b/common/convert_float_to_float16.cpp
@@ -6,7 +6,6 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
-#include <limits>
 #include <memory>
 #include <optional>
 #include <string>
@@ -19,11 +18,7 @@
 
 #include <onnx/onnx_pb.h>
 
-
-void convert_float_to_float16(
-    ONNX_NAMESPACE::ModelProto & model,
-    bool force_fp16_initializers
-) noexcept;
+#include "convert_float_to_float16.h"
 
 
 namespace {
@@ -298,12 +293,14 @@ static ONNX_NAMESPACE::ValueInfoProto make_value_info_from_tensor(
 
 void convert_float_to_float16(
     ONNX_NAMESPACE::ModelProto & model,
-    bool force_fp16_initializers
-    // , bool keep_io_types = True
-    // , bool disable_shape_infer = True
-    // , const std::optional<std::unordered_set<std::string>> op_block_list = DEFAULT_OP_BLOCK_LIST
-    // , const std::optional<std::unordered_set<std::string>> op_block_list = {}
-    , const std::unordered_set<std::string> & op_block_list
+    bool force_fp16_initializers,
+    // bool keep_io_types = True,
+    // bool disable_shape_infer = True,
+    // const std::optional<std::unordered_set<std::string>> op_block_list = DEFAULT_OP_BLOCK_LIST,
+    // const std::optional<std::unordered_set<std::string>> op_block_list = {},
+    const std::unordered_set<std::string> & op_block_list,
+    bool cast_input,
+    bool cast_output
 ) noexcept {
 
     std::vector<ONNX_NAMESPACE::ValueInfoProto> value_info_list {};
@@ -312,97 +309,101 @@ void convert_float_to_float16(
     std::unordered_map<std::string, std::string> name_mapping {};
     std::unordered_set<std::string> graph_io_to_skip {};
 
-    const std::vector<std::string> fp32_inputs = [&]() {
-        std::vector<std::string> ret {};
+    if (cast_input) {
+        const std::vector<std::string> fp32_inputs = [&]() {
+            std::vector<std::string> ret {};
 
-        for (const auto & n : model.graph().input()) {
-            if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
-                ret.emplace_back(n.name());
+            for (const auto & n : model.graph().input()) {
+                if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
+                    ret.emplace_back(n.name());
+                }
             }
-        }
 
-        return ret;
-    }();
-
-    for (const auto & n : model.graph().input()) {
-        if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name());
-            idx != std::cend(fp32_inputs)
-        ) {
-            const auto i = idx - std::cbegin(fp32_inputs);
-            std::string node_name = "graph_input_cast_" + std::to_string(i);
-            name_mapping.emplace(n.name(), node_name);
-            graph_io_to_skip.emplace(n.name());
-
-            auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
-            new_value_info->CopyFrom(n);
-            new_value_info->set_name(node_name);
-            new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
-                ONNX_NAMESPACE::TensorProto::FLOAT16
-            );
-            // add Cast node (from tensor(float) to tensor(float16) after graph input
-            for (auto & node : *model.mutable_graph()->mutable_node()) {
-                for (auto & input : *node.mutable_input()) {
-                    if (input == n.name()) {
-                        input = node_name;
+            return ret;
+        }();
+
+        for (const auto & n : model.graph().input()) {
+            if (auto idx = std::find(std::cbegin(fp32_inputs), std::cend(fp32_inputs), n.name());
+                idx != std::cend(fp32_inputs)
+            ) {
+                const auto i = idx - std::cbegin(fp32_inputs);
+                std::string node_name = "graph_input_cast_" + std::to_string(i);
+                name_mapping.emplace(n.name(), node_name);
+                graph_io_to_skip.emplace(n.name());
+
+                auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
+                new_value_info->CopyFrom(n);
+                new_value_info->set_name(node_name);
+                new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
+                    ONNX_NAMESPACE::TensorProto::FLOAT16
+                );
+                // add Cast node (from tensor(float) to tensor(float16) after graph input
+                for (auto & node : *model.mutable_graph()->mutable_node()) {
+                    for (auto & input : *node.mutable_input()) {
+                        if (input == n.name()) {
+                            input = node_name;
+                        }
                     }
                 }
+                auto new_node = make_node(
+                    "Cast", {n.name()}, {node_name}, node_name,
+                    "to", ONNX_NAMESPACE::TensorProto::FLOAT16
+                );
+                model.mutable_graph()->mutable_node()->Add();
+                for (int i = model.graph().node_size() - 2; i >= 0; --i) {
+                    model.mutable_graph()->mutable_node()->SwapElements(i, i + 1);
+                }
+                *model.mutable_graph()->mutable_node(0) = std::move(new_node);
+                value_info_list.emplace_back(*new_value_info);
+                io_casts.emplace(std::move(node_name));
             }
-            auto new_node = make_node(
-                "Cast", {n.name()}, {node_name}, node_name,
-                "to", ONNX_NAMESPACE::TensorProto::FLOAT16
-            );
-            model.mutable_graph()->mutable_node()->Add();
-            for (int i = model.graph().node_size() - 2; i >= 0; --i) {
-                model.mutable_graph()->mutable_node()->SwapElements(i, i + 1);
-            }
-            *model.mutable_graph()->mutable_node(0) = std::move(new_node);
-            value_info_list.emplace_back(*new_value_info);
-            io_casts.emplace(std::move(node_name));
         }
     }
 
-    const std::vector<std::string> fp32_outputs = [&]() {
-        std::vector<std::string> ret {};
+    if (cast_output) {
+        const std::vector<std::string> fp32_outputs = [&]() {
+            std::vector<std::string> ret {};
 
-        for (const auto & n : model.graph().output()) {
-            if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
-                ret.emplace_back(n.name());
+            for (const auto & n : model.graph().output()) {
+                if (n.type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto::FLOAT) {
+                    ret.emplace_back(n.name());
+                }
             }
-        }
 
-        return ret;
-    }();
-
-    for (const auto & n : model.graph().output()) {
-        if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name());
-            idx != std::cend(fp32_outputs)
-        ) {
-            const auto i = idx - std::cbegin(fp32_outputs);
-            std::string node_name = "graph_output_cast_" + std::to_string(i);
-            name_mapping.emplace(n.name(), node_name);
-            graph_io_to_skip.emplace(n.name());
-
-            auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
-            new_value_info->CopyFrom(n);
-            new_value_info->set_name(node_name);
-            new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
-                ONNX_NAMESPACE::TensorProto::FLOAT16
-            );
-            // add Cast node (from tensor(float16) to tensor(float) before graph output
-            for (auto & node : *model.mutable_graph()->mutable_node()) {
-                for (auto & output : *node.mutable_output()) {
-                    if (output == n.name()) {
-                        output = node_name;
+            return ret;
+        }();
+
+        for (const auto & n : model.graph().output()) {
+            if (auto idx = std::find(std::cbegin(fp32_outputs), std::cend(fp32_outputs), n.name());
+                idx != std::cend(fp32_outputs)
+            ) {
+                const auto i = idx - std::cbegin(fp32_outputs);
+                std::string node_name = "graph_output_cast_" + std::to_string(i);
+                name_mapping.emplace(n.name(), node_name);
+                graph_io_to_skip.emplace(n.name());
+
+                auto * new_value_info = model.mutable_graph()->mutable_value_info()->Add();
+                new_value_info->CopyFrom(n);
+                new_value_info->set_name(node_name);
+                new_value_info->mutable_type()->mutable_tensor_type()->set_elem_type(
+                    ONNX_NAMESPACE::TensorProto::FLOAT16
+                );
+                // add Cast node (from tensor(float16) to tensor(float) before graph output
+                for (auto & node : *model.mutable_graph()->mutable_node()) {
+                    for (auto & output : *node.mutable_output()) {
+                        if (output == n.name()) {
+                            output = node_name;
+                        }
                     }
                 }
+                auto new_node = make_node(
+                    "Cast", {node_name}, {n.name()}, node_name,
+                    "to", ONNX_NAMESPACE::TensorProto::FLOAT
+                );
+                model.mutable_graph()->mutable_node()->Add(std::move(new_node));
+                value_info_list.emplace_back(*new_value_info);
+                io_casts.emplace(std::move(node_name));
             }
-            auto new_node = make_node(
-                "Cast", {node_name}, {n.name()}, node_name,
-                "to", ONNX_NAMESPACE::TensorProto::FLOAT
-            );
-            model.mutable_graph()->mutable_node()->Add(std::move(new_node));
-            value_info_list.emplace_back(*new_value_info);
-            io_casts.emplace(std::move(node_name));
         }
     }
 
diff --git a/common/convert_float_to_float16.h b/common/convert_float_to_float16.h
new file mode 100644
index 0000000..1c41b4a
--- /dev/null
+++ b/common/convert_float_to_float16.h
@@ -0,0 +1,21 @@
+#ifndef CONVERT_FLOAT_TO_FLOAT16_H
+#define CONVERT_FLOAT_TO_FLOAT16_H
+
+#include <string>
+#include <unordered_set>
+
+#include <onnx/onnx_pb.h>
+
+void convert_float_to_float16(
+    ONNX_NAMESPACE::ModelProto & model,
+    bool force_fp16_initializers,
+    // bool keep_io_types = True,
+    // bool disable_shape_infer = True,
+    // const std::optional<std::unordered_set<std::string>> op_block_list = DEFAULT_OP_BLOCK_LIST,
+    // const std::optional<std::unordered_set<std::string>> op_block_list = {},
+    const std::unordered_set<std::string> & op_block_list,
+    bool cast_input = true,
+    bool cast_output = true
+) noexcept;
+
+#endif
diff --git a/common/onnx_utils.cpp b/common/onnx_utils.cpp
index 9c22898..5de7b67 100644
--- a/common/onnx_utils.cpp
+++ b/common/onnx_utils.cpp
@@ -8,6 +8,8 @@
 #include <onnx/onnx_pb.h>
 #include <onnx/shape_inference/implementation.h>
 
+#include "onnx_utils.h"
+
 
 using namespace std::string_literals;
 
diff --git a/common/onnx_utils.h b/common/onnx_utils.h
new file mode 100644
index 0000000..7041ab7
--- /dev/null
+++ b/common/onnx_utils.h
@@ -0,0 +1,18 @@
+#ifndef ONNX_UTILS_H
+#define ONNX_UTILS_H
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include <onnx/onnx_pb.h>
+
+std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
+    const std::string_view & path,
+    int64_t tile_w,
+    int64_t tile_h,
+    bool path_is_serialization
+) noexcept;
+
+#endif
diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py
index ecccb61..011c2b6 100644
--- a/scripts/vsmlrt.py
+++ b/scripts/vsmlrt.py
@@ -1,4 +1,4 @@
-__version__ = "3.15.55"
+__version__ = "3.20.11"
 
 __all__ = [
     "Backend", "BackendV2",
@@ -9,6 +9,8 @@
     "CUGAN",
     "RIFE", "RIFEModel", "RIFEMerge",
     "SAFA", "SAFAModel", "SAFAAdaptiveMode",
+    "SCUNet", "SCUNetModel",
+    "SwinIR", "SwinIRModel",
     "inference"
 ]
 
@@ -18,6 +20,8 @@
 from fractions import Fraction
 import math
 import os
+import os.path
+import platform
 import subprocess
 import sys
 import tempfile
@@ -75,6 +79,18 @@ class ORT_CUDA:
 
         basic performance tuning:
         set fp16 = True (on RTX GPUs)
+
+        Semantics of `fp16`:
+            Enabling `fp16` will use a built-in quantization that converts a fp32 onnx to a fp16 onnx.
+            If the input video is of half-precision floating-point format,
+            the generated fp16 onnx will use fp16 input.
+            The output format can be controlled by the `output_format` option (0 = fp32, 1 = fp16).
+
+            Disabling `fp16` will not use the built-in quantization.
+            However, if the onnx file itself uses fp16 for computation,
+            the actual computation will be done in fp16.
+            In this case, the input video format should match the input format of the onnx,
+            and the output format is inferred from the onnx.
         """
 
         device_id: int = 0
@@ -84,6 +100,9 @@ class ORT_CUDA:
         fp16: bool = False
         use_cuda_graph: bool = False # preview, not supported by all models
         fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None
+        prefer_nhwc: bool = False
+        output_format: int = 0 # 0: fp32, 1: fp16
+        tf32: bool = False
 
         # internal backend attributes
         supports_onnx_serialization: bool = True
@@ -122,17 +141,17 @@ class TRT:
         opt_shapes: typing.Optional[typing.Tuple[int, int]] = None
         fp16: bool = False
         device_id: int = 0
-        workspace: typing.Optional[int] = 128
+        workspace: typing.Optional[int] = None
         verbose: bool = False
         use_cuda_graph: bool = False
         num_streams: int = 1
         use_cublas: bool = False # cuBLAS + cuBLASLt
         static_shape: bool = True
-        tf32: bool = True
+        tf32: bool = False
         log: bool = True
 
         # as of TensorRT 8.4, it can be turned off without performance penalty in most cases
-        use_cudnn: bool = True
+        use_cudnn: bool = False # changed to False since vsmlrt.vpy 3.16
         use_edge_mask_convolutions: bool = True
         use_jit_convolutions: bool = True
         heuristic: bool = False # only supported on Ampere+ with TensorRT 8.5+
@@ -141,8 +160,12 @@ class TRT:
         faster_dynamic_shapes: bool = True
         force_fp16: bool = False
         builder_optimization_level: int = 3
+        max_aux_streams: typing.Optional[int] = None
+        short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise
+        bf16: bool = False
         custom_env: typing.Dict[str, str] = field(default_factory=lambda: {})
         custom_args: typing.List[str] = field(default_factory=lambda: [])
+        engine_folder: typing.Optional[str] = None
 
         # internal backend attributes
         supports_onnx_serialization: bool = False
@@ -207,12 +230,21 @@ class MIGX:
         fast_math: bool = True
         exhaustive_tune: bool = False
 
+        short_path: typing.Optional[bool] = None # True on Windows by default, False otherwise
         custom_env: typing.Dict[str, str] = field(default_factory=lambda: {})
         custom_args: typing.List[str] = field(default_factory=lambda: [])
 
         # internal backend attributes
         supports_onnx_serialization: bool = False
 
+    @dataclass(frozen=False)
+    class OV_NPU:
+        """ backend for intel npus
+        """
+
+        # internal backend attributes
+        supports_onnx_serialization: bool = True
+
 
 backendT = typing.Union[
     Backend.OV_CPU,
@@ -222,7 +254,8 @@ class MIGX:
     Backend.OV_GPU,
     Backend.NCNN_VK,
     Backend.ORT_DML,
-    Backend.MIGX
+    Backend.MIGX,
+    Backend.OV_NPU,
 ]
 
 
@@ -271,7 +304,7 @@ def Waifu2x(
         raise ValueError(f'{func_name}: "scale" must be 1, 2 or 4')
 
     if not isinstance(model, int) or model not in Waifu2xModel.__members__.values():
-        raise ValueError(f'{func_name}: "model" must be in [0, 9]')
+        raise ValueError(f'{func_name}: invalid "model"')
 
     if model == 0 and noise == 0:
         raise ValueError(
@@ -433,7 +466,7 @@ def DPIR(
         raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
 
     if not isinstance(model, int) or model not in DPIRModel.__members__.values():
-        raise ValueError(f'{func_name}: "model" must be 0, 1, 2 or 3')
+        raise ValueError(f'{func_name}: invalid "model"')
 
     if model in [0, 2] and clip.format.color_family != vs.GRAY:
         raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
@@ -976,8 +1009,11 @@ def RIFEMerge(
                         "/Reciprocal:fp32,/Reciprocal_1:fp32,"
                         "/Mul:fp32,/Mul_1:fp32,/Mul_8:fp32,/Mul_10:fp32,"
                         "/Sub_5:fp32,/Sub_6:fp32,"
+                        # generated by TensorRT's onnx parser
                         "ONNXTRT_Broadcast_236:fp32,ONNXTRT_Broadcast_238:fp32,"
-                        "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32"
+                        "ONNXTRT_Broadcast_273:fp32,ONNXTRT_Broadcast_275:fp32,"
+                        # TensorRT 9.0 or later
+                        "ONNXTRT_Broadcast_*:fp32"
                     )
                 ])
 
@@ -1147,6 +1183,14 @@ def handler(n: int, f: vs.VideoFrame) -> vs.VideoNode:
         else:
             return res
     else:
+        if not hasattr(core, 'akarin') or \
+            not hasattr(core.akarin, 'PropExpr') or \
+            not hasattr(core.akarin, 'PickFrames'):
+            raise RuntimeError(
+                'fractional multi requires plugin akarin '
+                '(https://github.com/AkarinVS/vapoursynth-plugin/releases)'
+                ', version v0.96g or later.')
+
         if clip.fps_num == 0 or clip.fps_den == 0:
             src_fps = Fraction(1)
         else:
@@ -1169,6 +1213,7 @@ def left_func(n: int) -> vs.VideoNode:
             left_clip = core.std.FrameEval(temp, left_func)
 
             def right_func(n: int) -> vs.VideoNode:
+                # no out of range access because of function filter_sc
                 return clip[dst_duration * n // src_duration + 1]
             right_clip = core.std.FrameEval(temp, right_func)
 
@@ -1181,14 +1226,14 @@ def timepoint_func(n: int) -> vs.VideoNode:
                 return temp_gray.std.BlankClip(color=tp, keep=True)
             tp_clip = core.std.FrameEval(temp_gray, timepoint_func)
 
-            output = RIFEMerge(
+            output0 = RIFEMerge(
                 clipa=left_clip, clipb=right_clip, mask=tp_clip,
                 scale=scale, tiles=tiles, tilesize=tilesize, overlap=overlap,
                 model=model, backend=backend, ensemble=ensemble,
                 _implementation=_implementation
             )
 
-            left0 = bits_as(left_clip, output)
+            left0 = bits_as(left_clip, output0)
 
             def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode:
                 current_time = dst_duration * n
@@ -1200,9 +1245,9 @@ def filter_sc(n: int, f: vs.VideoFrame) -> vs.VideoNode:
                 ):
                     return left0
                 else:
-                    return output
+                    return output0
 
-            res = core.std.FrameEval(output, filter_sc, left0)
+            res = core.std.FrameEval(output0, filter_sc, left0)
         else:
             if not hasattr(core, 'akarin') or \
                 not hasattr(core.akarin, 'PropExpr') or \
@@ -1373,6 +1418,210 @@ def SAFA(
     return clip
 
 
+@enum.unique
+class SCUNetModel(enum.IntEnum):
+    scunet_color_15 = 0
+    scunet_color_25 = 1
+    scunet_color_50 = 2
+    scunet_color_real_psnr = 3
+    scunet_color_real_gan = 4
+    scunet_gray_15 = 5
+    scunet_gray_25 = 6
+    scunet_gray_50 = 7
+
+
+def SCUNet(
+    clip: vs.VideoNode,
+    tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    model: SCUNetModel = SCUNetModel.scunet_color_real_psnr,
+    backend: backendT = Backend.OV_CPU()
+) -> vs.VideoNode:
+    """ Practical Blind Denoising via Swin-Conv-UNet and Data Synthesis
+
+    Unlike vs-scunet v1.0.0, the default model is set to scunet_color_real_psnr due to the color shift.
+    """
+
+    func_name = "vsmlrt.SCUNet"
+
+    if not isinstance(clip, vs.VideoNode):
+        raise TypeError(f'{func_name}: "clip" must be a clip!')
+
+    if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]:
+        raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
+
+    if not isinstance(model, int) or model not in SCUNetModel.__members__.values():
+        raise ValueError(f'{func_name}: invalid "model"')
+
+    if model in range(5) and clip.format.color_family != vs.RGB:
+        raise ValueError(f'{func_name}: "clip" must be of RGB color family')
+    elif model in range(5, 8) and clip.format.color_family != vs.GRAY:
+        raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
+
+    if overlap is None:
+        overlap_w = overlap_h = 16
+    elif isinstance(overlap, int):
+        overlap_w = overlap_h = overlap
+    else:
+        overlap_w, overlap_h = overlap
+
+    multiple = 1
+
+    (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize(
+        tiles=tiles, tilesize=tilesize,
+        width=clip.width, height=clip.height,
+        multiple=multiple,
+        overlap_w=overlap_w, overlap_h=overlap_h
+    )
+
+    if tile_w % multiple != 0 or tile_h % multiple != 0:
+        raise ValueError(
+            f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})'
+        )
+
+    backend = init_backend(
+        backend=backend,
+        trt_opt_shapes=(tile_w, tile_h)
+    )
+
+    network_path = os.path.join(
+        models_path,
+        "scunet",
+        f"{tuple(SCUNetModel.__members__)[model]}.onnx"
+    )
+
+    clip = inference_with_fallback(
+        clips=[clip], network_path=network_path,
+        overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h),
+        backend=backend
+    )
+
+    return clip
+
+
+@enum.unique
+class SwinIRModel(enum.IntEnum):
+    lightweightSR_DIV2K_s64w8_SwinIR_S_x2 = 0
+    lightweightSR_DIV2K_s64w8_SwinIR_S_x3 = 1
+    lightweightSR_DIV2K_s64w8_SwinIR_S_x4 = 2
+    realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_GAN = 3
+    # unused
+    realSR_BSRGAN_DFOWMFC_s64w8_SwinIR_L_x4_PSNR = 5
+    classicalSR_DF2K_s64w8_SwinIR_M_x2 = 6
+    classicalSR_DF2K_s64w8_SwinIR_M_x3 = 7
+    classicalSR_DF2K_s64w8_SwinIR_M_x4 = 8
+    classicalSR_DF2K_s64w8_SwinIR_M_x8 = 9
+    realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_GAN = 10
+    realSR_BSRGAN_DFO_s64w8_SwinIR_M_x2_PSNR = 11
+    realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_GAN = 12
+    realSR_BSRGAN_DFO_s64w8_SwinIR_M_x4_PSNR = 13
+    grayDN_DFWB_s128w8_SwinIR_M_noise15 = 14
+    grayDN_DFWB_s128w8_SwinIR_M_noise25 = 15
+    grayDN_DFWB_s128w8_SwinIR_M_noise50 = 16
+    colorDN_DFWB_s128w8_SwinIR_M_noise15 = 17
+    colorDN_DFWB_s128w8_SwinIR_M_noise25 = 18
+    colorDN_DFWB_s128w8_SwinIR_M_noise50 = 19
+    CAR_DFWB_s126w7_SwinIR_M_jpeg10 = 20
+    CAR_DFWB_s126w7_SwinIR_M_jpeg20 = 21
+    CAR_DFWB_s126w7_SwinIR_M_jpeg30 = 22
+    CAR_DFWB_s126w7_SwinIR_M_jpeg40 = 23
+    colorCAR_DFWB_s126w7_SwinIR_M_jpeg10 = 24
+    colorCAR_DFWB_s126w7_SwinIR_M_jpeg20 = 25
+    colorCAR_DFWB_s126w7_SwinIR_M_jpeg30 = 26
+    colorCAR_DFWB_s126w7_SwinIR_M_jpeg40 = 27
+
+
+def SwinIR(
+    clip: vs.VideoNode,
+    tiles: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    tilesize: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    overlap: typing.Optional[typing.Union[int, typing.Tuple[int, int]]] = None,
+    model: SwinIRModel = SwinIRModel.lightweightSR_DIV2K_s64w8_SwinIR_S_x2,
+    backend: backendT = Backend.OV_CPU()
+) -> vs.VideoNode:
+    """ SwinIR: Image Restoration Using Swin Transformer """
+
+    func_name = "vsmlrt.SwinIR"
+
+    if not isinstance(clip, vs.VideoNode):
+        raise TypeError(f'{func_name}: "clip" must be a clip!')
+
+    if clip.format.sample_type != vs.FLOAT or clip.format.bits_per_sample not in [16, 32]:
+        raise ValueError(f"{func_name}: only constant format 16/32 bit float input supported")
+
+    if not isinstance(model, int) or model not in SwinIRModel.__members__.values():
+        raise ValueError(f'{func_name}: invalid "model"')
+
+    if model in range(14, 17) or model in range(20, 24):
+        if clip.format.color_family != vs.GRAY:
+            raise ValueError(f'{func_name}: "clip" must be of GRAY color family')
+    elif clip.format.color_family != vs.RGB:
+        raise ValueError(f'{func_name}: "clip" must be of RGB color family')
+
+    if overlap is None:
+        overlap_w = overlap_h = 16
+    elif isinstance(overlap, int):
+        overlap_w = overlap_h = overlap
+    else:
+        overlap_w, overlap_h = overlap
+
+    multiple = 1
+
+    (tile_w, tile_h), (overlap_w, overlap_h) = calc_tilesize(
+        tiles=tiles, tilesize=tilesize,
+        width=clip.width, height=clip.height,
+        multiple=multiple,
+        overlap_w=overlap_w, overlap_h=overlap_h
+    )
+
+    if tile_w % multiple != 0 or tile_h % multiple != 0:
+        raise ValueError(
+            f'{func_name}: tile size must be divisible by {multiple} ({tile_w}, {tile_h})'
+        )
+
+    backend = init_backend(
+        backend=backend,
+        trt_opt_shapes=(tile_w, tile_h)
+    )
+
+    if model < 4:
+        model_name = tuple(SwinIRModel.__members__)[model]
+    else:
+        model_name = tuple(SwinIRModel.__members__)[model - 1]
+
+    model_name = model_name.replace("SwinIR_", "SwinIR-")
+
+    if model in range(3):
+        model_name = f"002_{model_name}"
+    elif model in (3, 5):
+        model_name = f"003_{model_name}"
+    elif model in range(6, 10):
+        model_name = f"001_{model_name}"
+    elif model in range(10, 14):
+        model_name = f"003_{model_name}"
+    elif model in range(14, 17):
+        model_name = f"004_{model_name}"
+    elif model in range(17, 20):
+        model_name = f"005_{model_name}"
+    elif model in range(20, 28):
+        model_name = f"006_{model_name}"
+
+    network_path = os.path.join(
+        models_path,
+        "swinir",
+        f"{model_name}.onnx"
+    )
+
+    clip = inference_with_fallback(
+        clips=[clip], network_path=network_path,
+        overlap=(overlap_w, overlap_h), tilesize=(tile_w, tile_h),
+        backend=backend
+    )
+
+    return clip
+
+
 def get_engine_path(
     network_path: str,
     min_shapes: typing.Tuple[int, int],
@@ -1386,7 +1635,12 @@ def get_engine_path(
     tf32: bool,
     use_cudnn: bool,
     input_format: int,
-    output_format: int
+    output_format: int,
+    builder_optimization_level: int,
+    max_aux_streams: typing.Optional[int],
+    short_path: typing.Optional[bool],
+    bf16: bool,
+    engine_folder: typing.Optional[str]
 ) -> str:
 
     with open(network_path, "rb") as file:
@@ -1401,30 +1655,42 @@ def get_engine_path(
         device_name = f"device{device_id}"
 
     if static_shape:
-        shape_str = f".{opt_shapes[0]}x{opt_shapes[1]}"
+        shape_str = f"{opt_shapes[0]}x{opt_shapes[1]}"
     else:
         shape_str = (
-            f".min{min_shapes[0]}x{min_shapes[1]}"
+            f"min{min_shapes[0]}x{min_shapes[1]}"
             f"_opt{opt_shapes[0]}x{opt_shapes[1]}"
             f"_max{max_shapes[0]}x{max_shapes[1]}"
         )
 
-    return (
-        network_path +
+    identity = (
         shape_str +
         ("_fp16" if fp16 else "") +
-        ("_no-tf32" if not tf32 else "") +
+        ("_tf32" if tf32 else "") +
+        ("_bf16" if bf16 else "") +
         (f"_workspace{workspace}" if workspace is not None else "") +
+        f"_opt{builder_optimization_level}" +
+        (f"_max-aux-streams{max_aux_streams}" if max_aux_streams is not None else "") +
         f"_trt-{trt_version}" +
         ("_cublas" if use_cublas else "") +
         ("_cudnn" if use_cudnn else "") +
         "_I-" + ("fp32" if input_format == 0 else "fp16") +
         "_O-" + ("fp32" if output_format == 0 else "fp16") +
         f"_{device_name}" +
-        f"_{checksum:x}" +
-        ".engine"
+        f"_{checksum:x}"
     )
 
+    dirname, basename = os.path.split(network_path)
+
+    if engine_folder is not None:
+        os.makedirs(engine_folder, exist_ok=True)
+        dirname = engine_folder
+
+    if short_path or (short_path is None and platform.system() == "Windows"):
+        return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.engine")
+    else:
+        return f"{os.path.join(dirname, basename)}.{identity}.engine"
+
 
 def trtexec(
     network_path: str,
@@ -1433,12 +1699,12 @@ def trtexec(
     max_shapes: typing.Tuple[int, int],
     fp16: bool,
     device_id: int,
-    workspace: typing.Optional[int] = 128,
+    workspace: typing.Optional[int] = None,
     verbose: bool = False,
     use_cuda_graph: bool = False,
     use_cublas: bool = False,
     static_shape: bool = True,
-    tf32: bool = True,
+    tf32: bool = False,
     log: bool = False,
     use_cudnn: bool = True,
     use_edge_mask_convolutions: bool = True,
@@ -1451,11 +1717,15 @@ def trtexec(
     faster_dynamic_shapes: bool = True,
     force_fp16: bool = False,
     builder_optimization_level: int = 3,
+    max_aux_streams: typing.Optional[int] = None,
+    short_path: typing.Optional[bool] = None,
+    bf16: bool = False,
     custom_env: typing.Dict[str, str] = {},
-    custom_args: typing.List[str] = []
+    custom_args: typing.List[str] = [],
+    engine_folder: typing.Optional[str] = None
 ) -> str:
 
-    # tensort runtime version, e.g. 8401 => 8.4.1
+    # tensort runtime version
     trt_version = parse_trt_version(int(core.trt.Version()["tensorrt_version"]))
 
     if isinstance(opt_shapes, int):
@@ -1467,6 +1737,7 @@ def trtexec(
     if force_fp16:
         fp16 = True
         tf32 = False
+        bf16 = False
 
     engine_path = get_engine_path(
         network_path=network_path,
@@ -1481,19 +1752,26 @@ def trtexec(
         tf32=tf32,
         use_cudnn=use_cudnn,
         input_format=input_format,
-        output_format=output_format
+        output_format=output_format,
+        builder_optimization_level=builder_optimization_level,
+        max_aux_streams=max_aux_streams,
+        short_path=short_path,
+        bf16=bf16,
+        engine_folder=engine_folder,
     )
 
     if os.access(engine_path, mode=os.R_OK):
         return engine_path
 
-    alter_engine_path = os.path.join(
-        tempfile.gettempdir(),
-        os.path.splitdrive(engine_path)[1][1:]
-    )
+    # do not consider alternative path when the engine_folder is given
+    if engine_folder is None:
+        alter_engine_path = os.path.join(
+            tempfile.gettempdir(),
+            os.path.splitdrive(engine_path)[1][1:]
+        )
 
-    if os.access(alter_engine_path, mode=os.R_OK):
-        return alter_engine_path
+        if os.access(alter_engine_path, mode=os.R_OK):
+            return alter_engine_path
 
     try:
         # test writability
@@ -1501,12 +1779,16 @@ def trtexec(
             pass
         os.remove(engine_path)
     except PermissionError:
-        print(f"{engine_path} not writable", file=sys.stderr)
-        engine_path = alter_engine_path
-        dirname = os.path.dirname(engine_path)
-        if not os.path.exists(dirname):
-            os.makedirs(dirname)
-        print(f"change engine path to {engine_path}", file=sys.stderr)
+        if engine_folder is None:
+            print(f"{engine_path} is not writable", file=sys.stderr)
+            engine_path = alter_engine_path
+            dirname = os.path.dirname(engine_path)
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+            print(f"change engine path to {engine_path}", file=sys.stderr)
+        else:
+            # do not consider alternative path when the engine_folder is given
+            raise PermissionError(f"{engine_path} is not writable")
 
     args = [
         trtexec_path,
@@ -1537,17 +1819,38 @@ def trtexec(
     if verbose:
         args.append("--verbose")
 
-    disabled_tactic_sources = []
-    if not use_cublas:
-        disabled_tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"])
-    if not use_cudnn:
-        disabled_tactic_sources.append("-CUDNN")
-    if not use_edge_mask_convolutions and trt_version >= (8, 4, 1):
-        disabled_tactic_sources.append("-EDGE_MASK_CONVOLUTIONS")
-    if not use_jit_convolutions and trt_version >= (8, 5, 0):
-        disabled_tactic_sources.append("-JIT_CONVOLUTIONS")
-    if disabled_tactic_sources:
-        args.append(f"--tacticSources={','.join(disabled_tactic_sources)}")
+    preview_features = []
+    if (use_cublas or use_cudnn) and (8, 6, 0) <= trt_version < (10, 0, 0):
+        preview_features.append("-disableExternalTacticSourcesForCore0805")
+
+    if preview_features and trt_version >= (8, 5, 0):
+        args.append(f"--preview={','.join(preview_features)}")
+
+    tactic_sources = []
+
+    if use_cublas:
+        tactic_sources.extend(["+CUBLAS", "+CUBLAS_LT"])
+    else:
+        tactic_sources.extend(["-CUBLAS", "-CUBLAS_LT"])
+
+    if use_cudnn:
+        tactic_sources.append("+CUDNN")
+    else:
+        tactic_sources.append("-CUDNN")
+
+    if trt_version >= (8, 4, 1):
+        if use_edge_mask_convolutions:
+            tactic_sources.append("+EDGE_MASK_CONVOLUTIONS")
+        else:
+            tactic_sources.append("-EDGE_MASK_CONVOLUTIONS")
+
+    if trt_version >= (8, 5, 0):
+        if use_jit_convolutions:
+            tactic_sources.append("+JIT_CONVOLUTIONS")
+        else:
+            tactic_sources.append("-JIT_CONVOLUTIONS")
+
+    args.append(f"--tacticSources={','.join(tactic_sources)}")
 
     if use_cuda_graph:
         args.extend((
@@ -1555,7 +1858,7 @@ def trtexec(
             "--noDataTransfers"
         ))
     else:
-        if trt_version >= 8600:
+        if trt_version >= (8, 6, 0):
             args.append("--skipInference")
         else:
             args.append("--buildOnly")
@@ -1590,6 +1893,13 @@ def trtexec(
     if trt_version >= (8, 6, 0):
         args.append(f"--builderOptimizationLevel={builder_optimization_level}")
 
+        if max_aux_streams is not None:
+            args.append(f"--maxAuxStreams={max_aux_streams}")
+
+    if trt_version >= (9, 0, 0):
+        if bf16:
+            args.append("--bf16")
+
     args.extend(custom_args)
 
     if log:
@@ -1598,7 +1908,7 @@ def trtexec(
 
         if prev_env_value is not None and len(prev_env_value) > 0:
             # env_key has been set, no extra action
-            env = {env_key: prev_env_value}
+            env = {env_key: prev_env_value, "CUDA_MODULE_LOADING": "LAZY"}
             env.update(**custom_env)
             subprocess.run(args, env=env, check=True, stdout=sys.stderr)
         else:
@@ -1609,7 +1919,7 @@ def trtexec(
                 f"trtexec_{time_str}.log"
             )
 
-            env = {env_key: log_filename}
+            env = {env_key: log_filename, "CUDA_MODULE_LOADING": "LAZY"}
             env.update(**custom_env)
 
             completed_process = subprocess.run(args, env=env, check=False, stdout=sys.stderr)
@@ -1639,7 +1949,8 @@ def get_mxr_path(
     fp16: bool,
     fast_math: bool,
     exhaustive_tune: bool,
-    device_id: int
+    device_id: int,
+    short_path: typing.Optional[bool]
 ) -> str:
 
     with open(network_path, "rb") as file:
@@ -1665,7 +1976,11 @@ def get_mxr_path(
         f"_{checksum:x}"
     )
 
-    return f"{network_path}.{identity}.mxr"
+    if short_path or (short_path is None and platform.system() == "Windows"):
+        dirname, basename = os.path.split(network_path)
+        return os.path.join(dirname, f"{zlib.crc32((basename + identity).encode()):x}.mxr")
+    else:
+        return f"{network_path}.{identity}.mxr"
 
 
 def migraphx_driver(
@@ -1677,6 +1992,7 @@ def migraphx_driver(
     exhaustive_tune: bool,
     device_id: int,
     input_name: str = "input",
+    short_path: typing.Optional[bool] = None,
     custom_env: typing.Dict[str, str] = {},
     custom_args: typing.List[str] = []
 ) -> str:
@@ -1690,7 +2006,8 @@ def migraphx_driver(
         fp16=fp16,
         fast_math=fast_math,
         exhaustive_tune=exhaustive_tune,
-        device_id=device_id
+        device_id=device_id,
+        short_path=short_path
     )
 
     if os.access(mxr_path, mode=os.R_OK):
@@ -1805,6 +2122,8 @@ def init_backend(
         backend = Backend.ORT_DML()
     elif backend is Backend.MIGX: # type: ignore
         backend = Backend.MIGX()
+    elif backend is Backend.OV_NPU:
+        backend = Backend.OV_NPU()
 
     backend = copy.deepcopy(backend)
 
@@ -1866,6 +2185,19 @@ def _inference(
             fp16_blacklist_ops=backend.fp16_blacklist_ops
         )
     elif isinstance(backend, Backend.ORT_CUDA):
+        kwargs = dict()
+
+        version_list = core.ort.Version().get("onnxruntime_version", b"0.0.0").split(b'.')
+        if len(version_list) != 3:
+            version = (0, 0, 0)
+        else:
+            version = tuple(map(int, version_list))
+
+        if version >= (1, 18, 0):
+            kwargs["prefer_nhwc"] = backend.prefer_nhwc
+            kwargs["output_format"] = backend.output_format
+            kwargs["tf32"] = backend.tf32
+
         clip = core.ort.Model(
             clips, network_path,
             overlap=overlap, tilesize=tilesize,
@@ -1877,34 +2209,66 @@ def _inference(
             fp16=backend.fp16,
             path_is_serialization=path_is_serialization,
             use_cuda_graph=backend.use_cuda_graph,
-            fp16_blacklist_ops=backend.fp16_blacklist_ops
+            fp16_blacklist_ops=backend.fp16_blacklist_ops,
+            **kwargs
         )
     elif isinstance(backend, Backend.OV_CPU):
-        config = lambda: dict(
-            CPU_THROUGHPUT_STREAMS=backend.num_streams,
-            CPU_BIND_THREAD="YES" if backend.bind_thread else "NO",
-            CPU_THREADS_NUM=backend.num_threads,
-            ENFORCE_BF16="YES" if backend.bf16 else "NO"
-        )
+        version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.')))
+
+        if version >= (2024, 0, 0):
+            config_dict = dict(
+                NUM_STREAMS=backend.num_streams,
+                INFERENCE_NUM_THREADS=backend.num_threads,
+                ENABLE_CPU_PINNING="YES" if backend.bind_thread else "NO"
+            )
+            if backend.fp16:
+                config_dict["INFERENCE_PRECISION_HINT"] = "f16"
+            elif backend.bf16:
+                config_dict["INFERENCE_PRECISION_HINT"] = "bf16"
+            else:
+                config_dict["INFERENCE_PRECISION_HINT"] = "f32"
+
+            config = lambda: config_dict
+        else:
+            config = lambda: dict(
+                CPU_THROUGHPUT_STREAMS=backend.num_streams,
+                CPU_BIND_THREAD="YES" if backend.bind_thread else "NO",
+                CPU_THREADS_NUM=backend.num_threads,
+                ENFORCE_BF16="YES" if backend.bf16 else "NO"
+            )
 
         clip = core.ov.Model(
             clips, network_path,
             overlap=overlap, tilesize=tilesize,
             device="CPU", builtin=False,
-            fp16=backend.fp16,
+            fp16=False, # use ov's internal quantization
             config=config,
             path_is_serialization=path_is_serialization,
-            fp16_blacklist_ops=backend.fp16_blacklist_ops
+            fp16_blacklist_ops=backend.fp16_blacklist_ops # disabled since fp16 = False
         )
     elif isinstance(backend, Backend.OV_GPU):
-        config = lambda: dict(
-            GPU_THROUGHPUT_STREAMS=backend.num_streams
-        )
+        version = tuple(map(int, core.ov.Version().get("openvino_version", b"0.0.0").split(b'-')[0].split(b'.')))
+
+        if version >= (2024, 0, 0):
+            config_dict = dict(
+                NUM_STREAMS=backend.num_streams,
+            )
+            if backend.fp16:
+                config_dict["INFERENCE_PRECISION_HINT"] = "f16"
+            else:
+                config_dict["INFERENCE_PRECISION_HINT"] = "f32"
+
+            config = lambda: config_dict
+        else:
+            config = lambda: dict(
+                GPU_THROUGHPUT_STREAMS=backend.num_streams
+            )
+
         clip = core.ov.Model(
             clips, network_path,
             overlap=overlap, tilesize=tilesize,
             device=f"GPU.{backend.device_id}", builtin=False,
-            fp16=backend.fp16,
+            fp16=False, # use ov's internal quantization
             config=config,
             path_is_serialization=path_is_serialization,
             fp16_blacklist_ops=backend.fp16_blacklist_ops
@@ -1945,8 +2309,12 @@ def _inference(
             faster_dynamic_shapes=backend.faster_dynamic_shapes,
             force_fp16=backend.force_fp16,
             builder_optimization_level=backend.builder_optimization_level,
+            max_aux_streams=backend.max_aux_streams,
+            short_path=backend.short_path,
+            bf16=backend.bf16,
             custom_env=backend.custom_env,
-            custom_args=backend.custom_args
+            custom_args=backend.custom_args,
+            engine_folder=backend.engine_folder,
         )
         clip = core.trt.Model(
             clips, engine_path,
@@ -1986,6 +2354,7 @@ def _inference(
             exhaustive_tune=backend.exhaustive_tune,
             device_id=backend.device_id,
             input_name=input_name,
+            short_path=backend.short_path,
             custom_env=backend.custom_env,
             custom_args=backend.custom_args
         )
@@ -1995,6 +2364,14 @@ def _inference(
             tilesize=tilesize,
             device_id=backend.device_id
         )
+    elif isinstance(backend, Backend.OV_NPU):
+        clip = core.ov.Model(
+            clips, network_path,
+            overlap=overlap, tilesize=tilesize,
+            device="NPU", builtin=False,
+            fp16=False, # use ov's internal quantization
+            path_is_serialization=path_is_serialization,
+        )
     else:
         raise TypeError(f'unknown backend {backend}')
 
@@ -2101,9 +2478,9 @@ class BackendV2:
     def TRT(*,
         num_streams: int = 1,
         fp16: bool = False,
-        tf32: bool = True,
+        tf32: bool = False,
         output_format: int = 0, # 0: fp32, 1: fp16
-        workspace: typing.Optional[int] = 128,
+        workspace: typing.Optional[int] = None,
         use_cuda_graph: bool = False,
         static_shape: bool = True,
         min_shapes: typing.Tuple[int, int] = (0, 0),
@@ -2111,7 +2488,7 @@ def TRT(*,
         max_shapes: typing.Optional[typing.Tuple[int, int]] = None,
         force_fp16: bool = False,
         use_cublas: bool = False,
-        use_cudnn: bool = True,
+        use_cudnn: bool = False,
         device_id: int = 0,
         **kwargs
     ) -> Backend.TRT:
@@ -2224,18 +2601,25 @@ def MIGX(*,
             **kwargs
         )
 
+    @staticmethod
+    def OV_NPU(**kwargs
+    ) -> Backend.OV_NPU:
+        return Backend.OV_NPU(
+            **kwargs
+        )
+
 
 def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode:
     clip_org = clip
 
     if clip.format.sample_type == vs.FLOAT and clip.format.bits_per_sample != 32:
         format = clip.format.replace(core=core, bits_per_sample=32)
-        clip = core.resize.Point(clip, format=format)
+        clip = core.resize.Point(clip, format=format.id)
 
     clip = core.fmtc.resample(clip, **kwargs)
 
     if clip.format.bits_per_sample != clip_org.format.bits_per_sample:
-        clip = core.resize.Point(clip, format=clip_org.format)
+        clip = core.resize.Point(clip, format=clip_org.format.id)
 
     return clip
 
diff --git a/vsncnn/vs_ncnn.cpp b/vsncnn/vs_ncnn.cpp
index 8b06821..cabe182 100644
--- a/vsncnn/vs_ncnn.cpp
+++ b/vsncnn/vs_ncnn.cpp
@@ -22,17 +22,12 @@
 #include <net.h>
 #include <gpu.h>
 
-#include "config.h" // generated by cmake
 #include <onnx/common/version.h>
-#include "onnx2ncnn.hpp"
 
+#include "../common/onnx_utils.h"
+#include "onnx2ncnn.hpp"
 
-extern std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
-    const std::string_view & path,
-    int64_t tile_w,
-    int64_t tile_h,
-    bool path_is_serialization
-) noexcept;
+#include "config.h" // generated by cmake
 
 
 static const VSPlugin * myself = nullptr;
diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt
index 85dcae3..c85ddc1 100644
--- a/vsort/CMakeLists.txt
+++ b/vsort/CMakeLists.txt
@@ -39,7 +39,14 @@ if (CMAKE_CXX_STANDARD GREATER_EQUAL 20)
     set_target_properties(vsort PROPERTIES CXX_STANDARD 20)
 endif()
 
-target_link_libraries(vsort PRIVATE onnx onnxruntime)
+# https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
+if (ONNX_VERSION VERSION_LESS 1.16.0)
+    target_link_libraries(vsort PRIVATE onnx)
+else()
+    target_link_libraries(vsort PRIVATE ONNX::onnx)
+endif()
+
+target_link_libraries(vsort PRIVATE onnxruntime)
 
 if (ENABLE_CUDA)
     find_package(CUDAToolkit REQUIRED)
diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp
index f543afd..290c1d2 100644
--- a/vsort/vs_onnxruntime.cpp
+++ b/vsort/vs_onnxruntime.cpp
@@ -1,11 +1,9 @@
 #include <array>
 #include <atomic>
 #include <cstdint>
-#include <ios>
 #include <memory>
 #include <mutex>
 #include <optional>
-#include <stdexcept>
 #include <string>
 #include <string_view>
 #include <unordered_set>
@@ -27,31 +25,20 @@ using namespace std::chrono_literals;
 #define NOMINMAX
 
 #include <onnxruntime_c_api.h>
+#include <onnxruntime_run_options_config_keys.h>
 
 #ifdef ENABLE_CUDA
 #include <cuda_runtime.h>
 #endif // ENABLE_CUDA
 
 #ifdef ENABLE_DML
-// include/onnxruntime/core/providers/dml/dml_provider_factory.h
-#include <../providers/dml/dml_provider_factory.h>
+#include <dml_provider_factory.h>
 #endif // ENABLE_DML
 
-#include "config.h"
-
+#include "../common/convert_float_to_float16.h"
+#include "../common/onnx_utils.h"
 
-extern std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
-    const std::string_view & path,
-    int64_t tile_w,
-    int64_t tile_h,
-    bool path_is_serialization
-) noexcept;
-
-extern void convert_float_to_float16(
-    ONNX_NAMESPACE::ModelProto & model,
-    bool force_fp16_initializers,
-    const std::unordered_set<std::string> & op_block_list
-) noexcept;
+#include "config.h"
 
 
 #ifdef ENABLE_COREML
@@ -87,6 +74,7 @@ static std::mutex capture_lock;
 // rename GridSample to com.microsoft::GridSample
 // onnxruntime has support for CUDA-accelerated GridSample only in its own opset domain
 static void rename(ONNX_NAMESPACE::ModelProto & model) {
+#if ORT_API_VERSION < 18
     constexpr auto ms_domain = "com.microsoft";
 
     bool has_ms_opset = false;
@@ -109,6 +97,7 @@ static void rename(ONNX_NAMESPACE::ModelProto & model) {
             *node.mutable_domain() = ms_domain;
         }
     }
+#endif // ORT_API_VERSION < 18
 }
 
 
@@ -176,6 +165,19 @@ static std::variant<std::string, std::array<int64_t, 4>> getShape(
     return std::get<std::array<int64_t, 4>>(maybe_shape);
 }
 
+static size_t getNumBytes(int32_t type) {
+    using namespace ONNX_NAMESPACE;
+
+    switch (type) {
+        case TensorProto::FLOAT:
+            return 4;
+        case TensorProto::FLOAT16:
+            return 2;
+        default:
+            return 0;
+    }
+}
+
 
 static int numPlanes(
     const std::vector<const VSVideoInfo *> & vis
@@ -197,8 +199,12 @@ static std::optional<std::string> checkNodes(
 ) noexcept {
 
     for (const auto & vi : vis) {
-        if (vi->format->sampleType != stFloat || vi->format->bitsPerSample != 32) {
-            return "expects clip with type fp32";
+        if (vi->format->sampleType != stFloat) {
+            return "expects clip with floating-point type";
+        }
+        
+        if (vi->format->bitsPerSample != 32 && vi->format->bitsPerSample != 16) {
+            return "expects clip with type fp32 or fp16";
         }
 
         if (vi->width != vis[0]->width || vi->height != vis[0]->height) {
@@ -234,8 +240,8 @@ static std::optional<std::string> checkIOInfo(
     ONNXTensorElementDataType element_type;
     checkError(ortapi->GetTensorElementType(tensor_info, &element_type));
 
-    if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
-        return set_error("expects network IO with type fp32");
+    if (element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && element_type != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
+        return set_error("expects network IO with type fp32 or fp16");
     }
 
     size_t num_dims;
@@ -341,6 +347,23 @@ static std::optional<std::string> checkNodesAndNetwork(
         return set_error("tile size larger than clip dimension");
     }
 
+    OrtTypeInfo * output_type_info;
+    checkError(ortapi->SessionGetOutputTypeInfo(session, 0, &output_type_info));
+
+    const OrtTensorTypeAndShapeInfo * output_tensor_info;
+    checkError(ortapi->CastTypeInfoToTensorInfo(output_type_info, &output_tensor_info));
+
+    auto network_out_dims = std::get<std::array<int64_t, 4>>(getShape(output_tensor_info));
+
+    auto network_out_height = network_out_dims[2];
+    auto network_out_width = network_out_dims[3];
+
+    if (network_out_height % network_in_height != 0 || network_out_width % network_in_width != 0) {
+        return set_error("output dimensions must be divisible by input dimensions");
+    }
+
+    ortapi->ReleaseTypeInfo(output_type_info);
+
     ortapi->ReleaseTypeInfo(input_type_info);
 
     return {};
@@ -351,16 +374,17 @@ static void setDimensions(
     const std::array<int64_t, 4> & input_shape,
     const std::array<int64_t, 4> & output_shape,
     VSCore * core,
-    const VSAPI * vsapi
+    const VSAPI * vsapi,
+    int32_t onnx_output_type
 ) noexcept {
 
     vi->height *= output_shape[2] / input_shape[2];
     vi->width *= output_shape[3] / input_shape[3];
 
     if (output_shape[1] == 1) {
-        vi->format = vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core);
+        vi->format = vsapi->registerFormat(cmGray, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core);
     } else if (output_shape[1] == 3) {
-        vi->format = vsapi->registerFormat(cmRGB, stFloat, 32, 0, 0, core);
+        vi->format = vsapi->registerFormat(cmRGB, stFloat, 8 * getNumBytes(onnx_output_type), 0, 0, core);
     }
 }
 
@@ -565,9 +589,23 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
             return nullptr;
         };
 
+        OrtRunOptions * run_options {};
+
 #ifdef ENABLE_CUDA
         if (d->backend == Backend::CUDA) {
             checkCUDAError(cudaSetDevice(d->device_id));
+
+#if ORT_API_VERSION >= 16
+            checkError(ortapi->CreateRunOptions(&run_options));
+            if (run_options == nullptr) {
+                return set_error("create run_options failed");
+            }
+            checkError(ortapi->AddRunConfigEntry(
+                run_options,
+                kOrtRunOptionsConfigDisableSynchronizeExecutionProviders,
+                "1"
+            ));
+#endif // ORT_API_VERSION >= 16
         }
 #endif // ENABLE_CUDA
 
@@ -627,9 +665,9 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
                         resource.stream
                     ));
 
-                    // OrtCUDAProviderOptionsV2 disallows using custom user stream
-                    // and the inference is executed on a private non-blocking stream
+#if ORT_API_VERSION < 16
                     checkCUDAError(cudaStreamSynchronize(resource.stream));
+#endif // ORT_API_VERSION < 16
                 }
 #endif // ENABLE_CUDA
 
@@ -644,17 +682,17 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
                     // note that this applies only to stream capture from the ort library
                     // this fails when another plugin also uses global-mode stream capture
                     std::lock_guard _ { capture_lock };
-                    checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
+                    checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding));
 
                     // onnxruntime replays the graph itself in CUDAExecutionProvider::OnRunEnd
                 } else
 #endif // ENABLE_CUDA
                 if (d->backend == Backend::CPU || d->backend == Backend::CUDA) {
-                    checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
+                    checkError(ortapi->RunWithBinding(resource.session, run_options, resource.binding));
                 } else {
                     checkError(ortapi->Run(
                         resource.session,
-                        nullptr,
+                        run_options,
                         &resource.input_name,
                         &resource.input_tensor,
                         1,
@@ -735,6 +773,10 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
             y = std::min(y + step_h, src_height - src_tile_h);
         }
 
+        if (run_options) {
+            ortapi->ReleaseRunOptions(run_options);
+        }
+
         d->release(ticket);
 
         for (const auto & frame : src_frames) {
@@ -908,6 +950,18 @@ static void VS_CC vsOrtCreate(
     if (error) {
         cudnn_benchmark = true;
     }
+
+#if ORT_API_VERSION >= 17
+    bool prefer_nhwc = !!(vsapi->propGetInt(in, "prefer_nhwc", 0, &error));
+    if (error) {
+        prefer_nhwc = false;
+    }
+#endif // ORT_API_VERSION >= 17
+
+    bool tf32 = !!(vsapi->propGetInt(in, "tf32", 0, &error));
+    if (error) {
+        tf32 = false;
+    }
 #endif // ENABLE_CUDA
 
     if (auto err = ortInit(); err.has_value()) {
@@ -929,6 +983,14 @@ static void VS_CC vsOrtCreate(
         use_cuda_graph = false;
     }
 
+    int output_format = int64ToIntS(vsapi->propGetInt(in, "output_format", 0, &error));
+    if (error) {
+        output_format = 0;
+    }
+    if (output_format != 0 && output_format != 1) {
+        return set_error("\"output_format\" must be 0 or 1");
+    }
+
     std::string_view path_view;
     std::string path;
     if (path_is_serialization) {
@@ -976,11 +1038,26 @@ static void VS_CC vsOrtCreate(
                 fp16_blacklist_ops.emplace(vsapi->propGetData(in, "fp16_blacklist_ops", i, nullptr));
             }
         }
-        convert_float_to_float16(onnx_model, false, fp16_blacklist_ops);
+        convert_float_to_float16(
+            onnx_model,
+            false,
+            fp16_blacklist_ops,
+            in_vis.front()->format->bytesPerSample == 4,
+            output_format == 0
+        );
     }
 
     rename(onnx_model);
 
+    auto onnx_input_type = onnx_model.graph().input()[0].type().tensor_type().elem_type();
+    auto onnx_output_type = onnx_model.graph().output()[0].type().tensor_type().elem_type();
+
+    if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT && in_vis.front()->format->bitsPerSample != 32) {
+        return set_error("the onnx requires input to be of type fp32");
+    } else if (onnx_input_type == ONNX_NAMESPACE::TensorProto::FLOAT16 && in_vis.front()->format->bitsPerSample != 16) {
+        return set_error("the onnx requires input to be of type fp16");
+    }
+
     std::string onnx_data = onnx_model.SerializeAsString();
     if (std::size(onnx_data) == 0) {
         return set_error("proto serialization failed");
@@ -1041,6 +1118,8 @@ static void VS_CC vsOrtCreate(
         // TODO: other providers
 #ifdef ENABLE_CUDA
         if (d->backend == Backend::CUDA) {
+            checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking));
+
             OrtCUDAProviderOptionsV2 * cuda_options;
             checkError(ortapi->CreateCUDAProviderOptions(&cuda_options));
 #ifdef _MSC_VER
@@ -1061,7 +1140,11 @@ static void VS_CC vsOrtCreate(
                 "cudnn_conv_algo_search",
                 "cudnn_conv_use_max_workspace",
                 "arena_extend_strategy",
-                "enable_cuda_graph"
+                "enable_cuda_graph",
+#if ORT_API_VERSION >= 17
+                "prefer_nhwc",
+                "use_tf32",
+#endif // ORT_API_VERSION >= 17
             };
             auto device_id_str = std::to_string(d->device_id);
             const char * values [] {
@@ -1069,7 +1152,11 @@ static void VS_CC vsOrtCreate(
                 "EXHAUSTIVE",
                 "1",
                 "kSameAsRequested",
-                "0"
+                "0",
+#if ORT_API_VERSION >= 17
+                "0",
+                "0",
+#endif // ORT_API_VERSION >= 17
             };
             if (!cudnn_benchmark) {
                 values[1] = "HEURISTIC";
@@ -1080,8 +1167,24 @@ static void VS_CC vsOrtCreate(
             } else {
                 resource.require_replay = false;
             }
+#if ORT_API_VERSION >= 17
+            if (prefer_nhwc) {
+                values[5] = "1";
+            }
+            if (tf32) {
+                values[6] = "1";
+            }
+#endif // ORT_API_VERSION >= 17
             checkError(ortapi->UpdateCUDAProviderOptions(cuda_options, keys, values, std::size(keys)));
 
+#if ORT_API_VERSION >= 16
+            checkError(ortapi->UpdateCUDAProviderOptionsWithValue(
+                cuda_options,
+                "user_compute_stream",
+                resource.stream
+            ));
+#endif // ORT_API_VERSION >= 16
+
             checkError(ortapi->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
 
             ortapi->ReleaseCUDAProviderOptions(cuda_options);
@@ -1122,14 +1225,12 @@ static void VS_CC vsOrtCreate(
 
 #ifdef ENABLE_CUDA
         if (d->backend == Backend::CUDA) {
-            checkCUDAError(cudaStreamCreateWithFlags(&resource.stream, cudaStreamNonBlocking));
-
             resource.input.size = (
                 input_shape[0] *
                 input_shape[1] *
                 input_shape[2] *
                 input_shape[3]
-            ) * sizeof(float);
+            ) * getNumBytes(onnx_input_type);
 
             checkCUDAError(cudaMallocHost(
                 &resource.input.h_data, resource.input.size,
@@ -1141,7 +1242,8 @@ static void VS_CC vsOrtCreate(
                 memory_info,
                 resource.input.d_data, resource.input.size,
                 std::data(input_shape), std::size(input_shape),
-                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.input_tensor
+                static_cast<ONNXTensorElementDataType>(onnx_input_type),
+                &resource.input_tensor
             ));
         } else
 #endif // ENALBE_CUDA
@@ -1149,7 +1251,7 @@ static void VS_CC vsOrtCreate(
             checkError(ortapi->CreateTensorAsOrtValue(
                 cpu_allocator,
                 std::data(input_shape), std::size(input_shape),
-                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                static_cast<ONNXTensorElementDataType>(onnx_input_type),
                 &resource.input_tensor
             ));
         }
@@ -1165,7 +1267,7 @@ static void VS_CC vsOrtCreate(
                 output_shape[1] *
                 output_shape[2] *
                 output_shape[3]
-            ) * sizeof(float);
+            ) * getNumBytes(onnx_output_type);
 
             checkCUDAError(cudaMallocHost(&resource.output.h_data, resource.output.size));
             checkCUDAError(cudaMalloc(&resource.output.d_data, resource.output.size));
@@ -1174,7 +1276,8 @@ static void VS_CC vsOrtCreate(
                 memory_info,
                 resource.output.d_data, resource.output.size,
                 std::data(output_shape), std::size(output_shape),
-                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &resource.output_tensor
+                static_cast<ONNXTensorElementDataType>(onnx_output_type),
+                &resource.output_tensor
             ));
         } else
 #endif // ENABLE_CUDA
@@ -1182,7 +1285,7 @@ static void VS_CC vsOrtCreate(
             checkError(ortapi->CreateTensorAsOrtValue(
                 cpu_allocator,
                 std::data(output_shape), std::size(output_shape),
-                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+                static_cast<ONNXTensorElementDataType>(onnx_output_type),
                 &resource.output_tensor
             ));
         }
@@ -1204,7 +1307,7 @@ static void VS_CC vsOrtCreate(
         }
 
         if (i == 0) {
-            setDimensions(d->out_vi, input_shape, output_shape, core, vsapi);
+            setDimensions(d->out_vi, input_shape, output_shape, core, vsapi, onnx_output_type);
         }
 
         d->resources.push_back(resource);
@@ -1249,6 +1352,9 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         "path_is_serialization:int:opt;"
         "use_cuda_graph:int:opt;"
         "fp16_blacklist_ops:data[]:opt;"
+        "prefer_nhwc:int:opt;"
+        "output_format:int:opt;"
+        "tf32:int:opt;"
         , vsOrtCreate,
         nullptr,
         plugin
@@ -1258,10 +1364,26 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         vsapi->propSetData(out, "version", VERSION, -1, paReplace);
 
         vsapi->propSetData(
-            out, "onnxruntime_version",
+            out, "onnxruntime_api_version_build",
             std::to_string(ORT_API_VERSION).c_str(), -1, paReplace
         );
 
+        if (auto err = ortInit(); err.has_value()) {
+            vsapi->logMessage(mtWarning, err.value().c_str());
+        } else {
+            if (auto p = OrtGetApiBase(); p) {
+                vsapi->propSetData(
+                    out, "onnxruntime_version",
+                    p->GetVersionString(), -1, paReplace
+                );
+            }
+
+            vsapi->propSetData(
+                out, "onnxruntime_build_info",
+                ortapi->GetBuildInfoString(), -1, paReplace
+            );
+        }
+
 #ifdef ENABLE_CUDA
         vsapi->propSetData(
             out, "cuda_runtime_version",
diff --git a/vsort/win32.cpp b/vsort/win32.cpp
index 8741167..cca5e18 100644
--- a/vsort/win32.cpp
+++ b/vsort/win32.cpp
@@ -22,7 +22,6 @@ static std::vector<std::wstring> cudaDlls {
     L"cudart64",
     L"cublasLt64", L"cublas64",
     L"cufft64",
-    L"zlibwapi", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency
     L"cudnn_ops_infer64", L"cudnn_cnn_infer64", L"cudnn_adv_infer64", L"cudnn64",
     L"cupti64",
 };
diff --git a/vsov/CMakeLists.txt b/vsov/CMakeLists.txt
index f003806..e6f033c 100644
--- a/vsov/CMakeLists.txt
+++ b/vsov/CMakeLists.txt
@@ -7,7 +7,6 @@ set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualizatio
 set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library")
 
 find_package(OpenVINO REQUIRED CONFIG)
-find_package(InferenceEngine REQUIRED CONFIG)
 
 add_library(vsov SHARED
     vs_openvino.cpp
@@ -24,20 +23,21 @@ if(WIN32_SHARED_OPENVINO)
     target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO)
 endif()
 
-target_include_directories(vsov PRIVATE
-    ${VAPOURSYNTH_INCLUDE_DIRECTORY}
-    ${ONNX_INCLUDE_DIRS}
-    ${InferenceEngine_INCLUDE_DIRS}
-)
-
 if(WIN32_SHARED_OPENVINO)
     find_package(protobuf REQUIRED CONFIG)
     find_package(ONNX REQUIRED CONFIG)
-    target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} onnx)
+    target_link_libraries(vsov PRIVATE onnx)
 else()
-    target_link_libraries(vsov PRIVATE ${InferenceEngine_LIBRARIES} openvino::onnx)
+    target_link_libraries(vsov PRIVATE openvino::frontend::onnx)
 endif()
 
+target_include_directories(vsov PRIVATE
+    ${VAPOURSYNTH_INCLUDE_DIRECTORY}
+    ${ONNX_INCLUDE_DIRS}
+)
+
+target_link_libraries(vsov PRIVATE openvino::runtime)
+
 set_target_properties(vsov PROPERTIES
     CXX_EXTENSIONS OFF
     CXX_STANDARD 17
diff --git a/vsov/vs_openvino.cpp b/vsov/vs_openvino.cpp
index f186be4..ce6dfe3 100644
--- a/vsov/vs_openvino.cpp
+++ b/vsov/vs_openvino.cpp
@@ -20,28 +20,17 @@
 #include <onnx/common/version.h>
 #include <onnx/onnx_pb.h>
 
-#include <ie_core.hpp>
+#include <openvino/openvino.hpp>
 #include <openvino/pass/constant_folding.hpp>
 
 #ifdef ENABLE_VISUALIZATION
 #include <openvino/pass/visualize_tree.hpp>
 #endif // ENABLE_VISUALIZATION
 
-#include "config.h"
-
-
-extern std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
-    const std::string_view & path,
-    int64_t tile_w,
-    int64_t tile_h,
-    bool path_is_serialization
-) noexcept;
+#include "../common/convert_float_to_float16.h"
+#include "../common/onnx_utils.h"
 
-extern void convert_float_to_float16(
-    ONNX_NAMESPACE::ModelProto & model,
-    bool force_fp16_initializers,
-    const std::unordered_set<std::string> & op_block_list
-) noexcept;
+#include "config.h"
 
 
 using namespace std::string_literals;
@@ -50,16 +39,16 @@ static const VSPlugin * myself = nullptr;
 
 
 static std::array<int, 4> getShape(
-    const InferenceEngine::ExecutableNetwork & network,
+    const ov::CompiledModel & network,
     bool input
 ) {
 
-    InferenceEngine::SizeVector dims;
+    ov::Shape dims;
 
     if (input) {
-        dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims();
+        dims = network.input().get_shape();
     } else {
-        dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims();
+        dims = network.output().get_shape();
     }
 
     std::array<int, 4> ret;
@@ -112,21 +101,19 @@ static std::optional<std::string> checkNodes(
 }
 
 
-template <typename T>
 [[nodiscard]]
 static std::optional<std::string> checkIOInfo(
-    const T & info,
+    const ov::Output<ov::Node> & info,
     bool is_output
 ) {
 
-    if (info->getPrecision() != InferenceEngine::Precision::FP32) {
+    if (info.get_element_type() != ov::element::f32) {
         return "expects network IO with type fp32";
     }
-    const auto & desc = info->getTensorDesc();
-    if (desc.getLayout() != InferenceEngine::Layout::NCHW) {
-        return "expects network IO with layout NCHW";
-    }
-    const auto & dims = desc.getDims();
+    // if (ov::layout::get_layout(info) != ov::Layout("NCHW")) {
+    //     return "expects network IO with layout NCHW";
+    // }
+    const auto & dims = info.get_shape();
     if (dims.size() != 4) {
         return "expects network with 4-D IO";
     }
@@ -148,27 +135,23 @@ static std::optional<std::string> checkIOInfo(
 
 [[nodiscard]]
 static std::optional<std::string> checkNetwork(
-    const InferenceEngine::CNNNetwork & network
+    const std::shared_ptr<ov::Model> & network
 ) {
 
-    const auto & inputs_info = network.getInputsInfo();
-
-    if (auto num_inputs = std::size(inputs_info); num_inputs != 1) {
+    if (auto num_inputs = std::size(network->inputs()); num_inputs != 1) {
         return "network input count must be 1, got " + std::to_string(num_inputs);
     }
 
-    const auto & input_info = inputs_info.cbegin()->second;
+    const auto & input_info = network->input();
     if (auto err = checkIOInfo(input_info, false); err.has_value()) {
         return err.value();
     }
 
-    const auto & outputs_info = network.getOutputsInfo();
-
-    if (auto num_outputs = std::size(outputs_info); num_outputs != 1) {
+    if (auto num_outputs = std::size(network->outputs()); num_outputs != 1) {
         return "network output count must be 1, got " + std::to_string(num_outputs);
     }
 
-    const auto & output_info = outputs_info.cbegin()->second;
+    const auto & output_info = network->output();
     if (auto err = checkIOInfo(output_info, true); err.has_value()) {
         return err.value();
     }
@@ -179,12 +162,12 @@ static std::optional<std::string> checkNetwork(
 
 [[nodiscard]]
 static std::optional<std::string> checkNodesAndNetwork(
-    const InferenceEngine::ExecutableNetwork & network,
+    const ov::CompiledModel & network,
     const std::vector<const VSVideoInfo *> & vis
 ) {
 
     const auto & network_in_dims = (
-        network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims()
+        network.input().get_tensor().get_shape()
     );
 
     int network_in_channels = static_cast<int>(network_in_dims[1]);
@@ -205,15 +188,16 @@ static std::optional<std::string> checkNodesAndNetwork(
 }
 
 
+
 static void setDimensions(
     std::unique_ptr<VSVideoInfo> & vi,
-    const InferenceEngine::ExecutableNetwork & network,
+    const ov::CompiledModel & network,
     VSCore * core,
     const VSAPI * vsapi
 ) {
 
-    auto in_dims = network.GetInputsInfo().cbegin()->second->getTensorDesc().getDims();
-    auto out_dims = network.GetOutputsInfo().cbegin()->second->getTensorDesc().getDims();
+    const auto & in_dims = network.input().get_shape();
+    const auto & out_dims = network.output().get_shape();
 
     vi->height *= out_dims[2] / in_dims[2];
     vi->width *= out_dims[3] / in_dims[3];
@@ -226,13 +210,13 @@ static void setDimensions(
 }
 
 
-static std::variant<std::string, std::map<std::string, std::string>> getConfig(
+static std::variant<std::string, ov::AnyMap> getConfig(
     VSFuncRef * config_func,
     VSCore * core,
     const VSAPI * vsapi
 ) {
 
-    std::map<std::string, std::string> config;
+    ov::AnyMap config;
 
     if (config_func == nullptr) {
         return config;
@@ -285,13 +269,10 @@ struct OVData {
 
     int overlap_w, overlap_h;
 
-    InferenceEngine::Core core;
-    InferenceEngine::ExecutableNetwork executable_network;
-    std::unordered_map<std::thread::id, InferenceEngine::InferRequest> infer_requests;
+    ov::Core core;
+    ov::CompiledModel executable_network;
+    std::unordered_map<std::thread::id, ov::InferRequest> infer_requests;
     std::shared_mutex infer_requests_lock;
-
-    std::string input_name;
-    std::string output_name;
 };
 
 
@@ -396,7 +377,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
 
         auto thread_id = std::this_thread::get_id();
         bool initialized = true;
-        InferenceEngine::InferRequest * infer_request;
+        ov::InferRequest * infer_request;
 
         d->infer_requests_lock.lock_shared();
         try {
@@ -409,9 +390,9 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
         if (!initialized) {
             std::lock_guard _ { d->infer_requests_lock };
             try {
-                d->infer_requests.emplace(thread_id, d->executable_network.CreateInferRequest());
-            } catch (const InferenceEngine::Exception& e) {
-                return set_error("[IE exception] Create inference request: "s + e.what());
+                d->infer_requests.emplace(thread_id, d->executable_network.create_infer_request());
+            } catch (const ov::Exception & e) {
+                return set_error("[OV exception] Create inference request: "s + e.what());
             } catch (const std::exception& e) {
                 return set_error("[Standard exception] Create inference request: "s + e.what());
             }
@@ -429,11 +410,7 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
                 int x_crop_end = (x == src_width - src_tile_w) ? 0 : d->overlap_w;
 
                 {
-                    InferenceEngine::Blob::Ptr input = infer_request->GetBlob(d->input_name);
-
-                    auto minput = input->as<InferenceEngine::MemoryBlob>();
-                    auto minputHolder = minput->wmap();
-                    uint8_t * input_buffer = minputHolder.as<uint8_t *>();
+                    auto input_buffer = (uint8_t *) infer_request->get_input_tensor().data<float>();
 
                     for (const auto & _src_ptr : src_ptrs) {
                         const uint8_t * src_ptr { _src_ptr +
@@ -451,19 +428,15 @@ static const VSFrameRef *VS_CC vsOvGetFrame(
                 }
 
                 try {
-                    infer_request->Infer();
-                } catch (const InferenceEngine::Exception & e) {
-                    return set_error("[IE exception] Create inference request: "s + e.what());
+                    infer_request->infer();
+                } catch (const ov::Exception & e) {
+                    return set_error("[OV exception] Create inference request: "s + e.what());
                 } catch (const std::exception& e) {
                     return set_error("[Standard exception] Create inference request: "s + e.what());
                 }
 
                 {
-                    InferenceEngine::Blob::CPtr output = infer_request->GetBlob(d->output_name);
-
-                    auto moutput = output->as<const InferenceEngine::MemoryBlob>();
-                    auto moutputHolder = moutput->rmap();
-                    const uint8_t * output_buffer = moutputHolder.as<const uint8_t *>();
+                    auto output_buffer = (const uint8_t *) infer_request->get_output_tensor().data<float>();
 
                     for (int plane = 0; plane < dst_planes; ++plane) {
                         uint8_t * dst_ptr = (dst_ptrs[plane] +
@@ -533,11 +506,11 @@ static void VS_CC vsOvCreate(
 ) {
 
     std::unique_ptr<OVData> d = nullptr;
-    
+
     try {
         d = std::make_unique<OVData>();
-    } catch (const InferenceEngine::Exception& e) {
-        vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str());
+    } catch (const ov::Exception& e) {
+        vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str());
         return ;
     } catch (const std::exception& e) {
         vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str());
@@ -675,12 +648,11 @@ static void VS_CC vsOvCreate(
     }
 
     {
-        InferenceEngine::CNNNetwork network;
+        std::shared_ptr<ov::Model> network;
         try {
-            auto empty = InferenceEngine::Blob::CPtr();
-            network = d->core.ReadNetwork(onnx_data, empty);
-        } catch (const InferenceEngine::Exception& e) {
-            return set_error("[IE exception] ReadNetwork(): "s + e.what());
+            network = d->core.read_model(onnx_data, ov::Tensor());
+        } catch (const ov::Exception& e) {
+            return set_error("[OV exception] ReadNetwork(): "s + e.what());
         } catch (const std::exception& e) {
             return set_error("[Standard exception] ReadNetwork(): "s + e.what());
         }
@@ -689,10 +661,8 @@ static void VS_CC vsOvCreate(
             return set_error(err.value());
         }
 
-        auto function = network.getFunction(); // mutable
-
         try {
-            ov::pass::ConstantFolding().run_on_model(function);
+            ov::pass::ConstantFolding().run_on_model(network);
         } catch (const ov::Exception & e) {
             return set_error(e.what());
         }
@@ -701,7 +671,7 @@ static void VS_CC vsOvCreate(
         const char * dot_path = vsapi->propGetData(in, "dot_path", 0, &error);
         if (!error) {
             try {
-                ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(function);
+                ov::pass::VisualizeTree(dot_path, nullptr, true).run_on_model(network);
             } catch (const ov::Exception & e) {
                 return set_error(e.what());
             }
@@ -714,11 +684,11 @@ static void VS_CC vsOvCreate(
         if (std::holds_alternative<std::string>(config_ret)) {
             return set_error(std::get<std::string>(config_ret));
         }
-        auto & config = std::get<std::map<std::string, std::string>>(config_ret);
+        auto & config = std::get<ov::AnyMap>(config_ret);
 
         try {
-            d->executable_network = d->core.LoadNetwork(network, device, config);
-        } catch (const InferenceEngine::Exception & e) {
+            d->executable_network = d->core.compile_model(network, device, config);
+        } catch (const ov::Exception & e) {
             return set_error(e.what());
         }
 
@@ -728,9 +698,6 @@ static void VS_CC vsOvCreate(
 
         setDimensions(d->out_vi, d->executable_network, core, vsapi);
 
-        d->input_name = d->executable_network.GetInputsInfo().cbegin()->first;
-        d->output_name = d->executable_network.GetOutputsInfo().cbegin()->first;
-
         VSCoreInfo core_info;
         vsapi->getCoreInfo2(core, &core_info);
         d->infer_requests.reserve(core_info.numThreads);
@@ -780,8 +747,10 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         vsapi->propSetData(out, "version", VERSION, -1, paReplace);
 
         std::ostringstream ostream;
-        ostream << IE_VERSION_MAJOR << '.' << IE_VERSION_MINOR << '.' << IE_VERSION_PATCH;
-        vsapi->propSetData(out, "inference_engine_version", ostream.str().c_str(), -1, paReplace);
+        ostream << OPENVINO_VERSION_MAJOR << '.' << OPENVINO_VERSION_MINOR << '.' << OPENVINO_VERSION_PATCH;
+        vsapi->propSetData(out, "openvino_version_build", ostream.str().c_str(), -1, paReplace);
+
+        vsapi->propSetData(out, "openvino_version", ov::get_openvino_version().buildNumber, -1, paReplace);
 
         vsapi->propSetData(
             out, "onnx_version",
@@ -798,13 +767,13 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
 
     auto availableDevices = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
         try {
-            auto core = InferenceEngine::Core();
-            auto devices = core.GetAvailableDevices();
+            auto core = ov::Core();
+            auto devices = core.get_available_devices();
             for (const auto & device : devices) {
                 vsapi->propSetData(out, "devices", device.c_str(), -1, paAppend);
             }
-        } catch (const InferenceEngine::Exception& e) {
-            vsapi->setError(out, ("[IE exception] Initialize inference engine: "s + e.what()).c_str());
+        } catch (const ov::Exception& e) {
+            vsapi->setError(out, ("[OV exception] Initialize inference engine: "s + e.what()).c_str());
         } catch (const std::exception& e) {
             vsapi->setError(out, ("[Standard exception] Initialize inference engine: "s + e.what()).c_str());
         }
diff --git a/vsov/win32.cpp b/vsov/win32.cpp
index a6572d7..e0b8d4d 100644
--- a/vsov/win32.cpp
+++ b/vsov/win32.cpp
@@ -14,10 +14,10 @@ namespace {
 std::vector<std::wstring> dlls = {
     // This list must be sorted by dependency.
 #ifdef WIN32_SHARED_OPENVINO
-    L"tbb.dll",
+    L"tbb12.dll",
     L"openvino.dll", // must be the last
 #else // WIN32_SHARED_OPENVINO
-    L"tbb.dll", // must be the last
+    L"tbb12.dll", // must be the last
 #endif // WIN32_SHARED_OPENVINO
 };
 
diff --git a/vstrt/CMakeLists.txt b/vstrt/CMakeLists.txt
index 7bddf7e..212a48f 100644
--- a/vstrt/CMakeLists.txt
+++ b/vstrt/CMakeLists.txt
@@ -8,7 +8,7 @@ set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
 option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE)
 option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE)
-set(CUDNN_HOME "" CACHE PATH "Path to cuDNN")
+set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
 
 FIND_PACKAGE(CUDAToolkit REQUIRED)
 
@@ -31,28 +31,21 @@ set_target_properties(vstrt PROPERTIES
 )
 
 target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib)
-target_link_libraries(vstrt PRIVATE CUDA::cudart_static nvinfer)
+target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer.dll" "delayimp.lib")
+    target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib")
 endif()
 
 if (USE_NVINFER_PLUGIN)
     add_definitions(-DUSE_NVINFER_PLUGIN)
     if (USE_NVINFER_PLUGIN_STATIC)
-        target_link_libraries(vstrt PRIVATE nvinfer_plugin_static)
-
-        find_library(CUDNN_LIB cudnn HINTS ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib)
-        target_link_libraries(vstrt PRIVATE ${CUDNN_LIB} CUDA::cublas)
-
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-            target_link_options(vstrt PRIVATE "/DELAYLOAD:cublas64_11.dll" "/DELAYLOAD:cudnn64_8.dll")
-        endif()
+        target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}")
     else()
-        target_link_libraries(vstrt PRIVATE nvinfer_plugin)
+        target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}")
 
         if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-            target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin.dll")
+            target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll")
         endif()
     endif()
 endif()
diff --git a/vstrt/trt_utils.h b/vstrt/trt_utils.h
index 3a2e3c8..3c0feb9 100644
--- a/vstrt/trt_utils.h
+++ b/vstrt/trt_utils.h
@@ -7,7 +7,6 @@
 #include <optional>
 #include <string>
 #include <variant>
-#include <vector>
 
 #include <cuda_runtime.h>
 #include <NvInferRuntime.h>
@@ -255,6 +254,16 @@ size_t getBytesPerSample(nvinfer1::DataType type) noexcept {
             return 1;
         case nvinfer1::DataType::kUINT8:
             return 1;
+#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+        case nvinfer1::DataType::kFP8:
+            return 1;
+#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+#if NV_TENSORRT_MAJOR >= 9
+        case nvinfer1::DataType::kBF16:
+            return 2;
+        case nvinfer1::DataType::kINT64:
+            return 8;
+#endif // NV_TENSORRT_MAJOR >= 9
         default:
             return 0;
     }
@@ -566,11 +575,20 @@ int getSampleType(nvinfer1::DataType type) noexcept {
     switch (type) {
         case nvinfer1::DataType::kFLOAT:
         case nvinfer1::DataType::kHALF:
+#if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+        case nvinfer1::DataType::kFP8:
+#endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
+#if NV_TENSORRT_MAJOR >= 9
+        case nvinfer1::DataType::kBF16:
+#endif // NV_TENSORRT_MAJOR >= 9
             return 1;
         case nvinfer1::DataType::kINT8:
         case nvinfer1::DataType::kINT32:
         case nvinfer1::DataType::kBOOL:
         case nvinfer1::DataType::kUINT8:
+#if NV_TENSORRT_MAJOR >= 9
+        case nvinfer1::DataType::kINT64:
+#endif // NV_TENSORRT_MAJOR >= 9
             return 0;
         default:
             return -1;
diff --git a/vstrt/trtexec/CMakeLists.txt b/vstrt/trtexec/CMakeLists.txt
index 0b77c94..482ba83 100644
--- a/vstrt/trtexec/CMakeLists.txt
+++ b/vstrt/trtexec/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.20)
 project(trtexec LANGUAGES CXX)
 
 set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
+set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
 
 find_package(CUDAToolkit REQUIRED)
 
@@ -10,16 +11,21 @@ add_executable(trtexec
     $<$<PLATFORM_ID:Windows>: trtexec.manifest>
     trtexec.cpp
     logfile.cpp
+    ../common/bfloat16.cpp
     ../common/logger.cpp
+    ../common/sampleDevice.cpp
     ../common/sampleEngines.cpp
     ../common/sampleInference.cpp
     ../common/sampleOptions.cpp
     ../common/sampleReporting.cpp
     ../common/sampleUtils.cpp
+    ../utils/fileLock.cpp
+    ../utils/timingCache.cpp
 )
 
 target_include_directories(trtexec PRIVATE
     ../common
+    ..
     ${CUDAToolkit_INCLUDE_DIRS}
     ${TENSORRT_HOME}/include
 )
@@ -31,10 +37,9 @@ target_link_directories(trtexec PRIVATE
 
 target_link_libraries(trtexec PRIVATE
     CUDA::cudart_static
-    nvinfer
-    nvinfer_plugin
-    nvparsers
-    nvonnxparser
+    "nvinfer${TENSORRT_LIBRARY_SUFFIX}"
+    "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}"
+    "nvonnxparser${TENSORRT_LIBRARY_SUFFIX}"
 )
 
 install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/vstrt/vs_tensorrt.cpp b/vstrt/vs_tensorrt.cpp
index b16402b..0f22832 100644
--- a/vstrt/vs_tensorrt.cpp
+++ b/vstrt/vs_tensorrt.cpp
@@ -366,9 +366,12 @@ static void VS_CC vsTrtCreate(
     d->logger.set_verbosity(static_cast<nvinfer1::ILogger::Severity>(verbosity));
 
 #ifdef USE_NVINFER_PLUGIN
+    // related to https://github.com/AmusementClub/vs-mlrt/discussions/65, for unknown reason
+#if !(NV_TENSORRT_MAJOR == 9 && defined(_WIN32))
     if (!initLibNvInferPlugins(&d->logger, "")) {
         vsapi->logMessage(mtWarning, "vsTrt: Initialize TensorRT plugins failed");
     }
+#endif
 #endif
 
     std::ifstream engine_stream {
@@ -446,7 +449,17 @@ static void VS_CC vsTrtCreate(
     auto input_type = d->engines[0]->getBindingDataType(0);
 #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 
-    auto input_sample_type = getSampleType(input_type) == 0 ? stInteger : stFloat;
+    VSSampleType input_sample_type;
+    {
+        auto sample_type = getSampleType(input_type);
+        if (sample_type == 0) {
+            input_sample_type = stInteger;
+        } else if (sample_type == 1) {
+            input_sample_type = stFloat;
+        } else {
+            return set_error("unknown input sample type");
+        }
+    }
     auto input_bits_per_sample = getBytesPerSample(input_type) * 8;
 
     if (auto err = checkNodes(in_vis, input_sample_type, input_bits_per_sample); err.has_value()) {
@@ -462,7 +475,17 @@ static void VS_CC vsTrtCreate(
     auto output_type = d->engines[0]->getBindingDataType(1);
 #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 
-    auto output_sample_type = getSampleType(output_type) == 0 ? stInteger : stFloat;
+    VSSampleType output_sample_type;
+    {
+        auto sample_type = getSampleType(output_type);
+        if (sample_type == 0) {
+            output_sample_type = stInteger;
+        } else if (sample_type == 1) {
+            output_sample_type = stFloat;
+        } else {
+            return set_error("unknown output sample type");
+        }
+    }
     auto output_bits_per_sample = getBytesPerSample(output_type) * 8;
 
     setDimensions(
@@ -489,13 +512,22 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         VAPOURSYNTH_API_VERSION, 1, plugin
     );
 
+    // TRT 9 for windows does not export getInferLibVersion()
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+    auto test = getPluginRegistry();
+
+    if (test == nullptr) {
+        std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
+        return;
+    }
+#else // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
     int ver = getInferLibVersion(); // must ensure this is the first nvinfer function called
 #ifdef _WIN32
     if (ver == 0) { // a sentinel value, see dummy function in win32.cpp.
         std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
         return;
     }
-#endif
+#endif // _WIN32
     if (ver != NV_TENSORRT_VERSION) {
 #if NV_TENSORRT_MAJOR >= 10
         std::fprintf(
@@ -513,6 +545,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         );
 #endif // NV_TENSORRT_MAJOR >= 10
     }
+#endif // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
 
     myself = plugin;
 
@@ -535,7 +568,12 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
 
         vsapi->propSetData(
             out, "tensorrt_version",
-            std::to_string(getInferLibVersion()).c_str(), -1, paReplace
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+            std::to_string(NV_TENSORRT_VERSION).c_str(), 
+#else
+            std::to_string(getInferLibVersion()).c_str(), 
+#endif
+            -1, paReplace
         );
 
         vsapi->propSetData(
diff --git a/vstrt/win32.cpp b/vstrt/win32.cpp
index a0cfcfc..b984803 100644
--- a/vstrt/win32.cpp
+++ b/vstrt/win32.cpp
@@ -10,22 +10,32 @@
 
 #include <iostream>
 
+#include <NvInferVersion.h>
+
+#if NV_TENSORRT_VERSION >= 100100
+#define CONCAT_VERSION(name, version) (name "_" #version ".dll")
+#endif // NV_TENSORRT_VERSION >= 100100
+
 namespace {
 std::vector<std::wstring> dlls = {
 	// This list must be sorted by dependency.
-	L"zlibwapi.dll", // cuDNN version 8.3.0+ depends on zlib as a shared library dependency
-	L"cudnn_ops_infer64_8.dll",
-	L"cudnn_cnn_infer64_8.dll",
-	L"cudnn64_8.dll",
+#if NV_TENSORRT_VERSION >= 100100
+#ifdef USE_NVINFER_PLUGIN
+	// nvinfer_plugin dependencies
+	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR),
+	CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR),
+#endif // USE_NVINFER_PLUGIN
+	// Finally, nvinfer again.
+	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last
+#else // NV_TENSORRT_VERSION >= 100100
 #ifdef USE_NVINFER_PLUGIN
 	// nvinfer_plugin dependencies
 	L"nvinfer.dll",
-	L"cublasLt64_11.dll",
-	L"cublas64_11.dll",
 	L"nvinfer_plugin.dll",
-#endif
+#endif // USE_NVINFER_PLUGIN
 	// Finally, nvinfer again.
 	L"nvinfer.dll", // must be the last
+#endif // NV_TENSORRT_VERSION >= 100100
 };
 
 namespace fs = std::filesystem;
@@ -64,7 +74,11 @@ FARPROC loadDLLs() {
 	return (FARPROC)h;
 }
 
+#if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
+static void * dummy() { // mimic getPluginRegistry
+#else
 static int dummy() { // mimic getInferLibVersion
+#endif
 	return 0;
 }