diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 85f0dba..43f6f90 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -20,3 +20,4 @@ add_subdirectory(memory)
 add_subdirectory(overhead)
 add_subdirectory(reduction)
 add_subdirectory(subgroup)
+add_subdirectory(vmt)
diff --git a/benchmarks/vmt/CMakeLists.txt b/benchmarks/vmt/CMakeLists.txt
new file mode 100644
index 0000000..cc9f522
--- /dev/null
+++ b/benchmarks/vmt/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright 2023 Nod Inc.
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-------------------------------------------------------------------------------
+# vmt (RDNA3)
+#-------------------------------------------------------------------------------
+
+uvkc_glsl_shader_permutation(
+  NAME
+    vmt_i8_shader_rdna3
+  SRC
+    "vmt_i8.glsl"
+  PERMUTATION
+    "N0=[1|2|4]"
+    "K0=[16|32|64|128]"
+    "WG_X=64"
+    "WG_Y=[1|2|4]"
+  GLSLC_ARGS
+    "--target-env=vulkan1.1"
+)
+
+uvkc_cc_binary(
+  NAME
+    vmt_rdna3
+  SRCS
+    "vmt_main.cc"
+  DEPS
+    ::vmt_i8_shader_rdna3
+    benchmark::benchmark
+    uvkc::benchmark::core
+    uvkc::benchmark::main
+  COPTS
+  -DUVKC_RDNA3
+)
diff --git a/benchmarks/vmt/README.md b/benchmarks/vmt/README.md
new file mode 100644
index 0000000..bce58dc
--- /dev/null
+++ b/benchmarks/vmt/README.md
@@ -0,0 +1,4 @@
+# vmt benchmark
+
+This directory contains microbenchmarks for evaluating different strategy to
+implement vector matrix transposed multiplication.
diff --git a/benchmarks/vmt/vmt_i8.glsl b/benchmarks/vmt/vmt_i8.glsl
new file mode 100644
index 0000000..269655d
--- /dev/null
+++ b/benchmarks/vmt/vmt_i8.glsl
@@ -0,0 +1,121 @@
+// Copyright 2023 Nod Inc.
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#version 450 core
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#pragma use_vulkan_memory_model
+
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#extension GL_KHR_shader_subgroup_basic : enable
+
+// Multiplies vector `inputA` of length `K` by matrix `inputB`
+// of size `K x N`.
+// We use `K0` and `N0` to denote the tile sizes for N and K,
+// respectively.
+//
+// We assign `N0` rows for each subgroup to process.
+// Subgroups load a batch of values from the vector and the row,
+// calculate the inner product of the two, and accumulate the results at the
+// subgroup-level.
+// Each workgroup produces `N0` * `WG_Y` results. We assume that WG_X is the
+// same as the subgroup size to simplify the implementation. This is a shortcut
+// that should be fixed.
+
+layout(binding = 0) buffer InputA { i32vec4 x[]; } inputA;  // Input vector.
+layout(binding = 1) buffer InputB { i32vec4 x[]; } inputB;  // Input matrix.
+layout(binding = 2) buffer Output { int32_t x[]; } outputO;  // Output vector.
+
+// These are constants defined at compile time.
+layout(local_size_x = WG_X, local_size_y = WG_Y, local_size_z = 1) in;
+
+layout(constant_id = 0) const uint N = 1;
+layout(constant_id = 1) const uint K = 1;
+
+// We process 16-element i8 vectors along the K dimension, and treat i32vec4
+// (packed format) as the effective element type.
+const uint VECTORIZE_K = 16;
+const uint K_VEC = K / VECTORIZE_K;
+const uint K0_VEC = K0 / VECTORIZE_K;
+
+const uint strideB = K_VEC; // Stride of the `inputB` matrix.
+
+// Each subgroup processes a total of N0 rows, therefore
+// each workgroup processes N0 * WG_Y rows.
+const uint WG_ROWS = N0 * WG_Y;
+
+// Offset between elements accessed by a thread in a workgroup.
+const uint WG_K_STRIDE = WG_X * K0_VEC;
+
+/// Returns the index of `X[i, j]`, where `X` is a 2D matrix of stride `stride`.
+uint coordToOffset(uint i, uint j, uint stride) { return stride * i + j; }
+
+int32_t sdot(int32_t lhsPacked, int32_t rhsPacked) {
+  i8vec4 lhs = unpack8(lhsPacked);
+  i8vec4 rhs = unpack8(rhsPacked);
+  i16vec4 mul = i16vec4(lhs) * i16vec4(rhs);
+  return int32_t(mul.x) + int32_t(mul.y) + int32_t(mul.z) + int32_t(mul.w);
+}
+
+int32_t sdot4(i32vec4 lhs, i32vec4 rhs) {
+  return sdot(lhs.x, rhs.x) + sdot(lhs.y, rhs.y) + sdot(lhs.z, rhs.z) + sdot(lhs.w, rhs.w);
+}
+
+void main() {
+  const uvec2 wgID = gl_WorkGroupID.xy;
+  const uvec2 localID = gl_LocalInvocationID.xy;
+  const uint threadID = gl_SubgroupInvocationID;
+
+  // The start offsets of the row tile processed by this thread in this workgroup.
+  const uint startRow = wgID.x * WG_ROWS;
+
+  for (uint y = 0; y < N0; ++y) {
+    uint r = startRow + y * WG_Y + localID.y;
+    int32_t laneResult = 0;
+    i32vec4 tileA[K0_VEC];
+    i32vec4 tileB[K0_VEC];
+
+    for (uint k = 0; k < K_VEC; k += WG_K_STRIDE) {
+      // Prefetch LHS and RHS to reduce the latency.
+      [[unroll]] for (uint kk = 0; kk < K0_VEC; ++kk) {
+        uint gk = k + kk * WG_X + threadID;
+        tileB[kk] = inputB.x[coordToOffset(r, gk, strideB)];
+        tileA[kk] = inputA.x[gk];
+      }
+
+      [[unroll]] for (uint kk = 0; kk < K0_VEC; ++kk) {
+        i32vec4 lhs = tileA[kk];
+        i32vec4 rhs = tileB[kk];
+        laneResult += sdot4(lhs, rhs);
+      }
+    }
+
+    // Final reduction with one subgroup.
+    int32_t sgResult = subgroupAdd(laneResult);
+    if (subgroupElect()) {
+      outputO.x[r] = sgResult;
+    }
+  }
+
+  // Assert that the subgroup and workgroup sizes match.
+  // This simplifies the code but doesn't have to be true on all targets.
+  if (threadID != localID.x)
+    outputO.x[0] = -1;
+}
diff --git a/benchmarks/vmt/vmt_main.cc b/benchmarks/vmt/vmt_main.cc
new file mode 100644
index 0000000..a1d881e
--- /dev/null
+++ b/benchmarks/vmt/vmt_main.cc
@@ -0,0 +1,383 @@
+// Copyright 2023 Nod Inc.
+// Copyright 2020-2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/span.h"
+#include "benchmark/benchmark.h"
+#include "uvkc/base/log.h"
+#include "uvkc/benchmark/data_type_util.h"
+#include "uvkc/benchmark/main.h"
+#include "uvkc/benchmark/status_util.h"
+#include "uvkc/benchmark/vulkan_buffer_util.h"
+#include "uvkc/benchmark/vulkan_context.h"
+#include "uvkc/vulkan/device.h"
+#include "uvkc/vulkan/pipeline.h"
+
+using namespace uvkc::benchmark;
+using ::uvkc::benchmark::LatencyMeasureMode;
+using ::uvkc::vulkan::Pipeline;
+
+static const char kBenchmarkName[] = "vmt";
+
+struct ShaderCode {
+  const char *name;                 // Shader case name
+  absl::Span<const uint32_t> code;  // SPIR-V code
+  int N0;
+  int K0;
+  int wg_size_x;
+  int wg_size_y;
+  DataType input_type;   // LHS & RHS element type
+  DataType output_type;  // Output/Result matrix element type
+};
+
+#define SHADER_I8(N0, K0, X, Y)                                               \
+  ShaderCode {                                                                \
+    "Tile[" #N0 "x" #K0 "]", vmt_i8::N0_##N0##_K0_##K0##_WG_X_##X##_WG_Y_##Y, \
+        N0, K0, X, Y, DataType::i8, DataType::i32                             \
+  }
+
+#define WORKGROUP_TILE_N_I8(X, Y, N0)                                        \
+  SHADER_I8(N0, 16, X, Y), SHADER_I8(N0, 32, X, Y), SHADER_I8(N0, 64, X, Y), \
+      SHADER_I8(N0, 128, X, Y)
+
+#if defined(UVKC_RDNA3)
+
+namespace vmt_i8 {
+#include "vmt_i8_shader_rdna3_spirv_permutation.inc"
+}
+
+static ShaderCode kShaderCodeCases[] = {
+    WORKGROUP_TILE_N_I8(64, 1, 1), WORKGROUP_TILE_N_I8(64, 1, 2),
+    WORKGROUP_TILE_N_I8(64, 1, 4), WORKGROUP_TILE_N_I8(64, 2, 2),
+    WORKGROUP_TILE_N_I8(64, 2, 4), WORKGROUP_TILE_N_I8(64, 4, 4),
+};
+
+#else
+#error "unsupported GPU architecture/strategy"
+#endif
+
+/// Fills the 2D matrix with values produced by the |generator| function.
+template <typename GeneratorFn>
+static void FillBuffer(DataType data_type, void *raw_buffer, size_t num_bytes,
+                       unsigned dim_1, unsigned dim_2, GeneratorFn generator) {
+  auto fill = [&](auto traits) {
+    using Traits = decltype(traits);
+    using StorageType = typename Traits::storage_type;
+    using RuntimeType = typename Traits::storage_type;
+    auto buffer = absl::MakeSpan(static_cast<StorageType *>(raw_buffer),
+                                 num_bytes / GetSize(data_type));
+
+    for (int i = 0; i < dim_1; ++i) {
+      for (int j = 0; j < dim_2; ++j) {
+        buffer[j + i * dim_1] =
+            static_cast<StorageType>(RuntimeType(generator(i, j)));
+      }
+    }
+  };
+
+  InvokeWithTraits(data_type, fill);
+}
+
+/// Checks that the output vector calculated by the shader is contains the
+/// same values as runtime vecmat with values defined by |lhs| and |rhs|.
+template <DataType OutputType, DataType InputType, typename Generator1Fn,
+          typename Generator2Fn>
+static void CheckOutput(const ShaderCode &shader, void *raw_buffer,
+                        size_t num_bytes, unsigned N, unsigned K,
+                        Generator1Fn lhs, Generator2Fn rhs) {
+  using OutputTraits = DataTypeTraits<OutputType>;
+  using OutputStorageType = typename OutputTraits::storage_type;
+  using OutputRuntimeType = typename OutputTraits::runtime_type;
+  using InputTraits = DataTypeTraits<InputType>;
+  using InputRuntimeType = typename InputTraits::runtime_type;
+
+  auto output =
+      absl::MakeConstSpan(static_cast<OutputStorageType *>(raw_buffer),
+                          num_bytes / GetSize(OutputType));
+  for (int j = 0; j < N; ++j) {
+    OutputRuntimeType acc(0.0f);
+    for (int k = 0; k < K; ++k) {
+      acc += OutputRuntimeType(InputRuntimeType(lhs(0, k))) *
+             OutputRuntimeType(InputRuntimeType(rhs(j, k)));
+    }
+
+    OutputRuntimeType gpuValue(output[j]);
+    BM_CHECK_EQ(gpuValue, acc)
+        << "destination buffer element (" << j << ")"
+        << " has incorrect value: expected to be " << acc << " but found "
+        << gpuValue << "\n\t^ In shader: " << shader.name << ", "
+        << GetName(shader.input_type) << "->" << GetName(shader.output_type);
+  }
+}
+
+static void Vmt(::benchmark::State &state, ::uvkc::vulkan::Device *device,
+                const ::uvkc::benchmark::LatencyMeasure *latency_measure,
+                const ShaderCode &shader, int N, int K) {
+  //===-------------------------------------------------------------------===/
+  // Create shader module, pipeline, and descriptor sets
+  //===-------------------------------------------------------------------===/
+
+  BM_CHECK_OK_AND_ASSIGN(
+      auto shader_module,
+      device->CreateShaderModule(shader.code.data(), shader.code.size()));
+
+  ::uvkc::vulkan::Pipeline::SpecConstant spec_constant[2] = {
+      {/*id=*/0, Pipeline::SpecConstant::Type::s32, N},
+      {/*id=*/1, Pipeline::SpecConstant::Type::s32, K},
+  };
+  BM_CHECK_OK_AND_ASSIGN(
+      auto pipeline,
+      device->CreatePipeline(*shader_module, "main", spec_constant));
+
+  BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool,
+                         device->CreateDescriptorPool(*shader_module));
+  BM_CHECK_OK_AND_ASSIGN(auto layout_set_map,
+                         descriptor_pool->AllocateDescriptorSets(
+                             shader_module->descriptor_set_layouts()));
+
+  //===-------------------------------------------------------------------===/
+  // Create buffers
+  //===-------------------------------------------------------------------===/
+  DataType input_type = shader.input_type;
+  DataType output_type = shader.output_type;
+  const size_t src0_size = K * GetSize(input_type);
+  const size_t src1_size = K * N * GetSize(input_type);
+  const size_t dst_size = N * GetSize(output_type);
+
+  BM_CHECK_OK_AND_ASSIGN(
+      auto src0_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src0_size));
+  BM_CHECK_OK_AND_ASSIGN(
+      auto src1_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src1_size));
+  BM_CHECK_OK_AND_ASSIGN(
+      auto dst_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_size));
+
+  //===-------------------------------------------------------------------===/
+  // Set source buffer data
+  //===-------------------------------------------------------------------===/
+  auto getLhs = [K](int i, int j) {
+    float v = ((float)((i * K + j) % 5) - 1.0f) / 2.0f;
+    return v;
+  };
+  auto getRhs = [K](int i, int j) {
+    float v = ((float)((i * K + j) % 7) - 1.0f) / 2.0f;
+    return v;
+  };
+
+  BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
+      device, src0_buffer.get(), src0_size, [&](void *ptr, size_t num_bytes) {
+        FillBuffer(input_type, ptr, num_bytes, 1, K, getLhs);
+      }));
+
+  // In vmt, the RHS is input is transposed, which makes the matrix
+  // column-major.
+  BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
+      device, src1_buffer.get(), src1_size, [&](void *ptr, size_t num_bytes) {
+        FillBuffer(input_type, ptr, num_bytes, N, K, getRhs);
+      }));
+
+  //===-------------------------------------------------------------------===/
+  // Clear the output buffer data set by the previous benchmark run
+  //===-------------------------------------------------------------------===/
+
+  BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
+      device, dst_buffer.get(), dst_size, [&](void *ptr, size_t num_bytes) {
+        FillBuffer(output_type, ptr, num_bytes, 1, N,
+                   [](int, int) { return 0.0f; });
+      }));
+
+  //===-------------------------------------------------------------------===/
+  // Dispatch
+  //===-------------------------------------------------------------------===/
+  std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = {
+      {src0_buffer.get(), /*set=*/0, /*binding=*/0},
+      {src1_buffer.get(), /*set=*/0, /*binding=*/1},
+      {dst_buffer.get(), /*set=*/0, /*binding=*/2},
+  };
+  BM_CHECK_OK(device->AttachBufferToDescriptor(
+      *shader_module, layout_set_map,
+      {bound_buffers.data(), bound_buffers.size()}));
+
+  BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1)
+      << "unexpected number of descriptor sets (" << shader.name << ")";
+  auto descriptor_set_layout = shader_module->descriptor_set_layouts().front();
+
+  std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet>
+      bound_descriptor_sets(1);
+  bound_descriptor_sets[0].index = 0;
+  bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout);
+  BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer());
+
+  BM_CHECK_OK(dispatch_cmdbuf->Begin());
+  dispatch_cmdbuf->BindPipelineAndDescriptorSets(
+      *pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()});
+  // Each workgroup processes N0 rows with S0 subgroups per row.
+  dispatch_cmdbuf->Dispatch(N / shader.N0, 1, 1);
+  BM_CHECK_OK(dispatch_cmdbuf->End());
+  BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf));
+
+  //===-------------------------------------------------------------------===/
+  // Verify destination buffer data
+  //===-------------------------------------------------------------------===/
+
+  if (output_type == DataType::i32) {
+    BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer(
+        device, dst_buffer.get(), dst_size, [&](void *ptr, size_t num_bytes) {
+          if (input_type == DataType::i8) {
+            CheckOutput<DataType::i32, DataType::i8>(shader, ptr, num_bytes, N,
+                                                     K, getLhs, getRhs);
+          } else {
+            BM_CHECK(false) << "Unhandled input type";
+          }
+        }));
+  } else {
+    BM_CHECK(false) << "Unhandled output type";
+  }
+
+  //===-------------------------------------------------------------------===/
+  // Benchmarking
+  //===-------------------------------------------------------------------===/
+
+  std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool;
+  bool use_timestamp =
+      latency_measure->mode == LatencyMeasureMode::kGpuTimestamp;
+  if (use_timestamp) {
+    BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2));
+  }
+
+  BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer());
+  for (auto _ : state) {
+    BM_CHECK_OK(cmdbuf->Begin());
+    if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool);
+
+    cmdbuf->BindPipelineAndDescriptorSets(
+        *pipeline,
+        {bound_descriptor_sets.data(), bound_descriptor_sets.size()});
+
+    if (use_timestamp) {
+      cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0);
+    }
+
+    cmdbuf->Dispatch(N / shader.N0, 1, 1);
+
+    if (use_timestamp) {
+      cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                             1);
+    }
+
+    BM_CHECK_OK(cmdbuf->End());
+
+    auto start_time = std::chrono::high_resolution_clock::now();
+    BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf));
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds =
+        std::chrono::duration_cast<std::chrono::duration<double>>(end_time -
+                                                                  start_time);
+
+    switch (latency_measure->mode) {
+      case LatencyMeasureMode::kSystemDispatch: {
+        state.SetIterationTime(elapsed_seconds.count() -
+                               latency_measure->overhead_seconds);
+      } break;
+      case LatencyMeasureMode::kSystemSubmit: {
+        state.SetIterationTime(elapsed_seconds.count());
+      } break;
+      case LatencyMeasureMode::kGpuTimestamp: {
+        BM_CHECK_OK_AND_ASSIGN(
+            double timestamp_seconds,
+            query_pool->CalculateElapsedSecondsBetween(0, 1));
+        state.SetIterationTime(timestamp_seconds);
+      } break;
+    }
+
+    BM_CHECK_OK(cmdbuf->Reset());
+  }
+
+  double numOperation =
+      double(N) * double(K) + double(K) + double(K) * sizeof(int32_t);
+  state.counters["Bytes"] =
+      ::benchmark::Counter(numOperation,
+                           ::benchmark::Counter::kIsIterationInvariant |
+                               ::benchmark::Counter::kIsRate,
+                           ::benchmark::Counter::kIs1000);
+
+  // Reset the command pool to release all command buffers in the benchmarking
+  // loop to avoid draining GPU resources.
+  BM_CHECK_OK(device->ResetCommandPool());
+}
+
+// Returns true iff |a| is a multiple of |b|.
+static bool isMultipleOf(int a, int b) { return a >= b && a % b == 0; }
+
+namespace uvkc::benchmark {
+
+absl::StatusOr<std::unique_ptr<VulkanContext>> CreateVulkanContext() {
+  return CreateDefaultVulkanContext(kBenchmarkName);
+}
+
+bool RegisterVulkanOverheadBenchmark(
+    const vulkan::Driver::PhysicalDeviceInfo &physical_device,
+    vulkan::Device *device, double *overhead_seconds) {
+  return false;
+}
+
+void RegisterVulkanBenchmarks(
+    const vulkan::Driver::PhysicalDeviceInfo &physical_device,
+    vulkan::Device *device, const LatencyMeasure *latency_measure) {
+  const char *gpu_name = physical_device.v10_properties.deviceName;
+
+  for (const int sz : {4096, 8192, 16384}) {
+    const int N = sz;
+    const int K = sz;
+    for (const ShaderCode &shader : kShaderCodeCases) {
+      std::string vecmat_size = absl::StrCat(N, "x", K);
+      std::string tiling_scheme = absl::StrCat(shader.N0, "x", shader.K0);
+      BM_CHECK(isMultipleOf(N, shader.N0))
+          << "Incompatible tiling scheme: " << tiling_scheme;
+      BM_CHECK(isMultipleOf(K, shader.K0))
+          << "Incompatible tiling scheme: " << tiling_scheme;
+      BM_CHECK(isMultipleOf(shader.K0, 4))
+          << "Incompatible tiling scheme: " << tiling_scheme;
+
+      std::string workgroup_size =
+          absl::StrCat(shader.wg_size_x, "x", shader.wg_size_y, "x1");
+      std::string type_info = absl::StrCat(GetName(shader.input_type), "->",
+                                           GetName(shader.output_type));
+      std::string test_name =
+          absl::StrCat(gpu_name, "/vmt[", vecmat_size, "]/", type_info, "/",
+                       shader.name, "/Workgroup[", workgroup_size, "]");
+      ::benchmark::RegisterBenchmark(test_name.c_str(), Vmt, device,
+                                     latency_measure, shader, N, K)
+          ->UseManualTime()
+          ->Unit(::benchmark::kMicrosecond);
+    }
+  }
+}
+
+}  // namespace uvkc::benchmark