diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 85f0dba..43f6f90 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -20,3 +20,4 @@ add_subdirectory(memory) add_subdirectory(overhead) add_subdirectory(reduction) add_subdirectory(subgroup) +add_subdirectory(vmt) diff --git a/benchmarks/vmt/CMakeLists.txt b/benchmarks/vmt/CMakeLists.txt new file mode 100644 index 0000000..cc9f522 --- /dev/null +++ b/benchmarks/vmt/CMakeLists.txt @@ -0,0 +1,46 @@ +# Copyright 2023 Nod Inc. +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#------------------------------------------------------------------------------- +# vmt (RDNA3) +#------------------------------------------------------------------------------- + +uvkc_glsl_shader_permutation( + NAME + vmt_i8_shader_rdna3 + SRC + "vmt_i8.glsl" + PERMUTATION + "N0=[1|2|4]" + "K0=[16|32|64|128]" + "WG_X=64" + "WG_Y=[1|2|4]" + GLSLC_ARGS + "--target-env=vulkan1.1" +) + +uvkc_cc_binary( + NAME + vmt_rdna3 + SRCS + "vmt_main.cc" + DEPS + ::vmt_i8_shader_rdna3 + benchmark::benchmark + uvkc::benchmark::core + uvkc::benchmark::main + COPTS + -DUVKC_RDNA3 +) diff --git a/benchmarks/vmt/README.md b/benchmarks/vmt/README.md new file mode 100644 index 0000000..bce58dc --- /dev/null +++ b/benchmarks/vmt/README.md @@ -0,0 +1,4 @@ +# vmt benchmark + +This directory contains microbenchmarks for evaluating different strategy to +implement vector matrix transposed multiplication. diff --git a/benchmarks/vmt/vmt_i8.glsl b/benchmarks/vmt/vmt_i8.glsl new file mode 100644 index 0000000..269655d --- /dev/null +++ b/benchmarks/vmt/vmt_i8.glsl @@ -0,0 +1,121 @@ +// Copyright 2023 Nod Inc. +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#version 450 core +#extension GL_KHR_shader_subgroup_arithmetic : enable +#pragma use_vulkan_memory_model + +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require + +#extension GL_KHR_shader_subgroup_basic : enable + +// Multiplies vector `inputA` of length `K` by matrix `inputB` +// of size `K x N`. +// We use `K0` and `N0` to denote the tile sizes for N and K, +// respectively. +// +// We assign `N0` rows for each subgroup to process. +// Subgroups load a batch of values from the vector and the row, +// calculate the inner product of the two, and accumulate the results at the +// subgroup-level. +// Each workgroup produces `N0` * `WG_Y` results. We assume that WG_X is the +// same as the subgroup size to simplify the implementation. This is a shortcut +// that should be fixed. + +layout(binding = 0) buffer InputA { i32vec4 x[]; } inputA; // Input vector. +layout(binding = 1) buffer InputB { i32vec4 x[]; } inputB; // Input matrix. +layout(binding = 2) buffer Output { int32_t x[]; } outputO; // Output vector. + +// These are constants defined at compile time. +layout(local_size_x = WG_X, local_size_y = WG_Y, local_size_z = 1) in; + +layout(constant_id = 0) const uint N = 1; +layout(constant_id = 1) const uint K = 1; + +// We process 16-element i8 vectors along the K dimension, and treat i32vec4 +// (packed format) as the effective element type. +const uint VECTORIZE_K = 16; +const uint K_VEC = K / VECTORIZE_K; +const uint K0_VEC = K0 / VECTORIZE_K; + +const uint strideB = K_VEC; // Stride of the `inputB` matrix. + +// Each subgroup processes a total of N0 rows, therefore +// each workgroup processes N0 * WG_Y rows. +const uint WG_ROWS = N0 * WG_Y; + +// Offset between elements accessed by a thread in a workgroup. +const uint WG_K_STRIDE = WG_X * K0_VEC; + +/// Returns the index of `X[i, j]`, where `X` is a 2D matrix of stride `stride`. +uint coordToOffset(uint i, uint j, uint stride) { return stride * i + j; } + +int32_t sdot(int32_t lhsPacked, int32_t rhsPacked) { + i8vec4 lhs = unpack8(lhsPacked); + i8vec4 rhs = unpack8(rhsPacked); + i16vec4 mul = i16vec4(lhs) * i16vec4(rhs); + return int32_t(mul.x) + int32_t(mul.y) + int32_t(mul.z) + int32_t(mul.w); +} + +int32_t sdot4(i32vec4 lhs, i32vec4 rhs) { + return sdot(lhs.x, rhs.x) + sdot(lhs.y, rhs.y) + sdot(lhs.z, rhs.z) + sdot(lhs.w, rhs.w); +} + +void main() { + const uvec2 wgID = gl_WorkGroupID.xy; + const uvec2 localID = gl_LocalInvocationID.xy; + const uint threadID = gl_SubgroupInvocationID; + + // The start offsets of the row tile processed by this thread in this workgroup. + const uint startRow = wgID.x * WG_ROWS; + + for (uint y = 0; y < N0; ++y) { + uint r = startRow + y * WG_Y + localID.y; + int32_t laneResult = 0; + i32vec4 tileA[K0_VEC]; + i32vec4 tileB[K0_VEC]; + + for (uint k = 0; k < K_VEC; k += WG_K_STRIDE) { + // Prefetch LHS and RHS to reduce the latency. + [[unroll]] for (uint kk = 0; kk < K0_VEC; ++kk) { + uint gk = k + kk * WG_X + threadID; + tileB[kk] = inputB.x[coordToOffset(r, gk, strideB)]; + tileA[kk] = inputA.x[gk]; + } + + [[unroll]] for (uint kk = 0; kk < K0_VEC; ++kk) { + i32vec4 lhs = tileA[kk]; + i32vec4 rhs = tileB[kk]; + laneResult += sdot4(lhs, rhs); + } + } + + // Final reduction with one subgroup. + int32_t sgResult = subgroupAdd(laneResult); + if (subgroupElect()) { + outputO.x[r] = sgResult; + } + } + + // Assert that the subgroup and workgroup sizes match. + // This simplifies the code but doesn't have to be true on all targets. + if (threadID != localID.x) + outputO.x[0] = -1; +} diff --git a/benchmarks/vmt/vmt_main.cc b/benchmarks/vmt/vmt_main.cc new file mode 100644 index 0000000..a1d881e --- /dev/null +++ b/benchmarks/vmt/vmt_main.cc @@ -0,0 +1,383 @@ +// Copyright 2023 Nod Inc. +// Copyright 2020-2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" +#include "absl/types/span.h" +#include "benchmark/benchmark.h" +#include "uvkc/base/log.h" +#include "uvkc/benchmark/data_type_util.h" +#include "uvkc/benchmark/main.h" +#include "uvkc/benchmark/status_util.h" +#include "uvkc/benchmark/vulkan_buffer_util.h" +#include "uvkc/benchmark/vulkan_context.h" +#include "uvkc/vulkan/device.h" +#include "uvkc/vulkan/pipeline.h" + +using namespace uvkc::benchmark; +using ::uvkc::benchmark::LatencyMeasureMode; +using ::uvkc::vulkan::Pipeline; + +static const char kBenchmarkName[] = "vmt"; + +struct ShaderCode { + const char *name; // Shader case name + absl::Span code; // SPIR-V code + int N0; + int K0; + int wg_size_x; + int wg_size_y; + DataType input_type; // LHS & RHS element type + DataType output_type; // Output/Result matrix element type +}; + +#define SHADER_I8(N0, K0, X, Y) \ + ShaderCode { \ + "Tile[" #N0 "x" #K0 "]", vmt_i8::N0_##N0##_K0_##K0##_WG_X_##X##_WG_Y_##Y, \ + N0, K0, X, Y, DataType::i8, DataType::i32 \ + } + +#define WORKGROUP_TILE_N_I8(X, Y, N0) \ + SHADER_I8(N0, 16, X, Y), SHADER_I8(N0, 32, X, Y), SHADER_I8(N0, 64, X, Y), \ + SHADER_I8(N0, 128, X, Y) + +#if defined(UVKC_RDNA3) + +namespace vmt_i8 { +#include "vmt_i8_shader_rdna3_spirv_permutation.inc" +} + +static ShaderCode kShaderCodeCases[] = { + WORKGROUP_TILE_N_I8(64, 1, 1), WORKGROUP_TILE_N_I8(64, 1, 2), + WORKGROUP_TILE_N_I8(64, 1, 4), WORKGROUP_TILE_N_I8(64, 2, 2), + WORKGROUP_TILE_N_I8(64, 2, 4), WORKGROUP_TILE_N_I8(64, 4, 4), +}; + +#else +#error "unsupported GPU architecture/strategy" +#endif + +/// Fills the 2D matrix with values produced by the |generator| function. +template +static void FillBuffer(DataType data_type, void *raw_buffer, size_t num_bytes, + unsigned dim_1, unsigned dim_2, GeneratorFn generator) { + auto fill = [&](auto traits) { + using Traits = decltype(traits); + using StorageType = typename Traits::storage_type; + using RuntimeType = typename Traits::storage_type; + auto buffer = absl::MakeSpan(static_cast(raw_buffer), + num_bytes / GetSize(data_type)); + + for (int i = 0; i < dim_1; ++i) { + for (int j = 0; j < dim_2; ++j) { + buffer[j + i * dim_1] = + static_cast(RuntimeType(generator(i, j))); + } + } + }; + + InvokeWithTraits(data_type, fill); +} + +/// Checks that the output vector calculated by the shader is contains the +/// same values as runtime vecmat with values defined by |lhs| and |rhs|. +template +static void CheckOutput(const ShaderCode &shader, void *raw_buffer, + size_t num_bytes, unsigned N, unsigned K, + Generator1Fn lhs, Generator2Fn rhs) { + using OutputTraits = DataTypeTraits; + using OutputStorageType = typename OutputTraits::storage_type; + using OutputRuntimeType = typename OutputTraits::runtime_type; + using InputTraits = DataTypeTraits; + using InputRuntimeType = typename InputTraits::runtime_type; + + auto output = + absl::MakeConstSpan(static_cast(raw_buffer), + num_bytes / GetSize(OutputType)); + for (int j = 0; j < N; ++j) { + OutputRuntimeType acc(0.0f); + for (int k = 0; k < K; ++k) { + acc += OutputRuntimeType(InputRuntimeType(lhs(0, k))) * + OutputRuntimeType(InputRuntimeType(rhs(j, k))); + } + + OutputRuntimeType gpuValue(output[j]); + BM_CHECK_EQ(gpuValue, acc) + << "destination buffer element (" << j << ")" + << " has incorrect value: expected to be " << acc << " but found " + << gpuValue << "\n\t^ In shader: " << shader.name << ", " + << GetName(shader.input_type) << "->" << GetName(shader.output_type); + } +} + +static void Vmt(::benchmark::State &state, ::uvkc::vulkan::Device *device, + const ::uvkc::benchmark::LatencyMeasure *latency_measure, + const ShaderCode &shader, int N, int K) { + //===-------------------------------------------------------------------===/ + // Create shader module, pipeline, and descriptor sets + //===-------------------------------------------------------------------===/ + + BM_CHECK_OK_AND_ASSIGN( + auto shader_module, + device->CreateShaderModule(shader.code.data(), shader.code.size())); + + ::uvkc::vulkan::Pipeline::SpecConstant spec_constant[2] = { + {/*id=*/0, Pipeline::SpecConstant::Type::s32, N}, + {/*id=*/1, Pipeline::SpecConstant::Type::s32, K}, + }; + BM_CHECK_OK_AND_ASSIGN( + auto pipeline, + device->CreatePipeline(*shader_module, "main", spec_constant)); + + BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool, + device->CreateDescriptorPool(*shader_module)); + BM_CHECK_OK_AND_ASSIGN(auto layout_set_map, + descriptor_pool->AllocateDescriptorSets( + shader_module->descriptor_set_layouts())); + + //===-------------------------------------------------------------------===/ + // Create buffers + //===-------------------------------------------------------------------===/ + DataType input_type = shader.input_type; + DataType output_type = shader.output_type; + const size_t src0_size = K * GetSize(input_type); + const size_t src1_size = K * N * GetSize(input_type); + const size_t dst_size = N * GetSize(output_type); + + BM_CHECK_OK_AND_ASSIGN( + auto src0_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src0_size)); + BM_CHECK_OK_AND_ASSIGN( + auto src1_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src1_size)); + BM_CHECK_OK_AND_ASSIGN( + auto dst_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_size)); + + //===-------------------------------------------------------------------===/ + // Set source buffer data + //===-------------------------------------------------------------------===/ + auto getLhs = [K](int i, int j) { + float v = ((float)((i * K + j) % 5) - 1.0f) / 2.0f; + return v; + }; + auto getRhs = [K](int i, int j) { + float v = ((float)((i * K + j) % 7) - 1.0f) / 2.0f; + return v; + }; + + BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer( + device, src0_buffer.get(), src0_size, [&](void *ptr, size_t num_bytes) { + FillBuffer(input_type, ptr, num_bytes, 1, K, getLhs); + })); + + // In vmt, the RHS is input is transposed, which makes the matrix + // column-major. + BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer( + device, src1_buffer.get(), src1_size, [&](void *ptr, size_t num_bytes) { + FillBuffer(input_type, ptr, num_bytes, N, K, getRhs); + })); + + //===-------------------------------------------------------------------===/ + // Clear the output buffer data set by the previous benchmark run + //===-------------------------------------------------------------------===/ + + BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer( + device, dst_buffer.get(), dst_size, [&](void *ptr, size_t num_bytes) { + FillBuffer(output_type, ptr, num_bytes, 1, N, + [](int, int) { return 0.0f; }); + })); + + //===-------------------------------------------------------------------===/ + // Dispatch + //===-------------------------------------------------------------------===/ + std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = { + {src0_buffer.get(), /*set=*/0, /*binding=*/0}, + {src1_buffer.get(), /*set=*/0, /*binding=*/1}, + {dst_buffer.get(), /*set=*/0, /*binding=*/2}, + }; + BM_CHECK_OK(device->AttachBufferToDescriptor( + *shader_module, layout_set_map, + {bound_buffers.data(), bound_buffers.size()})); + + BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1) + << "unexpected number of descriptor sets (" << shader.name << ")"; + auto descriptor_set_layout = shader_module->descriptor_set_layouts().front(); + + std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet> + bound_descriptor_sets(1); + bound_descriptor_sets[0].index = 0; + bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout); + BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer()); + + BM_CHECK_OK(dispatch_cmdbuf->Begin()); + dispatch_cmdbuf->BindPipelineAndDescriptorSets( + *pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()}); + // Each workgroup processes N0 rows with S0 subgroups per row. + dispatch_cmdbuf->Dispatch(N / shader.N0, 1, 1); + BM_CHECK_OK(dispatch_cmdbuf->End()); + BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf)); + + //===-------------------------------------------------------------------===/ + // Verify destination buffer data + //===-------------------------------------------------------------------===/ + + if (output_type == DataType::i32) { + BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer( + device, dst_buffer.get(), dst_size, [&](void *ptr, size_t num_bytes) { + if (input_type == DataType::i8) { + CheckOutput(shader, ptr, num_bytes, N, + K, getLhs, getRhs); + } else { + BM_CHECK(false) << "Unhandled input type"; + } + })); + } else { + BM_CHECK(false) << "Unhandled output type"; + } + + //===-------------------------------------------------------------------===/ + // Benchmarking + //===-------------------------------------------------------------------===/ + + std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool; + bool use_timestamp = + latency_measure->mode == LatencyMeasureMode::kGpuTimestamp; + if (use_timestamp) { + BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2)); + } + + BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer()); + for (auto _ : state) { + BM_CHECK_OK(cmdbuf->Begin()); + if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool); + + cmdbuf->BindPipelineAndDescriptorSets( + *pipeline, + {bound_descriptor_sets.data(), bound_descriptor_sets.size()}); + + if (use_timestamp) { + cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); + } + + cmdbuf->Dispatch(N / shader.N0, 1, 1); + + if (use_timestamp) { + cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + 1); + } + + BM_CHECK_OK(cmdbuf->End()); + + auto start_time = std::chrono::high_resolution_clock::now(); + BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf)); + auto end_time = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end_time - + start_time); + + switch (latency_measure->mode) { + case LatencyMeasureMode::kSystemDispatch: { + state.SetIterationTime(elapsed_seconds.count() - + latency_measure->overhead_seconds); + } break; + case LatencyMeasureMode::kSystemSubmit: { + state.SetIterationTime(elapsed_seconds.count()); + } break; + case LatencyMeasureMode::kGpuTimestamp: { + BM_CHECK_OK_AND_ASSIGN( + double timestamp_seconds, + query_pool->CalculateElapsedSecondsBetween(0, 1)); + state.SetIterationTime(timestamp_seconds); + } break; + } + + BM_CHECK_OK(cmdbuf->Reset()); + } + + double numOperation = + double(N) * double(K) + double(K) + double(K) * sizeof(int32_t); + state.counters["Bytes"] = + ::benchmark::Counter(numOperation, + ::benchmark::Counter::kIsIterationInvariant | + ::benchmark::Counter::kIsRate, + ::benchmark::Counter::kIs1000); + + // Reset the command pool to release all command buffers in the benchmarking + // loop to avoid draining GPU resources. + BM_CHECK_OK(device->ResetCommandPool()); +} + +// Returns true iff |a| is a multiple of |b|. +static bool isMultipleOf(int a, int b) { return a >= b && a % b == 0; } + +namespace uvkc::benchmark { + +absl::StatusOr> CreateVulkanContext() { + return CreateDefaultVulkanContext(kBenchmarkName); +} + +bool RegisterVulkanOverheadBenchmark( + const vulkan::Driver::PhysicalDeviceInfo &physical_device, + vulkan::Device *device, double *overhead_seconds) { + return false; +} + +void RegisterVulkanBenchmarks( + const vulkan::Driver::PhysicalDeviceInfo &physical_device, + vulkan::Device *device, const LatencyMeasure *latency_measure) { + const char *gpu_name = physical_device.v10_properties.deviceName; + + for (const int sz : {4096, 8192, 16384}) { + const int N = sz; + const int K = sz; + for (const ShaderCode &shader : kShaderCodeCases) { + std::string vecmat_size = absl::StrCat(N, "x", K); + std::string tiling_scheme = absl::StrCat(shader.N0, "x", shader.K0); + BM_CHECK(isMultipleOf(N, shader.N0)) + << "Incompatible tiling scheme: " << tiling_scheme; + BM_CHECK(isMultipleOf(K, shader.K0)) + << "Incompatible tiling scheme: " << tiling_scheme; + BM_CHECK(isMultipleOf(shader.K0, 4)) + << "Incompatible tiling scheme: " << tiling_scheme; + + std::string workgroup_size = + absl::StrCat(shader.wg_size_x, "x", shader.wg_size_y, "x1"); + std::string type_info = absl::StrCat(GetName(shader.input_type), "->", + GetName(shader.output_type)); + std::string test_name = + absl::StrCat(gpu_name, "/vmt[", vecmat_size, "]/", type_info, "/", + shader.name, "/Workgroup[", workgroup_size, "]"); + ::benchmark::RegisterBenchmark(test_name.c_str(), Vmt, device, + latency_measure, shader, N, K) + ->UseManualTime() + ->Unit(::benchmark::kMicrosecond); + } + } +} + +} // namespace uvkc::benchmark