Skip to content

Commit

Permalink
shader_recompiler: Improvements to buffer addressing implementation. s…
Browse files Browse the repository at this point in the history
  • Loading branch information
diegolix29 committed Jan 11, 2025
1 parent 6806e20 commit f0cd244
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 101 deletions.
96 changes: 55 additions & 41 deletions src/shader_recompiler/frontend/translate/vector_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
}

void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const bool is_ring = mtbuf.glc && mtbuf.slc;
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value soffset{GetSrc(inst.src[3])};
Expand All @@ -178,22 +178,23 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
if (is_ring) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
}
if (mtbuf.idxen && mtbuf.offen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mtbuf.idxen || mtbuf.offen) {
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();

IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mtbuf.idxen);
buffer_info.offset_enable.Assign(mtbuf.offen);
buffer_info.inst_offset.Assign(mtbuf.offset);
buffer_info.globally_coherent.Assign(mtbuf.glc);
buffer_info.system_coherent.Assign(mtbuf.slc);
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
Expand All @@ -220,32 +221,38 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBufferFormat(handle, address, info);
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
const IR::VectorReg dst_reg{inst.src[1].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
}
}

void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const bool is_ring = mtbuf.glc && mtbuf.slc;
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value soffset{GetSrc(inst.src[3])};
Expand All @@ -259,22 +266,23 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
if (is_ring) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
}
if (mtbuf.idxen && mtbuf.offen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mtbuf.idxen || mtbuf.offen) {
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();

IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mtbuf.idxen);
buffer_info.offset_enable.Assign(mtbuf.offen);
buffer_info.inst_offset.Assign(mtbuf.offset);
buffer_info.globally_coherent.Assign(mtbuf.glc);
buffer_info.system_coherent.Assign(mtbuf.slc);
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
Expand Down Expand Up @@ -321,8 +329,12 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

const IR::VectorReg src_reg{inst.src[1].code};

Expand All @@ -338,7 +350,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBufferFormat(handle, address, value, info);
ir.StoreBufferFormat(handle, address, value, buffer_info);
}

void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
Expand All @@ -358,10 +370,12 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const IR::U32 soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
info.inst_offset.Assign(mubuf.offset);
info.offset_enable.Assign(mubuf.offen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

IR::Value vdata_val = ir.GetVectorReg<Shader::IR::U32>(vdata);
const IR::Value handle =
Expand All @@ -371,27 +385,27 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const IR::Value original_val = [&] {
switch (op) {
case AtomicOp::Swap:
return ir.BufferAtomicSwap(handle, address, vdata_val, info);
return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info);
case AtomicOp::Add:
return ir.BufferAtomicIAdd(handle, address, vdata_val, info);
return ir.BufferAtomicIAdd(handle, address, vdata_val, buffer_info);
case AtomicOp::Smin:
return ir.BufferAtomicIMin(handle, address, vdata_val, true, info);
return ir.BufferAtomicIMin(handle, address, vdata_val, true, buffer_info);
case AtomicOp::Umin:
return ir.BufferAtomicIMin(handle, address, vdata_val, false, info);
return ir.BufferAtomicIMin(handle, address, vdata_val, false, buffer_info);
case AtomicOp::Smax:
return ir.BufferAtomicIMax(handle, address, vdata_val, true, info);
return ir.BufferAtomicIMax(handle, address, vdata_val, true, buffer_info);
case AtomicOp::Umax:
return ir.BufferAtomicIMax(handle, address, vdata_val, false, info);
return ir.BufferAtomicIMax(handle, address, vdata_val, false, buffer_info);
case AtomicOp::And:
return ir.BufferAtomicAnd(handle, address, vdata_val, info);
return ir.BufferAtomicAnd(handle, address, vdata_val, buffer_info);
case AtomicOp::Or:
return ir.BufferAtomicOr(handle, address, vdata_val, info);
return ir.BufferAtomicOr(handle, address, vdata_val, buffer_info);
case AtomicOp::Xor:
return ir.BufferAtomicXor(handle, address, vdata_val, info);
return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info);
case AtomicOp::Inc:
return ir.BufferAtomicInc(handle, address, vdata_val, info);
return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info);
case AtomicOp::Dec:
return ir.BufferAtomicDec(handle, address, vdata_val, info);
return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info);
default:
UNREACHABLE();
}
Expand Down
14 changes: 10 additions & 4 deletions src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,15 @@ void FoldMul(IR::Block& block, IR::Inst& inst) {
return;
}
const IR::Value rhs{inst.Arg(1)};
if (rhs.IsImmediate() && Arg<T>(rhs) == 0) {
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
return;
if (rhs.IsImmediate()) {
if (Arg<T>(rhs) == 0) {
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
return;
}
if (Arg<T>(rhs) == 1) {
inst.ReplaceUsesWithAndRemove(inst.Arg(0));
return;
}
}
}

Expand Down Expand Up @@ -491,4 +497,4 @@ void ConstantPropagationPass(IR::BlockList& program) {
}
}

} // namespace Shader::Optimization
} // namespace Shader::Optimization
105 changes: 64 additions & 41 deletions src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,64 +483,89 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
inst.SetArg(1, ir.Imm32(binding));
}

IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
const AmdGpu::Buffer& buffer, u32 stride) {
const auto inst_info = inst.Flags<IR::BufferInstInfo>();

// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
IR::U32 index = ir.Imm32(0U);
if (inst_info.index_enable) {
const IR::U32 vgpr_index{inst_info.offset_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
: IR::U32{inst.Arg(1)}};
index = ir.IAdd(index, vgpr_index);
}
if (buffer.add_tid_enable) {
/*ASSERT_MSG(info.l_stage == LogicalStage::Compute,
"Thread ID buffer addressing is not supported outside of compute.");
const IR::U32 thread_id{ir.LaneId()};
index = ir.IAdd(index, thread_id);*/
}
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.offset_enable) {
const IR::U32 vgpr_offset = inst_info.index_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
: IR::U32{inst.Arg(1)};
offset = ir.IAdd(offset, vgpr_offset);
}
const IR::U32 const_stride = ir.Imm32(stride);
IR::U32 buffer_offset;
if (buffer.swizzle_enable) {
const IR::U32 const_index_stride = ir.Imm32(buffer.index_stride);
const IR::U32 const_element_size = ir.Imm32(buffer.element_size);
// index_msb = index / const_index_stride
const IR::U32 index_msb{ir.IDiv(index, const_index_stride)};
// index_lsb = index % const_index_stride
const IR::U32 index_lsb{ir.IMod(index, const_index_stride)};
// offset_msb = offset / const_element_size
const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)};
// offset_lsb = offset % const_element_size
const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)};
// buffer_offset =
// (index_msb * const_stride + offset_msb * const_element_size) * const_index_stride
// + index_lsb * const_element_size + offset_lsb
const IR::U32 buffer_offset_msb = ir.IMul(
ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)),
const_index_stride);
const IR::U32 buffer_offset_lsb =
ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb);
buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb);
} else {
// buffer_offset = index * const_stride + offset
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
}
return buffer_offset;
}

void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
const auto handle = inst.Arg(0);
const auto buffer_res = info.buffers[handle.U32()];
const auto buffer = buffer_res.GetSharp(info);

ASSERT(!buffer.add_tid_enable);

// Address of constant buffer reads can be calculated at IR emission time.
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) {
return;
}

IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
const auto inst_info = inst.Flags<IR::BufferInstInfo>();

const IR::U32 index_stride = ir.Imm32(buffer.index_stride);
const IR::U32 element_size = ir.Imm32(buffer.element_size);

// Compute address of the buffer using the stride.
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.index_enable) {
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
: IR::U32{inst.Arg(1)};
if (buffer.swizzle_enable) {
const IR::U32 stride_index_stride =
ir.Imm32(static_cast<u32>(buffer.stride * buffer.index_stride));
const IR::U32 index_msb = ir.IDiv(index, index_stride);
const IR::U32 index_lsb = ir.IMod(index, index_stride);
address = ir.IAdd(address, ir.IAdd(ir.IMul(index_msb, stride_index_stride),
ir.IMul(index_lsb, element_size)));
} else {
address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride())));
}
}
if (inst_info.offset_enable) {
const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
: IR::U32{inst.Arg(1)};
if (buffer.swizzle_enable) {
const IR::U32 element_size_index_stride =
ir.Imm32(buffer.element_size * buffer.index_stride);
const IR::U32 offset_msb = ir.IDiv(offset, element_size);
const IR::U32 offset_lsb = ir.IMod(offset, element_size);
address = ir.IAdd(address,
ir.IAdd(ir.IMul(offset_msb, element_size_index_stride), offset_lsb));
} else {
address = ir.IAdd(address, offset);
}
}
inst.SetArg(1, address);
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride));
}

void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
const auto handle = inst.Arg(0);
const auto buffer_res = info.texture_buffers[handle.U32()];
const auto buffer = buffer_res.GetSharp(info);

ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
// Only linear addressing with index is supported currently, since we cannot yet
// address with sub-texel granularity.
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0,
"Unsupported texture buffer address mode.");

IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
// Stride of 1 to get an index into formatted data. See above addressing limitations.
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U));

if (inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32) {
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
Expand Down Expand Up @@ -779,7 +804,6 @@ void PatchImageArgs(IR::Block& block, IR::Inst& inst, Info& info) {
if (inst.GetOpcode() == IR::Opcode::ImageRead) {
auto texel = ir.ImageRead(handle, coords, lod, ms, inst_info);
if (is_storage) {
// Storage image requires shader swizzle.
}
const auto converted =
ApplyReadNumberConversionVec4(ir, texel, image.GetNumberConversion());
Expand All @@ -792,7 +816,6 @@ void PatchImageArgs(IR::Block& block, IR::Inst& inst, Info& info) {

auto texel = inst.Arg(4);
if (is_storage) {
// Storage image requires shader swizzle.
}
const auto converted =
ApplyWriteNumberConversionVec4(ir, texel, image.GetNumberConversion());
Expand Down
Loading

0 comments on commit f0cd244

Please sign in to comment.