Skip to content

Commit

Permalink
GPU processor refactoring (#1787)
Browse files Browse the repository at this point in the history
* coroutine code prettification

* asc queues submission refactoring

* better asc ring context handling

* final touches and review notes

* even more simplification for context saving
  • Loading branch information
psucien authored Dec 14, 2024
1 parent af26c94 commit 0fd1ab6
Show file tree
Hide file tree
Showing 12 changed files with 234 additions and 146 deletions.
3 changes: 3 additions & 0 deletions src/common/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,6 @@ enum MarkersPalette : int {
tracy::SourceLocationData{nullptr, name, TracyFile, (uint32_t)TracyLine, 0};

#define FRAME_END FrameMark

#define FIBER_ENTER(name) TracyFiberEnter(name)
#define FIBER_EXIT TracyFiberLeave
68 changes: 44 additions & 24 deletions src/core/debug_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,41 +142,61 @@ void DebugStateImpl::PushQueueDump(QueueDump dump) {
frame.queues.push_back(std::move(dump));
}

void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
const AmdGpu::Liverpool::Regs& regs, bool is_compute) {
std::scoped_lock lock{frame_dump_list_mutex};
std::optional<RegDump*> DebugStateImpl::GetRegDump(uintptr_t base_addr, uintptr_t header_addr) {
const auto it = waiting_reg_dumps.find(header_addr);
if (it == waiting_reg_dumps.end()) {
return;
return std::nullopt;
}
auto& frame = *it->second;
waiting_reg_dumps.erase(it);
waiting_reg_dumps_dbg.erase(waiting_reg_dumps_dbg.find(header_addr));
auto& dump = frame.regs[header_addr - base_addr];
dump.regs = regs;
if (is_compute) {
dump.is_compute = true;
const auto& cs = dump.regs.cs_program;
dump.cs_data = PipelineComputerProgramDump{
.cs_program = cs,
.code = std::vector<u32>{cs.Code().begin(), cs.Code().end()},
};
} else {
for (int i = 0; i < RegDump::MaxShaderStages; i++) {
if (regs.stage_enable.IsStageEnabled(i)) {
auto stage = regs.ProgramForStage(i);
if (stage->address_lo != 0) {
auto code = stage->Code();
dump.stages[i] = PipelineShaderProgramDump{
.user_data = *stage,
.code = std::vector<u32>{code.begin(), code.end()},
};
}
return &frame.regs[header_addr - base_addr];
}

void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
const AmdGpu::Liverpool::Regs& regs) {
std::scoped_lock lock{frame_dump_list_mutex};

auto dump = GetRegDump(base_addr, header_addr);
if (!dump) {
return;
}

(*dump)->regs = regs;

for (int i = 0; i < RegDump::MaxShaderStages; i++) {
if ((*dump)->regs.stage_enable.IsStageEnabled(i)) {
auto stage = (*dump)->regs.ProgramForStage(i);
if (stage->address_lo != 0) {
auto code = stage->Code();
(*dump)->stages[i] = PipelineShaderProgramDump{
.user_data = *stage,
.code = std::vector<u32>{code.begin(), code.end()},
};
}
}
}
}

void DebugStateImpl::PushRegsDumpCompute(uintptr_t base_addr, uintptr_t header_addr,
const CsState& cs_state) {
std::scoped_lock lock{frame_dump_list_mutex};

auto dump = GetRegDump(base_addr, header_addr);
if (!dump) {
return;
}

(*dump)->is_compute = true;
auto& cs = (*dump)->regs.cs_program;
cs = cs_state;

(*dump)->cs_data = PipelineComputerProgramDump{
.cs_program = cs,
.code = std::vector<u32>{cs.Code().begin(), cs.Code().end()},
};
}

void DebugStateImpl::CollectShader(const std::string& name, Shader::LogicalStage l_stage,
vk::ShaderModule module, std::span<const u32> spv,
std::span<const u32> raw_code, std::span<const u32> patch_spv,
Expand Down
8 changes: 6 additions & 2 deletions src/core/debug_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <queue>

#include "common/types.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"

#ifdef _WIN32
Expand Down Expand Up @@ -204,12 +203,17 @@ class DebugStateImpl {
void PushQueueDump(QueueDump dump);

void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
const AmdGpu::Liverpool::Regs& regs, bool is_compute = false);
const AmdGpu::Liverpool::Regs& regs);
using CsState = AmdGpu::Liverpool::ComputeProgram;
void PushRegsDumpCompute(uintptr_t base_addr, uintptr_t header_addr, const CsState& cs_state);

void CollectShader(const std::string& name, Shader::LogicalStage l_stage,
vk::ShaderModule module, std::span<const u32> spv,
std::span<const u32> raw_code, std::span<const u32> patch_spv,
bool is_patched);

private:
std::optional<RegDump*> GetRegDump(uintptr_t base_addr, uintptr_t header_addr);
};
} // namespace DebugStateType

Expand Down
36 changes: 20 additions & 16 deletions src/core/libraries/gnmdriver/gnmdriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,12 @@ static_assert(CtxInitSequence400.size() == 0x61);
// In case if `submitDone` is issued we need to block submissions until GPU idle
static u32 submission_lock{};
std::condition_variable cv_lock{};
static std::mutex m_submission{};
std::mutex m_submission{};
static u64 frames_submitted{}; // frame counter
static bool send_init_packet{true}; // initialize HW state before first game's submit in a frame
static int sdk_version{0};

struct AscQueueInfo {
VAddr map_addr;
u32* read_addr;
u32 ring_size_dw;
};
static Common::SlotVector<AscQueueInfo> asc_queues{};
static u32 asc_next_offs_dw[Liverpool::NumComputeRings];
static constexpr VAddr tessellation_factors_ring_addr = Core::SYSTEM_RESERVED_MAX - 0xFFFFFFF;
static constexpr u32 tessellation_offchip_buffer_size = 0x800000u;

Expand Down Expand Up @@ -506,11 +501,19 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) {
}

auto vqid = gnm_vqid - 1;
auto& asc_queue = asc_queues[{vqid}];
const auto* acb_ptr = reinterpret_cast<const u32*>(asc_queue.map_addr + *asc_queue.read_addr);
const auto acb_size = next_offs_dw ? (next_offs_dw << 2u) - *asc_queue.read_addr
: (asc_queue.ring_size_dw << 2u) - *asc_queue.read_addr;
const std::span acb_span{acb_ptr, acb_size >> 2u};
auto& asc_queue = liverpool->asc_queues[{vqid}];

const auto& offs_dw = asc_next_offs_dw[vqid];

if (next_offs_dw < offs_dw) {
ASSERT_MSG(next_offs_dw == 0, "ACB submission is split at the end of ring buffer");
}

const auto* acb_ptr = reinterpret_cast<const u32*>(asc_queue.map_addr) + offs_dw;
const auto acb_size_dw = (next_offs_dw ? next_offs_dw : asc_queue.ring_size_dw) - offs_dw;
const std::span acb_span{acb_ptr, acb_size_dw};

asc_next_offs_dw[vqid] = next_offs_dw;

if (DebugState.DumpingCurrentFrame()) {
static auto last_frame_num = -1LL;
Expand Down Expand Up @@ -545,9 +548,6 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) {
});
}
liverpool->SubmitAsc(gnm_vqid, acb_span);

*asc_queue.read_addr += acb_size;
*asc_queue.read_addr %= asc_queue.ring_size_dw * 4;
}

void PS4_SYSV_ABI sceGnmDingDongForWorkload(u32 gnm_vqid, u32 next_offs_dw, u64 workload_id) {
Expand Down Expand Up @@ -1266,12 +1266,16 @@ int PS4_SYSV_ABI sceGnmMapComputeQueue(u32 pipe_id, u32 queue_id, VAddr ring_bas
return ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_READ_PTR_ADDR;
}

auto vqid = asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw);
const auto vqid =
liverpool->asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw, pipe_id);
// We need to offset index as `dingDong` assumes it to be from the range [1..64]
const auto gnm_vqid = vqid.index + 1;
LOG_INFO(Lib_GnmDriver, "ASC pipe {} queue {} mapped to vqueue {}", pipe_id, queue_id,
gnm_vqid);

const auto& queue = liverpool->asc_queues[vqid];
*queue.read_addr = 0u;

return gnm_vqid;
}

Expand Down
2 changes: 1 addition & 1 deletion src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ void EmitContext::DefineInputs() {
});
// Note that we pass index rather than Id
input_params[attrib.semantic] = SpirvAttribute{
.id = rate_idx,
.id = {rate_idx},
.pointer_type = input_u32,
.component_type = U32[1],
.num_components = std::min<u16>(attrib.num_elements, num_components),
Expand Down
Loading

0 comments on commit 0fd1ab6

Please sign in to comment.